From 0adef07413bbc7c1af9c51fdacd416e38699df8c Mon Sep 17 00:00:00 2001 From: Molly He Date: Fri, 6 Mar 2026 14:35:14 -0800 Subject: [PATCH 01/11] Intermediary checkpoint --- .../src/sagemaker/core/resources.py | 22 +- sagemaker-train/pyproject.toml | 3 + .../src/sagemaker/train/__init__.py | 12 + .../sagemaker/train/common_utils/constants.py | 1 + .../train/common_utils/finetune_utils.py | 1 + .../train/common_utils/metrics_visualizer.py | 142 ++++ .../train/common_utils/mlflow_metrics_util.py | 4 +- .../train/common_utils/trainer_wait.py | 139 +++- ...dpo_trainer_example_notebook_v3_prod.ipynb | 6 +- ...uning_example_notebook_pysdk_prod_v3.ipynb | 509 ++++++++++++-- ...io-nova-training-job-sample-notebook.ipynb | 637 +++++++++++++++++- 11 files changed, 1379 insertions(+), 97 deletions(-) create mode 100644 sagemaker-train/src/sagemaker/train/common_utils/metrics_visualizer.py diff --git a/sagemaker-core/src/sagemaker/core/resources.py b/sagemaker-core/src/sagemaker/core/resources.py index 66b13e112a..61e0f9c677 100644 --- a/sagemaker-core/src/sagemaker/core/resources.py +++ b/sagemaker-core/src/sagemaker/core/resources.py @@ -35788,7 +35788,7 @@ def stop(self) -> None: ResourceNotFound: Resource being access is not found. """ - client = SageMakerClient().client + client = SageMakerClient().sagemaker_client operation_input_args = { "TrainingJobName": self.training_job_name, @@ -35833,15 +35833,17 @@ def wait( progress.add_task("Waiting for TrainingJob...") status = Status("Current status:") - instance_count = ( - sum( - instance_group.instance_count - for instance_group in self.resource_config.instance_groups - ) - if self.resource_config.instance_groups - and not isinstance(self.resource_config.instance_groups, Unassigned) - else self.resource_config.instance_count - ) + instance_count = 1 # Default + if not isinstance(self.resource_config, Unassigned): + if (hasattr(self.resource_config, 'instance_groups') and + self.resource_config.instance_groups and + not isinstance(self.resource_config.instance_groups, Unassigned)): + instance_count = sum( + instance_group.instance_count + for instance_group in self.resource_config.instance_groups + ) + elif hasattr(self.resource_config, 'instance_count'): + instance_count = self.resource_config.instance_count if logs: multi_stream_logger = MultiLogStreamHandler( diff --git a/sagemaker-train/pyproject.toml b/sagemaker-train/pyproject.toml index 9a51ef9aa2..64eace3585 100644 --- a/sagemaker-train/pyproject.toml +++ b/sagemaker-train/pyproject.toml @@ -43,6 +43,9 @@ dependencies = [ "sagemaker-mlflow>=0.0.1,<1.0.0", "mlflow>=3.0.0,<4.0.0", "nest_asyncio>=1.5.0", + "ipywidgets>=8.0.0", + "rich>=13.0.0", + "matplotlib>=3.5.0", ] [project.urls] diff --git a/sagemaker-train/src/sagemaker/train/__init__.py b/sagemaker-train/src/sagemaker/train/__init__.py index 74518dc65a..38a6fda76d 100644 --- a/sagemaker-train/src/sagemaker/train/__init__.py +++ b/sagemaker-train/src/sagemaker/train/__init__.py @@ -56,4 +56,16 @@ def __getattr__(name): elif name == "get_builtin_metrics": from sagemaker.train.evaluate import get_builtin_metrics return get_builtin_metrics + elif name == "plot_training_metrics": + from sagemaker.train.common_utils.metrics_visualizer import plot_training_metrics + return plot_training_metrics + elif name == "get_available_metrics": + from sagemaker.train.common_utils.metrics_visualizer import get_available_metrics + return get_available_metrics + elif name == "get_studio_url": + from sagemaker.train.common_utils.metrics_visualizer import get_studio_url + return get_studio_url + elif name == "get_mlflow_url": + from sagemaker.train.common_utils.trainer_wait import get_mlflow_url + return get_mlflow_url raise AttributeError(f"module '{__name__}' has no attribute '{name}'") diff --git a/sagemaker-train/src/sagemaker/train/common_utils/constants.py b/sagemaker-train/src/sagemaker/train/common_utils/constants.py index 8de3ab4638..b96c58134c 100644 --- a/sagemaker-train/src/sagemaker/train/common_utils/constants.py +++ b/sagemaker-train/src/sagemaker/train/common_utils/constants.py @@ -20,6 +20,7 @@ class _MLflowConstants: # Metric names TOTAL_LOSS_METRIC = 'total_loss' + LOSS_METRIC_KEYWORDS = ('loss',) EPOCH_KEYWORD = 'epoch' # MLflow run tags diff --git a/sagemaker-train/src/sagemaker/train/common_utils/finetune_utils.py b/sagemaker-train/src/sagemaker/train/common_utils/finetune_utils.py index 3fd17c3ac0..c6e89e19c8 100644 --- a/sagemaker-train/src/sagemaker/train/common_utils/finetune_utils.py +++ b/sagemaker-train/src/sagemaker/train/common_utils/finetune_utils.py @@ -376,6 +376,7 @@ def _get_fine_tuning_options_and_model_arn(model_name: str, customization_techni except Exception as e: logger.error("Exception getting fine-tuning options: %s", e) + raise def _create_input_channels(dataset: str, content_type: Optional[str] = None, diff --git a/sagemaker-train/src/sagemaker/train/common_utils/metrics_visualizer.py b/sagemaker-train/src/sagemaker/train/common_utils/metrics_visualizer.py new file mode 100644 index 0000000000..0b617601e2 --- /dev/null +++ b/sagemaker-train/src/sagemaker/train/common_utils/metrics_visualizer.py @@ -0,0 +1,142 @@ +"""MLflow metrics visualization utilities for SageMaker training jobs.""" + +import logging +from typing import Optional, List, Dict, Any +import boto3 +from sagemaker.core.resources import TrainingJob + +logger = logging.getLogger(__name__) + + +def get_studio_url(training_job: TrainingJob, domain_id: str = None) -> str: + """Get SageMaker Studio URL for training job logs. + + Args: + training_job: SageMaker TrainingJob object or job name string + domain_id: Studio domain ID (e.g., 'd-xxxxxxxxxxxx'). If not provided, attempts to auto-detect + + Returns: + Studio URL pointing to the training job details + + Example: + >>> from sagemaker.train import get_studio_url + >>> url = get_studio_url('my-training-job') + """ + if isinstance(training_job, str): + training_job = TrainingJob.get(training_job_name=training_job) + + region = training_job.region if hasattr(training_job, 'region') and training_job.region else 'us-east-1' + job_name = training_job.training_job_name + + sm_client = boto3.client('sagemaker', region_name=region) + + # Auto-detect domain if not provided + if not domain_id: + try: + domains = sm_client.list_domains()['Domains'] + if domains: + domain_id = domains[0]['DomainId'] + except Exception: + pass + + if not domain_id: + # Fallback to console URL + return f"https://{region}.console.aws.amazon.com/sagemaker/home?region={region}#/jobs/{job_name}" + + # Studio URL format: https://studio-{domain_id}.studio.{region}.sagemaker.aws/jobs/train/{job_name} + return f"https://studio-{domain_id}.studio.{region}.sagemaker.aws/jobs/train/{job_name}" + + +def plot_training_metrics( + training_job: TrainingJob, + metrics: Optional[List[str]] = None, + figsize: tuple = (12, 6) +) -> None: + """Plot training metrics from MLflow for a completed training job. + + Args: + training_job: SageMaker TrainingJob object or job name string + metrics: List of metric names to plot. If None, plots all available metrics. + figsize: Figure size as (width, height) + """ + import matplotlib.pyplot as plt + import mlflow + from mlflow.tracking import MlflowClient + from IPython.display import display + import logging + + logging.getLogger('botocore.credentials').setLevel(logging.WARNING) + + if isinstance(training_job, str): + training_job = TrainingJob.get(training_job_name=training_job) + + run_id = training_job.mlflow_details.mlflow_run_id + + mlflow.set_tracking_uri(training_job.mlflow_config.mlflow_resource_arn) + client = MlflowClient() + + run = mlflow.get_run(run_id) + available_metrics = list(run.data.metrics.keys()) + metrics_to_plot = metrics if metrics else available_metrics + + # Fetch metric histories + metric_data = {} + for metric_name in metrics_to_plot: + history = client.get_metric_history(run_id, metric_name) + if history: + metric_data[metric_name] = history + + # Plot + num_metrics = len(metric_data) + rows = (num_metrics + 1) // 2 + fig, axes = plt.subplots(rows, 2, figsize=(figsize[0], figsize[1] * rows)) + axes = axes.flatten() if num_metrics > 1 else [axes] + + for idx, (metric_name, history) in enumerate(metric_data.items()): + steps = [h.step for h in history] + values = [h.value for h in history] + axes[idx].plot(steps, values, linewidth=2, marker='o', markersize=4) + axes[idx].set_xlabel('Step') + axes[idx].set_ylabel('Value') + axes[idx].set_title(metric_name, fontweight='bold') + axes[idx].grid(True, alpha=0.3) + + for idx in range(len(metric_data), len(axes)): + axes[idx].set_visible(False) + + plt.suptitle(f'Training Metrics: {training_job.training_job_name}', fontweight='bold', fontsize=14) + plt.tight_layout(rect=[0, 0, 1, 0.98]) # Leave small space for suptitle + display(fig) + plt.close() + + +def get_available_metrics(training_job: TrainingJob) -> List[str]: + """Get list of available metrics for a training job. + + Args: + training_job: SageMaker TrainingJob object or job name string + + Returns: + List of metric names + """ + try: + import mlflow + except ImportError: + logger.error("mlflow package not installed") + return [] + + # Handle string input + if isinstance(training_job, str): + training_job = TrainingJob.get(training_job_name=training_job) + + if not hasattr(training_job, 'mlflow_config') or not training_job.mlflow_config: + return [] + + mlflow_details = training_job.mlflow_details + if not mlflow_details or not mlflow_details.mlflow_run_id: + return [] + + mlflow.set_tracking_uri(training_job.mlflow_config.mlflow_resource_arn) + run = mlflow.get_run(mlflow_details.mlflow_run_id) + + return list(run.data.metrics.keys()) diff --git a/sagemaker-train/src/sagemaker/train/common_utils/mlflow_metrics_util.py b/sagemaker-train/src/sagemaker/train/common_utils/mlflow_metrics_util.py index 43cc5e3847..8d15731977 100644 --- a/sagemaker-train/src/sagemaker/train/common_utils/mlflow_metrics_util.py +++ b/sagemaker-train/src/sagemaker/train/common_utils/mlflow_metrics_util.py @@ -154,7 +154,7 @@ def _get_loss_metrics( loss_data = [] for metric_key in run.data.metrics: - if _MLflowConstants.TOTAL_LOSS_METRIC == metric_key.lower(): + if any(kw in metric_key.lower() for kw in _MLflowConstants.LOSS_METRIC_KEYWORDS): metric_history = client.get_metric_history(rid, metric_key) loss_data.append({ 'metric_name': metric_key, @@ -335,7 +335,7 @@ def _get_most_recent_total_loss( for rid, metrics in loss_metrics.items(): for metric in metrics: - if metric['metric_name'].lower() == _MLflowConstants.TOTAL_LOSS_METRIC: + if any(kw in metric['metric_name'].lower() for kw in _MLflowConstants.LOSS_METRIC_KEYWORDS): if metric['history']: # Get the most recent entry (last in history) return metric['history'][-1]['value'] diff --git a/sagemaker-train/src/sagemaker/train/common_utils/trainer_wait.py b/sagemaker-train/src/sagemaker/train/common_utils/trainer_wait.py index 900f13d4c6..002793fab1 100644 --- a/sagemaker-train/src/sagemaker/train/common_utils/trainer_wait.py +++ b/sagemaker-train/src/sagemaker/train/common_utils/trainer_wait.py @@ -40,6 +40,10 @@ def _setup_mlflow_integration(training_job: TrainingJob) -> Tuple[ try: import boto3 + # Check if mlflow_config exists and is assigned + if not hasattr(training_job, 'mlflow_config') or _is_unassigned_attribute(training_job.mlflow_config): + return None, None, None + sm_client = boto3.client('sagemaker') mlflow_arn = training_job.mlflow_config.mlflow_resource_arn @@ -56,7 +60,11 @@ def _setup_mlflow_integration(training_job: TrainingJob) -> Tuple[ return mlflow_url, metrics_util, mlflow_run_name - except Exception: + except Exception as e: + # Log the exception for debugging + import logging + logger = logging.getLogger(__name__) + logger.debug(f"MLflow integration setup failed: {e}") return None, None, None @@ -154,6 +162,59 @@ def _calculate_transition_duration(trans) -> Tuple[str, str]: return duration, check +def get_mlflow_url(training_job) -> str: + """Get presigned MLflow URL for training job experiment. + + Args: + training_job: SageMaker TrainingJob object or job name string + + Returns: + Presigned MLflow URL to experiment (valid for 5 minutes) + + Example: + >>> from sagemaker.train import get_mlflow_url + >>> url = get_mlflow_url('my-training-job') + >>> print(url) + """ + if isinstance(training_job, str): + training_job = TrainingJob.get(training_job_name=training_job) + + if not hasattr(training_job, 'mlflow_config') or _is_unassigned_attribute(training_job.mlflow_config): + raise ValueError("Training job does not have MLflow configured") + + import boto3 + import os + from mlflow.tracking import MlflowClient + import mlflow + + mlflow_arn = training_job.mlflow_config.mlflow_resource_arn + exp_name = training_job.mlflow_config.mlflow_experiment_name + + # Get presigned base URL + sm_client = boto3.client('sagemaker') + response = sm_client.create_presigned_mlflow_app_url(Arn=mlflow_arn) + base_url = response.get('AuthorizedUrl') + + # Try to get experiment ID and append to URL + try: + os.environ['MLFLOW_TRACKING_URI'] = mlflow_arn + mlflow.set_tracking_uri(mlflow_arn) + + mlflow_client = MlflowClient(tracking_uri=mlflow_arn) + experiment = mlflow_client.get_experiment_by_name(exp_name) + + if experiment: + # Format: base_url#/experiments/{id} + # The base_url already has /auth?authToken=... + return f"{base_url}#/experiments/{experiment.experiment_id}" + except Exception: + pass + + return base_url + + + + def wait( training_job: TrainingJob, poll: int = 5, @@ -188,28 +249,84 @@ def wait( from rich.console import Group with _suppress_info_logging(): console = Console(force_jupyter=True) + + # MLflow link caching + mlflow_link_cache = {'url': None, 'timestamp': 0, 'error': None} + has_mlflow_config = (hasattr(training_job, 'mlflow_config') and + not _is_unassigned_attribute(training_job.mlflow_config)) + + def get_cached_mlflow_url(): + """Get cached MLflow URL or generate new one if expired.""" + current_time = time.time() + # Regenerate every 4 minutes (before 5-minute expiration) + if mlflow_link_cache['url'] is None or (current_time - mlflow_link_cache['timestamp']) > 240: + try: + mlflow_link_cache['url'] = get_mlflow_url(training_job) + mlflow_link_cache['error'] = None + except Exception as e: + mlflow_link_cache['error'] = str(e) + mlflow_link_cache['timestamp'] = current_time + return mlflow_link_cache['url'] + + # Track last rendered state to avoid unnecessary refreshes + last_status = None + last_secondary_status = None iteration = 0 while True: iteration += 1 - time.sleep(1) - if iteration == poll: + time.sleep(0.5) + if iteration >= poll * 2: training_job.refresh() iteration = 0 - clear_output(wait=True) - + status = training_job.training_job_status secondary_status = training_job.secondary_status elapsed = time.time() - start_time + + # Only re-render if status changed or every 2 seconds (for elapsed time) + should_render = ( + status != last_status or + secondary_status != last_secondary_status or + iteration % 4 == 0 # Every 2 seconds (4 * 0.5s) + ) + + if not should_render: + continue + + last_status = status + last_secondary_status = secondary_status + + clear_output(wait=True) - # Header section with training job name and MLFlow URL + # Header section with training job name header_table = Table(show_header=False, box=None, padding=(0, 1)) header_table.add_column("Property", style="cyan bold", width=20) - header_table.add_column("Value", style="white") - header_table.add_row("TrainingJob Name", f"[bold green]{training_job.training_job_name}[/bold green]") - if mlflow_url: - header_table.add_row("MLFlow URL", - f"[link={mlflow_url}][bold bright_blue underline]{mlflow_run_name}(link valid for 5 mins)[/bright_blue bold underline][/link]") + header_table.add_column("Value", style="white", overflow="fold") + + # Add Studio job link + try: + from sagemaker.train.common_utils.metrics_visualizer import get_studio_url + studio_url = get_studio_url(training_job) + header_table.add_row("TrainingJob Name", f"[link={studio_url}]🔗 {training_job.training_job_name}[/link]") + except Exception: + header_table.add_row("TrainingJob Name", f"[bold green]{training_job.training_job_name}[/bold green]") + + header_table.add_row("TrainingJob ARN", f"[dim]{training_job.training_job_arn}[/dim]") + + # Add MLflow link to header if available + if has_mlflow_config: + cached_url = get_cached_mlflow_url() + if cached_url: + exp_name = training_job.mlflow_config.mlflow_experiment_name if hasattr(training_job, 'mlflow_config') else None + if exp_name and not _is_unassigned_attribute(exp_name): + link_text = exp_name + else: + link_text = "MLflow Experiment" + + header_table.add_row("MLflow Experiment", f"[link={cached_url}]🔗 {link_text}[/link]") + elif mlflow_link_cache['error']: + header_table.add_row("MLflow Experiment", f"[red]{mlflow_link_cache['error']}[/red]") status_table = Table(show_header=False, box=None, padding=(0, 1)) status_table.add_column("Property", style="cyan bold", width=20) diff --git a/v3-examples/model-customization-examples/dpo_trainer_example_notebook_v3_prod.ipynb b/v3-examples/model-customization-examples/dpo_trainer_example_notebook_v3_prod.ipynb index e5fbe4cd99..130991834a 100644 --- a/v3-examples/model-customization-examples/dpo_trainer_example_notebook_v3_prod.ipynb +++ b/v3-examples/model-customization-examples/dpo_trainer_example_notebook_v3_prod.ipynb @@ -58,8 +58,8 @@ "# This creates a versioned dataset that can be referenced by ARN\n", "# Provide a source (it can be local file path or S3 URL)\n", "dataset = DataSet.create(\n", - " name=\"demo-6\",\n", - " source=\"s3://nova-mlflow-us-west-2/dataset/preference_dataset_train_256.jsonl\"\n", + " name=\"demo-2\",\n", + " source=\"s3://sagemaker-us-west-2-529088288990/dataset/preference_dataset_train_256.jsonl\"\n", ")\n", "\n", "print(f\"Dataset ARN: {dataset.arn}\")" @@ -292,7 +292,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.12.10" + "version": "3.10.14" } }, "nbformat": 4, diff --git a/v3-examples/model-customization-examples/sft_finetuning_example_notebook_pysdk_prod_v3.ipynb b/v3-examples/model-customization-examples/sft_finetuning_example_notebook_pysdk_prod_v3.ipynb index 946debc7d7..887f60a3d9 100644 --- a/v3-examples/model-customization-examples/sft_finetuning_example_notebook_pysdk_prod_v3.ipynb +++ b/v3-examples/model-customization-examples/sft_finetuning_example_notebook_pysdk_prod_v3.ipynb @@ -35,10 +35,30 @@ }, { "cell_type": "code", - "execution_count": null, - "id": "a51be0b5-fd33-4fa0-af2b-d08ce0dc7a8e", + "execution_count": 1, + "id": "989646bf", "metadata": {}, "outputs": [], + "source": [ + "import logging\n", + "logging.basicConfig(level=logging.WARNING)" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "a51be0b5-fd33-4fa0-af2b-d08ce0dc7a8e", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "sagemaker.config INFO - Not applying SDK defaults from location: /Library/Application Support/sagemaker/config.yaml\n", + "sagemaker.config INFO - Not applying SDK defaults from location: /Users/mollyhe/Library/Application Support/sagemaker/config.yaml\n" + ] + } + ], "source": [ "from sagemaker.train.sft_trainer import SFTTrainer\n", "from sagemaker.train.common import TrainingType\n", @@ -53,7 +73,7 @@ "\n", "\n", "# For MLFlow native metrics in Trainer wait, run below line with approriate region\n", - "os.environ[\"SAGEMAKER_MLFLOW_CUSTOM_ENDPOINT\"] = \"https://mlflow.sagemaker.us-west-2.app.aws\"\n", + "os.environ[\"SAGEMAKER_MLFLOW_CUSTOM_ENDPOINT\"] = \"https://mlflow.sagemaker.us-east-1.app.aws\"\n", "\n" ] }, @@ -76,10 +96,55 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 3, "id": "ef4f0e61-de4d-4228-b7a1-ea7497dad547", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "b7310bb3f485478f905417b421e7ae11", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Output()" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
Final Resource Status: Available\n",
+       "
\n" + ], + "text/plain": [ + "Final Resource Status: Available\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
\n"
+      ],
+      "text/plain": []
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Dataset ARN: arn:aws:sagemaker:us-east-1:529088288990:hub-content/G82247CBEQ6TN0FI3J1SAJU59UURMT3H3EKG48C6VQGDINNRNGU0/DataSet/demo-1/28.0.0\n"
+     ]
+    }
+   ],
    "source": [
     "from sagemaker.ai_registry.dataset import DataSet\n",
     "from sagemaker.ai_registry.dataset_utils import CustomizationTechnique\n",
@@ -91,7 +156,7 @@
     "# Provide a source (it can be local file path or S3 URL)\n",
     "dataset = DataSet.create(\n",
     "    name=\"demo-1\",\n",
-    "    source=\"s3://mc-flows-sdk-testing/input_data/sft/sample_data_256_final.jsonl\"\n",
+    "    source=\"s3://sagemaker-us-east-1-529088288990/nova1_SFT.jsonl\"\n",
     ")\n",
     "\n",
     "print(f\"Dataset ARN: {dataset.arn}\")"
@@ -107,14 +172,23 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 4,
    "id": "d6937550-f721-43ff-82dd-c513c328dd17",
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "WARNING:sagemaker.core.utils.utils:No region provided. Using default region.\n"
+     ]
+    }
+   ],
    "source": [
     "from sagemaker.core.resources import ModelPackage, ModelPackageGroup\n",
     "\n",
-    "model_package_group=ModelPackageGroup.create(model_package_group_name=\"test-model-package-group\")"
+    "# model_package_group=ModelPackageGroup.create(model_package_group_name=\"model-package-group-nova13\")\n",
+    "model_package_group=ModelPackageGroup.get(model_package_group_name=\"model-package-group-nova13\")"
    ]
   },
   {
@@ -150,20 +224,20 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 5,
    "id": "88fe8360-de50-481d-932f-564a32be66a0",
    "metadata": {},
    "outputs": [],
    "source": [
     "# For fine-tuning \n",
     "sft_trainer = SFTTrainer(\n",
-    "    model=\"meta-textgeneration-llama-3-2-1b-instruct\", \n",
-    "    training_type=TrainingType.LORA, \n",
+    "    model=\"nova-textgeneration-lite\", \n",
+    "    training_type=TrainingType.FULL, \n",
     "    model_package_group=model_package_group, # or use an existing model package group arn\n",
     "    mlflow_experiment_name=\"test-finetuned-models-exp\", \n",
     "    mlflow_run_name=\"test-finetuned-models-run\", \n",
     "    training_dataset=dataset.arn, \n",
-    "    s3_output_path=\"s3://mc-flows-sdk-testing/output/\",\n",
+    "    s3_output_path=\"s3://sagemaker-us-east-1-529088288990/output/\",\n",
     "    accept_eula=True\n",
     ")\n"
    ]
@@ -201,7 +275,7 @@
    "outputs": [],
    "source": [
     "# To update any hyperparameter, simply assign the value, example:\n",
-    "sft_trainer.hyperparameters.global_batch_size=16"
+    "sft_trainer.hyperparameters.global_batch_size=32"
    ]
   },
   {
@@ -214,7 +288,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 6,
    "id": "4d3b6441-9abb-447b-9307-9606a8c0fabd",
    "metadata": {
     "jupyter": {
@@ -222,28 +296,327 @@
     },
     "scrolled": true
    },
-   "outputs": [],
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "
╭────────────────────────────────── Training Job Status ───────────────────────────────────╮\n",
+       "  TrainingJob Name      🔗 nova-textgeneration-lite-sft-20260305152839                    \n",
+       "  TrainingJob ARN       arn:aws:sagemaker:us-east-1:529088288990:training-job/nova-textg  \n",
+       "                        eneration-lite-sft-20260305152839                                 \n",
+       "  MLflow Experiment     🔗 test-finetuned-models-exp                                      \n",
+       "                                                                                          \n",
+       "  Job Status            InProgress                                                        \n",
+       "  Secondary Status      Pending                                                           \n",
+       "  Elapsed Time          10.3s                                                             \n",
+       "                                                                                          \n",
+       " Status Transitions                                                                       \n",
+       "                                                                                          \n",
+       "        Step              Details                               Duration                  \n",
+       "  ───────────────────────────────────────────────────────────────────────────             \n",
+       "    Starting          Starting the training job             1.8s                      \n",
+       "    Pending           Training job waiting for capacity     Running...                \n",
+       "                                                                                          \n",
+       "╰──────────────────────────────────────────────────────────────────────────────────────────╯\n",
+       "
\n" + ], + "text/plain": [ + "\u001b[38;5;172m╭─\u001b[0m\u001b[38;5;172m─────────────────────────────────\u001b[0m\u001b[38;5;172m \u001b[0m\u001b[1;94mTraining Job Status\u001b[0m\u001b[38;5;172m \u001b[0m\u001b[38;5;172m──────────────────────────────────\u001b[0m\u001b[38;5;172m─╮\u001b[0m\n", + "\u001b[38;5;172m│\u001b[0m \u001b[1;36m \u001b[0m\u001b[1;36mTrainingJob Name \u001b[0m\u001b[1;36m \u001b[0m\u001b[37m \u001b[0m\u001b]8;id=315550;https://studio-d-ingyyaeglvki.studio.us-east-1.sagemaker.aws/jobs/train/nova-textgeneration-lite-sft-20260305152839\u001b\\\u001b[37m🔗 nova-textgeneration-lite-sft-20260305152839\u001b[0m\u001b]8;;\u001b\\\u001b[37m \u001b[0m\u001b[37m \u001b[0m \u001b[38;5;172m│\u001b[0m\n", + "\u001b[38;5;172m│\u001b[0m \u001b[1;36m \u001b[0m\u001b[1;36mTrainingJob ARN \u001b[0m\u001b[1;36m \u001b[0m\u001b[37m \u001b[0m\u001b[2;37marn:aws:sagemaker:us-east-1:529088288990:training-job/nova-textg\u001b[0m\u001b[37m \u001b[0m \u001b[38;5;172m│\u001b[0m\n", + "\u001b[38;5;172m│\u001b[0m \u001b[1;36m \u001b[0m\u001b[37m \u001b[0m\u001b[2;37meneration-lite-sft-20260305152839\u001b[0m\u001b[37m \u001b[0m\u001b[37m \u001b[0m \u001b[38;5;172m│\u001b[0m\n", + "\u001b[38;5;172m│\u001b[0m \u001b[1;36m \u001b[0m\u001b[1;36mMLflow Experiment \u001b[0m\u001b[1;36m \u001b[0m\u001b[37m \u001b[0m\u001b]8;id=794600;https://app-2SV6Q35HTCBO.mlflow.sagemaker.us-east-1.app.aws/auth?authToken=eyJhbGciOiJIUzI1NiJ9.eyJhdXRoVG9rZW5JZCI6IkVCNVFXNCIsImZhc0NyZWRlbnRpYWxzIjoiQWdWNHdycUhQdW9VQVJiTzR4cXhXNHRWeHlVcmpuNjRXaisyUXRWSFRka2s0c3NBWHdBQkFCVmhkM010WTNKNWNIUnZMWEIxWW14cFl5MXJaWGtBUkVGeEswRm9NelpIVDNWVWQyWjVVakl4U1hsT1kwTlNkSEZWYlZrNFVHdFNVMkZ3TjI0ck1sWkRjSFZpZUZjcmFXRm5kMk5OTkROSVNGTnhNMWRVTDBsWVVUMDlBQUVBQjJGM2N5MXJiWE1BUzJGeWJqcGhkM002YTIxek9uVnpMV1ZoYzNRdE1Ub3pNVFF4TkRZek1EWTBPREk2YTJWNUx6Y3dOMkpoTmpjeExUUXpZamd0TkRFeU5DMWhaVFUzTFRrMFlqTXdZbUptT1RJNU13QzRBUUlCQUhnQjRVMDBTK3ErVE51d1gydlFlaGtxQnVneWQ3YnNrb0pWdWQ2NmZjVENVd0g5eFBDUDFZQy9RcHg3eG5oS1ZjcWZBQUFBZmpCOEJna3Foa2lHOXcwQkJ3YWdiekJ0QWdFQU1HZ0dDU3FHU0liM0RRRUhBVEFlQmdsZ2hrZ0JaUU1FQVM0d0VRUU1odjk4eG9YSE52QjlWOHBlQWdFUWdEc0Z6Q2dETXN1Njl4dms3eVY4bVFFVE41a2Vmc0pXRnhLcE9HL1BiVlgxMFNZM1BlT3VzL29QSm1jSWVGV3FZQ3dpWlhjU1hjSmgxVnBiY3dJQUFCQUFkODB0d04vbERjMWRLdnMxY01UWll4eExGU0xMUkw5RW4zVk9WR2U0L1R4cWcvS1BqUlY3N0NJR0NseFBUZzFGLy8vLy93QUFBQUVBQUFBQUFBQUFBQUFBQUFFQUFBUTNhOXAwZkRLaXgrSE1sOEtKRXdYNkMyVkdXMlk0ZERURnJtTmZOekkyai8rTVIrei9rMGhPUTRFczYwQ2VqS0pRU0ZkY0p4TUNiK2ZzZllYVzl6amw1Q1RLNGx6K3djcCtDTnhoNjFNUXU2Yk9taWVvQm1LYnVsMGhuSWllaEhZQmV4cjFPU2JXaFF3L1k0Ui82emVlZFNadFNRalpON2d1K0ZFWnhweTBjTXhsYnBYVE5PVXdZQW9Ec1RGejVJSzdoeG1HRk9CTEdnYUtBUmM1WkFlVUxSWFB4L2hJQklQUXhaTG1Gejd2cUxqa3pLVjc3aG94Q3VKdjF3R1hEUm01NnJvZ0xibjlZR3FZVHpBMlVoVXVDa05wVVZyb09aZThKL2JpMnlXbHJ1NU5pcDFnWkJzWjVhSlQ0a055QTNFZS8vak5tT1dZbCtRbjBnVCtvR2dOM2VLMFhvcTlaQUJ4bngrNmQrSFlhckVjRXU3S3AyTFNwVTVtUEQ5TndORHJKcC9SZ0FjQ3lDV2lYVlVPbGhPL1l5bEVld0h0c2NDYkc2N2JvRTFiaS9aeks4L0czL2g2b0dyTWlGWUFHcmo3NS9BZUVrMTNuNWtuako5S3BYNnVYd0Ywdm5wNVA3ZzJ4ZDR5UVA1WEpQOExpM0NldStwRTBwV1VvbHcxcklXOXViemRKOVN2UnVNNmJzWDE3ZHBBcGxvV2hzaGZMWnhjMlU0ODZ2SlY4SytuZEpwT055Q0x5ZGRVdWpHbndZQ3dzOEdVUDVsL3o1NzUzaWpSaG4vT1cvK0owT0xMU3g2ekV3d0tIWm1jVnRQOXFFRUViNmN1WGtRUGMyWVNuTlRFb01XUm1OQ0JZME1hNnZWS3dnSXJnTVFvNVM5Nkw2TUhYVjRrWFdhOFJ0V3o3ak1zNWh3NXpPSFo5S2Q1MVp0TEdFbU1iQzVMcnpuSnRnaU1mRkJ3dk9Rak5WOGpUUHhOUFVMVUNpWWxQZ2k2TS9GbTYybE1VVXpIK3Ewd1J2VjNZOHhiUElHZVRMMUwwWEt4VDNOeXRORElaN2dZejhkTDhBeWFPVzlKRkttSGdtMm9meUNxSGZuZktvdytwZjM1TVpveUpsUUJ2MnU5RHVpNFNnT1lMckZlblFreHVYNVVVQ0dmY3lRMXNkSERRSjZyaUl1cGlTazVITHU5YllpbEdpUVI5VWZkR08wYnltSnFCcGNUYUR3Y2ZVWkVLR3dmdU9KUWxweG5XME0wT1pnTnRjbm9VVEY1SHpScTZFcEJsaXBub1ZEZjk2M2dWUWowRUdpZzNUWGx6cHBzWXl5d04zcTlSMDlUbENUUzNnNkJTOWo4UnZ6eStvTmpyMTRKTisvdExKejRCUThOVzFYMkt3L29FWFZybzZTTHVQbGp3aE1aWFVDbjU5bXdER2x3Y1AwRUVlQndhcUw5bmg1aXYvdmVmbnpBaWlUSmhOditIaGU2SzZqRzY2QkhLZ3NLc0ZRSFJCU1JCQTQweTNnU2U1SmhtbDYwQTFGM3VYLzdnYXRWdFFhbTRwM0lIT2ZCdVc5eWNiRmpPVUZGVVc2dU53eXpaZVhiYWp5QWpwKzg2TWVDODViTEpMRmZ2NHlVa2dZU0lNSkdrVEZwanNSMGMrWkFHcUZnRjJqSzNEWlFzRHBWSGtYTEtQeWZUTldOTk1UV3pGL1J0bjdKM1VwWHpoMEE1ZWdzV3lXWU1lMllOU2lPSytJWmJlS0U1ZTd3NjhuaXBsU0k4ZTFDZks2aE4wNDNaeldCbkl1U0txMFNEeXB2cVAwZ3MzTjRBdldaYmVHdENJOWVBT2I3akYzQUFRVDYyTUM3VnhHT285Qmo1aVo2ckJYVUpqQU1VOGF5cXFiQTdEdkZFVnpxWGJhbHJjUjVpNjR3Y0o3ZHB1aVBseDFEZG41NnlwWkFuOHgrRXpETEFHY3daUUl4QU1lczlNb3RJOEFnSGFTRm90dWM3MDZaL0JNOE1pd2VUUjQ3US9yb3hIWkMraG4zRzFDaVlGQ2xrakRqTGpSeTlBSXdFbnVNWitqcXFnRkRvTzcyMnJmWjBTQklTb1AyKzFQTCtUdTExL0pvZU1hUUxWZUdPd1hzNzhOd3JseE45TGI1IiwiY2lwaGVyVGV4dCI6IkFRSUJBSGdCNFUwMFMrcStUTnV3WDJ2UWVoa3FCdWd5ZDdic2tvSlZ1ZDY2ZmNUQ1V3R0xCblpkUmZWWlpqN3hTS3hiMFBUUUFBQUFvakNCbndZSktvWklodmNOQVFjR29JR1JNSUdPQWdFQU1JR0lCZ2txaGtpRzl3MEJCd0V3SGdZSllJWklBV1VEQkFFdU1CRUVETmtCbnRsUzdSbGhkSjJkN2dJQkVJQmJBN0xFYitRTVZyQnR4VmIzOVhkc1RudXNBdkhQYzBNanNidmpjaDlLVHhVc0xKcFo3VlZYVlJvVDFnS0lCd2R0aHFRKzFqcWFkUHFTeHNjQmE4K2tJS1NON05RNms2SEdtcXlGWVpDc3lPWlZZQ01KSXpUSHdSZjBDZz09Iiwic3ViIjoiYXJuOmF3czpzYWdlbWFrZXI6dXMtZWFzdC0xOjUyOTA4ODI4ODk5MDptbGZsb3ctYXBwL2FwcC0yU1Y2UTM1SFRDQk8iLCJpYXQiOjE3NzI3NTMzMjUsImV4cCI6MTc3Mjc1MzYyNX0.X2ERSgvF5-bTlp7a-wWjG7vQiNNi9RJ-dreP8RUI4n8#/experiments/2\u001b\\\u001b[37m🔗 test-finetuned-models-exp\u001b[0m\u001b]8;;\u001b\\\u001b[37m \u001b[0m\u001b[37m \u001b[0m \u001b[38;5;172m│\u001b[0m\n", + "\u001b[38;5;172m│\u001b[0m \u001b[38;5;172m│\u001b[0m\n", + "\u001b[38;5;172m│\u001b[0m \u001b[1;36m \u001b[0m\u001b[1;36mJob Status \u001b[0m\u001b[1;36m \u001b[0m\u001b[37m \u001b[0m\u001b[1;38;5;172mInProgress\u001b[0m\u001b[37m \u001b[0m \u001b[38;5;172m│\u001b[0m\n", + "\u001b[38;5;172m│\u001b[0m \u001b[1;36m \u001b[0m\u001b[1;36mSecondary Status \u001b[0m\u001b[1;36m \u001b[0m\u001b[37m \u001b[0m\u001b[1;33mPending\u001b[0m\u001b[37m \u001b[0m\u001b[37m \u001b[0m \u001b[38;5;172m│\u001b[0m\n", + "\u001b[38;5;172m│\u001b[0m \u001b[1;36m \u001b[0m\u001b[1;36mElapsed Time \u001b[0m\u001b[1;36m \u001b[0m\u001b[37m \u001b[0m\u001b[1;91m10.3s\u001b[0m\u001b[37m \u001b[0m\u001b[37m \u001b[0m \u001b[38;5;172m│\u001b[0m\n", + "\u001b[38;5;172m│\u001b[0m \u001b[38;5;172m│\u001b[0m\n", + "\u001b[38;5;172m│\u001b[0m \u001b[1;35mStatus Transitions\u001b[0m \u001b[38;5;172m│\u001b[0m\n", + "\u001b[38;5;172m│\u001b[0m \u001b[38;5;172m│\u001b[0m\n", + "\u001b[38;5;172m│\u001b[0m \u001b[1;35m \u001b[0m\u001b[1;35m \u001b[0m\u001b[1;35m \u001b[0m \u001b[1;35m \u001b[0m\u001b[1;35mStep \u001b[0m\u001b[1;35m \u001b[0m \u001b[1;35m \u001b[0m\u001b[1;35mDetails \u001b[0m\u001b[1;35m \u001b[0m \u001b[1;35m \u001b[0m\u001b[1;35mDuration \u001b[0m\u001b[1;35m \u001b[0m \u001b[38;5;172m│\u001b[0m\n", + "\u001b[38;5;172m│\u001b[0m ─────────────────────────────────────────────────────────────────────────── \u001b[38;5;172m│\u001b[0m\n", + "\u001b[38;5;172m│\u001b[0m \u001b[32m \u001b[0m\u001b[32m✓ \u001b[0m\u001b[32m \u001b[0m \u001b[36m \u001b[0m\u001b[36mStarting \u001b[0m\u001b[36m \u001b[0m \u001b[38;5;172m \u001b[0m\u001b[38;5;172mStarting the training job \u001b[0m\u001b[38;5;172m \u001b[0m \u001b[32m \u001b[0m\u001b[32m1.8s \u001b[0m\u001b[32m \u001b[0m \u001b[38;5;172m│\u001b[0m\n", + "\u001b[38;5;172m│\u001b[0m \u001b[32m \u001b[0m\u001b[32m⋯ \u001b[0m\u001b[32m \u001b[0m \u001b[36m \u001b[0m\u001b[36mPending \u001b[0m\u001b[36m \u001b[0m \u001b[38;5;172m \u001b[0m\u001b[38;5;172mTraining job waiting for capacity \u001b[0m\u001b[38;5;172m \u001b[0m \u001b[32m \u001b[0m\u001b[32mRunning... \u001b[0m\u001b[32m \u001b[0m \u001b[38;5;172m│\u001b[0m\n", + "\u001b[38;5;172m│\u001b[0m \u001b[38;5;172m│\u001b[0m\n", + "\u001b[38;5;172m╰──────────────────────────────────────────────────────────────────────────────────────────╯\u001b[0m\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
╭─────────────────────────────── Traceback (most recent call last) ────────────────────────────────╮\n",
+       " in <module>:1                                                                                    \n",
+       "                                                                                                  \n",
+       " 1 training_job = sft_trainer.train(                                                            \n",
+       "   2 wait=True                                                                                \n",
+       "   3 )                                                                                            \n",
+       "   4                                                                                              \n",
+       "                                                                                                  \n",
+       " /Users/mollyhe/Documents/SageMaker/sagemaker-python-sdk-molly/sagemaker-core/src/sagemaker/core/ \n",
+       " telemetry/telemetry_logging.py:180 in wrapper                                                    \n",
+       "                                                                                                  \n",
+       "   177 │   │   │   │   │   \"sagemaker_session is not provided or not valid.\",                     \n",
+       "   178 │   │   │   │   │   func_name,                                                             \n",
+       "   179 │   │   │   │   )                                                                          \n",
+       " 180 │   │   │   │   return func(*args, **kwargs)                                               \n",
+       "   181 │   │                                                                                      \n",
+       "   182 │   │   return wrapper                                                                     \n",
+       "   183                                                                                            \n",
+       "                                                                                                  \n",
+       " /Users/mollyhe/Documents/SageMaker/sagemaker-python-sdk-molly/sagemaker-train/src/sagemaker/trai \n",
+       " n/sft_trainer.py:280 in train                                                                    \n",
+       "                                                                                                  \n",
+       "   277 │   │   │   from sagemaker.train.common_utils.trainer_wait import wait as _wait            \n",
+       "   278 │   │   │   from sagemaker.core.utils.exceptions import TimeoutExceededError               \n",
+       "   279 │   │   │   try :                                                                          \n",
+       " 280 │   │   │   │   _wait(training_job)                                                        \n",
+       "   281 │   │   │   except TimeoutExceededError as e:                                              \n",
+       "   282 │   │   │   │   logger.error(\"Error: %s\", e)                                               \n",
+       "   283                                                                                            \n",
+       "                                                                                                  \n",
+       " /Users/mollyhe/Documents/SageMaker/sagemaker-python-sdk-molly/sagemaker-train/src/sagemaker/trai \n",
+       " n/common_utils/trainer_wait.py:272 in wait                                                       \n",
+       "                                                                                                  \n",
+       "   269 │   │   │   │   iteration = 0                                                              \n",
+       "   270 │   │   │   │   while True:                                                                \n",
+       "   271 │   │   │   │   │   iteration += 1                                                         \n",
+       " 272 │   │   │   │   │   time.sleep(0.5)                                                        \n",
+       "   273 │   │   │   │   │   if iteration >= poll * 2:                                              \n",
+       "   274 │   │   │   │   │   │   training_job.refresh()                                             \n",
+       "   275 │   │   │   │   │   │   iteration = 0                                                      \n",
+       "╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n",
+       "KeyboardInterrupt\n",
+       "
\n" + ], + "text/plain": [ + "\u001b[38;2;255;0;0m╭─\u001b[0m\u001b[38;2;255;0;0m──────────────────────────────\u001b[0m\u001b[38;2;255;0;0m \u001b[0m\u001b[1;38;2;255;0;0mTraceback \u001b[0m\u001b[1;2;38;2;255;0;0m(most recent call last)\u001b[0m\u001b[38;2;255;0;0m \u001b[0m\u001b[38;2;255;0;0m───────────────────────────────\u001b[0m\u001b[38;2;255;0;0m─╮\u001b[0m\n", + "\u001b[38;2;255;0;0m│\u001b[0m in \u001b[92m\u001b[0m:\u001b[94m1\u001b[0m \u001b[38;2;255;0;0m│\u001b[0m\n", + "\u001b[38;2;255;0;0m│\u001b[0m \u001b[38;2;255;0;0m│\u001b[0m\n", + "\u001b[38;2;255;0;0m│\u001b[0m \u001b[31m❱ \u001b[0m1 training_job = sft_trainer.train( \u001b[38;2;255;0;0m│\u001b[0m\n", + "\u001b[38;2;255;0;0m│\u001b[0m \u001b[2m2 \u001b[0m\u001b[2m│ \u001b[0mwait=\u001b[94mTrue\u001b[0m \u001b[38;2;255;0;0m│\u001b[0m\n", + "\u001b[38;2;255;0;0m│\u001b[0m \u001b[2m3 \u001b[0m) \u001b[38;2;255;0;0m│\u001b[0m\n", + "\u001b[38;2;255;0;0m│\u001b[0m \u001b[2m4 \u001b[0m \u001b[38;2;255;0;0m│\u001b[0m\n", + "\u001b[38;2;255;0;0m│\u001b[0m \u001b[38;2;255;0;0m│\u001b[0m\n", + "\u001b[38;2;255;0;0m│\u001b[0m \u001b[2;33m/Users/mollyhe/Documents/SageMaker/sagemaker-python-sdk-molly/sagemaker-core/src/sagemaker/core/\u001b[0m \u001b[38;2;255;0;0m│\u001b[0m\n", + "\u001b[38;2;255;0;0m│\u001b[0m \u001b[2;33mtelemetry/\u001b[0m\u001b[1;33mtelemetry_logging.py\u001b[0m:\u001b[94m180\u001b[0m in \u001b[92mwrapper\u001b[0m \u001b[38;2;255;0;0m│\u001b[0m\n", + "\u001b[38;2;255;0;0m│\u001b[0m \u001b[38;2;255;0;0m│\u001b[0m\n", + "\u001b[38;2;255;0;0m│\u001b[0m \u001b[2m177 \u001b[0m\u001b[2m│ │ │ │ │ \u001b[0m\u001b[33m\"\u001b[0m\u001b[33msagemaker_session is not provided or not valid.\u001b[0m\u001b[33m\"\u001b[0m, \u001b[38;2;255;0;0m│\u001b[0m\n", + "\u001b[38;2;255;0;0m│\u001b[0m \u001b[2m178 \u001b[0m\u001b[2m│ │ │ │ │ \u001b[0mfunc_name, \u001b[38;2;255;0;0m│\u001b[0m\n", + "\u001b[38;2;255;0;0m│\u001b[0m \u001b[2m179 \u001b[0m\u001b[2m│ │ │ │ \u001b[0m) \u001b[38;2;255;0;0m│\u001b[0m\n", + "\u001b[38;2;255;0;0m│\u001b[0m \u001b[31m❱ \u001b[0m180 \u001b[2m│ │ │ │ \u001b[0m\u001b[94mreturn\u001b[0m func(*args, **kwargs) \u001b[38;2;255;0;0m│\u001b[0m\n", + "\u001b[38;2;255;0;0m│\u001b[0m \u001b[2m181 \u001b[0m\u001b[2m│ │ \u001b[0m \u001b[38;2;255;0;0m│\u001b[0m\n", + "\u001b[38;2;255;0;0m│\u001b[0m \u001b[2m182 \u001b[0m\u001b[2m│ │ \u001b[0m\u001b[94mreturn\u001b[0m wrapper \u001b[38;2;255;0;0m│\u001b[0m\n", + "\u001b[38;2;255;0;0m│\u001b[0m \u001b[2m183 \u001b[0m \u001b[38;2;255;0;0m│\u001b[0m\n", + "\u001b[38;2;255;0;0m│\u001b[0m \u001b[38;2;255;0;0m│\u001b[0m\n", + "\u001b[38;2;255;0;0m│\u001b[0m \u001b[2;33m/Users/mollyhe/Documents/SageMaker/sagemaker-python-sdk-molly/sagemaker-train/src/sagemaker/trai\u001b[0m \u001b[38;2;255;0;0m│\u001b[0m\n", + "\u001b[38;2;255;0;0m│\u001b[0m \u001b[2;33mn/\u001b[0m\u001b[1;33msft_trainer.py\u001b[0m:\u001b[94m280\u001b[0m in \u001b[92mtrain\u001b[0m \u001b[38;2;255;0;0m│\u001b[0m\n", + "\u001b[38;2;255;0;0m│\u001b[0m \u001b[38;2;255;0;0m│\u001b[0m\n", + "\u001b[38;2;255;0;0m│\u001b[0m \u001b[2m277 \u001b[0m\u001b[2m│ │ │ \u001b[0m\u001b[94mfrom\u001b[0m\u001b[90m \u001b[0m\u001b[4;96msagemaker\u001b[0m\u001b[4;96m.\u001b[0m\u001b[4;96mtrain\u001b[0m\u001b[4;96m.\u001b[0m\u001b[4;96mcommon_utils\u001b[0m\u001b[4;96m.\u001b[0m\u001b[4;96mtrainer_wait\u001b[0m\u001b[90m \u001b[0m\u001b[94mimport\u001b[0m wait \u001b[94mas\u001b[0m _wait \u001b[38;2;255;0;0m│\u001b[0m\n", + "\u001b[38;2;255;0;0m│\u001b[0m \u001b[2m278 \u001b[0m\u001b[2m│ │ │ \u001b[0m\u001b[94mfrom\u001b[0m\u001b[90m \u001b[0m\u001b[4;96msagemaker\u001b[0m\u001b[4;96m.\u001b[0m\u001b[4;96mcore\u001b[0m\u001b[4;96m.\u001b[0m\u001b[4;96mutils\u001b[0m\u001b[4;96m.\u001b[0m\u001b[4;96mexceptions\u001b[0m\u001b[90m \u001b[0m\u001b[94mimport\u001b[0m TimeoutExceededError \u001b[38;2;255;0;0m│\u001b[0m\n", + "\u001b[38;2;255;0;0m│\u001b[0m \u001b[2m279 \u001b[0m\u001b[2m│ │ │ \u001b[0m\u001b[94mtry\u001b[0m : \u001b[38;2;255;0;0m│\u001b[0m\n", + "\u001b[38;2;255;0;0m│\u001b[0m \u001b[31m❱ \u001b[0m280 \u001b[2m│ │ │ │ \u001b[0m_wait(training_job) \u001b[38;2;255;0;0m│\u001b[0m\n", + "\u001b[38;2;255;0;0m│\u001b[0m \u001b[2m281 \u001b[0m\u001b[2m│ │ │ \u001b[0m\u001b[94mexcept\u001b[0m TimeoutExceededError \u001b[94mas\u001b[0m e: \u001b[38;2;255;0;0m│\u001b[0m\n", + "\u001b[38;2;255;0;0m│\u001b[0m \u001b[2m282 \u001b[0m\u001b[2m│ │ │ │ \u001b[0mlogger.error(\u001b[33m\"\u001b[0m\u001b[33mError: \u001b[0m\u001b[33m%s\u001b[0m\u001b[33m\"\u001b[0m, e) \u001b[38;2;255;0;0m│\u001b[0m\n", + "\u001b[38;2;255;0;0m│\u001b[0m \u001b[2m283 \u001b[0m \u001b[38;2;255;0;0m│\u001b[0m\n", + "\u001b[38;2;255;0;0m│\u001b[0m \u001b[38;2;255;0;0m│\u001b[0m\n", + "\u001b[38;2;255;0;0m│\u001b[0m \u001b[2;33m/Users/mollyhe/Documents/SageMaker/sagemaker-python-sdk-molly/sagemaker-train/src/sagemaker/trai\u001b[0m \u001b[38;2;255;0;0m│\u001b[0m\n", + "\u001b[38;2;255;0;0m│\u001b[0m \u001b[2;33mn/common_utils/\u001b[0m\u001b[1;33mtrainer_wait.py\u001b[0m:\u001b[94m272\u001b[0m in \u001b[92mwait\u001b[0m \u001b[38;2;255;0;0m│\u001b[0m\n", + "\u001b[38;2;255;0;0m│\u001b[0m \u001b[38;2;255;0;0m│\u001b[0m\n", + "\u001b[38;2;255;0;0m│\u001b[0m \u001b[2m269 \u001b[0m\u001b[2m│ │ │ │ \u001b[0miteration = \u001b[94m0\u001b[0m \u001b[38;2;255;0;0m│\u001b[0m\n", + "\u001b[38;2;255;0;0m│\u001b[0m \u001b[2m270 \u001b[0m\u001b[2m│ │ │ │ \u001b[0m\u001b[94mwhile\u001b[0m \u001b[94mTrue\u001b[0m: \u001b[38;2;255;0;0m│\u001b[0m\n", + "\u001b[38;2;255;0;0m│\u001b[0m \u001b[2m271 \u001b[0m\u001b[2m│ │ │ │ │ \u001b[0miteration += \u001b[94m1\u001b[0m \u001b[38;2;255;0;0m│\u001b[0m\n", + "\u001b[38;2;255;0;0m│\u001b[0m \u001b[31m❱ \u001b[0m272 \u001b[2m│ │ │ │ │ \u001b[0mtime.sleep(\u001b[94m0.5\u001b[0m) \u001b[38;2;255;0;0m│\u001b[0m\n", + "\u001b[38;2;255;0;0m│\u001b[0m \u001b[2m273 \u001b[0m\u001b[2m│ │ │ │ │ \u001b[0m\u001b[94mif\u001b[0m iteration >= poll * \u001b[94m2\u001b[0m: \u001b[38;2;255;0;0m│\u001b[0m\n", + "\u001b[38;2;255;0;0m│\u001b[0m \u001b[2m274 \u001b[0m\u001b[2m│ │ │ │ │ │ \u001b[0mtraining_job.refresh() \u001b[38;2;255;0;0m│\u001b[0m\n", + "\u001b[38;2;255;0;0m│\u001b[0m \u001b[2m275 \u001b[0m\u001b[2m│ │ │ │ │ │ \u001b[0miteration = \u001b[94m0\u001b[0m \u001b[38;2;255;0;0m│\u001b[0m\n", + "\u001b[38;2;255;0;0m╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\u001b[0m\n", + "\u001b[1;91mKeyboardInterrupt\u001b[0m\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], "source": [ "training_job = sft_trainer.train(\n", - " wait=True,\n", + " wait=True\n", ")" ] }, + { + "cell_type": "code", + "execution_count": 1, + "id": "64d68d6f", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
[03/05/26 15:52:42] WARNING  No region provided. Using default region.                                 utils.py:356\n",
+       "
\n" + ], + "text/plain": [ + "\u001b[2;36m[03/05/26 15:52:42]\u001b[0m\u001b[2;36m \u001b[0m\u001b[1;38;2;215;175;0mWARNING \u001b[0m No region provided. Using default region. \u001b]8;id=398978;file:///Users/mollyhe/Documents/SageMaker/sagemaker-python-sdk-molly/sagemaker-core/src/sagemaker/core/utils/utils.py\u001b\\\u001b[2mutils.py\u001b[0m\u001b]8;;\u001b\\\u001b[2m:\u001b[0m\u001b]8;id=927295;file:///Users/mollyhe/Documents/SageMaker/sagemaker-python-sdk-molly/sagemaker-core/src/sagemaker/core/utils/utils.py#356\u001b\\\u001b[2m356\u001b[0m\u001b]8;;\u001b\\\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
                    INFO     Runs on sagemaker prod, region:us-east-1                                  utils.py:370\n",
+       "
\n" + ], + "text/plain": [ + "\u001b[2;36m \u001b[0m\u001b[2;36m \u001b[0m\u001b[1;38;2;0;105;255mINFO \u001b[0m Runs on sagemaker prod, region:us-east-\u001b[1;36m1\u001b[0m \u001b]8;id=459862;file:///Users/mollyhe/Documents/SageMaker/sagemaker-python-sdk-molly/sagemaker-core/src/sagemaker/core/utils/utils.py\u001b\\\u001b[2mutils.py\u001b[0m\u001b]8;;\u001b\\\u001b[2m:\u001b[0m\u001b]8;id=910567;file:///Users/mollyhe/Documents/SageMaker/sagemaker-python-sdk-molly/sagemaker-core/src/sagemaker/core/utils/utils.py#370\u001b\\\u001b[2m370\u001b[0m\u001b]8;;\u001b\\\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
                    INFO     Found credentials in shared credentials file: ~/.aws/credentials   credentials.py:1392\n",
+       "
\n" + ], + "text/plain": [ + "\u001b[2;36m \u001b[0m\u001b[2;36m \u001b[0m\u001b[1;38;2;0;105;255mINFO \u001b[0m Found credentials in shared credentials file: ~\u001b[38;2;225;0;225m/.aws/\u001b[0m\u001b[38;2;225;0;225mcredentials\u001b[0m \u001b]8;id=500109;file:///Users/mollyhe/.pyenv/versions/3.10.14/envs/py3.10.14/lib/python3.10/site-packages/botocore/credentials.py\u001b\\\u001b[2mcredentials.py\u001b[0m\u001b]8;;\u001b\\\u001b[2m:\u001b[0m\u001b]8;id=207687;file:///Users/mollyhe/.pyenv/versions/3.10.14/envs/py3.10.14/lib/python3.10/site-packages/botocore/credentials.py#1392\u001b\\\u001b[2m1392\u001b[0m\u001b]8;;\u001b\\\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
                    INFO     Found credentials in shared credentials file: ~/.aws/credentials   credentials.py:1392\n",
+       "
\n" + ], + "text/plain": [ + "\u001b[2;36m \u001b[0m\u001b[2;36m \u001b[0m\u001b[1;38;2;0;105;255mINFO \u001b[0m Found credentials in shared credentials file: ~\u001b[38;2;225;0;225m/.aws/\u001b[0m\u001b[38;2;225;0;225mcredentials\u001b[0m \u001b]8;id=615220;file:///Users/mollyhe/.pyenv/versions/3.10.14/envs/py3.10.14/lib/python3.10/site-packages/botocore/credentials.py\u001b\\\u001b[2mcredentials.py\u001b[0m\u001b]8;;\u001b\\\u001b[2m:\u001b[0m\u001b]8;id=20168;file:///Users/mollyhe/.pyenv/versions/3.10.14/envs/py3.10.14/lib/python3.10/site-packages/botocore/credentials.py#1392\u001b\\\u001b[2m1392\u001b[0m\u001b]8;;\u001b\\\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
[03/05/26 15:52:46] INFO     Found credentials in shared credentials file: ~/.aws/credentials   credentials.py:1392\n",
+       "
\n" + ], + "text/plain": [ + "\u001b[2;36m[03/05/26 15:52:46]\u001b[0m\u001b[2;36m \u001b[0m\u001b[1;38;2;0;105;255mINFO \u001b[0m Found credentials in shared credentials file: ~\u001b[38;2;225;0;225m/.aws/\u001b[0m\u001b[38;2;225;0;225mcredentials\u001b[0m \u001b]8;id=639090;file:///Users/mollyhe/.pyenv/versions/3.10.14/envs/py3.10.14/lib/python3.10/site-packages/botocore/credentials.py\u001b\\\u001b[2mcredentials.py\u001b[0m\u001b]8;;\u001b\\\u001b[2m:\u001b[0m\u001b]8;id=958918;file:///Users/mollyhe/.pyenv/versions/3.10.14/envs/py3.10.14/lib/python3.10/site-packages/botocore/credentials.py#1392\u001b\\\u001b[2m1392\u001b[0m\u001b]8;;\u001b\\\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "https://app-2SV6Q35HTCBO.mlflow.sagemaker.us-east-1.app.aws/auth?authToken=eyJhbGciOiJIUzI1NiJ9.eyJhdXRoVG9rZW5JZCI6Ilg3NzVSUyIsImZhc0NyZWRlbnRpYWxzIjoiQWdWNFRoSVRsSm9mYWNBZGpHWWtHSms2dnBjbDAwMFNkQmJSRTR6ek5PbmhuTzRBWHdBQkFCVmhkM010WTNKNWNIUnZMWEIxWW14cFl5MXJaWGtBUkVGMlVGZEJSV3RrUzFKMGQzQm1jVWRNUkhrdllYTkNiblJIWkV4YWFtTXhZakZPVEdaaGMwVjNlREJ4WlRGdlN6Z3JWME5GU0RNMk0yTk9UMFZXVkVneFVUMDlBQUVBQjJGM2N5MXJiWE1BUzJGeWJqcGhkM002YTIxek9uVnpMV1ZoYzNRdE1Ub3pNVFF4TkRZek1EWTBPREk2YTJWNUx6Y3dOMkpoTmpjeExUUXpZamd0TkRFeU5DMWhaVFUzTFRrMFlqTXdZbUptT1RJNU13QzRBUUlCQUhnQjRVMDBTK3ErVE51d1gydlFlaGtxQnVneWQ3YnNrb0pWdWQ2NmZjVENVd0Uxb2ZRYnB0UUNKRmgrK3lDR1ZTOW9BQUFBZmpCOEJna3Foa2lHOXcwQkJ3YWdiekJ0QWdFQU1HZ0dDU3FHU0liM0RRRUhBVEFlQmdsZ2hrZ0JaUU1FQVM0d0VRUU1aTTlCUVV2NFluOEZiUXVyQWdFUWdEc1FCbTNoMFN1L3IyL2hWYktoY0phUzRvZ2dCdHMzelJ4T0lXenhUSnNjaVFTQk9JVVN2RHltb29aQWZTRU9pV0RDUnZDeFZnZWFCNGVvWGdJQUFCQUE2ZEVyc1IvWUFRUDYzSVZmY25qTE53SGhmalZjcmMzUUdwZkRtZ2pBVlJhSENPRURES3RtcHZIRmtIYWk3YmRVLy8vLy93QUFBQUVBQUFBQUFBQUFBQUFBQUFFQUFBUTNLNzgyRUdXYnpjY2RZMWJaMUZvS08zRGJPVjM1ZUlGUFdyR3V6YjNvQjdNYWZJZzUzTFRVemtDcjExNmFFK3JPc21CcmIzaTY5cGowck91Tm9TVXQ0d3Q4d2pURzBocjcwS1Q5NmFWTUROVVJKVGZENjVYZENPc25QbGFkbTV2S2lFSURueDdPZ1cyemNDVUd0TlpVYUU5Y1U2bkp1MzlOeWFDVTVlNWFYRDM2ZW5tNHRjSCtCbmhkU1o0WkEzMmM4bVhPMTduQmQ3Mzl6cGZtTFNjMzdGY2tBbnlCcit3dndKSDFES2V5dEJvTTdrUzlhNWpiR0wzeXdnV0h1b2JwODBkdmEzRVpGY3VCRU5XM0lXemFVQkc0R0pCdWdmT2UwQ2wySTlzd2lWbkhlQVc3bFpObUhpVDBlZnJMYXA4dDVZbG5sbzNzSWVYdzlrSG1uVldpakFWa3dxRU1zMTYrNFBRMmVGNzkvM0wyREM4SkJ0QmFESEw4K2FhdTd5TEIvenNQYkZEOEVWY0w0RkZlMjZwZjY0NkdMeW4xMTVnYTNvcmNCREtkMmZ1REE2YlV5Zis2ZzJOOEsxZWlrdTFFQmljZW5URjRRZFJiZlEvMm9Ba2ZXanJPdW9sRlYxWjZrVTVBRnl0dHpHUGhwOVZnOFY3T2VCSE84R2grWHUrbTBTS3hnOEd0NDdyanA5NXo5QnRuSDNGbTdPK3F2b3oyVlo0bGhsU0RGYzN4SWQ2L2IxcnVUVDZxOGFhTjRNNzFqYytQQy9KQ3NKNlI1VlJONFVMeHBuRTZXUmxwZUI1Y3ZaZEFGTE9wVSt5RUV4K21pR0hsUnltNm5laGNidGtaemc5S3ZBUUZUMUFwS0RTMXhFV0RROWxRTkVyaFMrVW4xdFRNQVRlWFpYTzkwRjFjR3dRVmQzZXRUQzlRM2syWFJrYnhrbmY5cmNRc2w0RTE1UER5YWdVTE9aWHJTOGxyVytMbGlsYkxBMExCWnV2VjkraWRrS1F6eGhPUjZSSjBEYVl4ZnJBanFXc2xpdDNvMWxSMy85bUdRMVNOREVQWFVpd2Q1Q05mZW9lSm5QOTJqc29GQ3Z2Sm5mY2wrYmVtRjMwd0ovOGgyRkliQ1RiOUhOaWtUV2wvWllRK2t6V2dhMEhEQ3hWZ0J2Q1J5V1J6K0liUVpZQ1dIL1FJV2dHN1BRazBMNlZWc0VTeURnNnFmdkF3RWs1eEF1cXNzRVhKR1AwRldEMmtYUGg3N0FQTWtuOGNiaGorV2Myc2UyU0xxTEt0MzQweTVud2hiU00xRGJDTWY1NVZ3STBTWWRFRStCRXRlUlZLdlp3RXU3NnFmb1pBMXluVVQvMlZ4NWdDcyt0OEFlNENnUDF6VzZNSGZEeWhUSURjVjM4SHZsbUtVdWE5Sy81ZXorS3Q2M2pyUlhydTlPTzEwYmo4L3Y5eEhxdHdDNjI3RTJTd3BCWU9ncTllTXRwNlo0eGxDdUh6bUhVTkZ0MEtnMC83YW1IcitLUWdnT29Wd3hXdStWcHh2am9EaGxnZUZvUjBicFBtQjI0OFIrRXRoQUdzdlZmVGFlNXoyenlTdGh4UHVmNVRQNGFMTnlQd25zYTVDODVwcExSd3JwUUhHVktPZktYNWVCZjJaTGdueUtpcmR1dlV6dVhmZ29IUTlrRDY4WlVLVXI2Q2h6ZFB0cERSU3AvcCtObXhOb05nT2J5Vy9zS1ZIcVZEUzNCSUZ3M1NaWGkwOXl3OWNSV2xxckVDbXF0S3JsS2JoV0d0UFlyQ2tMM3Q3YjdyTTE2c3UvTEtxKy9lL3FmYitraTB0N0ZUd285Y1pYV240ZmN3dnZNVEYvcXJuMEpXZE9EYWJNMmdOVTdtYmFzd0FHa0xBSHkxM25RTUFIcW9PTnRSUlZsczM2cjhJTmZSWmFGMmN4amNFaGpOKysyWEFHY3daUUl4QU9YNzVLcFY0Sk9WaDU5REZ2T2V6bFVNT1ZZVkxzS3ZFa0cydmE2REc2cXZLV1hCKytIUGFlQVJtaEpMbzA0U3hRSXdHby9sOFJkMWQzSC9Wd1hITUNzckFNcHNQb1hqVDJVdklaN29uTGJqWFNETGtManIyMEx1cTUzUXh1S3prdkpkIiwiY2lwaGVyVGV4dCI6IkFRSUJBSGdCNFUwMFMrcStUTnV3WDJ2UWVoa3FCdWd5ZDdic2tvSlZ1ZDY2ZmNUQ1V3SDl6Tm9yR2JzcjRmMUFUR3FJY1VBVEFBQUFvakNCbndZSktvWklodmNOQVFjR29JR1JNSUdPQWdFQU1JR0lCZ2txaGtpRzl3MEJCd0V3SGdZSllJWklBV1VEQkFFdU1CRUVEUE1PSFVvTlVJTWhyd1IxdFFJQkVJQmIyRlJyQ1dRS0Z0NVZ6cUdJUVRKK0UyZDB6UnBVVGxHWTJVdmMyVmxmd04xVm1WekpzL2JQYThvbUxwT3o3NERERk93NWdDSFByK0hZalJuNjFqMGtxTWFKcFpsVTF0VCtBYU5nZWdiM1l4a1ZXN3RHWm8yVndFazE1dz09Iiwic3ViIjoiYXJuOmF3czpzYWdlbWFrZXI6dXMtZWFzdC0xOjUyOTA4ODI4ODk5MDptbGZsb3ctYXBwL2FwcC0yU1Y2UTM1SFRDQk8iLCJpYXQiOjE3NzI3NTQ3NjQsImV4cCI6MTc3Mjc1NTA2NH0.4TwUDjNP53N-JIHofgyV8hq_6HKftycvEWoOxoqk7fc#/experiments/2\n" + ] + } + ], + "source": [ + "from sagemaker.train import get_mlflow_url\n", + "\n", + "# Get MLflow URL for a training job\n", + "url = get_mlflow_url(\"nova-textgeneration-micro-sft-20260303164831\")\n", + "print(url)" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "fef3d01d", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "https://studio-d-ingyyaeglvki.studio.us-east-1.sagemaker.aws/jobs/train/nova-textgeneration-micro-sft-20260302144611\n" + ] + } + ], + "source": [ + "from sagemaker.train import get_studio_url\n", + "\n", + "\n", + "# Still working on getting the specific training job\n", + "url = get_studio_url('nova-textgeneration-micro-sft-20260302144611')\n", + "print(url)" + ] + }, { "cell_type": "code", "execution_count": null, - "id": "0373cea6-7419-47f1-a59e-1cb441324dc3", + "id": "9b50b630", + "metadata": {}, + "outputs": [], + "source": [ + "from sagemaker.core.resources import TrainingJob\n", + "\n", + "training_job = TrainingJob.get(training_job_name='nova-textgeneration-micro-sft-20260303161941')\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8fa20618", "metadata": {}, "outputs": [], "source": [ + "from sagemaker.train import get_studio_url\n", "\n", + "# Studio URL (job details)\n", + "studio_url = get_studio_url(training_job)\n", + "print(studio_url)\n", + "\n", + "# CloudWatch Logs URL (view logs directly)\n", + "url = get_studio_url(training_job, direct=True)\n", + "print(url)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "591d5e98", + "metadata": {}, + "outputs": [], + "source": [ + "job.stop()" + ] + }, + { + "cell_type": "markdown", + "id": "da489ad0-36b8-44e7-9f65-2ffd359e5225", + "metadata": {}, + "source": [ + "### View any Training job details\n", + "\n", + "We can get any training job details and its status with TrainingJob.get(...)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0373cea6-7419-47f1-a59e-1cb441324dc3", + "metadata": {}, + "outputs": [], + "source": [ "\n", "import json\n", "import re\n", "from sagemaker.core.utils.utils import Unassigned\n", - "from sagemaker.core.resources import TrainingJob\n", - "\n", - "response = TrainingJob.get(training_job_name=\"meta-textgeneration-llama-3-2-1b-instruct-sft-20251201114921\")\n", "\n", "def pretty_print(obj):\n", " def parse_unassigned(item):\n", @@ -260,62 +633,78 @@ " return item\n", "\n", " cleaned = parse_unassigned(obj.__dict__ if hasattr(obj, '__dict__') else obj)\n", - " print(json.dumps(cleaned, indent=2, default=str))\n", - "\n", - "pretty_print(response)" + " print(json.dumps(cleaned, indent=2, default=str))" ] }, { "cell_type": "code", "execution_count": null, - "id": "e9ee7f8e-b26c-4579-9dbc-f08124f2e944", - "metadata": {}, + "id": "6bbe96b4-c8cd-4de3-b4c0-a66fd3086eb2", + "metadata": { + "scrolled": true + }, "outputs": [], "source": [ - "#In order to skip waiting and monitor the training Job later\n", + "from sagemaker.core.resources import TrainingJob\n", "\n", - "'''\n", - "training_job = sft_trainer.train(\n", - " wait=False,\n", - ")\n", - "'''" + "response = TrainingJob.get(training_job_name=\"nova-textgeneration-micro-sft-20260302144611\")\n", + "pretty_print(response)" + ] + }, + { + "cell_type": "markdown", + "id": "8d25735e", + "metadata": {}, + "source": [ + "## Visualize Training Metrics\n", + "\n", + "After training completes, you can visualize metrics logged to MLflow:\n", + "\n", + "- **`get_available_metrics(job)`** - List all available metrics\n", + "- **`plot_training_metrics(job)`** - Plot all metrics\n", + "- **`plot_training_metrics(job, metrics=['reduced_train_loss', 'global_step'])`** - Plot specific metrics" ] }, { "cell_type": "code", "execution_count": null, - "id": "0d99f212-f0bd-43c1-be21-30202fb4a152", - "metadata": { - "scrolled": true - }, + "id": "b8a9256c", + "metadata": {}, "outputs": [], "source": [ - "pretty_print(training_job)" + "from sagemaker.train import get_available_metrics\n", + "\n", + "get_available_metrics('nova-textgeneration-micro-sft-20260302144611')" ] }, { - "cell_type": "markdown", - "id": "da489ad0-36b8-44e7-9f65-2ffd359e5225", + "cell_type": "code", + "execution_count": null, + "id": "f1a794ba", "metadata": {}, + "outputs": [], "source": [ - "### View any Training job details\n", + "from sagemaker.train import plot_training_metrics\n", "\n", - "We can get any training job details and its status with TrainingJob.get(...)" + "# Simple - plot all metrics\n", + "plot_training_metrics('nova-textgeneration-micro-sft-20260302144611')" ] }, { "cell_type": "code", "execution_count": null, - "id": "6bbe96b4-c8cd-4de3-b4c0-a66fd3086eb2", - "metadata": { - "scrolled": true - }, + "id": "2f7ed9d7", + "metadata": {}, "outputs": [], "source": [ - "from sagemaker.core.resources import TrainingJob\n", + "from sagemaker.train import plot_training_metrics\n", "\n", - "response = TrainingJob.get(training_job_name=\"meta-textgeneration-llama-3-2-1b-instruct-sft-20251123162832\")\n", - "pretty_print(response)" + "# Advanced - plot specific metrics\n", + "plot_training_metrics(\n", + " training_job='nova-textgeneration-micro-sft-20260302144611',\n", + " metrics=['reduced_train_loss', 'global_step'],\n", + " figsize=(14, 6)\n", + ")" ] }, { @@ -464,11 +853,31 @@ " wait=True,\n", ")" ] + }, + { + "cell_type": "markdown", + "id": "be4dc095", + "metadata": {}, + "source": [ + "### Stop Training Job" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "fb92815b", + "metadata": {}, + "outputs": [], + "source": [ + "from sagemaker.core.resources import TrainingJob\n", + "job = TrainingJob.get(training_job_name='nova-textgeneration-pro-sft-20260304135114')\n", + "job.stop()" + ] } ], "metadata": { "kernelspec": { - "display_name": "Python 3 (ipykernel)", + "display_name": "py3.10.14", "language": "python", "name": "python3" }, @@ -482,7 +891,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.12.10" + "version": "3.10.14" } }, "nbformat": 4, diff --git a/v3-examples/model-customization-examples/sm-studio-nova-training-job-sample-notebook.ipynb b/v3-examples/model-customization-examples/sm-studio-nova-training-job-sample-notebook.ipynb index 4e49266323..a5f83647cc 100644 --- a/v3-examples/model-customization-examples/sm-studio-nova-training-job-sample-notebook.ipynb +++ b/v3-examples/model-customization-examples/sm-studio-nova-training-job-sample-notebook.ipynb @@ -47,10 +47,33 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 1, "id": "234f7398-fd6b-4d02-a406-0491924c461d", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "
[03/02/26 14:23:42] INFO     Found credentials in shared credentials file: ~/.aws/credentials   credentials.py:1392\n",
+       "
\n" + ], + "text/plain": [ + "\u001b[2;36m[03/02/26 14:23:42]\u001b[0m\u001b[2;36m \u001b[0m\u001b[1;38;2;0;105;255mINFO \u001b[0m Found credentials in shared credentials file: ~\u001b[38;2;225;0;225m/.aws/\u001b[0m\u001b[38;2;225;0;225mcredentials\u001b[0m \u001b]8;id=981430;file:///Users/mollyhe/.pyenv/versions/3.10.14/envs/py3.10.14/lib/python3.10/site-packages/botocore/credentials.py\u001b\\\u001b[2mcredentials.py\u001b[0m\u001b]8;;\u001b\\\u001b[2m:\u001b[0m\u001b]8;id=408099;file:///Users/mollyhe/.pyenv/versions/3.10.14/envs/py3.10.14/lib/python3.10/site-packages/botocore/credentials.py#1392\u001b\\\u001b[2m1392\u001b[0m\u001b]8;;\u001b\\\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "sagemaker.config INFO - Not applying SDK defaults from location: /Library/Application Support/sagemaker/config.yaml\n", + "sagemaker.config INFO - Not applying SDK defaults from location: /Users/mollyhe/Library/Application Support/sagemaker/config.yaml\n", + "Region: us-east-1\n" + ] + } + ], "source": [ "\n", "import os\n", @@ -82,10 +105,110 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 2, "id": "39aaeb1d", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "
[03/02/26 14:23:45] INFO     Found credentials in shared credentials file: ~/.aws/credentials   credentials.py:1392\n",
+       "
\n" + ], + "text/plain": [ + "\u001b[2;36m[03/02/26 14:23:45]\u001b[0m\u001b[2;36m \u001b[0m\u001b[1;38;2;0;105;255mINFO \u001b[0m Found credentials in shared credentials file: ~\u001b[38;2;225;0;225m/.aws/\u001b[0m\u001b[38;2;225;0;225mcredentials\u001b[0m \u001b]8;id=437588;file:///Users/mollyhe/.pyenv/versions/3.10.14/envs/py3.10.14/lib/python3.10/site-packages/botocore/credentials.py\u001b\\\u001b[2mcredentials.py\u001b[0m\u001b]8;;\u001b\\\u001b[2m:\u001b[0m\u001b]8;id=570403;file:///Users/mollyhe/.pyenv/versions/3.10.14/envs/py3.10.14/lib/python3.10/site-packages/botocore/credentials.py#1392\u001b\\\u001b[2m1392\u001b[0m\u001b]8;;\u001b\\\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
[03/02/26 14:23:46] INFO     SageMaker Python SDK will collect telemetry to help us better  telemetry_logging.py:92\n",
+       "                             understand our user's needs, diagnose issues, and deliver                             \n",
+       "                             additional features.                                                                  \n",
+       "                             To opt out of telemetry, please disable via TelemetryOptOut                           \n",
+       "                             parameter in SDK defaults config. For more information, refer                         \n",
+       "                             to                                                                                    \n",
+       "                             https://sagemaker.readthedocs.io/en/stable/overview.html#confi                        \n",
+       "                             guring-and-using-defaults-with-the-sagemaker-python-sdk.                              \n",
+       "
\n" + ], + "text/plain": [ + "\u001b[2;36m[03/02/26 14:23:46]\u001b[0m\u001b[2;36m \u001b[0m\u001b[1;38;2;0;105;255mINFO \u001b[0m SageMaker Python SDK will collect telemetry to help us better \u001b]8;id=719707;file:///Users/mollyhe/.pyenv/versions/3.10.14/envs/py3.10.14/lib/python3.10/site-packages/sagemaker/core/telemetry/telemetry_logging.py\u001b\\\u001b[2mtelemetry_logging.py\u001b[0m\u001b]8;;\u001b\\\u001b[2m:\u001b[0m\u001b]8;id=805235;file:///Users/mollyhe/.pyenv/versions/3.10.14/envs/py3.10.14/lib/python3.10/site-packages/sagemaker/core/telemetry/telemetry_logging.py#92\u001b\\\u001b[2m92\u001b[0m\u001b]8;;\u001b\\\n", + "\u001b[2;36m \u001b[0m understand our user's needs, diagnose issues, and deliver \u001b[2m \u001b[0m\n", + "\u001b[2;36m \u001b[0m additional features. \u001b[2m \u001b[0m\n", + "\u001b[2;36m \u001b[0m To opt out of telemetry, please disable via TelemetryOptOut \u001b[2m \u001b[0m\n", + "\u001b[2;36m \u001b[0m parameter in SDK defaults config. For more information, refer \u001b[2m \u001b[0m\n", + "\u001b[2;36m \u001b[0m to \u001b[2m \u001b[0m\n", + "\u001b[2;36m \u001b[0m \u001b[4;38;2;0;105;255mhttps://sagemaker.readthedocs.io/en/stable/overview.html#confi\u001b[0m \u001b[2m \u001b[0m\n", + "\u001b[2;36m \u001b[0m \u001b[4;38;2;0;105;255mguring-and-using-defaults-with-the-sagemaker-python-sdk.\u001b[0m \u001b[2m \u001b[0m\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
[03/02/26 14:23:47] INFO     Role not provided. Using default role:                                  defaults.py:75\n",
+       "                             arn:aws:iam::529088288990:role/Admin                                                  \n",
+       "
\n" + ], + "text/plain": [ + "\u001b[2;36m[03/02/26 14:23:47]\u001b[0m\u001b[2;36m \u001b[0m\u001b[1;38;2;0;105;255mINFO \u001b[0m Role not provided. Using default role: \u001b]8;id=193859;file:///Users/mollyhe/.pyenv/versions/3.10.14/envs/py3.10.14/lib/python3.10/site-packages/sagemaker/train/defaults.py\u001b\\\u001b[2mdefaults.py\u001b[0m\u001b]8;;\u001b\\\u001b[2m:\u001b[0m\u001b]8;id=892655;file:///Users/mollyhe/.pyenv/versions/3.10.14/envs/py3.10.14/lib/python3.10/site-packages/sagemaker/train/defaults.py#75\u001b\\\u001b[2m75\u001b[0m\u001b]8;;\u001b\\\n", + "\u001b[2;36m \u001b[0m arn:aws:iam::\u001b[1;36m529088288990\u001b[0m:role/Admin \u001b[2m \u001b[0m\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "521e87aea90849e48e5312e3160c5a23", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Output()" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
Final Resource Status: Available\n",
+       "
\n" + ], + "text/plain": [ + "Final Resource Status: Available\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
\n"
+      ],
+      "text/plain": []
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "TRAINING_DATASET ARN: arn:aws:sagemaker:us-east-1:529088288990:hub-content/G82247CBEQ6TN0FI3J1SAJU59UURMT3H3EKG48C6VQGDINNRNGU0/DataSet/demo-sft-dataset/3.0.0\n"
+     ]
+    }
+   ],
    "source": [
     "from sagemaker.ai_registry.dataset import DataSet\n",
     "from sagemaker.ai_registry.dataset_utils import CustomizationTechnique\n",
@@ -93,34 +216,34 @@
     "# Register dataset in SageMaker AI Registry. This creates a versioned dataset that can be referenced by ARN\n",
     "dataset = DataSet.create(\n",
     "    name=\"demo-sft-dataset\",\n",
-    "    source=\"s3://your-bucket/dataset/training_dataset.jsonl\", # source can be S3 or local path\n",
+    "    source=\"s3://sagemaker-us-east-1-529088288990/nova1_SFT.jsonl\", # source can be S3 or local path\n",
     "    #customization_technique=CUSTOMIZATION_TECHNIQUE.SFT # or DPO or RLVR\n",
     "        # Optional technique name for minimal dataset format check.\n",
     "    wait=True\n",
     ")\n",
     "\n",
     "print(f\"TRAINING_DATASET ARN: {dataset.arn}\")\n",
-    "# TRAINING_DATASET = dataset.arn"
+    "TRAINING_DATASET = dataset.arn"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 3,
    "id": "ea22bd22",
    "metadata": {},
    "outputs": [],
    "source": [
     "# Required Configs\n",
-    "BASE_MODEL = \"\"\n",
+    "BASE_MODEL = \"nova-textgeneration-micro\"\n",
     "\n",
     "# MODEL_PACKAGE_GROUP_NAME is same as CUSTOM_MODEL_NAME\n",
-    "MODEL_PACKAGE_GROUP_NAME = \"\"\n",
+    "MODEL_PACKAGE_GROUP_NAME = \"model-pacakge-group-nova4\"\n",
     "\n",
-    "TRAINING_DATASET = \"\"\n",
+    "# TRAINING_DATASET = \"\"\n",
     "\n",
-    "S3_OUTPUT_PATH = \"\"\n",
+    "S3_OUTPUT_PATH = \"s3://sagemaker-us-east-1-529088288990/output/\"\n",
     "\n",
-    "ROLE_ARN = \"\""
+    "ROLE_ARN = \"arn:aws:iam::529088288990:role/Admin\""
    ]
   },
   {
@@ -133,15 +256,68 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 4,
    "id": "90a1069d19eeee7",
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "
[03/02/26 14:23:59] INFO     Creating model_package_group resource.                              resources.py:25559\n",
+       "
\n" + ], + "text/plain": [ + "\u001b[2;36m[03/02/26 14:23:59]\u001b[0m\u001b[2;36m \u001b[0m\u001b[1;38;2;0;105;255mINFO \u001b[0m Creating model_package_group resource. \u001b]8;id=924503;file:///Users/mollyhe/.pyenv/versions/3.10.14/envs/py3.10.14/lib/python3.10/site-packages/sagemaker/core/resources.py\u001b\\\u001b[2mresources.py\u001b[0m\u001b]8;;\u001b\\\u001b[2m:\u001b[0m\u001b]8;id=629107;file:///Users/mollyhe/.pyenv/versions/3.10.14/envs/py3.10.14/lib/python3.10/site-packages/sagemaker/core/resources.py#25559\u001b\\\u001b[2m25559\u001b[0m\u001b]8;;\u001b\\\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
                    WARNING  No region provided. Using default region.                                 utils.py:340\n",
+       "
\n" + ], + "text/plain": [ + "\u001b[2;36m \u001b[0m\u001b[2;36m \u001b[0m\u001b[1;38;2;215;175;0mWARNING \u001b[0m No region provided. Using default region. \u001b]8;id=617102;file:///Users/mollyhe/.pyenv/versions/3.10.14/envs/py3.10.14/lib/python3.10/site-packages/sagemaker/core/utils/utils.py\u001b\\\u001b[2mutils.py\u001b[0m\u001b]8;;\u001b\\\u001b[2m:\u001b[0m\u001b]8;id=566991;file:///Users/mollyhe/.pyenv/versions/3.10.14/envs/py3.10.14/lib/python3.10/site-packages/sagemaker/core/utils/utils.py#340\u001b\\\u001b[2m340\u001b[0m\u001b]8;;\u001b\\\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
                    INFO     Runs on sagemaker prod, region:us-east-1                                  utils.py:354\n",
+       "
\n" + ], + "text/plain": [ + "\u001b[2;36m \u001b[0m\u001b[2;36m \u001b[0m\u001b[1;38;2;0;105;255mINFO \u001b[0m Runs on sagemaker prod, region:us-east-\u001b[1;36m1\u001b[0m \u001b]8;id=132575;file:///Users/mollyhe/.pyenv/versions/3.10.14/envs/py3.10.14/lib/python3.10/site-packages/sagemaker/core/utils/utils.py\u001b\\\u001b[2mutils.py\u001b[0m\u001b]8;;\u001b\\\u001b[2m:\u001b[0m\u001b]8;id=459422;file:///Users/mollyhe/.pyenv/versions/3.10.14/envs/py3.10.14/lib/python3.10/site-packages/sagemaker/core/utils/utils.py#354\u001b\\\u001b[2m354\u001b[0m\u001b]8;;\u001b\\\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
                    INFO     Found credentials in shared credentials file: ~/.aws/credentials   credentials.py:1392\n",
+       "
\n" + ], + "text/plain": [ + "\u001b[2;36m \u001b[0m\u001b[2;36m \u001b[0m\u001b[1;38;2;0;105;255mINFO \u001b[0m Found credentials in shared credentials file: ~\u001b[38;2;225;0;225m/.aws/\u001b[0m\u001b[38;2;225;0;225mcredentials\u001b[0m \u001b]8;id=803081;file:///Users/mollyhe/.pyenv/versions/3.10.14/envs/py3.10.14/lib/python3.10/site-packages/botocore/credentials.py\u001b\\\u001b[2mcredentials.py\u001b[0m\u001b]8;;\u001b\\\u001b[2m:\u001b[0m\u001b]8;id=146617;file:///Users/mollyhe/.pyenv/versions/3.10.14/envs/py3.10.14/lib/python3.10/site-packages/botocore/credentials.py#1392\u001b\\\u001b[2m1392\u001b[0m\u001b]8;;\u001b\\\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], "source": [ "from sagemaker.core.resources import ModelPackageGroup\n", "model_package_group = ModelPackageGroup.create(\n", " model_package_group_name=MODEL_PACKAGE_GROUP_NAME,\n", - " model_package_group_description='' # Required Description\n", + " model_package_group_description='test nova textgeneration micro' # Required Description\n", ")" ] }, @@ -210,7 +386,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 5, "id": "062953d8", "metadata": { "editable": true, @@ -401,10 +577,84 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 6, "id": "2f6eeb5e", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Default Finetuning Options:\n" + ] + }, + { + "data": { + "text/html": [ + "
{\n",
+       "'name': 'my-lora-run-wkxk5',\n",
+       "'global_batch_size': '64',\n",
+       "'max_epochs': '2',\n",
+       "'learning_rate': '1e-05',\n",
+       "'lora_alpha': '128',\n",
+       "'learning_rate_ratio': '16.0',\n",
+       "'max_context_length': '8192'\n",
+       "}\n",
+       "
\n" + ], + "text/plain": [ + "\u001b[1m{\u001b[0m\n", + "\u001b[2;32m│ \u001b[0m\u001b[38;2;0;135;0m'name'\u001b[0m: \u001b[38;2;0;135;0m'my-lora-run-wkxk5'\u001b[0m,\n", + "\u001b[2;32m│ \u001b[0m\u001b[38;2;0;135;0m'global_batch_size'\u001b[0m: \u001b[38;2;0;135;0m'64'\u001b[0m,\n", + "\u001b[2;32m│ \u001b[0m\u001b[38;2;0;135;0m'max_epochs'\u001b[0m: \u001b[38;2;0;135;0m'2'\u001b[0m,\n", + "\u001b[2;32m│ \u001b[0m\u001b[38;2;0;135;0m'learning_rate'\u001b[0m: \u001b[38;2;0;135;0m'1e-05'\u001b[0m,\n", + "\u001b[2;32m│ \u001b[0m\u001b[38;2;0;135;0m'lora_alpha'\u001b[0m: \u001b[38;2;0;135;0m'128'\u001b[0m,\n", + "\u001b[2;32m│ \u001b[0m\u001b[38;2;0;135;0m'learning_rate_ratio'\u001b[0m: \u001b[38;2;0;135;0m'16.0'\u001b[0m,\n", + "\u001b[2;32m│ \u001b[0m\u001b[38;2;0;135;0m'max_context_length'\u001b[0m: \u001b[38;2;0;135;0m'8192'\u001b[0m\n", + "\u001b[1m}\u001b[0m\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Modified/User defined Options:\n" + ] + }, + { + "data": { + "text/html": [ + "
{\n",
+       "'name': 'my-lora-run-wkxk5',\n",
+       "'global_batch_size': '64',\n",
+       "'max_epochs': '2',\n",
+       "'learning_rate': '0.0002',\n",
+       "'lora_alpha': '128',\n",
+       "'learning_rate_ratio': '16.0',\n",
+       "'max_context_length': '8192'\n",
+       "}\n",
+       "
\n" + ], + "text/plain": [ + "\u001b[1m{\u001b[0m\n", + "\u001b[2;32m│ \u001b[0m\u001b[38;2;0;135;0m'name'\u001b[0m: \u001b[38;2;0;135;0m'my-lora-run-wkxk5'\u001b[0m,\n", + "\u001b[2;32m│ \u001b[0m\u001b[38;2;0;135;0m'global_batch_size'\u001b[0m: \u001b[38;2;0;135;0m'64'\u001b[0m,\n", + "\u001b[2;32m│ \u001b[0m\u001b[38;2;0;135;0m'max_epochs'\u001b[0m: \u001b[38;2;0;135;0m'2'\u001b[0m,\n", + "\u001b[2;32m│ \u001b[0m\u001b[38;2;0;135;0m'learning_rate'\u001b[0m: \u001b[38;2;0;135;0m'0.0002'\u001b[0m,\n", + "\u001b[2;32m│ \u001b[0m\u001b[38;2;0;135;0m'lora_alpha'\u001b[0m: \u001b[38;2;0;135;0m'128'\u001b[0m,\n", + "\u001b[2;32m│ \u001b[0m\u001b[38;2;0;135;0m'learning_rate_ratio'\u001b[0m: \u001b[38;2;0;135;0m'16.0'\u001b[0m,\n", + "\u001b[2;32m│ \u001b[0m\u001b[38;2;0;135;0m'max_context_length'\u001b[0m: \u001b[38;2;0;135;0m'8192'\u001b[0m\n", + "\u001b[1m}\u001b[0m\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], "source": [ "print(\"Default Finetuning Options:\")\n", "pprint(trainer.hyperparameters.to_dict())\n", @@ -426,10 +676,355 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 7, "id": "31690f41", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "
╭────────────────────────────────── Training Job Status ───────────────────────────────────╮\n",
+       "  TrainingJob Name      nova-textgeneration-micro-sft-20260302142433                      \n",
+       "                                                                                          \n",
+       "  Job Status            InProgress                                                        \n",
+       "  Secondary Status      Pending                                                           \n",
+       "  Elapsed Time          814.3s                                                            \n",
+       "                                                                                          \n",
+       " Status Transitions                                                                       \n",
+       "                                                                                          \n",
+       "        Step              Details                               Duration                  \n",
+       "  ───────────────────────────────────────────────────────────────────────────             \n",
+       "    Starting          Starting the training job             2.2s                      \n",
+       "        Pending           Training job waiting for capacity                               \n",
+       "                                                                                          \n",
+       "╰──────────────────────────────────────────────────────────────────────────────────────────╯\n",
+       "
\n" + ], + "text/plain": [ + "\u001b[38;5;172m╭─\u001b[0m\u001b[38;5;172m─────────────────────────────────\u001b[0m\u001b[38;5;172m \u001b[0m\u001b[1;94mTraining Job Status\u001b[0m\u001b[38;5;172m \u001b[0m\u001b[38;5;172m──────────────────────────────────\u001b[0m\u001b[38;5;172m─╮\u001b[0m\n", + "\u001b[38;5;172m│\u001b[0m \u001b[1;36m \u001b[0m\u001b[1;36mTrainingJob Name \u001b[0m\u001b[1;36m \u001b[0m\u001b[37m \u001b[0m\u001b[1;32mnova-textgeneration-micro-sft-20260302142433\u001b[0m\u001b[37m \u001b[0m \u001b[38;5;172m│\u001b[0m\n", + "\u001b[38;5;172m│\u001b[0m \u001b[38;5;172m│\u001b[0m\n", + "\u001b[38;5;172m│\u001b[0m \u001b[1;36m \u001b[0m\u001b[1;36mJob Status \u001b[0m\u001b[1;36m \u001b[0m\u001b[37m \u001b[0m\u001b[1;38;5;172mInProgress\u001b[0m\u001b[37m \u001b[0m \u001b[38;5;172m│\u001b[0m\n", + "\u001b[38;5;172m│\u001b[0m \u001b[1;36m \u001b[0m\u001b[1;36mSecondary Status \u001b[0m\u001b[1;36m \u001b[0m\u001b[37m \u001b[0m\u001b[1;33mPending\u001b[0m\u001b[37m \u001b[0m\u001b[37m \u001b[0m \u001b[38;5;172m│\u001b[0m\n", + "\u001b[38;5;172m│\u001b[0m \u001b[1;36m \u001b[0m\u001b[1;36mElapsed Time \u001b[0m\u001b[1;36m \u001b[0m\u001b[37m \u001b[0m\u001b[1;91m814.3s\u001b[0m\u001b[37m \u001b[0m\u001b[37m \u001b[0m \u001b[38;5;172m│\u001b[0m\n", + "\u001b[38;5;172m│\u001b[0m \u001b[38;5;172m│\u001b[0m\n", + "\u001b[38;5;172m│\u001b[0m \u001b[1;35mStatus Transitions\u001b[0m \u001b[38;5;172m│\u001b[0m\n", + "\u001b[38;5;172m│\u001b[0m \u001b[38;5;172m│\u001b[0m\n", + "\u001b[38;5;172m│\u001b[0m \u001b[1;35m \u001b[0m\u001b[1;35m \u001b[0m\u001b[1;35m \u001b[0m \u001b[1;35m \u001b[0m\u001b[1;35mStep \u001b[0m\u001b[1;35m \u001b[0m \u001b[1;35m \u001b[0m\u001b[1;35mDetails \u001b[0m\u001b[1;35m \u001b[0m \u001b[1;35m \u001b[0m\u001b[1;35mDuration \u001b[0m\u001b[1;35m \u001b[0m \u001b[38;5;172m│\u001b[0m\n", + "\u001b[38;5;172m│\u001b[0m ─────────────────────────────────────────────────────────────────────────── \u001b[38;5;172m│\u001b[0m\n", + "\u001b[38;5;172m│\u001b[0m \u001b[32m \u001b[0m\u001b[32m✓ \u001b[0m\u001b[32m \u001b[0m \u001b[36m \u001b[0m\u001b[36mStarting \u001b[0m\u001b[36m \u001b[0m \u001b[38;5;172m \u001b[0m\u001b[38;5;172mStarting the training job \u001b[0m\u001b[38;5;172m \u001b[0m \u001b[32m \u001b[0m\u001b[32m2.2s \u001b[0m\u001b[32m \u001b[0m \u001b[38;5;172m│\u001b[0m\n", + "\u001b[38;5;172m│\u001b[0m \u001b[32m \u001b[0m\u001b[32m \u001b[0m\u001b[32m \u001b[0m \u001b[36m \u001b[0m\u001b[36mPending \u001b[0m\u001b[36m \u001b[0m \u001b[38;5;172m \u001b[0m\u001b[38;5;172mTraining job waiting for capacity \u001b[0m\u001b[38;5;172m \u001b[0m \u001b[32m \u001b[0m\u001b[32m \u001b[0m\u001b[32m \u001b[0m \u001b[38;5;172m│\u001b[0m\n", + "\u001b[38;5;172m│\u001b[0m \u001b[38;5;172m│\u001b[0m\n", + "\u001b[38;5;172m╰──────────────────────────────────────────────────────────────────────────────────────────╯\u001b[0m\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
╭─────────────────────────────── Traceback (most recent call last) ────────────────────────────────╮\n",
+       " /Users/mollyhe/.pyenv/versions/3.10.14/envs/py3.10.14/lib/python3.10/site-packages/sagemaker/tra \n",
+       " in/common_utils/trainer_wait.py:197 in wait                                                      \n",
+       "                                                                                                  \n",
+       "   194 │   │   │   │   │   iteration += 1                                                         \n",
+       "   195 │   │   │   │   │   time.sleep(1)                                                          \n",
+       "   196 │   │   │   │   │   if iteration == poll:                                                  \n",
+       " 197 │   │   │   │   │   │   training_job.refresh()                                             \n",
+       "   198 │   │   │   │   │   │   iteration = 0                                                      \n",
+       "   199 │   │   │   │   │   clear_output(wait=True)                                                \n",
+       "   200                                                                                            \n",
+       "                                                                                                  \n",
+       " /Users/mollyhe/.pyenv/versions/3.10.14/envs/py3.10.14/lib/python3.10/site-packages/sagemaker/cor \n",
+       " e/resources.py:143 in wrapper                                                                    \n",
+       "                                                                                                  \n",
+       "     140 │   │   @functools.wraps(func)                                                           \n",
+       "     141 │   │   def wrapper(*args, **kwargs):                                                    \n",
+       "     142 │   │   │   config = dict(arbitrary_types_allowed=True)                                  \n",
+       "   143 │   │   │   return validate_call(config=config)(func)(*args, **kwargs)                   \n",
+       "     144 │   │                                                                                    \n",
+       "     145 │   │   return wrapper                                                                   \n",
+       "     146                                                                                          \n",
+       "                                                                                                  \n",
+       " /Users/mollyhe/.pyenv/versions/3.10.14/envs/py3.10.14/lib/python3.10/site-packages/pydantic/_int \n",
+       " ernal/_validate_call.py:39 in wrapper_function                                                   \n",
+       "                                                                                                  \n",
+       "    36 │   │                                                                                      \n",
+       "    37 │   │   @functools.wraps(wrapped)                                                          \n",
+       "    38 │   │   def wrapper_function(*args, **kwargs):                                             \n",
+       "  39 │   │   │   return wrapper(*args, **kwargs)                                                \n",
+       "    40                                                                                        \n",
+       "    41 # We need to manually update this because `partial` object has no `__name__` and `__   \n",
+       "    42 wrapper_function.__name__ = extract_function_name(wrapped)                             \n",
+       "                                                                                                  \n",
+       " /Users/mollyhe/.pyenv/versions/3.10.14/envs/py3.10.14/lib/python3.10/site-packages/pydantic/_int \n",
+       " ernal/_validate_call.py:136 in __call__                                                          \n",
+       "                                                                                                  \n",
+       "   133 │   │   if not self.__pydantic_complete__:                                                 \n",
+       "   134 │   │   │   self._create_validators()                                                      \n",
+       "   135 │   │                                                                                      \n",
+       " 136 │   │   res = self.__pydantic_validator__.validate_python(pydantic_core.ArgsKwargs(args,   \n",
+       "   137 │   │   if self.__return_pydantic_validator__:                                             \n",
+       "   138 │   │   │   return self.__return_pydantic_validator__(res)                                 \n",
+       "   139 │   │   else:                                                                              \n",
+       "                                                                                                  \n",
+       " /Users/mollyhe/.pyenv/versions/3.10.14/envs/py3.10.14/lib/python3.10/site-packages/sagemaker/cor \n",
+       " e/resources.py:35682 in refresh                                                                  \n",
+       "                                                                                                  \n",
+       "   35679 │   │   logger.debug(f\"Serialized input request: {operation_input_args}\")                \n",
+       "   35680 │   │                                                                                    \n",
+       "   35681 │   │   client = Base.get_sagemaker_client()                                             \n",
+       " 35682 │   │   response = client.describe_training_job(**operation_input_args)                  \n",
+       "   35683 │   │                                                                                    \n",
+       "   35684 │   │   # deserialize response and update self                                           \n",
+       "   35685 │   │   transform(response, \"DescribeTrainingJobResponse\", self)                         \n",
+       "                                                                                                  \n",
+       " /Users/mollyhe/.pyenv/versions/3.10.14/envs/py3.10.14/lib/python3.10/site-packages/botocore/clie \n",
+       " nt.py:602 in _api_call                                                                           \n",
+       "                                                                                                  \n",
+       "    599 │   │   │   │   │   f\"{py_operation_name}() only accepts keyword arguments.\"              \n",
+       "    600 │   │   │   │   )                                                                         \n",
+       "    601 │   │   │   # The \"self\" in this scope is referring to the BaseClient.                    \n",
+       "  602 │   │   │   return self._make_api_call(operation_name, kwargs)                            \n",
+       "    603 │   │                                                                                     \n",
+       "    604 │   │   _api_call.__name__ = str(py_operation_name)                                       \n",
+       "    605                                                                                           \n",
+       "                                                                                                  \n",
+       " /Users/mollyhe/.pyenv/versions/3.10.14/envs/py3.10.14/lib/python3.10/site-packages/botocore/cont \n",
+       " ext.py:123 in wrapper                                                                            \n",
+       "                                                                                                  \n",
+       "   120 │   │   │   with start_as_current_context():                                               \n",
+       "   121 │   │   │   │   if hook:                                                                   \n",
+       "   122 │   │   │   │   │   hook()                                                                 \n",
+       " 123 │   │   │   │   return func(*args, **kwargs)                                               \n",
+       "   124 │   │                                                                                      \n",
+       "   125 │   │   return wrapper                                                                     \n",
+       "   126                                                                                            \n",
+       "                                                                                                  \n",
+       " /Users/mollyhe/.pyenv/versions/3.10.14/envs/py3.10.14/lib/python3.10/site-packages/botocore/clie \n",
+       " nt.py:1078 in _make_api_call                                                                     \n",
+       "                                                                                                  \n",
+       "   1075 │   │   │   │   'error_code_override'                                                     \n",
+       "   1076 │   │   │   ) or error_info.get(\"Code\")                                                   \n",
+       "   1077 │   │   │   error_class = self.exceptions.from_code(error_code)                           \n",
+       " 1078 │   │   │   raise error_class(parsed_response, operation_name)                            \n",
+       "   1079 │   │   else:                                                                             \n",
+       "   1080 │   │   │   return parsed_response                                                        \n",
+       "   1081                                                                                           \n",
+       "╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n",
+       "ClientError: An error occurred (ExpiredTokenException) when calling the DescribeTrainingJob operation: The security\n",
+       "token included in the request is expired\n",
+       "\n",
+       "The above exception was the direct cause of the following exception:\n",
+       "\n",
+       "╭─────────────────────────────── Traceback (most recent call last) ────────────────────────────────╮\n",
+       " in <module>:1                                                                                    \n",
+       "                                                                                                  \n",
+       " 1 training_job = trainer.train(wait=True)                                                      \n",
+       "   2                                                                                              \n",
+       "   3 TRAINING_JOB_NAME = training_job.training_job_name                                           \n",
+       "   4                                                                                              \n",
+       "                                                                                                  \n",
+       " /Users/mollyhe/.pyenv/versions/3.10.14/envs/py3.10.14/lib/python3.10/site-packages/sagemaker/cor \n",
+       " e/telemetry/telemetry_logging.py:168 in wrapper                                                  \n",
+       "                                                                                                  \n",
+       "   165 │   │   │   │   │   caught_ex = e                                                          \n",
+       "   166 │   │   │   │   finally:                                                                   \n",
+       "   167 │   │   │   │   │   if caught_ex:                                                          \n",
+       " 168 │   │   │   │   │   │   raise caught_ex                                                    \n",
+       "   169 │   │   │   │   │   return response  # pylint: disable=W0150                               \n",
+       "   170 │   │   │   else:                                                                          \n",
+       "   171 │   │   │   │   logger.debug(                                                              \n",
+       "                                                                                                  \n",
+       " /Users/mollyhe/.pyenv/versions/3.10.14/envs/py3.10.14/lib/python3.10/site-packages/sagemaker/cor \n",
+       " e/telemetry/telemetry_logging.py:139 in wrapper                                                  \n",
+       "                                                                                                  \n",
+       "   136 │   │   │   │   start_timer = perf_counter()                                               \n",
+       "   137 │   │   │   │   try:                                                                       \n",
+       "   138 │   │   │   │   │   # Call the original function                                           \n",
+       " 139 │   │   │   │   │   response = func(*args, **kwargs)                                       \n",
+       "   140 │   │   │   │   │   stop_timer = perf_counter()                                            \n",
+       "   141 │   │   │   │   │   elapsed = stop_timer - start_timer                                     \n",
+       "   142 │   │   │   │   │   extra += f\"&x-latency={round(elapsed, 2)}\"                             \n",
+       "                                                                                                  \n",
+       " /Users/mollyhe/.pyenv/versions/3.10.14/envs/py3.10.14/lib/python3.10/site-packages/sagemaker/tra \n",
+       " in/sft_trainer.py:267 in train                                                                   \n",
+       "                                                                                                  \n",
+       "   264 │   │   │   from sagemaker.train.common_utils.trainer_wait import wait as _wait            \n",
+       "   265 │   │   │   from sagemaker.core.utils.exceptions import TimeoutExceededError               \n",
+       "   266 │   │   │   try :                                                                          \n",
+       " 267 │   │   │   │   _wait(training_job)                                                        \n",
+       "   268 │   │   │   except TimeoutExceededError as e:                                              \n",
+       "   269 │   │   │   │   logger.error(\"Error: %s\", e)                                               \n",
+       "   270                                                                                            \n",
+       "                                                                                                  \n",
+       " /Users/mollyhe/.pyenv/versions/3.10.14/envs/py3.10.14/lib/python3.10/site-packages/sagemaker/tra \n",
+       " in/common_utils/trainer_wait.py:374 in wait                                                      \n",
+       "                                                                                                  \n",
+       "   371 except (FailedStatusError, TimeoutExceededError):                                      \n",
+       "   372 │   │   raise                                                                              \n",
+       "   373 except Exception as e:                                                                 \n",
+       " 374 │   │   raise RuntimeError(f\"Training job monitoring failed: {e}\") from e                  \n",
+       "   375                                                                                            \n",
+       "╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n",
+       "RuntimeError: Training job monitoring failed: An error occurred (ExpiredTokenException) when calling the \n",
+       "DescribeTrainingJob operation: The security token included in the request is expired\n",
+       "
\n" + ], + "text/plain": [ + "\u001b[38;2;255;0;0m╭─\u001b[0m\u001b[38;2;255;0;0m──────────────────────────────\u001b[0m\u001b[38;2;255;0;0m \u001b[0m\u001b[1;38;2;255;0;0mTraceback \u001b[0m\u001b[1;2;38;2;255;0;0m(most recent call last)\u001b[0m\u001b[38;2;255;0;0m \u001b[0m\u001b[38;2;255;0;0m───────────────────────────────\u001b[0m\u001b[38;2;255;0;0m─╮\u001b[0m\n", + "\u001b[38;2;255;0;0m│\u001b[0m \u001b[2;33m/Users/mollyhe/.pyenv/versions/3.10.14/envs/py3.10.14/lib/python3.10/site-packages/sagemaker/tra\u001b[0m \u001b[38;2;255;0;0m│\u001b[0m\n", + "\u001b[38;2;255;0;0m│\u001b[0m \u001b[2;33min/common_utils/\u001b[0m\u001b[1;33mtrainer_wait.py\u001b[0m:\u001b[94m197\u001b[0m in \u001b[92mwait\u001b[0m \u001b[38;2;255;0;0m│\u001b[0m\n", + "\u001b[38;2;255;0;0m│\u001b[0m \u001b[38;2;255;0;0m│\u001b[0m\n", + "\u001b[38;2;255;0;0m│\u001b[0m \u001b[2m194 \u001b[0m\u001b[2m│ │ │ │ │ \u001b[0miteration += \u001b[94m1\u001b[0m \u001b[38;2;255;0;0m│\u001b[0m\n", + "\u001b[38;2;255;0;0m│\u001b[0m \u001b[2m195 \u001b[0m\u001b[2m│ │ │ │ │ \u001b[0mtime.sleep(\u001b[94m1\u001b[0m) \u001b[38;2;255;0;0m│\u001b[0m\n", + "\u001b[38;2;255;0;0m│\u001b[0m \u001b[2m196 \u001b[0m\u001b[2m│ │ │ │ │ \u001b[0m\u001b[94mif\u001b[0m iteration == poll: \u001b[38;2;255;0;0m│\u001b[0m\n", + "\u001b[38;2;255;0;0m│\u001b[0m \u001b[31m❱ \u001b[0m197 \u001b[2m│ │ │ │ │ │ \u001b[0mtraining_job.refresh() \u001b[38;2;255;0;0m│\u001b[0m\n", + "\u001b[38;2;255;0;0m│\u001b[0m \u001b[2m198 \u001b[0m\u001b[2m│ │ │ │ │ │ \u001b[0miteration = \u001b[94m0\u001b[0m \u001b[38;2;255;0;0m│\u001b[0m\n", + "\u001b[38;2;255;0;0m│\u001b[0m \u001b[2m199 \u001b[0m\u001b[2m│ │ │ │ │ \u001b[0mclear_output(wait=\u001b[94mTrue\u001b[0m) \u001b[38;2;255;0;0m│\u001b[0m\n", + "\u001b[38;2;255;0;0m│\u001b[0m \u001b[2m200 \u001b[0m \u001b[38;2;255;0;0m│\u001b[0m\n", + "\u001b[38;2;255;0;0m│\u001b[0m \u001b[38;2;255;0;0m│\u001b[0m\n", + "\u001b[38;2;255;0;0m│\u001b[0m \u001b[2;33m/Users/mollyhe/.pyenv/versions/3.10.14/envs/py3.10.14/lib/python3.10/site-packages/sagemaker/cor\u001b[0m \u001b[38;2;255;0;0m│\u001b[0m\n", + "\u001b[38;2;255;0;0m│\u001b[0m \u001b[2;33me/\u001b[0m\u001b[1;33mresources.py\u001b[0m:\u001b[94m143\u001b[0m in \u001b[92mwrapper\u001b[0m \u001b[38;2;255;0;0m│\u001b[0m\n", + "\u001b[38;2;255;0;0m│\u001b[0m \u001b[38;2;255;0;0m│\u001b[0m\n", + "\u001b[38;2;255;0;0m│\u001b[0m \u001b[2m 140 \u001b[0m\u001b[2m│ │ \u001b[0m\u001b[1;95m@functools\u001b[0m.wraps(func) \u001b[38;2;255;0;0m│\u001b[0m\n", + "\u001b[38;2;255;0;0m│\u001b[0m \u001b[2m 141 \u001b[0m\u001b[2m│ │ \u001b[0m\u001b[94mdef\u001b[0m\u001b[90m \u001b[0m\u001b[92mwrapper\u001b[0m(*args, **kwargs): \u001b[38;2;255;0;0m│\u001b[0m\n", + "\u001b[38;2;255;0;0m│\u001b[0m \u001b[2m 142 \u001b[0m\u001b[2m│ │ │ \u001b[0mconfig = \u001b[96mdict\u001b[0m(arbitrary_types_allowed=\u001b[94mTrue\u001b[0m) \u001b[38;2;255;0;0m│\u001b[0m\n", + "\u001b[38;2;255;0;0m│\u001b[0m \u001b[31m❱ \u001b[0m 143 \u001b[2m│ │ │ \u001b[0m\u001b[94mreturn\u001b[0m validate_call(config=config)(func)(*args, **kwargs) \u001b[38;2;255;0;0m│\u001b[0m\n", + "\u001b[38;2;255;0;0m│\u001b[0m \u001b[2m 144 \u001b[0m\u001b[2m│ │ \u001b[0m \u001b[38;2;255;0;0m│\u001b[0m\n", + "\u001b[38;2;255;0;0m│\u001b[0m \u001b[2m 145 \u001b[0m\u001b[2m│ │ \u001b[0m\u001b[94mreturn\u001b[0m wrapper \u001b[38;2;255;0;0m│\u001b[0m\n", + "\u001b[38;2;255;0;0m│\u001b[0m \u001b[2m 146 \u001b[0m \u001b[38;2;255;0;0m│\u001b[0m\n", + "\u001b[38;2;255;0;0m│\u001b[0m \u001b[38;2;255;0;0m│\u001b[0m\n", + "\u001b[38;2;255;0;0m│\u001b[0m \u001b[2;33m/Users/mollyhe/.pyenv/versions/3.10.14/envs/py3.10.14/lib/python3.10/site-packages/pydantic/_int\u001b[0m \u001b[38;2;255;0;0m│\u001b[0m\n", + "\u001b[38;2;255;0;0m│\u001b[0m \u001b[2;33mernal/\u001b[0m\u001b[1;33m_validate_call.py\u001b[0m:\u001b[94m39\u001b[0m in \u001b[92mwrapper_function\u001b[0m \u001b[38;2;255;0;0m│\u001b[0m\n", + "\u001b[38;2;255;0;0m│\u001b[0m \u001b[38;2;255;0;0m│\u001b[0m\n", + "\u001b[38;2;255;0;0m│\u001b[0m \u001b[2m 36 \u001b[0m\u001b[2m│ │ \u001b[0m \u001b[38;2;255;0;0m│\u001b[0m\n", + "\u001b[38;2;255;0;0m│\u001b[0m \u001b[2m 37 \u001b[0m\u001b[2m│ │ \u001b[0m\u001b[1;95m@functools\u001b[0m.wraps(wrapped) \u001b[38;2;255;0;0m│\u001b[0m\n", + "\u001b[38;2;255;0;0m│\u001b[0m \u001b[2m 38 \u001b[0m\u001b[2m│ │ \u001b[0m\u001b[94mdef\u001b[0m\u001b[90m \u001b[0m\u001b[92mwrapper_function\u001b[0m(*args, **kwargs): \u001b[38;2;255;0;0m│\u001b[0m\n", + "\u001b[38;2;255;0;0m│\u001b[0m \u001b[31m❱ \u001b[0m 39 \u001b[2m│ │ │ \u001b[0m\u001b[94mreturn\u001b[0m wrapper(*args, **kwargs) \u001b[38;2;255;0;0m│\u001b[0m\n", + "\u001b[38;2;255;0;0m│\u001b[0m \u001b[2m 40 \u001b[0m\u001b[2m│ \u001b[0m \u001b[38;2;255;0;0m│\u001b[0m\n", + "\u001b[38;2;255;0;0m│\u001b[0m \u001b[2m 41 \u001b[0m\u001b[2m│ \u001b[0m\u001b[2m# We need to manually update this because `partial` object has no `__name__` and `__\u001b[0m \u001b[38;2;255;0;0m│\u001b[0m\n", + "\u001b[38;2;255;0;0m│\u001b[0m \u001b[2m 42 \u001b[0m\u001b[2m│ \u001b[0mwrapper_function.\u001b[91m__name__\u001b[0m = extract_function_name(wrapped) \u001b[38;2;255;0;0m│\u001b[0m\n", + "\u001b[38;2;255;0;0m│\u001b[0m \u001b[38;2;255;0;0m│\u001b[0m\n", + "\u001b[38;2;255;0;0m│\u001b[0m \u001b[2;33m/Users/mollyhe/.pyenv/versions/3.10.14/envs/py3.10.14/lib/python3.10/site-packages/pydantic/_int\u001b[0m \u001b[38;2;255;0;0m│\u001b[0m\n", + "\u001b[38;2;255;0;0m│\u001b[0m \u001b[2;33mernal/\u001b[0m\u001b[1;33m_validate_call.py\u001b[0m:\u001b[94m136\u001b[0m in \u001b[92m__call__\u001b[0m \u001b[38;2;255;0;0m│\u001b[0m\n", + "\u001b[38;2;255;0;0m│\u001b[0m \u001b[38;2;255;0;0m│\u001b[0m\n", + "\u001b[38;2;255;0;0m│\u001b[0m \u001b[2m133 \u001b[0m\u001b[2m│ │ \u001b[0m\u001b[94mif\u001b[0m \u001b[95mnot\u001b[0m \u001b[96mself\u001b[0m.__pydantic_complete__: \u001b[38;2;255;0;0m│\u001b[0m\n", + "\u001b[38;2;255;0;0m│\u001b[0m \u001b[2m134 \u001b[0m\u001b[2m│ │ │ \u001b[0m\u001b[96mself\u001b[0m._create_validators() \u001b[38;2;255;0;0m│\u001b[0m\n", + "\u001b[38;2;255;0;0m│\u001b[0m \u001b[2m135 \u001b[0m\u001b[2m│ │ \u001b[0m \u001b[38;2;255;0;0m│\u001b[0m\n", + "\u001b[38;2;255;0;0m│\u001b[0m \u001b[31m❱ \u001b[0m136 \u001b[2m│ │ \u001b[0mres = \u001b[96mself\u001b[0m.__pydantic_validator__.validate_python(pydantic_core.ArgsKwargs(args, \u001b[38;2;255;0;0m│\u001b[0m\n", + "\u001b[38;2;255;0;0m│\u001b[0m \u001b[2m137 \u001b[0m\u001b[2m│ │ \u001b[0m\u001b[94mif\u001b[0m \u001b[96mself\u001b[0m.__return_pydantic_validator__: \u001b[38;2;255;0;0m│\u001b[0m\n", + "\u001b[38;2;255;0;0m│\u001b[0m \u001b[2m138 \u001b[0m\u001b[2m│ │ │ \u001b[0m\u001b[94mreturn\u001b[0m \u001b[96mself\u001b[0m.__return_pydantic_validator__(res) \u001b[38;2;255;0;0m│\u001b[0m\n", + "\u001b[38;2;255;0;0m│\u001b[0m \u001b[2m139 \u001b[0m\u001b[2m│ │ \u001b[0m\u001b[94melse\u001b[0m: \u001b[38;2;255;0;0m│\u001b[0m\n", + "\u001b[38;2;255;0;0m│\u001b[0m \u001b[38;2;255;0;0m│\u001b[0m\n", + "\u001b[38;2;255;0;0m│\u001b[0m \u001b[2;33m/Users/mollyhe/.pyenv/versions/3.10.14/envs/py3.10.14/lib/python3.10/site-packages/sagemaker/cor\u001b[0m \u001b[38;2;255;0;0m│\u001b[0m\n", + "\u001b[38;2;255;0;0m│\u001b[0m \u001b[2;33me/\u001b[0m\u001b[1;33mresources.py\u001b[0m:\u001b[94m35682\u001b[0m in \u001b[92mrefresh\u001b[0m \u001b[38;2;255;0;0m│\u001b[0m\n", + "\u001b[38;2;255;0;0m│\u001b[0m \u001b[38;2;255;0;0m│\u001b[0m\n", + "\u001b[38;2;255;0;0m│\u001b[0m \u001b[2m35679 \u001b[0m\u001b[2m│ │ \u001b[0mlogger.debug(\u001b[33mf\u001b[0m\u001b[33m\"\u001b[0m\u001b[33mSerialized input request: \u001b[0m\u001b[33m{\u001b[0moperation_input_args\u001b[33m}\u001b[0m\u001b[33m\"\u001b[0m) \u001b[38;2;255;0;0m│\u001b[0m\n", + "\u001b[38;2;255;0;0m│\u001b[0m \u001b[2m35680 \u001b[0m\u001b[2m│ │ \u001b[0m \u001b[38;2;255;0;0m│\u001b[0m\n", + "\u001b[38;2;255;0;0m│\u001b[0m \u001b[2m35681 \u001b[0m\u001b[2m│ │ \u001b[0mclient = Base.get_sagemaker_client() \u001b[38;2;255;0;0m│\u001b[0m\n", + "\u001b[38;2;255;0;0m│\u001b[0m \u001b[31m❱ \u001b[0m35682 \u001b[2m│ │ \u001b[0mresponse = client.describe_training_job(**operation_input_args) \u001b[38;2;255;0;0m│\u001b[0m\n", + "\u001b[38;2;255;0;0m│\u001b[0m \u001b[2m35683 \u001b[0m\u001b[2m│ │ \u001b[0m \u001b[38;2;255;0;0m│\u001b[0m\n", + "\u001b[38;2;255;0;0m│\u001b[0m \u001b[2m35684 \u001b[0m\u001b[2m│ │ \u001b[0m\u001b[2m# deserialize response and update self\u001b[0m \u001b[38;2;255;0;0m│\u001b[0m\n", + "\u001b[38;2;255;0;0m│\u001b[0m \u001b[2m35685 \u001b[0m\u001b[2m│ │ \u001b[0mtransform(response, \u001b[33m\"\u001b[0m\u001b[33mDescribeTrainingJobResponse\u001b[0m\u001b[33m\"\u001b[0m, \u001b[96mself\u001b[0m) \u001b[38;2;255;0;0m│\u001b[0m\n", + "\u001b[38;2;255;0;0m│\u001b[0m \u001b[38;2;255;0;0m│\u001b[0m\n", + "\u001b[38;2;255;0;0m│\u001b[0m \u001b[2;33m/Users/mollyhe/.pyenv/versions/3.10.14/envs/py3.10.14/lib/python3.10/site-packages/botocore/\u001b[0m\u001b[1;33mclie\u001b[0m \u001b[38;2;255;0;0m│\u001b[0m\n", + "\u001b[38;2;255;0;0m│\u001b[0m \u001b[1;33mnt.py\u001b[0m:\u001b[94m602\u001b[0m in \u001b[92m_api_call\u001b[0m \u001b[38;2;255;0;0m│\u001b[0m\n", + "\u001b[38;2;255;0;0m│\u001b[0m \u001b[38;2;255;0;0m│\u001b[0m\n", + "\u001b[38;2;255;0;0m│\u001b[0m \u001b[2m 599 \u001b[0m\u001b[2m│ │ │ │ │ \u001b[0m\u001b[33mf\u001b[0m\u001b[33m\"\u001b[0m\u001b[33m{\u001b[0mpy_operation_name\u001b[33m}\u001b[0m\u001b[33m() only accepts keyword arguments.\u001b[0m\u001b[33m\"\u001b[0m \u001b[38;2;255;0;0m│\u001b[0m\n", + "\u001b[38;2;255;0;0m│\u001b[0m \u001b[2m 600 \u001b[0m\u001b[2m│ │ │ │ \u001b[0m) \u001b[38;2;255;0;0m│\u001b[0m\n", + "\u001b[38;2;255;0;0m│\u001b[0m \u001b[2m 601 \u001b[0m\u001b[2m│ │ │ \u001b[0m\u001b[2m# The \"self\" in this scope is referring to the BaseClient.\u001b[0m \u001b[38;2;255;0;0m│\u001b[0m\n", + "\u001b[38;2;255;0;0m│\u001b[0m \u001b[31m❱ \u001b[0m 602 \u001b[2m│ │ │ \u001b[0m\u001b[94mreturn\u001b[0m \u001b[96mself\u001b[0m._make_api_call(operation_name, kwargs) \u001b[38;2;255;0;0m│\u001b[0m\n", + "\u001b[38;2;255;0;0m│\u001b[0m \u001b[2m 603 \u001b[0m\u001b[2m│ │ \u001b[0m \u001b[38;2;255;0;0m│\u001b[0m\n", + "\u001b[38;2;255;0;0m│\u001b[0m \u001b[2m 604 \u001b[0m\u001b[2m│ │ \u001b[0m_api_call.\u001b[91m__name__\u001b[0m = \u001b[96mstr\u001b[0m(py_operation_name) \u001b[38;2;255;0;0m│\u001b[0m\n", + "\u001b[38;2;255;0;0m│\u001b[0m \u001b[2m 605 \u001b[0m \u001b[38;2;255;0;0m│\u001b[0m\n", + "\u001b[38;2;255;0;0m│\u001b[0m \u001b[38;2;255;0;0m│\u001b[0m\n", + "\u001b[38;2;255;0;0m│\u001b[0m \u001b[2;33m/Users/mollyhe/.pyenv/versions/3.10.14/envs/py3.10.14/lib/python3.10/site-packages/botocore/\u001b[0m\u001b[1;33mcont\u001b[0m \u001b[38;2;255;0;0m│\u001b[0m\n", + "\u001b[38;2;255;0;0m│\u001b[0m \u001b[1;33mext.py\u001b[0m:\u001b[94m123\u001b[0m in \u001b[92mwrapper\u001b[0m \u001b[38;2;255;0;0m│\u001b[0m\n", + "\u001b[38;2;255;0;0m│\u001b[0m \u001b[38;2;255;0;0m│\u001b[0m\n", + "\u001b[38;2;255;0;0m│\u001b[0m \u001b[2m120 \u001b[0m\u001b[2m│ │ │ \u001b[0m\u001b[94mwith\u001b[0m start_as_current_context(): \u001b[38;2;255;0;0m│\u001b[0m\n", + "\u001b[38;2;255;0;0m│\u001b[0m \u001b[2m121 \u001b[0m\u001b[2m│ │ │ │ \u001b[0m\u001b[94mif\u001b[0m hook: \u001b[38;2;255;0;0m│\u001b[0m\n", + "\u001b[38;2;255;0;0m│\u001b[0m \u001b[2m122 \u001b[0m\u001b[2m│ │ │ │ │ \u001b[0mhook() \u001b[38;2;255;0;0m│\u001b[0m\n", + "\u001b[38;2;255;0;0m│\u001b[0m \u001b[31m❱ \u001b[0m123 \u001b[2m│ │ │ │ \u001b[0m\u001b[94mreturn\u001b[0m func(*args, **kwargs) \u001b[38;2;255;0;0m│\u001b[0m\n", + "\u001b[38;2;255;0;0m│\u001b[0m \u001b[2m124 \u001b[0m\u001b[2m│ │ \u001b[0m \u001b[38;2;255;0;0m│\u001b[0m\n", + "\u001b[38;2;255;0;0m│\u001b[0m \u001b[2m125 \u001b[0m\u001b[2m│ │ \u001b[0m\u001b[94mreturn\u001b[0m wrapper \u001b[38;2;255;0;0m│\u001b[0m\n", + "\u001b[38;2;255;0;0m│\u001b[0m \u001b[2m126 \u001b[0m \u001b[38;2;255;0;0m│\u001b[0m\n", + "\u001b[38;2;255;0;0m│\u001b[0m \u001b[38;2;255;0;0m│\u001b[0m\n", + "\u001b[38;2;255;0;0m│\u001b[0m \u001b[2;33m/Users/mollyhe/.pyenv/versions/3.10.14/envs/py3.10.14/lib/python3.10/site-packages/botocore/\u001b[0m\u001b[1;33mclie\u001b[0m \u001b[38;2;255;0;0m│\u001b[0m\n", + "\u001b[38;2;255;0;0m│\u001b[0m \u001b[1;33mnt.py\u001b[0m:\u001b[94m1078\u001b[0m in \u001b[92m_make_api_call\u001b[0m \u001b[38;2;255;0;0m│\u001b[0m\n", + "\u001b[38;2;255;0;0m│\u001b[0m \u001b[38;2;255;0;0m│\u001b[0m\n", + "\u001b[38;2;255;0;0m│\u001b[0m \u001b[2m1075 \u001b[0m\u001b[2m│ │ │ │ \u001b[0m\u001b[33m'\u001b[0m\u001b[33merror_code_override\u001b[0m\u001b[33m'\u001b[0m \u001b[38;2;255;0;0m│\u001b[0m\n", + "\u001b[38;2;255;0;0m│\u001b[0m \u001b[2m1076 \u001b[0m\u001b[2m│ │ │ \u001b[0m) \u001b[95mor\u001b[0m error_info.get(\u001b[33m\"\u001b[0m\u001b[33mCode\u001b[0m\u001b[33m\"\u001b[0m) \u001b[38;2;255;0;0m│\u001b[0m\n", + "\u001b[38;2;255;0;0m│\u001b[0m \u001b[2m1077 \u001b[0m\u001b[2m│ │ │ \u001b[0merror_class = \u001b[96mself\u001b[0m.exceptions.from_code(error_code) \u001b[38;2;255;0;0m│\u001b[0m\n", + "\u001b[38;2;255;0;0m│\u001b[0m \u001b[31m❱ \u001b[0m1078 \u001b[2m│ │ │ \u001b[0m\u001b[94mraise\u001b[0m error_class(parsed_response, operation_name) \u001b[38;2;255;0;0m│\u001b[0m\n", + "\u001b[38;2;255;0;0m│\u001b[0m \u001b[2m1079 \u001b[0m\u001b[2m│ │ \u001b[0m\u001b[94melse\u001b[0m: \u001b[38;2;255;0;0m│\u001b[0m\n", + "\u001b[38;2;255;0;0m│\u001b[0m \u001b[2m1080 \u001b[0m\u001b[2m│ │ │ \u001b[0m\u001b[94mreturn\u001b[0m parsed_response \u001b[38;2;255;0;0m│\u001b[0m\n", + "\u001b[38;2;255;0;0m│\u001b[0m \u001b[2m1081 \u001b[0m \u001b[38;2;255;0;0m│\u001b[0m\n", + "\u001b[38;2;255;0;0m╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\u001b[0m\n", + "\u001b[1;91mClientError: \u001b[0mAn error occurred \u001b[1m(\u001b[0mExpiredTokenException\u001b[1m)\u001b[0m when calling the DescribeTrainingJob operation: The security\n", + "token included in the request is expired\n", + "\n", + "\u001b[3mThe above exception was the direct cause of the following exception:\u001b[0m\n", + "\n", + "\u001b[38;2;255;0;0m╭─\u001b[0m\u001b[38;2;255;0;0m──────────────────────────────\u001b[0m\u001b[38;2;255;0;0m \u001b[0m\u001b[1;38;2;255;0;0mTraceback \u001b[0m\u001b[1;2;38;2;255;0;0m(most recent call last)\u001b[0m\u001b[38;2;255;0;0m \u001b[0m\u001b[38;2;255;0;0m───────────────────────────────\u001b[0m\u001b[38;2;255;0;0m─╮\u001b[0m\n", + "\u001b[38;2;255;0;0m│\u001b[0m in \u001b[92m\u001b[0m:\u001b[94m1\u001b[0m \u001b[38;2;255;0;0m│\u001b[0m\n", + "\u001b[38;2;255;0;0m│\u001b[0m \u001b[38;2;255;0;0m│\u001b[0m\n", + "\u001b[38;2;255;0;0m│\u001b[0m \u001b[31m❱ \u001b[0m1 training_job = trainer.train(wait=\u001b[94mTrue\u001b[0m) \u001b[38;2;255;0;0m│\u001b[0m\n", + "\u001b[38;2;255;0;0m│\u001b[0m \u001b[2m2 \u001b[0m \u001b[38;2;255;0;0m│\u001b[0m\n", + "\u001b[38;2;255;0;0m│\u001b[0m \u001b[2m3 \u001b[0mTRAINING_JOB_NAME = training_job.training_job_name \u001b[38;2;255;0;0m│\u001b[0m\n", + "\u001b[38;2;255;0;0m│\u001b[0m \u001b[2m4 \u001b[0m \u001b[38;2;255;0;0m│\u001b[0m\n", + "\u001b[38;2;255;0;0m│\u001b[0m \u001b[38;2;255;0;0m│\u001b[0m\n", + "\u001b[38;2;255;0;0m│\u001b[0m \u001b[2;33m/Users/mollyhe/.pyenv/versions/3.10.14/envs/py3.10.14/lib/python3.10/site-packages/sagemaker/cor\u001b[0m \u001b[38;2;255;0;0m│\u001b[0m\n", + "\u001b[38;2;255;0;0m│\u001b[0m \u001b[2;33me/telemetry/\u001b[0m\u001b[1;33mtelemetry_logging.py\u001b[0m:\u001b[94m168\u001b[0m in \u001b[92mwrapper\u001b[0m \u001b[38;2;255;0;0m│\u001b[0m\n", + "\u001b[38;2;255;0;0m│\u001b[0m \u001b[38;2;255;0;0m│\u001b[0m\n", + "\u001b[38;2;255;0;0m│\u001b[0m \u001b[2m165 \u001b[0m\u001b[2m│ │ │ │ │ \u001b[0mcaught_ex = e \u001b[38;2;255;0;0m│\u001b[0m\n", + "\u001b[38;2;255;0;0m│\u001b[0m \u001b[2m166 \u001b[0m\u001b[2m│ │ │ │ \u001b[0m\u001b[94mfinally\u001b[0m: \u001b[38;2;255;0;0m│\u001b[0m\n", + "\u001b[38;2;255;0;0m│\u001b[0m \u001b[2m167 \u001b[0m\u001b[2m│ │ │ │ │ \u001b[0m\u001b[94mif\u001b[0m caught_ex: \u001b[38;2;255;0;0m│\u001b[0m\n", + "\u001b[38;2;255;0;0m│\u001b[0m \u001b[31m❱ \u001b[0m168 \u001b[2m│ │ │ │ │ │ \u001b[0m\u001b[94mraise\u001b[0m caught_ex \u001b[38;2;255;0;0m│\u001b[0m\n", + "\u001b[38;2;255;0;0m│\u001b[0m \u001b[2m169 \u001b[0m\u001b[2m│ │ │ │ │ \u001b[0m\u001b[94mreturn\u001b[0m response \u001b[2m# pylint: disable=W0150\u001b[0m \u001b[38;2;255;0;0m│\u001b[0m\n", + "\u001b[38;2;255;0;0m│\u001b[0m \u001b[2m170 \u001b[0m\u001b[2m│ │ │ \u001b[0m\u001b[94melse\u001b[0m: \u001b[38;2;255;0;0m│\u001b[0m\n", + "\u001b[38;2;255;0;0m│\u001b[0m \u001b[2m171 \u001b[0m\u001b[2m│ │ │ │ \u001b[0mlogger.debug( \u001b[38;2;255;0;0m│\u001b[0m\n", + "\u001b[38;2;255;0;0m│\u001b[0m \u001b[38;2;255;0;0m│\u001b[0m\n", + "\u001b[38;2;255;0;0m│\u001b[0m \u001b[2;33m/Users/mollyhe/.pyenv/versions/3.10.14/envs/py3.10.14/lib/python3.10/site-packages/sagemaker/cor\u001b[0m \u001b[38;2;255;0;0m│\u001b[0m\n", + "\u001b[38;2;255;0;0m│\u001b[0m \u001b[2;33me/telemetry/\u001b[0m\u001b[1;33mtelemetry_logging.py\u001b[0m:\u001b[94m139\u001b[0m in \u001b[92mwrapper\u001b[0m \u001b[38;2;255;0;0m│\u001b[0m\n", + "\u001b[38;2;255;0;0m│\u001b[0m \u001b[38;2;255;0;0m│\u001b[0m\n", + "\u001b[38;2;255;0;0m│\u001b[0m \u001b[2m136 \u001b[0m\u001b[2m│ │ │ │ \u001b[0mstart_timer = perf_counter() \u001b[38;2;255;0;0m│\u001b[0m\n", + "\u001b[38;2;255;0;0m│\u001b[0m \u001b[2m137 \u001b[0m\u001b[2m│ │ │ │ \u001b[0m\u001b[94mtry\u001b[0m: \u001b[38;2;255;0;0m│\u001b[0m\n", + "\u001b[38;2;255;0;0m│\u001b[0m \u001b[2m138 \u001b[0m\u001b[2m│ │ │ │ │ \u001b[0m\u001b[2m# Call the original function\u001b[0m \u001b[38;2;255;0;0m│\u001b[0m\n", + "\u001b[38;2;255;0;0m│\u001b[0m \u001b[31m❱ \u001b[0m139 \u001b[2m│ │ │ │ │ \u001b[0mresponse = func(*args, **kwargs) \u001b[38;2;255;0;0m│\u001b[0m\n", + "\u001b[38;2;255;0;0m│\u001b[0m \u001b[2m140 \u001b[0m\u001b[2m│ │ │ │ │ \u001b[0mstop_timer = perf_counter() \u001b[38;2;255;0;0m│\u001b[0m\n", + "\u001b[38;2;255;0;0m│\u001b[0m \u001b[2m141 \u001b[0m\u001b[2m│ │ │ │ │ \u001b[0melapsed = stop_timer - start_timer \u001b[38;2;255;0;0m│\u001b[0m\n", + "\u001b[38;2;255;0;0m│\u001b[0m \u001b[2m142 \u001b[0m\u001b[2m│ │ │ │ │ \u001b[0mextra += \u001b[33mf\u001b[0m\u001b[33m\"\u001b[0m\u001b[33m&x-latency=\u001b[0m\u001b[33m{\u001b[0m\u001b[96mround\u001b[0m(elapsed,\u001b[90m \u001b[0m\u001b[94m2\u001b[0m)\u001b[33m}\u001b[0m\u001b[33m\"\u001b[0m \u001b[38;2;255;0;0m│\u001b[0m\n", + "\u001b[38;2;255;0;0m│\u001b[0m \u001b[38;2;255;0;0m│\u001b[0m\n", + "\u001b[38;2;255;0;0m│\u001b[0m \u001b[2;33m/Users/mollyhe/.pyenv/versions/3.10.14/envs/py3.10.14/lib/python3.10/site-packages/sagemaker/tra\u001b[0m \u001b[38;2;255;0;0m│\u001b[0m\n", + "\u001b[38;2;255;0;0m│\u001b[0m \u001b[2;33min/\u001b[0m\u001b[1;33msft_trainer.py\u001b[0m:\u001b[94m267\u001b[0m in \u001b[92mtrain\u001b[0m \u001b[38;2;255;0;0m│\u001b[0m\n", + "\u001b[38;2;255;0;0m│\u001b[0m \u001b[38;2;255;0;0m│\u001b[0m\n", + "\u001b[38;2;255;0;0m│\u001b[0m \u001b[2m264 \u001b[0m\u001b[2m│ │ │ \u001b[0m\u001b[94mfrom\u001b[0m\u001b[90m \u001b[0m\u001b[4;96msagemaker\u001b[0m\u001b[4;96m.\u001b[0m\u001b[4;96mtrain\u001b[0m\u001b[4;96m.\u001b[0m\u001b[4;96mcommon_utils\u001b[0m\u001b[4;96m.\u001b[0m\u001b[4;96mtrainer_wait\u001b[0m\u001b[90m \u001b[0m\u001b[94mimport\u001b[0m wait \u001b[94mas\u001b[0m _wait \u001b[38;2;255;0;0m│\u001b[0m\n", + "\u001b[38;2;255;0;0m│\u001b[0m \u001b[2m265 \u001b[0m\u001b[2m│ │ │ \u001b[0m\u001b[94mfrom\u001b[0m\u001b[90m \u001b[0m\u001b[4;96msagemaker\u001b[0m\u001b[4;96m.\u001b[0m\u001b[4;96mcore\u001b[0m\u001b[4;96m.\u001b[0m\u001b[4;96mutils\u001b[0m\u001b[4;96m.\u001b[0m\u001b[4;96mexceptions\u001b[0m\u001b[90m \u001b[0m\u001b[94mimport\u001b[0m TimeoutExceededError \u001b[38;2;255;0;0m│\u001b[0m\n", + "\u001b[38;2;255;0;0m│\u001b[0m \u001b[2m266 \u001b[0m\u001b[2m│ │ │ \u001b[0m\u001b[94mtry\u001b[0m : \u001b[38;2;255;0;0m│\u001b[0m\n", + "\u001b[38;2;255;0;0m│\u001b[0m \u001b[31m❱ \u001b[0m267 \u001b[2m│ │ │ │ \u001b[0m_wait(training_job) \u001b[38;2;255;0;0m│\u001b[0m\n", + "\u001b[38;2;255;0;0m│\u001b[0m \u001b[2m268 \u001b[0m\u001b[2m│ │ │ \u001b[0m\u001b[94mexcept\u001b[0m TimeoutExceededError \u001b[94mas\u001b[0m e: \u001b[38;2;255;0;0m│\u001b[0m\n", + "\u001b[38;2;255;0;0m│\u001b[0m \u001b[2m269 \u001b[0m\u001b[2m│ │ │ │ \u001b[0mlogger.error(\u001b[33m\"\u001b[0m\u001b[33mError: \u001b[0m\u001b[33m%s\u001b[0m\u001b[33m\"\u001b[0m, e) \u001b[38;2;255;0;0m│\u001b[0m\n", + "\u001b[38;2;255;0;0m│\u001b[0m \u001b[2m270 \u001b[0m \u001b[38;2;255;0;0m│\u001b[0m\n", + "\u001b[38;2;255;0;0m│\u001b[0m \u001b[38;2;255;0;0m│\u001b[0m\n", + "\u001b[38;2;255;0;0m│\u001b[0m \u001b[2;33m/Users/mollyhe/.pyenv/versions/3.10.14/envs/py3.10.14/lib/python3.10/site-packages/sagemaker/tra\u001b[0m \u001b[38;2;255;0;0m│\u001b[0m\n", + "\u001b[38;2;255;0;0m│\u001b[0m \u001b[2;33min/common_utils/\u001b[0m\u001b[1;33mtrainer_wait.py\u001b[0m:\u001b[94m374\u001b[0m in \u001b[92mwait\u001b[0m \u001b[38;2;255;0;0m│\u001b[0m\n", + "\u001b[38;2;255;0;0m│\u001b[0m \u001b[38;2;255;0;0m│\u001b[0m\n", + "\u001b[38;2;255;0;0m│\u001b[0m \u001b[2m371 \u001b[0m\u001b[2m│ \u001b[0m\u001b[94mexcept\u001b[0m (FailedStatusError, TimeoutExceededError): \u001b[38;2;255;0;0m│\u001b[0m\n", + "\u001b[38;2;255;0;0m│\u001b[0m \u001b[2m372 \u001b[0m\u001b[2m│ │ \u001b[0m\u001b[94mraise\u001b[0m \u001b[38;2;255;0;0m│\u001b[0m\n", + "\u001b[38;2;255;0;0m│\u001b[0m \u001b[2m373 \u001b[0m\u001b[2m│ \u001b[0m\u001b[94mexcept\u001b[0m \u001b[96mException\u001b[0m \u001b[94mas\u001b[0m e: \u001b[38;2;255;0;0m│\u001b[0m\n", + "\u001b[38;2;255;0;0m│\u001b[0m \u001b[31m❱ \u001b[0m374 \u001b[2m│ │ \u001b[0m\u001b[94mraise\u001b[0m \u001b[96mRuntimeError\u001b[0m(\u001b[33mf\u001b[0m\u001b[33m\"\u001b[0m\u001b[33mTraining job monitoring failed: \u001b[0m\u001b[33m{\u001b[0me\u001b[33m}\u001b[0m\u001b[33m\"\u001b[0m) \u001b[94mfrom\u001b[0m\u001b[90m \u001b[0m\u001b[4;96me\u001b[0m \u001b[38;2;255;0;0m│\u001b[0m\n", + "\u001b[38;2;255;0;0m│\u001b[0m \u001b[2m375 \u001b[0m \u001b[38;2;255;0;0m│\u001b[0m\n", + "\u001b[38;2;255;0;0m╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\u001b[0m\n", + "\u001b[1;91mRuntimeError: \u001b[0mTraining job monitoring failed: An error occurred \u001b[1m(\u001b[0mExpiredTokenException\u001b[1m)\u001b[0m when calling the \n", + "DescribeTrainingJob operation: The security token included in the request is expired\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], "source": [ "training_job = trainer.train(wait=True)\n", "\n", @@ -1069,7 +1664,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.11.11" + "version": "3.10.14" } }, "nbformat": 4, From 7b5f8b14c42ec0a387c76837a38d809f32cdb2b2 Mon Sep 17 00:00:00 2001 From: Molly He Date: Tue, 10 Mar 2026 17:55:52 -0700 Subject: [PATCH 02/11] Evaluation job update --- .../evaluate/benchmark_demo.ipynb | 549 +++++++++++++++++- .../train/common_utils/metrics_visualizer.py | 102 +++- .../src/sagemaker/train/evaluate/execution.py | 69 ++- .../train/evaluate/pipeline_templates.py | 9 +- ...uning_example_notebook_pysdk_prod_v3.ipynb | 222 +++---- 5 files changed, 757 insertions(+), 194 deletions(-) diff --git a/sagemaker-train/example_notebooks/evaluate/benchmark_demo.ipynb b/sagemaker-train/example_notebooks/evaluate/benchmark_demo.ipynb index e0133a9272..b78d35b8e9 100644 --- a/sagemaker-train/example_notebooks/evaluate/benchmark_demo.ipynb +++ b/sagemaker-train/example_notebooks/evaluate/benchmark_demo.ipynb @@ -24,23 +24,94 @@ "execution_count": null, "metadata": {}, "outputs": [], + "source": [ + "import logging\n", + "logging.basicConfig(level=logging.WARNING)" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
[\n",
+       "<_Benchmark.MMLU: 'mmlu'>,\n",
+       "<_Benchmark.MMLU_PRO: 'mmlu_pro'>,\n",
+       "<_Benchmark.BBH: 'bbh'>,\n",
+       "<_Benchmark.GPQA: 'gpqa'>,\n",
+       "<_Benchmark.MATH: 'math'>,\n",
+       "<_Benchmark.STRONG_REJECT: 'strong_reject'>,\n",
+       "<_Benchmark.IFEVAL: 'ifeval'>,\n",
+       "<_Benchmark.MMMU: 'mmmu'>,\n",
+       "<_Benchmark.LLM_JUDGE: 'llm_judge'>\n",
+       "]\n",
+       "
\n" + ], + "text/plain": [ + "\u001b[1m[\u001b[0m\n", + "\u001b[2;32m│ \u001b[0m\u001b[1m<\u001b[0m\u001b[1;38;2;225;0;225m_Benchmark.MMLU:\u001b[0m\u001b[39m \u001b[0m\u001b[38;2;0;135;0m'mmlu'\u001b[0m\u001b[39m>,\u001b[0m\n", + "\u001b[2;32m│ \u001b[0m\u001b[39m<_Benchmark.MMLU_PRO: \u001b[0m\u001b[38;2;0;135;0m'mmlu_pro'\u001b[0m\u001b[39m>,\u001b[0m\n", + "\u001b[2;32m│ \u001b[0m\u001b[39m<_Benchmark.BBH: \u001b[0m\u001b[38;2;0;135;0m'bbh'\u001b[0m\u001b[39m>,\u001b[0m\n", + "\u001b[2;32m│ \u001b[0m\u001b[39m<_Benchmark.GPQA: \u001b[0m\u001b[38;2;0;135;0m'gpqa'\u001b[0m\u001b[39m>,\u001b[0m\n", + "\u001b[2;32m│ \u001b[0m\u001b[39m<_Benchmark.MATH: \u001b[0m\u001b[38;2;0;135;0m'math'\u001b[0m\u001b[39m>,\u001b[0m\n", + "\u001b[2;32m│ \u001b[0m\u001b[39m<_Benchmark.STRONG_REJECT: \u001b[0m\u001b[38;2;0;135;0m'strong_reject'\u001b[0m\u001b[39m>,\u001b[0m\n", + "\u001b[2;32m│ \u001b[0m\u001b[39m<_Benchmark.IFEVAL: \u001b[0m\u001b[38;2;0;135;0m'ifeval'\u001b[0m\u001b[39m>,\u001b[0m\n", + "\u001b[2;32m│ \u001b[0m\u001b[39m<_Benchmark.MMMU: \u001b[0m\u001b[38;2;0;135;0m'mmmu'\u001b[0m\u001b[39m>,\u001b[0m\n", + "\u001b[2;32m│ \u001b[0m\u001b[39m<_Benchmark.LLM_JUDGE: \u001b[0m\u001b[38;2;0;135;0m'llm_judge'\u001b[0m\u001b[1m>\u001b[0m\n", + "\u001b[1m]\u001b[0m\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
{\n",
+       "'modality': 'Text',\n",
+       "'description': 'General Physics Question Answering – Assesses comprehension of physics concepts and related problem-solving abilities.',\n",
+       "'metrics': ['accuracy'],\n",
+       "'strategy': 'zs_cot',\n",
+       "'subtask_available': False,\n",
+       "'subtasks': None\n",
+       "}\n",
+       "
\n" + ], + "text/plain": [ + "\u001b[1m{\u001b[0m\n", + "\u001b[2;32m│ \u001b[0m\u001b[38;2;0;135;0m'modality'\u001b[0m: \u001b[38;2;0;135;0m'Text'\u001b[0m,\n", + "\u001b[2;32m│ \u001b[0m\u001b[38;2;0;135;0m'description'\u001b[0m: \u001b[38;2;0;135;0m'General Physics Question Answering – Assesses comprehension of physics concepts and related problem-solving abilities.'\u001b[0m,\n", + "\u001b[2;32m│ \u001b[0m\u001b[38;2;0;135;0m'metrics'\u001b[0m: \u001b[1m[\u001b[0m\u001b[38;2;0;135;0m'accuracy'\u001b[0m\u001b[1m]\u001b[0m,\n", + "\u001b[2;32m│ \u001b[0m\u001b[38;2;0;135;0m'strategy'\u001b[0m: \u001b[38;2;0;135;0m'zs_cot'\u001b[0m,\n", + "\u001b[2;32m│ \u001b[0m\u001b[38;2;0;135;0m'subtask_available'\u001b[0m: \u001b[3;38;2;215;0;0mFalse\u001b[0m,\n", + "\u001b[2;32m│ \u001b[0m\u001b[38;2;0;135;0m'subtasks'\u001b[0m: \u001b[3;38;2;225;0;225mNone\u001b[0m\n", + "\u001b[1m}\u001b[0m\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], "source": [ "from sagemaker.train.evaluate import get_benchmarks, get_benchmark_properties\n", "from rich.pretty import pprint\n", "\n", "# Configure logging to show INFO messages\n", - "import logging\n", - "logging.basicConfig(\n", - " level=logging.INFO,\n", - " format='%(levelname)s - %(name)s - %(message)s'\n", - ")\n", + "# import logging\n", + "# logging.basicConfig(\n", + "# level=logging.INFO,\n", + "# format='%(levelname)s - %(name)s - %(message)s'\n", + "# )\n", "\n", "# Get available benchmarks\n", "Benchmark = get_benchmarks()\n", "pprint(list(Benchmark))\n", "\n", "# Print properties for a specific benchmark\n", - "pprint(get_benchmark_properties(benchmark=Benchmark.GEN_QA))" + "pprint(get_benchmark_properties(benchmark=Benchmark.GPQA))" ] }, { @@ -67,22 +138,200 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "WARNING:sagemaker.core.utils.utils:No region provided. Using default region.\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "model_package_group_name='model-package-group-nova13' model_package_group_arn='arn:aws:sagemaker:us-east-1:529088288990:model-package-group/model-package-group-nova13' model_package_group_description=Unassigned() creation_time=datetime.datetime(2026, 3, 4, 13, 13, 33, 33000, tzinfo=tzlocal()) created_by=UserContext(user_profile_arn=Unassigned(), user_profile_name=Unassigned(), domain_id=Unassigned(), iam_identity=IamIdentity(arn='arn:aws:sts::529088288990:assumed-role/Admin/mollyhe-Isengard', principal_id='AROAXWMA6TDPKWL3QDB2W:mollyhe-Isengard', source_identity=Unassigned())) model_package_group_status='Completed'\n" + ] + } + ], + "source": [ + "from sagemaker.core.resources import ModelPackage, ModelPackageGroup\n", + "\n", + "# model_package_group=ModelPackageGroup.create(model_package_group_name=\"model-package-group-nova13\")\n", + "model_package_group=ModelPackageGroup.get(model_package_group_name=\"model-package-group-nova13\")\n", + "print(model_package_group)" + ] + }, + { + "cell_type": "code", + "execution_count": 4, "metadata": {}, "outputs": [], + "source": [ + "\n", + "import json\n", + "import re\n", + "from sagemaker.core.utils.utils import Unassigned\n", + "\n", + "def pretty_print(obj):\n", + " def parse_unassigned(item):\n", + " if isinstance(item, Unassigned):\n", + " return None\n", + " if isinstance(item, dict):\n", + " return {k: parse_unassigned(v) for k, v in item.items() if parse_unassigned(v) is not None}\n", + " if isinstance(item, list):\n", + " return [parse_unassigned(x) for x in item if parse_unassigned(x) is not None]\n", + " if isinstance(item, str) and \"Unassigned object\" in item:\n", + " pairs = re.findall(r\"(\\w+)=([^<][^=]*?)(?=\\s+\\w+=|$)\", item)\n", + " result = {k: v.strip(\"'\\\"\") for k, v in pairs}\n", + " return result if result else None\n", + " return item\n", + "\n", + " cleaned = parse_unassigned(obj.__dict__ if hasattr(obj, '__dict__') else obj)\n", + " print(json.dumps(cleaned, indent=2, default=str))" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{\n", + " \"training_job_name\": \"nova-textgeneration-micro-sft-20260302144611\",\n", + " \"training_job_arn\": \"arn:aws:sagemaker:us-east-1:529088288990:training-job/nova-textgeneration-micro-sft-20260302144611\",\n", + " \"model_artifacts\": \"s3_model_artifacts='s3://sagemaker-us-east-1-529088288990/output/nova-textgeneration-micro-sft-20260302144611/output/model'\",\n", + " \"training_job_output\": \"s3_training_job_output='s3://sagemaker-us-east-1-529088288990/output/nova-textgeneration-micro-sft-20260302144611/output/output'\",\n", + " \"training_job_status\": \"Completed\",\n", + " \"secondary_status\": \"Completed\",\n", + " \"hyper_parameters\": {\n", + " \"global_batch_size\": \"64\",\n", + " \"learning_rate\": \"1e-05\",\n", + " \"learning_rate_ratio\": \"16.0\",\n", + " \"lora_alpha\": \"128\",\n", + " \"max_context_length\": \"8192\",\n", + " \"max_epochs\": \"2\",\n", + " \"name\": \"my-lora-run-wkxk5\"\n", + " },\n", + " \"role_arn\": \"arn:aws:iam::529088288990:role/Admin\",\n", + " \"input_data_config\": [\n", + " \"channel_name='train' data_source=DataSource(s3_data_source=Unassigned(), file_system_data_source=Unassigned(), dataset_source=DatasetSource(dataset_arn='arn:aws:sagemaker:us-east-1:529088288990:hub-content/G82247CBEQ6TN0FI3J1SAJU59UURMT3H3EKG48C6VQGDINNRNGU0/DataSet/demo-1/5.0.0')) content_type=Unassigned() compression_type='None' record_wrapper_type='None' input_mode=Unassigned() shuffle_config=Unassigned() enable_ffm=False\"\n", + " ],\n", + " \"output_data_config\": \"s3_output_path='s3://sagemaker-us-east-1-529088288990/output/' kms_key_id='' compression_type='NONE' remove_job_name_from_s3_output_path=False disable_model_upload=False channels=Unassigned()\",\n", + " \"stopping_condition\": \"max_runtime_in_seconds=86400 max_wait_time_in_seconds=Unassigned() max_pending_time_in_seconds=Unassigned()\",\n", + " \"creation_time\": \"2026-03-02 14:46:12.873000-08:00\",\n", + " \"training_start_time\": \"2026-03-02 14:58:59.629000-08:00\",\n", + " \"training_end_time\": \"2026-03-02 15:22:26.851000-08:00\",\n", + " \"last_modified_time\": \"2026-03-02 15:22:26.851000-08:00\",\n", + " \"secondary_status_transitions\": [\n", + " \"status='Starting' start_time=datetime.datetime(2026, 3, 2, 14, 46, 12, 873000, tzinfo=tzlocal()) end_time=datetime.datetime(2026, 3, 2, 14, 46, 14, 829000, tzinfo=tzlocal()) status_message='Starting the training job'\",\n", + " \"status='Pending' start_time=datetime.datetime(2026, 3, 2, 14, 46, 14, 829000, tzinfo=tzlocal()) end_time=datetime.datetime(2026, 3, 2, 14, 58, 59, 629000, tzinfo=tzlocal()) status_message='Preparing the instances for training'\",\n", + " \"status='Downloading' start_time=datetime.datetime(2026, 3, 2, 14, 58, 59, 629000, tzinfo=tzlocal()) end_time=datetime.datetime(2026, 3, 2, 15, 7, 4, 705000, tzinfo=tzlocal()) status_message='Downloading the training image'\",\n", + " \"status='Training' start_time=datetime.datetime(2026, 3, 2, 15, 7, 4, 705000, tzinfo=tzlocal()) end_time=datetime.datetime(2026, 3, 2, 15, 21, 2, 752000, tzinfo=tzlocal()) status_message='Training image download completed. Training in progress.'\",\n", + " \"status='Uploading' start_time=datetime.datetime(2026, 3, 2, 15, 21, 2, 752000, tzinfo=tzlocal()) end_time=datetime.datetime(2026, 3, 2, 15, 22, 26, 851000, tzinfo=tzlocal()) status_message='Uploading generated training model'\",\n", + " \"status='Completed' start_time=datetime.datetime(2026, 3, 2, 15, 22, 26, 851000, tzinfo=tzlocal()) end_time=datetime.datetime(2026, 3, 2, 15, 22, 26, 851000, tzinfo=tzlocal()) status_message='Training job completed'\"\n", + " ],\n", + " \"enable_network_isolation\": false,\n", + " \"enable_inter_container_traffic_encryption\": false,\n", + " \"enable_managed_spot_training\": false,\n", + " \"training_time_in_seconds\": 1407,\n", + " \"billable_time_in_seconds\": 0,\n", + " \"billable_token_count\": 153604,\n", + " \"disable_efa\": false,\n", + " \"image_metadata\": \"image_type='BYOImage'\",\n", + " \"serverless_job_config\": \"base_model_arn='arn:aws:sagemaker:us-east-1:aws:hub-content/SageMakerPublicHub/Model/nova-textgeneration-micro/2.45.1' job_type='FineTuning' accept_eula=True customization_technique='SFT' peft='LORA' evaluation_type=Unassigned() evaluator_arn=Unassigned() job_spec=Unassigned()\",\n", + " \"mlflow_config\": \"mlflow_resource_arn='arn:aws:sagemaker:us-east-1:529088288990:mlflow-app/app-2SV6Q35HTCBO' mlflow_tracking_server_arn=Unassigned() mlflow_experiment_name='test-finetuned-models-exp' mlflow_run_name='test-finetuned-models-run'\",\n", + " \"model_package_config\": \"model_package_group_arn='arn:aws:sagemaker:us-east-1:529088288990:model-package-group/model-package-group-nova6' source_model_package_arn=Unassigned()\",\n", + " \"mlflow_details\": \"mlflow_experiment_id='2' mlflow_run_id='1f13027c62db4ac198a7d8452a682d5d'\",\n", + " \"progress_info\": \"total_step_count_per_epoch=1 current_step=1 current_epoch=2 max_epoch=2\",\n", + " \"output_model_package_arn\": \"arn:aws:sagemaker:us-east-1:529088288990:model-package/model-package-group-nova6/1\"\n", + "}\n" + ] + } + ], + "source": [ + "from sagemaker.core.resources import TrainingJob\n", + "\n", + "response = TrainingJob.get(training_job_name=\"nova-textgeneration-micro-sft-20260302144611\")\n", + "pretty_print(response)" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "sagemaker.config INFO - Not applying SDK defaults from location: /Library/Application Support/sagemaker/config.yaml\n", + "sagemaker.config INFO - Not applying SDK defaults from location: /Users/mollyhe/Library/Application Support/sagemaker/config.yaml\n" + ] + }, + { + "data": { + "text/html": [ + "
BenchMarkEvaluator(\n",
+       "region=None,\n",
+       "sagemaker_session=<sagemaker.core.helper.session_helper.Session object at 0x17252afe0>,\n",
+       "model='arn:aws:sagemaker:us-east-1:529088288990:model-package/model-package-group-nova6/1',\n",
+       "base_eval_name='gpqa-eval-demo',\n",
+       "s3_output_path='s3://sagemaker-us-east-1-529088288990/eval',\n",
+       "mlflow_resource_arn='arn:aws:sagemaker:us-east-1:529088288990:mlflow-app/app-2SV6Q35HTCBO',\n",
+       "mlflow_experiment_name='test-eval-models-exp',\n",
+       "mlflow_run_name=None,\n",
+       "networking=None,\n",
+       "kms_key_id=None,\n",
+       "model_package_group='arn:aws:sagemaker:us-east-1:529088288990:model-package-group/model-package-group-nova13',\n",
+       "benchmark=<_Benchmark.GPQA: 'gpqa'>,\n",
+       "subtasks=None,\n",
+       "evaluate_base_model=False\n",
+       ")\n",
+       "
\n" + ], + "text/plain": [ + "\u001b[1;38;2;225;0;225mBenchMarkEvaluator\u001b[0m\u001b[1m(\u001b[0m\n", + "\u001b[2;32m│ \u001b[0m\u001b[38;2;215;175;0mregion\u001b[0m=\u001b[3;38;2;225;0;225mNone\u001b[0m,\n", + "\u001b[2;32m│ \u001b[0m\u001b[38;2;215;175;0msagemaker_session\u001b[0m=\u001b[1m<\u001b[0m\u001b[1;38;2;225;0;225msagemaker.core.helper.session_helper.Session\u001b[0m\u001b[39m object at \u001b[0m\u001b[1;36m0x17252afe0\u001b[0m\u001b[39m>,\u001b[0m\n", + "\u001b[2;32m│ \u001b[0m\u001b[38;2;215;175;0mmodel\u001b[0m\u001b[39m=\u001b[0m\u001b[38;2;0;135;0m'arn:aws:sagemaker:us-east-1:529088288990:model-package/model-package-group-nova6/1'\u001b[0m\u001b[39m,\u001b[0m\n", + "\u001b[2;32m│ \u001b[0m\u001b[38;2;215;175;0mbase_eval_name\u001b[0m\u001b[39m=\u001b[0m\u001b[38;2;0;135;0m'gpqa-eval-demo'\u001b[0m\u001b[39m,\u001b[0m\n", + "\u001b[2;32m│ \u001b[0m\u001b[38;2;215;175;0ms3_output_path\u001b[0m\u001b[39m=\u001b[0m\u001b[38;2;0;135;0m's3://sagemaker-us-east-1-529088288990/eval'\u001b[0m\u001b[39m,\u001b[0m\n", + "\u001b[2;32m│ \u001b[0m\u001b[38;2;215;175;0mmlflow_resource_arn\u001b[0m\u001b[39m=\u001b[0m\u001b[38;2;0;135;0m'arn:aws:sagemaker:us-east-1:529088288990:mlflow-app/app-2SV6Q35HTCBO'\u001b[0m\u001b[39m,\u001b[0m\n", + "\u001b[2;32m│ \u001b[0m\u001b[38;2;215;175;0mmlflow_experiment_name\u001b[0m\u001b[39m=\u001b[0m\u001b[38;2;0;135;0m'test-eval-models-exp'\u001b[0m\u001b[39m,\u001b[0m\n", + "\u001b[2;32m│ \u001b[0m\u001b[38;2;215;175;0mmlflow_run_name\u001b[0m\u001b[39m=\u001b[0m\u001b[3;38;2;225;0;225mNone\u001b[0m\u001b[39m,\u001b[0m\n", + "\u001b[2;32m│ \u001b[0m\u001b[38;2;215;175;0mnetworking\u001b[0m\u001b[39m=\u001b[0m\u001b[3;38;2;225;0;225mNone\u001b[0m\u001b[39m,\u001b[0m\n", + "\u001b[2;32m│ \u001b[0m\u001b[38;2;215;175;0mkms_key_id\u001b[0m\u001b[39m=\u001b[0m\u001b[3;38;2;225;0;225mNone\u001b[0m\u001b[39m,\u001b[0m\n", + "\u001b[2;32m│ \u001b[0m\u001b[38;2;215;175;0mmodel_package_group\u001b[0m\u001b[39m=\u001b[0m\u001b[38;2;0;135;0m'arn:aws:sagemaker:us-east-1:529088288990:model-package-group/model-package-group-nova13'\u001b[0m\u001b[39m,\u001b[0m\n", + "\u001b[2;32m│ \u001b[0m\u001b[38;2;215;175;0mbenchmark\u001b[0m\u001b[39m=<_Benchmark.GPQA: \u001b[0m\u001b[38;2;0;135;0m'gpqa'\u001b[0m\u001b[1m>\u001b[0m,\n", + "\u001b[2;32m│ \u001b[0m\u001b[38;2;215;175;0msubtasks\u001b[0m=\u001b[3;38;2;225;0;225mNone\u001b[0m,\n", + "\u001b[2;32m│ \u001b[0m\u001b[38;2;215;175;0mevaluate_base_model\u001b[0m=\u001b[3;38;2;215;0;0mFalse\u001b[0m\n", + "\u001b[1m)\u001b[0m\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], "source": [ "from sagemaker.train.evaluate import BenchMarkEvaluator\n", "\n", "# Create evaluator with GEN_QA benchmark\n", "# These values match our successfully tested configuration\n", "evaluator = BenchMarkEvaluator(\n", - " benchmark=Benchmark.GEN_QA,\n", - " model=\"arn:aws:sagemaker:us-west-2:052150106756:model-package/test-finetuned-models-gamma/28\",\n", - " s3_output_path=\"s3://mufi-test-serverless-smtj/eval/\",\n", - " mlflow_resource_arn=\"arn:aws:sagemaker:us-west-2:052150106756:mlflow-tracking-server/mmlu-eval-experiment\",\n", - " dataset=\"s3://sagemaker-us-west-2-052150106756/studio-users/d20251107t195443/datasets/2025-11-07T19-55-37-609Z/zc_test.jsonl\",\n", - " model_package_group=\"arn:aws:sagemaker:us-west-2:052150106756:model-package-group/example-name-aovqo\", # Optional inferred from model if model package\n", - " base_eval_name=\"gen-qa-eval-demo\",\n", + " benchmark=Benchmark.GPQA,\n", + " model=\"arn:aws:sagemaker:us-east-1:529088288990:model-package/model-package-group-nova6/1\",\n", + " s3_output_path=\"s3://sagemaker-us-east-1-529088288990/eval\",\n", + " mlflow_experiment_name=\"test-eval-models-exp\", \n", + " # mlflow_resource_arn=\"arn:aws:sagemaker:us-west-2:052150106756:mlflow-tracking-server/mmlu-eval-experiment\",\n", + " dataset=\"s3://sagemaker-us-east-1-529088288990/eval.jsonl\",\n", + " model_package_group=\"arn:aws:sagemaker:us-east-1:529088288990:model-package-group/model-package-group-nova13\", # Optional inferred from model if model package\n", + " base_eval_name=\"gpqa-eval-demo\",\n", " # Note: sagemaker_session is optional and will be auto-created if not provided\n", " # Note: region is optional and will be auto deduced using environment variables - SAGEMAKER_REGION, AWS_REGION\n", ")\n", @@ -92,7 +341,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 7, "metadata": {}, "outputs": [], "source": [ @@ -119,7 +368,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 8, "metadata": {}, "outputs": [], "source": [ @@ -155,9 +404,23 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 9, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "
{'max_new_tokens': '8196', 'temperature': '0', 'top_k': '-1', 'top_p': '1.0'}\n",
+       "
\n" + ], + "text/plain": [ + "\u001b[1m{\u001b[0m\u001b[38;2;0;135;0m'max_new_tokens'\u001b[0m: \u001b[38;2;0;135;0m'8196'\u001b[0m, \u001b[38;2;0;135;0m'temperature'\u001b[0m: \u001b[38;2;0;135;0m'0'\u001b[0m, \u001b[38;2;0;135;0m'top_k'\u001b[0m: \u001b[38;2;0;135;0m'-1'\u001b[0m, \u001b[38;2;0;135;0m'top_p'\u001b[0m: \u001b[38;2;0;135;0m'1.0'\u001b[0m\u001b[1m}\u001b[0m\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], "source": [ "pprint(evaluator.hyperparameters.to_dict())\n", "\n", @@ -188,9 +451,48 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 10, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "
BenchmarkEvaluationExecution(\n",
+       "arn='arn:aws:sagemaker:us-east-1:529088288990:pipeline/SagemakerEvaluation-BenchmarkEvaluation-d47edc79-1d45-4b8f-9ff6-df8e2c8c92be/execution/ab0k65fkztid',\n",
+       "name='gpqa-eval-demo',\n",
+       "status=PipelineExecutionStatus(overall_status='Executing', step_details=[], failure_reason=None),\n",
+       "last_modified_time=datetime.datetime(2026, 3, 10, 17, 48, 10, 308000, tzinfo=tzlocal()),\n",
+       "eval_type=<EvalType.BENCHMARK: 'benchmark'>,\n",
+       "s3_output_path='s3://sagemaker-us-east-1-529088288990/eval',\n",
+       "steps=[]\n",
+       ")\n",
+       "
\n" + ], + "text/plain": [ + "\u001b[1;38;2;225;0;225mBenchmarkEvaluationExecution\u001b[0m\u001b[1m(\u001b[0m\n", + "\u001b[2;32m│ \u001b[0m\u001b[38;2;215;175;0marn\u001b[0m=\u001b[38;2;0;135;0m'arn:aws:sagemaker:us-east-1:529088288990:pipeline/SagemakerEvaluation-BenchmarkEvaluation-d47edc79-1d45-4b8f-9ff6-df8e2c8c92be/execution/ab0k65fkztid'\u001b[0m,\n", + "\u001b[2;32m│ \u001b[0m\u001b[38;2;215;175;0mname\u001b[0m=\u001b[38;2;0;135;0m'gpqa-eval-demo'\u001b[0m,\n", + "\u001b[2;32m│ \u001b[0m\u001b[38;2;215;175;0mstatus\u001b[0m=\u001b[1;38;2;225;0;225mPipelineExecutionStatus\u001b[0m\u001b[1m(\u001b[0m\u001b[38;2;215;175;0moverall_status\u001b[0m=\u001b[38;2;0;135;0m'Executing'\u001b[0m, \u001b[38;2;215;175;0mstep_details\u001b[0m=\u001b[1m[\u001b[0m\u001b[1m]\u001b[0m, \u001b[38;2;215;175;0mfailure_reason\u001b[0m=\u001b[3;38;2;225;0;225mNone\u001b[0m\u001b[1m)\u001b[0m,\n", + "\u001b[2;32m│ \u001b[0m\u001b[38;2;215;175;0mlast_modified_time\u001b[0m=\u001b[1;38;2;225;0;225mdatetime\u001b[0m\u001b[1;38;2;225;0;225m.datetime\u001b[0m\u001b[1m(\u001b[0m\u001b[1;36m2026\u001b[0m, \u001b[1;36m3\u001b[0m, \u001b[1;36m10\u001b[0m, \u001b[1;36m17\u001b[0m, \u001b[1;36m48\u001b[0m, \u001b[1;36m10\u001b[0m, \u001b[1;36m308000\u001b[0m, \u001b[38;2;215;175;0mtzinfo\u001b[0m=\u001b[1;38;2;225;0;225mtzlocal\u001b[0m\u001b[1m(\u001b[0m\u001b[1m)\u001b[0m\u001b[1m)\u001b[0m,\n", + "\u001b[2;32m│ \u001b[0m\u001b[38;2;215;175;0meval_type\u001b[0m=\u001b[1m<\u001b[0m\u001b[1;38;2;225;0;225mEvalType.BENCHMARK:\u001b[0m\u001b[39m \u001b[0m\u001b[38;2;0;135;0m'benchmark'\u001b[0m\u001b[1m>\u001b[0m,\n", + "\u001b[2;32m│ \u001b[0m\u001b[38;2;215;175;0ms3_output_path\u001b[0m=\u001b[38;2;0;135;0m's3://sagemaker-us-east-1-529088288990/eval'\u001b[0m,\n", + "\u001b[2;32m│ \u001b[0m\u001b[38;2;215;175;0msteps\u001b[0m=\u001b[1m[\u001b[0m\u001b[1m]\u001b[0m\n", + "\u001b[1m)\u001b[0m\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Pipeline Execution ARN: arn:aws:sagemaker:us-east-1:529088288990:pipeline/SagemakerEvaluation-BenchmarkEvaluation-d47edc79-1d45-4b8f-9ff6-df8e2c8c92be/execution/ab0k65fkztid\n", + "Initial Status: Executing\n" + ] + } + ], "source": [ "# Run evaluation with configured parameters\n", "execution = evaluator.evaluate()\n", @@ -211,7 +513,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 11, "metadata": {}, "outputs": [], "source": [ @@ -231,9 +533,79 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 12, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "
PipelineExecutionStatus(\n",
+       "overall_status='Executing',\n",
+       "step_details=[\n",
+       "│   │   StepDetail(\n",
+       "│   │   │   name='CreateEvaluationAction',\n",
+       "│   │   │   status='Starting',\n",
+       "│   │   │   start_time='2026-03-10T17:48:10.807000-07:00',\n",
+       "│   │   │   end_time=None,\n",
+       "│   │   │   display_name=None,\n",
+       "│   │   │   failure_reason=None,\n",
+       "│   │   │   job_arn=None\n",
+       "│   │   ),\n",
+       "│   │   StepDetail(\n",
+       "│   │   │   name='EvaluateCustomModel',\n",
+       "│   │   │   status='Starting',\n",
+       "│   │   │   start_time='2026-03-10T17:48:10.807000-07:00',\n",
+       "│   │   │   end_time=None,\n",
+       "│   │   │   display_name=None,\n",
+       "│   │   │   failure_reason=None,\n",
+       "│   │   │   job_arn=None\n",
+       "│   │   )\n",
+       "],\n",
+       "failure_reason=None\n",
+       ")\n",
+       "
\n" + ], + "text/plain": [ + "\u001b[1;38;2;225;0;225mPipelineExecutionStatus\u001b[0m\u001b[1m(\u001b[0m\n", + "\u001b[2;32m│ \u001b[0m\u001b[38;2;215;175;0moverall_status\u001b[0m=\u001b[38;2;0;135;0m'Executing'\u001b[0m,\n", + "\u001b[2;32m│ \u001b[0m\u001b[38;2;215;175;0mstep_details\u001b[0m=\u001b[1m[\u001b[0m\n", + "\u001b[2;32m│ │ \u001b[0m\u001b[1;38;2;225;0;225mStepDetail\u001b[0m\u001b[1m(\u001b[0m\n", + "\u001b[2;32m│ │ │ \u001b[0m\u001b[38;2;215;175;0mname\u001b[0m=\u001b[38;2;0;135;0m'CreateEvaluationAction'\u001b[0m,\n", + "\u001b[2;32m│ │ │ \u001b[0m\u001b[38;2;215;175;0mstatus\u001b[0m=\u001b[38;2;0;135;0m'Starting'\u001b[0m,\n", + "\u001b[2;32m│ │ │ \u001b[0m\u001b[38;2;215;175;0mstart_time\u001b[0m=\u001b[38;2;0;135;0m'2026-03-10T17:48:10.807000-07:00'\u001b[0m,\n", + "\u001b[2;32m│ │ │ \u001b[0m\u001b[38;2;215;175;0mend_time\u001b[0m=\u001b[3;38;2;225;0;225mNone\u001b[0m,\n", + "\u001b[2;32m│ │ │ \u001b[0m\u001b[38;2;215;175;0mdisplay_name\u001b[0m=\u001b[3;38;2;225;0;225mNone\u001b[0m,\n", + "\u001b[2;32m│ │ │ \u001b[0m\u001b[38;2;215;175;0mfailure_reason\u001b[0m=\u001b[3;38;2;225;0;225mNone\u001b[0m,\n", + "\u001b[2;32m│ │ │ \u001b[0m\u001b[38;2;215;175;0mjob_arn\u001b[0m=\u001b[3;38;2;225;0;225mNone\u001b[0m\n", + "\u001b[2;32m│ │ \u001b[0m\u001b[1m)\u001b[0m,\n", + "\u001b[2;32m│ │ \u001b[0m\u001b[1;38;2;225;0;225mStepDetail\u001b[0m\u001b[1m(\u001b[0m\n", + "\u001b[2;32m│ │ │ \u001b[0m\u001b[38;2;215;175;0mname\u001b[0m=\u001b[38;2;0;135;0m'EvaluateCustomModel'\u001b[0m,\n", + "\u001b[2;32m│ │ │ \u001b[0m\u001b[38;2;215;175;0mstatus\u001b[0m=\u001b[38;2;0;135;0m'Starting'\u001b[0m,\n", + "\u001b[2;32m│ │ │ \u001b[0m\u001b[38;2;215;175;0mstart_time\u001b[0m=\u001b[38;2;0;135;0m'2026-03-10T17:48:10.807000-07:00'\u001b[0m,\n", + "\u001b[2;32m│ │ │ \u001b[0m\u001b[38;2;215;175;0mend_time\u001b[0m=\u001b[3;38;2;225;0;225mNone\u001b[0m,\n", + "\u001b[2;32m│ │ │ \u001b[0m\u001b[38;2;215;175;0mdisplay_name\u001b[0m=\u001b[3;38;2;225;0;225mNone\u001b[0m,\n", + "\u001b[2;32m│ │ │ \u001b[0m\u001b[38;2;215;175;0mfailure_reason\u001b[0m=\u001b[3;38;2;225;0;225mNone\u001b[0m,\n", + "\u001b[2;32m│ │ │ \u001b[0m\u001b[38;2;215;175;0mjob_arn\u001b[0m=\u001b[3;38;2;225;0;225mNone\u001b[0m\n", + "\u001b[2;32m│ │ \u001b[0m\u001b[1m)\u001b[0m\n", + "\u001b[2;32m│ \u001b[0m\u001b[1m]\u001b[0m,\n", + "\u001b[2;32m│ \u001b[0m\u001b[38;2;215;175;0mfailure_reason\u001b[0m=\u001b[3;38;2;225;0;225mNone\u001b[0m\n", + "\u001b[1m)\u001b[0m\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Step Details:\n", + " CreateEvaluationAction: Starting\n", + " EvaluateCustomModel: Starting\n" + ] + } + ], "source": [ "# Refresh status\n", "execution.refresh()\n", @@ -259,9 +631,128 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 13, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "
╭─────────────────────────────────────────── Pipeline Execution Status ───────────────────────────────────────────╮\n",
+       "  Overall Status        Executing                                                                                \n",
+       "  Target Status         Succeeded                                                                                \n",
+       "  Elapsed Time          33.4s                                                                                    \n",
+       "                                                                                                                 \n",
+       " Pipeline Steps                                                                                                  \n",
+       "  Step Name                       Status           Duration                                                      \n",
+       "  CreateEvaluationAction          Succeeded        1.1s                                                          \n",
+       "  EvaluateCustomModel             Executing        Running...                                                    \n",
+       "                                                                                                                 \n",
+       " Job ARNs                                                                                                        \n",
+       "  Step                  Job Link      Job ARN                                                                    \n",
+       "  EvaluateCustomModel   🔗 link       arn:aws:sagemaker:us-east-1:529088288990:training-job/pipelines-ab0k65fkz  \n",
+       "                                      tid-EvaluateCustomModel-5XaO1EW1hQ                                         \n",
+       "╰─────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯\n",
+       "
\n" + ], + "text/plain": [ + "\u001b[34m╭─\u001b[0m\u001b[34m──────────────────────────────────────────\u001b[0m\u001b[34m \u001b[0m\u001b[1;34mPipeline Execution Status\u001b[0m\u001b[34m \u001b[0m\u001b[34m──────────────────────────────────────────\u001b[0m\u001b[34m─╮\u001b[0m\n", + "\u001b[34m│\u001b[0m \u001b[1;36m \u001b[0m\u001b[1;36mOverall Status \u001b[0m\u001b[1;36m \u001b[0m\u001b[37m \u001b[0m\u001b[1;37mExecuting\u001b[0m\u001b[37m \u001b[0m \u001b[34m│\u001b[0m\n", + "\u001b[34m│\u001b[0m \u001b[1;36m \u001b[0m\u001b[1;36mTarget Status \u001b[0m\u001b[1;36m \u001b[0m\u001b[37m \u001b[0m\u001b[1;37mSucceeded\u001b[0m\u001b[37m \u001b[0m \u001b[34m│\u001b[0m\n", + "\u001b[34m│\u001b[0m \u001b[1;36m \u001b[0m\u001b[1;36mElapsed Time \u001b[0m\u001b[1;36m \u001b[0m\u001b[37m \u001b[0m\u001b[37m33.4s \u001b[0m\u001b[37m \u001b[0m \u001b[34m│\u001b[0m\n", + "\u001b[34m│\u001b[0m \u001b[34m│\u001b[0m\n", + "\u001b[34m│\u001b[0m \u001b[1;35mPipeline Steps\u001b[0m \u001b[34m│\u001b[0m\n", + "\u001b[34m│\u001b[0m \u001b[1;35m \u001b[0m\u001b[1;35mStep Name \u001b[0m\u001b[1;35m \u001b[0m\u001b[1;35m \u001b[0m\u001b[1;35mStatus \u001b[0m\u001b[1;35m \u001b[0m\u001b[1;35m \u001b[0m\u001b[1;35mDuration \u001b[0m\u001b[1;35m \u001b[0m \u001b[34m│\u001b[0m\n", + "\u001b[34m│\u001b[0m \u001b[36m \u001b[0m\u001b[36mCreateEvaluationAction \u001b[0m\u001b[36m \u001b[0m\u001b[33m \u001b[0m\u001b[32mSucceeded\u001b[0m\u001b[33m \u001b[0m\u001b[33m \u001b[0m\u001b[32m \u001b[0m\u001b[32m1.1s \u001b[0m\u001b[32m \u001b[0m \u001b[34m│\u001b[0m\n", + "\u001b[34m│\u001b[0m \u001b[36m \u001b[0m\u001b[36mEvaluateCustomModel \u001b[0m\u001b[36m \u001b[0m\u001b[33m \u001b[0m\u001b[33mExecuting\u001b[0m\u001b[33m \u001b[0m\u001b[33m \u001b[0m\u001b[32m \u001b[0m\u001b[32mRunning... \u001b[0m\u001b[32m \u001b[0m \u001b[34m│\u001b[0m\n", + "\u001b[34m│\u001b[0m \u001b[34m│\u001b[0m\n", + "\u001b[34m│\u001b[0m \u001b[1;35mJob ARNs\u001b[0m \u001b[34m│\u001b[0m\n", + "\u001b[34m│\u001b[0m \u001b[1;35m \u001b[0m\u001b[1;35mStep \u001b[0m\u001b[1;35m \u001b[0m\u001b[1;35m \u001b[0m\u001b[1;35mJob Link \u001b[0m\u001b[1;35m \u001b[0m\u001b[1;35m \u001b[0m\u001b[1;35mJob ARN \u001b[0m\u001b[1;35m \u001b[0m \u001b[34m│\u001b[0m\n", + "\u001b[34m│\u001b[0m \u001b[36m \u001b[0m\u001b[36mEvaluateCustomModel \u001b[0m\u001b[36m \u001b[0m \u001b]8;id=840181;https://studio-d-ingyyaeglvki.studio.us-east-1.sagemaker.aws/jobs/train/pipelines-ab0k65fkztid-EvaluateCustomModel-5XaO1EW1hQ\u001b\\🔗 link\u001b]8;;\u001b\\ \u001b[2m \u001b[0m\u001b[2marn:aws:sagemaker:us-east-1:529088288990:training-job/pipelines-ab0k65fkz\u001b[0m\u001b[2m \u001b[0m \u001b[34m│\u001b[0m\n", + "\u001b[34m│\u001b[0m \u001b[36m \u001b[0m \u001b[2m \u001b[0m\u001b[2mtid-EvaluateCustomModel-5XaO1EW1hQ \u001b[0m\u001b[2m \u001b[0m \u001b[34m│\u001b[0m\n", + "\u001b[34m╰─────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯\u001b[0m\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
╭─────────────────────────────── Traceback (most recent call last) ────────────────────────────────╮\n",
+       " in <module>:3                                                                                    \n",
+       "                                                                                                  \n",
+       "   1 # Wait for job completion with progress updates                                              \n",
+       "   2 # This will show a rich progress display in Jupyter                                          \n",
+       " 3 execution.wait(target_status=\"Succeeded\", poll=5, timeout=3600)                              \n",
+       "   4                                                                                              \n",
+       "   5 print(f\"\\nFinal Status: {execution.status.overall_status}\")                                  \n",
+       "   6                                                                                              \n",
+       "                                                                                                  \n",
+       " /Users/mollyhe/Documents/SageMaker/sagemaker-python-sdk/sagemaker-core/src/sagemaker/core/teleme \n",
+       " try/telemetry_logging.py:180 in wrapper                                                          \n",
+       "                                                                                                  \n",
+       "   177 │   │   │   │   │   \"sagemaker_session is not provided or not valid.\",                     \n",
+       "   178 │   │   │   │   │   func_name,                                                             \n",
+       "   179 │   │   │   │   )                                                                          \n",
+       " 180 │   │   │   │   return func(*args, **kwargs)                                               \n",
+       "   181 │   │                                                                                      \n",
+       "   182 │   │   return wrapper                                                                     \n",
+       "   183                                                                                            \n",
+       "                                                                                                  \n",
+       " /Users/mollyhe/Documents/SageMaker/sagemaker-python-sdk-molly/sagemaker-train/src/sagemaker/trai \n",
+       " n/evaluate/execution.py:1061 in wait                                                             \n",
+       "                                                                                                  \n",
+       "   1058 │   │   │   │   │   │   status=current_status                                             \n",
+       "   1059 │   │   │   │   │   )                                                                     \n",
+       "   1060 │   │   │   │                                                                             \n",
+       " 1061 │   │   │   │   time.sleep(poll)                                                          \n",
+       "   1062 │   │   else:                                                                             \n",
+       "   1063 │   │   │   # Terminal experience with rich library                                       \n",
+       "   1064 │   │   │   try:                                                                          \n",
+       "╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n",
+       "KeyboardInterrupt\n",
+       "
\n" + ], + "text/plain": [ + "\u001b[38;2;255;0;0m╭─\u001b[0m\u001b[38;2;255;0;0m──────────────────────────────\u001b[0m\u001b[38;2;255;0;0m \u001b[0m\u001b[1;38;2;255;0;0mTraceback \u001b[0m\u001b[1;2;38;2;255;0;0m(most recent call last)\u001b[0m\u001b[38;2;255;0;0m \u001b[0m\u001b[38;2;255;0;0m───────────────────────────────\u001b[0m\u001b[38;2;255;0;0m─╮\u001b[0m\n", + "\u001b[38;2;255;0;0m│\u001b[0m in \u001b[92m\u001b[0m:\u001b[94m3\u001b[0m \u001b[38;2;255;0;0m│\u001b[0m\n", + "\u001b[38;2;255;0;0m│\u001b[0m \u001b[38;2;255;0;0m│\u001b[0m\n", + "\u001b[38;2;255;0;0m│\u001b[0m \u001b[2m1 \u001b[0m\u001b[2m# Wait for job completion with progress updates\u001b[0m \u001b[38;2;255;0;0m│\u001b[0m\n", + "\u001b[38;2;255;0;0m│\u001b[0m \u001b[2m2 \u001b[0m\u001b[2m# This will show a rich progress display in Jupyter\u001b[0m \u001b[38;2;255;0;0m│\u001b[0m\n", + "\u001b[38;2;255;0;0m│\u001b[0m \u001b[31m❱ \u001b[0m3 execution.wait(target_status=\u001b[33m\"\u001b[0m\u001b[33mSucceeded\u001b[0m\u001b[33m\"\u001b[0m, poll=\u001b[94m5\u001b[0m, timeout=\u001b[94m3600\u001b[0m) \u001b[38;2;255;0;0m│\u001b[0m\n", + "\u001b[38;2;255;0;0m│\u001b[0m \u001b[2m4 \u001b[0m \u001b[38;2;255;0;0m│\u001b[0m\n", + "\u001b[38;2;255;0;0m│\u001b[0m \u001b[2m5 \u001b[0m\u001b[96mprint\u001b[0m(\u001b[33mf\u001b[0m\u001b[33m\"\u001b[0m\u001b[33m\\n\u001b[0m\u001b[33mFinal Status: \u001b[0m\u001b[33m{\u001b[0mexecution.status.overall_status\u001b[33m}\u001b[0m\u001b[33m\"\u001b[0m) \u001b[38;2;255;0;0m│\u001b[0m\n", + "\u001b[38;2;255;0;0m│\u001b[0m \u001b[2m6 \u001b[0m \u001b[38;2;255;0;0m│\u001b[0m\n", + "\u001b[38;2;255;0;0m│\u001b[0m \u001b[38;2;255;0;0m│\u001b[0m\n", + "\u001b[38;2;255;0;0m│\u001b[0m \u001b[2;33m/Users/mollyhe/Documents/SageMaker/sagemaker-python-sdk/sagemaker-core/src/sagemaker/core/teleme\u001b[0m \u001b[38;2;255;0;0m│\u001b[0m\n", + "\u001b[38;2;255;0;0m│\u001b[0m \u001b[2;33mtry/\u001b[0m\u001b[1;33mtelemetry_logging.py\u001b[0m:\u001b[94m180\u001b[0m in \u001b[92mwrapper\u001b[0m \u001b[38;2;255;0;0m│\u001b[0m\n", + "\u001b[38;2;255;0;0m│\u001b[0m \u001b[38;2;255;0;0m│\u001b[0m\n", + "\u001b[38;2;255;0;0m│\u001b[0m \u001b[2m177 \u001b[0m\u001b[2m│ │ │ │ │ \u001b[0m\u001b[33m\"\u001b[0m\u001b[33msagemaker_session is not provided or not valid.\u001b[0m\u001b[33m\"\u001b[0m, \u001b[38;2;255;0;0m│\u001b[0m\n", + "\u001b[38;2;255;0;0m│\u001b[0m \u001b[2m178 \u001b[0m\u001b[2m│ │ │ │ │ \u001b[0mfunc_name, \u001b[38;2;255;0;0m│\u001b[0m\n", + "\u001b[38;2;255;0;0m│\u001b[0m \u001b[2m179 \u001b[0m\u001b[2m│ │ │ │ \u001b[0m) \u001b[38;2;255;0;0m│\u001b[0m\n", + "\u001b[38;2;255;0;0m│\u001b[0m \u001b[31m❱ \u001b[0m180 \u001b[2m│ │ │ │ \u001b[0m\u001b[94mreturn\u001b[0m func(*args, **kwargs) \u001b[38;2;255;0;0m│\u001b[0m\n", + "\u001b[38;2;255;0;0m│\u001b[0m \u001b[2m181 \u001b[0m\u001b[2m│ │ \u001b[0m \u001b[38;2;255;0;0m│\u001b[0m\n", + "\u001b[38;2;255;0;0m│\u001b[0m \u001b[2m182 \u001b[0m\u001b[2m│ │ \u001b[0m\u001b[94mreturn\u001b[0m wrapper \u001b[38;2;255;0;0m│\u001b[0m\n", + "\u001b[38;2;255;0;0m│\u001b[0m \u001b[2m183 \u001b[0m \u001b[38;2;255;0;0m│\u001b[0m\n", + "\u001b[38;2;255;0;0m│\u001b[0m \u001b[38;2;255;0;0m│\u001b[0m\n", + "\u001b[38;2;255;0;0m│\u001b[0m \u001b[2;33m/Users/mollyhe/Documents/SageMaker/sagemaker-python-sdk-molly/sagemaker-train/src/sagemaker/trai\u001b[0m \u001b[38;2;255;0;0m│\u001b[0m\n", + "\u001b[38;2;255;0;0m│\u001b[0m \u001b[2;33mn/evaluate/\u001b[0m\u001b[1;33mexecution.py\u001b[0m:\u001b[94m1061\u001b[0m in \u001b[92mwait\u001b[0m \u001b[38;2;255;0;0m│\u001b[0m\n", + "\u001b[38;2;255;0;0m│\u001b[0m \u001b[38;2;255;0;0m│\u001b[0m\n", + "\u001b[38;2;255;0;0m│\u001b[0m \u001b[2m1058 \u001b[0m\u001b[2m│ │ │ │ │ │ \u001b[0mstatus=current_status \u001b[38;2;255;0;0m│\u001b[0m\n", + "\u001b[38;2;255;0;0m│\u001b[0m \u001b[2m1059 \u001b[0m\u001b[2m│ │ │ │ │ \u001b[0m) \u001b[38;2;255;0;0m│\u001b[0m\n", + "\u001b[38;2;255;0;0m│\u001b[0m \u001b[2m1060 \u001b[0m\u001b[2m│ │ │ │ \u001b[0m \u001b[38;2;255;0;0m│\u001b[0m\n", + "\u001b[38;2;255;0;0m│\u001b[0m \u001b[31m❱ \u001b[0m1061 \u001b[2m│ │ │ │ \u001b[0mtime.sleep(poll) \u001b[38;2;255;0;0m│\u001b[0m\n", + "\u001b[38;2;255;0;0m│\u001b[0m \u001b[2m1062 \u001b[0m\u001b[2m│ │ \u001b[0m\u001b[94melse\u001b[0m: \u001b[38;2;255;0;0m│\u001b[0m\n", + "\u001b[38;2;255;0;0m│\u001b[0m \u001b[2m1063 \u001b[0m\u001b[2m│ │ │ \u001b[0m\u001b[2m# Terminal experience with rich library\u001b[0m \u001b[38;2;255;0;0m│\u001b[0m\n", + "\u001b[38;2;255;0;0m│\u001b[0m \u001b[2m1064 \u001b[0m\u001b[2m│ │ │ \u001b[0m\u001b[94mtry\u001b[0m: \u001b[38;2;255;0;0m│\u001b[0m\n", + "\u001b[38;2;255;0;0m╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\u001b[0m\n", + "\u001b[1;91mKeyboardInterrupt\u001b[0m\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], "source": [ "# Wait for job completion with progress updates\n", "# This will show a rich progress display in Jupyter\n", @@ -338,13 +829,13 @@ "\n", "# Get an existing job by ARN\n", "# Replace with your actual pipeline execution ARN\n", - "existing_arn = \"arn:aws:sagemaker:us-west-2:052150106756:pipeline/SagemakerEvaluation-BenchmarkEvaluation-c344c91d-6f62-4907-85cc-7e6b29171c42/execution/inlsexrd7jes\"\n", + "existing_arn = \"arn:aws:sagemaker:us-east-1:529088288990:pipeline/SagemakerEvaluation-BenchmarkEvaluation-d47edc79-1d45-4b8f-9ff6-df8e2c8c92be/execution/bhtdxm5tenya\"\n", "\n", "# base model only example\n", "# existing_arn = \"arn:aws:sagemaker:us-west-2:052150106756:pipeline/SagemakerEvaluation-benchmark/execution/gdp9f4dbv2vi\"\n", "existing_execution = EvaluationPipelineExecution.get(\n", " arn=existing_arn,\n", - " region=\"us-west-2\"\n", + " region=\"us-east-1\"\n", ")\n", "\n", "pprint(existing_execution)\n", @@ -442,7 +933,7 @@ ], "metadata": { "kernelspec": { - "display_name": "Python 3", + "display_name": "py3.10.14", "language": "python", "name": "python3" }, @@ -456,7 +947,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.12.12" + "version": "3.10.14" } }, "nbformat": 4, diff --git a/sagemaker-train/src/sagemaker/train/common_utils/metrics_visualizer.py b/sagemaker-train/src/sagemaker/train/common_utils/metrics_visualizer.py index 0b617601e2..2b48149871 100644 --- a/sagemaker-train/src/sagemaker/train/common_utils/metrics_visualizer.py +++ b/sagemaker-train/src/sagemaker/train/common_utils/metrics_visualizer.py @@ -8,25 +8,40 @@ logger = logging.getLogger(__name__) -def get_studio_url(training_job: TrainingJob, domain_id: str = None) -> str: +def get_studio_url(training_job, domain_id: str = None) -> str: """Get SageMaker Studio URL for training job logs. Args: - training_job: SageMaker TrainingJob object or job name string + training_job: SageMaker TrainingJob object, job name string, or job ARN string domain_id: Studio domain ID (e.g., 'd-xxxxxxxxxxxx'). If not provided, attempts to auto-detect Returns: - Studio URL pointing to the training job details + Studio URL pointing to the training job details, or empty string if not resolvable Example: >>> from sagemaker.train import get_studio_url >>> url = get_studio_url('my-training-job') + >>> url = get_studio_url('arn:aws:sagemaker:us-east-1:123456789:training-job/my-job') """ + import re + + # Handle ARN string — extract region and job name directly if isinstance(training_job, str): - training_job = TrainingJob.get(training_job_name=training_job) - - region = training_job.region if hasattr(training_job, 'region') and training_job.region else 'us-east-1' - job_name = training_job.training_job_name + arn_match = re.match( + r'arn:aws(?:-[a-z]+)?:sagemaker:([a-z0-9-]+):\d+:training-job/(.+)', + training_job, + ) + if arn_match: + region = arn_match.group(1) + job_name = arn_match.group(2) + else: + # Treat as job name, need to fetch the object + training_job = TrainingJob.get(training_job_name=training_job) + region = training_job.region if hasattr(training_job, 'region') and training_job.region else 'us-east-1' + job_name = training_job.training_job_name + else: + region = training_job.region if hasattr(training_job, 'region') and training_job.region else 'us-east-1' + job_name = training_job.training_job_name sm_client = boto3.client('sagemaker', region_name=region) @@ -40,13 +55,82 @@ def get_studio_url(training_job: TrainingJob, domain_id: str = None) -> str: pass if not domain_id: - # Fallback to console URL - return f"https://{region}.console.aws.amazon.com/sagemaker/home?region={region}#/jobs/{job_name}" + return "" # Studio URL format: https://studio-{domain_id}.studio.{region}.sagemaker.aws/jobs/train/{job_name} return f"https://studio-{domain_id}.studio.{region}.sagemaker.aws/jobs/train/{job_name}" +def display_job_links_html(rows: list, as_html: bool = False): + """Render job/resource links with copy-to-clipboard buttons as a Jupyter HTML table. + + Args: + rows: List of dicts, each with keys: + - label (str): Row label (e.g. step name, "Training Job", "MLflow Experiment") + - arn (str): The ARN or URI to display and copy + - url (Optional[str]): Clickable link URL. If None, resolved via get_studio_url for job ARNs. + - url_text (Optional[str]): Link display text. Defaults to "🔗 link" + - url_hint (Optional[str]): Hint text after link. Defaults to "(please sign in to Studio first)" + as_html: If True, return HTML object instead of displaying it. + + Returns: + HTML object if as_html=True, otherwise None. + """ + from IPython.display import display, HTML + import html as html_mod + + html_rows = "" + for row in rows: + escaped_arn = html_mod.escape(row['arn']) + escaped_label = html_mod.escape(row['label']) + + url = row.get('url') + if url is None: + url = get_studio_url(row['arn']) + url_text = row.get('url_text', '🔗 link') + url_hint = row.get('url_hint', '(please sign in to Studio first)') + + link_html = "" + if url: + link_html = ( + f'{html_mod.escape(url_text)}' + f' {html_mod.escape(url_hint)}' + ) + + copy_btn = ( + f'' + ) + + html_rows += ( + f'' + f'{escaped_label}' + f'{link_html}' + f'' + f'{escaped_arn}' + f' {copy_btn}' + f'' + ) + + result = HTML( + f'' + f'' + f'' + f'' + f'' + f'{html_rows}
StepJob LinkJob ARN
' + ) + + if as_html: + return result + display(result) + + def plot_training_metrics( training_job: TrainingJob, metrics: Optional[List[str]] = None, diff --git a/sagemaker-train/src/sagemaker/train/evaluate/execution.py b/sagemaker-train/src/sagemaker/train/evaluate/execution.py index e2388ef313..38dc63b4ec 100644 --- a/sagemaker-train/src/sagemaker/train/evaluate/execution.py +++ b/sagemaker-train/src/sagemaker/train/evaluate/execution.py @@ -447,6 +447,7 @@ class StepDetail(BaseModel): end_time: Optional[str] = Field(None, description="Step end time") display_name: Optional[str] = Field(None, description="Display name for the step") failure_reason: Optional[str] = Field(None, description="Reason for failure if step failed") + job_arn: Optional[str] = Field(None, description="ARN of the underlying job (training, processing, transform, etc.)") class PipelineExecutionStatus(BaseModel): @@ -938,7 +939,6 @@ def wait( # Create steps table if steps exist if self.status.step_details: - # Check if any step has a failure has_failures = any(step.failure_reason for step in self.status.step_details) steps_table = Table(show_header=True, header_style="bold magenta", box=None, padding=(0, 1)) @@ -946,10 +946,10 @@ def wait( steps_table.add_column("Status", style="yellow", width=15) steps_table.add_column("Duration", style="green", width=12) - failed_steps = [] # Track steps with failures for detailed display + failed_steps = [] + job_arn_entries = [] for step in self.status.step_details: - # Calculate duration if both times are available duration = "" if step.start_time and step.end_time: try: @@ -963,7 +963,6 @@ def wait( elif step.start_time: duration = "Running..." - # Color code status status_display = step.status if "succeeded" in step.status.lower() or "completed" in step.status.lower(): status_display = f"[green]{step.status}[/green]" @@ -972,14 +971,18 @@ def wait( elif "executing" in step.status.lower() or "running" in step.status.lower(): status_display = f"[yellow]{step.status}[/yellow]" - # Build row data + if step.job_arn: + job_arn_entries.append({ + 'step_name': step.display_name or step.name, + 'job_arn': step.job_arn, + }) + row_data = [ step.display_name or step.name, status_display, duration ] - # Add error indicator if failures exist if has_failures: if step.failure_reason: row_data.append("❌") @@ -989,35 +992,47 @@ def wait( steps_table.add_row(*row_data) - # Build combined content from rich.console import Group content_parts = [ status_table, - Text(""), # Empty line for spacing + Text(""), Text("Pipeline Steps", style="bold magenta"), steps_table ] - # Add failure details section if there are any failures if failed_steps: - content_parts.append(Text("")) # Empty line + content_parts.append(Text("")) content_parts.append(Text("Step Failure Details", style="bold red")) for step in failed_steps: - content_parts.append(Text("")) # Empty line before each failure + content_parts.append(Text("")) content_parts.append(Text(f"• {step.display_name or step.name}:", style="bold red")) content_parts.append(Text(f" {step.failure_reason}", style="red")) - combined_content = Group(*content_parts) + # Add job links table if any steps have ARNs + if job_arn_entries: + links_table = Table(show_header=True, header_style="bold magenta", box=None, padding=(0, 1)) + links_table.add_column("Step", style="cyan", width=20) + links_table.add_column("Job Link", width=12) + links_table.add_column("Job ARN", style="dim", overflow="fold") + for entry in job_arn_entries: + try: + from sagemaker.train.common_utils.metrics_visualizer import get_studio_url + url = get_studio_url(entry['job_arn']) + link_col = f"[link={url}]🔗 link[/link]" if url else "" + except Exception: + link_col = "" + links_table.add_row(entry['step_name'], link_col, entry['job_arn']) + content_parts.append(Text("")) + content_parts.append(Text("Job ARNs", style="bold magenta")) + content_parts.append(links_table) - # Display combined content in a single panel console.print(Panel( - combined_content, + Group(*content_parts), title="[bold blue]Pipeline Execution Status[/bold blue]", border_style="blue" )) else: - # Display only status table if no steps console.print(Panel( status_table, title="[bold blue]Pipeline Execution Status[/bold blue]", @@ -1204,7 +1219,22 @@ def _convert_to_subclass(self, eval_type: EvalType) -> 'EvaluationPipelineExecut execution._pipeline_execution = pipeline_execution_ref return execution - + + @staticmethod + def _extract_job_arn_from_metadata(step) -> Optional[str]: + """Extract the underlying job ARN from a pipeline step's metadata.""" + metadata = getattr(step, 'metadata', None) + if metadata is None or 'Unassigned' in metadata.__class__.__name__: + return None + for attr in ('training_job', 'processing_job', 'transform_job', 'tuning_job', + 'auto_ml_job', 'compilation_job'): + job_meta = getattr(metadata, attr, None) + if job_meta is not None and not ('Unassigned' in job_meta.__class__.__name__): + arn = getattr(job_meta, 'arn', None) + if arn and not ('Unassigned' in arn.__class__.__name__): + return str(arn) + return None + def _update_step_details_from_raw_steps(self, raw_steps: List[Any]) -> None: """Internal method to update step_details from raw pipeline execution steps @@ -1246,7 +1276,8 @@ def _update_step_details_from_raw_steps(self, raw_steps: List[Any]) -> None: start_time=start_time, end_time=end_time, display_name=step_display_name, - failure_reason=failure_reason + failure_reason=failure_reason, + job_arn=self._extract_job_arn_from_metadata(step) ) step_details.append(step_detail) @@ -1256,8 +1287,8 @@ def _update_step_details_from_raw_steps(self, raw_steps: List[Any]) -> None: logger.warning(f"Failed to process pipeline step: {str(e)}") continue - # Update the job's step details - self.status.step_details = step_details + # Update the job's step details (reverse so earliest step appears first) + self.status.step_details = list(reversed(step_details)) # ============================================================================ diff --git a/sagemaker-train/src/sagemaker/train/evaluate/pipeline_templates.py b/sagemaker-train/src/sagemaker/train/evaluate/pipeline_templates.py index b14145ff31..9112856ce6 100644 --- a/sagemaker-train/src/sagemaker/train/evaluate/pipeline_templates.py +++ b/sagemaker-train/src/sagemaker/train/evaluate/pipeline_templates.py @@ -11,8 +11,7 @@ "Metadata": {}, "MlflowConfig": { "MlflowResourceArn": "{{ mlflow_resource_arn }}"{% if mlflow_experiment_name %}, - "MlflowExperimentName": "{{ mlflow_experiment_name }}"{% endif %}{% if mlflow_run_name %}, - "MlflowRunName": "{{ mlflow_run_name }}"{% endif %} + "MlflowExperimentName": "{{ mlflow_experiment_name }}"{% endif %} }, "Parameters": [], "Steps": [ @@ -531,8 +530,7 @@ "Metadata": {}, "MlflowConfig": { "MlflowResourceArn": "{{ mlflow_resource_arn }}"{% if mlflow_experiment_name %}, - "MlflowExperimentName": "{{ mlflow_experiment_name }}"{% endif %}{% if mlflow_run_name %}, - "MlflowRunName": "{{ mlflow_run_name }}"{% endif %} + "MlflowExperimentName": "{{ mlflow_experiment_name }}"{% endif %} }, "Parameters": [], "Steps": [ @@ -925,8 +923,7 @@ "Metadata": {}, "MlflowConfig": { "MlflowResourceArn": "{{ mlflow_resource_arn }}"{% if mlflow_experiment_name %}, - "MlflowExperimentName": "{{ mlflow_experiment_name }}"{% endif %}{% if mlflow_run_name %}, - "MlflowRunName": "{{ mlflow_run_name }}"{% endif %} + "MlflowExperimentName": "{{ mlflow_experiment_name }}"{% endif %} }, "Parameters": [], "Steps": [ diff --git a/v3-examples/model-customization-examples/sft_finetuning_example_notebook_pysdk_prod_v3.ipynb b/v3-examples/model-customization-examples/sft_finetuning_example_notebook_pysdk_prod_v3.ipynb index 887f60a3d9..7b4348d0e4 100644 --- a/v3-examples/model-customization-examples/sft_finetuning_example_notebook_pysdk_prod_v3.ipynb +++ b/v3-examples/model-customization-examples/sft_finetuning_example_notebook_pysdk_prod_v3.ipynb @@ -23,7 +23,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 1, "id": "87aa2004556ad7c6", "metadata": {}, "outputs": [], @@ -35,7 +35,7 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": 2, "id": "989646bf", "metadata": {}, "outputs": [], @@ -46,7 +46,7 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 3, "id": "a51be0b5-fd33-4fa0-af2b-d08ce0dc7a8e", "metadata": {}, "outputs": [ @@ -96,14 +96,14 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 4, "id": "ef4f0e61-de4d-4228-b7a1-ea7497dad547", "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "b7310bb3f485478f905417b421e7ae11", + "model_id": "e98bb5da0450400ab462efafe4fd3bb8", "version_major": 2, "version_minor": 0 }, @@ -141,7 +141,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "Dataset ARN: arn:aws:sagemaker:us-east-1:529088288990:hub-content/G82247CBEQ6TN0FI3J1SAJU59UURMT3H3EKG48C6VQGDINNRNGU0/DataSet/demo-1/28.0.0\n" + "Dataset ARN: arn:aws:sagemaker:us-east-1:529088288990:hub-content/G82247CBEQ6TN0FI3J1SAJU59UURMT3H3EKG48C6VQGDINNRNGU0/DataSet/demo-1/45.0.0\n" ] } ], @@ -172,7 +172,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 5, "id": "d6937550-f721-43ff-82dd-c513c328dd17", "metadata": {}, "outputs": [ @@ -224,7 +224,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 6, "id": "88fe8360-de50-481d-932f-564a32be66a0", "metadata": {}, "outputs": [], @@ -254,12 +254,45 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 7, "id": "de183042-bb92-4947-9acd-78d7231bda13", "metadata": { "scrolled": true }, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Default Finetuning Options:\n" + ] + }, + { + "data": { + "text/html": [ + "
{\n",
+       "'name': 'my-fullrank-run-k3174',\n",
+       "'global_batch_size': '16',\n",
+       "'max_epochs': '1',\n",
+       "'learning_rate': '5e-06',\n",
+       "'max_context_length': '8192'\n",
+       "}\n",
+       "
\n" + ], + "text/plain": [ + "\u001b[1m{\u001b[0m\n", + "\u001b[2;32m│ \u001b[0m\u001b[38;2;0;135;0m'name'\u001b[0m: \u001b[38;2;0;135;0m'my-fullrank-run-k3174'\u001b[0m,\n", + "\u001b[2;32m│ \u001b[0m\u001b[38;2;0;135;0m'global_batch_size'\u001b[0m: \u001b[38;2;0;135;0m'16'\u001b[0m,\n", + "\u001b[2;32m│ \u001b[0m\u001b[38;2;0;135;0m'max_epochs'\u001b[0m: \u001b[38;2;0;135;0m'1'\u001b[0m,\n", + "\u001b[2;32m│ \u001b[0m\u001b[38;2;0;135;0m'learning_rate'\u001b[0m: \u001b[38;2;0;135;0m'5e-06'\u001b[0m,\n", + "\u001b[2;32m│ \u001b[0m\u001b[38;2;0;135;0m'max_context_length'\u001b[0m: \u001b[38;2;0;135;0m'8192'\u001b[0m\n", + "\u001b[1m}\u001b[0m\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], "source": [ "print(\"Default Finetuning Options:\")\n", "pprint(sft_trainer.hyperparameters.to_dict()) # rename as hyperparameters" @@ -267,7 +300,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 8, "id": "6b57838f-81ac-4fbe-9ddf-5588e42bcce1", "metadata": { "scrolled": true @@ -288,7 +321,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 9, "id": "4d3b6441-9abb-447b-9307-9606a8c0fabd", "metadata": { "jupyter": { @@ -301,42 +334,50 @@ "data": { "text/html": [ "
╭────────────────────────────────── Training Job Status ───────────────────────────────────╮\n",
-       "  TrainingJob Name      🔗 nova-textgeneration-lite-sft-20260305152839                    \n",
+       "  TrainingJob Name      🔗 nova-textgeneration-lite-sft-20260310173212                    \n",
        "  TrainingJob ARN       arn:aws:sagemaker:us-east-1:529088288990:training-job/nova-textg  \n",
-       "                        eneration-lite-sft-20260305152839                                 \n",
-       "  MLflow Experiment     🔗 test-finetuned-models-exp                                      \n",
+       "                        eneration-lite-sft-20260310173212                                 \n",
+       "  MLflow Experiment     🔗 test-finetuned-models-exp                                      \n",
        "                                                                                          \n",
        "  Job Status            InProgress                                                        \n",
-       "  Secondary Status      Pending                                                           \n",
-       "  Elapsed Time          10.3s                                                             \n",
+       "  Secondary Status      Training                                                          \n",
+       "  Elapsed Time          990.7s                                                            \n",
        "                                                                                          \n",
        " Status Transitions                                                                       \n",
        "                                                                                          \n",
        "        Step              Details                               Duration                  \n",
        "  ───────────────────────────────────────────────────────────────────────────             \n",
-       "    Starting          Starting the training job             1.8s                      \n",
-       "    Pending           Training job waiting for capacity     Running...                \n",
+       "    Starting          Starting the training job             4.7s                      \n",
+       "    Pending           Preparing the instances for           656.3s                    \n",
+       "                          training                                                        \n",
+       "    Downloading       Downloading the training image        253.4s                    \n",
+       "    Training          Training image download completed.    Running...                \n",
+       "                          Training in progress.                                           \n",
        "                                                                                          \n",
        "╰──────────────────────────────────────────────────────────────────────────────────────────╯\n",
        "
\n" ], "text/plain": [ "\u001b[38;5;172m╭─\u001b[0m\u001b[38;5;172m─────────────────────────────────\u001b[0m\u001b[38;5;172m \u001b[0m\u001b[1;94mTraining Job Status\u001b[0m\u001b[38;5;172m \u001b[0m\u001b[38;5;172m──────────────────────────────────\u001b[0m\u001b[38;5;172m─╮\u001b[0m\n", - "\u001b[38;5;172m│\u001b[0m \u001b[1;36m \u001b[0m\u001b[1;36mTrainingJob Name \u001b[0m\u001b[1;36m \u001b[0m\u001b[37m \u001b[0m\u001b]8;id=315550;https://studio-d-ingyyaeglvki.studio.us-east-1.sagemaker.aws/jobs/train/nova-textgeneration-lite-sft-20260305152839\u001b\\\u001b[37m🔗 nova-textgeneration-lite-sft-20260305152839\u001b[0m\u001b]8;;\u001b\\\u001b[37m \u001b[0m\u001b[37m \u001b[0m \u001b[38;5;172m│\u001b[0m\n", + "\u001b[38;5;172m│\u001b[0m \u001b[1;36m \u001b[0m\u001b[1;36mTrainingJob Name \u001b[0m\u001b[1;36m \u001b[0m\u001b[37m \u001b[0m\u001b]8;id=225350;https://studio-d-ingyyaeglvki.studio.us-east-1.sagemaker.aws/jobs/train/nova-textgeneration-lite-sft-20260310173212\u001b\\\u001b[37m🔗 nova-textgeneration-lite-sft-20260310173212\u001b[0m\u001b]8;;\u001b\\\u001b[37m \u001b[0m\u001b[37m \u001b[0m \u001b[38;5;172m│\u001b[0m\n", "\u001b[38;5;172m│\u001b[0m \u001b[1;36m \u001b[0m\u001b[1;36mTrainingJob ARN \u001b[0m\u001b[1;36m \u001b[0m\u001b[37m \u001b[0m\u001b[2;37marn:aws:sagemaker:us-east-1:529088288990:training-job/nova-textg\u001b[0m\u001b[37m \u001b[0m \u001b[38;5;172m│\u001b[0m\n", - "\u001b[38;5;172m│\u001b[0m \u001b[1;36m \u001b[0m\u001b[37m \u001b[0m\u001b[2;37meneration-lite-sft-20260305152839\u001b[0m\u001b[37m \u001b[0m\u001b[37m \u001b[0m \u001b[38;5;172m│\u001b[0m\n", - "\u001b[38;5;172m│\u001b[0m \u001b[1;36m \u001b[0m\u001b[1;36mMLflow Experiment \u001b[0m\u001b[1;36m \u001b[0m\u001b[37m \u001b[0m\u001b]8;id=794600;https://app-2SV6Q35HTCBO.mlflow.sagemaker.us-east-1.app.aws/auth?authToken=eyJhbGciOiJIUzI1NiJ9.eyJhdXRoVG9rZW5JZCI6IkVCNVFXNCIsImZhc0NyZWRlbnRpYWxzIjoiQWdWNHdycUhQdW9VQVJiTzR4cXhXNHRWeHlVcmpuNjRXaisyUXRWSFRka2s0c3NBWHdBQkFCVmhkM010WTNKNWNIUnZMWEIxWW14cFl5MXJaWGtBUkVGeEswRm9NelpIVDNWVWQyWjVVakl4U1hsT1kwTlNkSEZWYlZrNFVHdFNVMkZ3TjI0ck1sWkRjSFZpZUZjcmFXRm5kMk5OTkROSVNGTnhNMWRVTDBsWVVUMDlBQUVBQjJGM2N5MXJiWE1BUzJGeWJqcGhkM002YTIxek9uVnpMV1ZoYzNRdE1Ub3pNVFF4TkRZek1EWTBPREk2YTJWNUx6Y3dOMkpoTmpjeExUUXpZamd0TkRFeU5DMWhaVFUzTFRrMFlqTXdZbUptT1RJNU13QzRBUUlCQUhnQjRVMDBTK3ErVE51d1gydlFlaGtxQnVneWQ3YnNrb0pWdWQ2NmZjVENVd0g5eFBDUDFZQy9RcHg3eG5oS1ZjcWZBQUFBZmpCOEJna3Foa2lHOXcwQkJ3YWdiekJ0QWdFQU1HZ0dDU3FHU0liM0RRRUhBVEFlQmdsZ2hrZ0JaUU1FQVM0d0VRUU1odjk4eG9YSE52QjlWOHBlQWdFUWdEc0Z6Q2dETXN1Njl4dms3eVY4bVFFVE41a2Vmc0pXRnhLcE9HL1BiVlgxMFNZM1BlT3VzL29QSm1jSWVGV3FZQ3dpWlhjU1hjSmgxVnBiY3dJQUFCQUFkODB0d04vbERjMWRLdnMxY01UWll4eExGU0xMUkw5RW4zVk9WR2U0L1R4cWcvS1BqUlY3N0NJR0NseFBUZzFGLy8vLy93QUFBQUVBQUFBQUFBQUFBQUFBQUFFQUFBUTNhOXAwZkRLaXgrSE1sOEtKRXdYNkMyVkdXMlk0ZERURnJtTmZOekkyai8rTVIrei9rMGhPUTRFczYwQ2VqS0pRU0ZkY0p4TUNiK2ZzZllYVzl6amw1Q1RLNGx6K3djcCtDTnhoNjFNUXU2Yk9taWVvQm1LYnVsMGhuSWllaEhZQmV4cjFPU2JXaFF3L1k0Ui82emVlZFNadFNRalpON2d1K0ZFWnhweTBjTXhsYnBYVE5PVXdZQW9Ec1RGejVJSzdoeG1HRk9CTEdnYUtBUmM1WkFlVUxSWFB4L2hJQklQUXhaTG1Gejd2cUxqa3pLVjc3aG94Q3VKdjF3R1hEUm01NnJvZ0xibjlZR3FZVHpBMlVoVXVDa05wVVZyb09aZThKL2JpMnlXbHJ1NU5pcDFnWkJzWjVhSlQ0a055QTNFZS8vak5tT1dZbCtRbjBnVCtvR2dOM2VLMFhvcTlaQUJ4bngrNmQrSFlhckVjRXU3S3AyTFNwVTVtUEQ5TndORHJKcC9SZ0FjQ3lDV2lYVlVPbGhPL1l5bEVld0h0c2NDYkc2N2JvRTFiaS9aeks4L0czL2g2b0dyTWlGWUFHcmo3NS9BZUVrMTNuNWtuako5S3BYNnVYd0Ywdm5wNVA3ZzJ4ZDR5UVA1WEpQOExpM0NldStwRTBwV1VvbHcxcklXOXViemRKOVN2UnVNNmJzWDE3ZHBBcGxvV2hzaGZMWnhjMlU0ODZ2SlY4SytuZEpwT055Q0x5ZGRVdWpHbndZQ3dzOEdVUDVsL3o1NzUzaWpSaG4vT1cvK0owT0xMU3g2ekV3d0tIWm1jVnRQOXFFRUViNmN1WGtRUGMyWVNuTlRFb01XUm1OQ0JZME1hNnZWS3dnSXJnTVFvNVM5Nkw2TUhYVjRrWFdhOFJ0V3o3ak1zNWh3NXpPSFo5S2Q1MVp0TEdFbU1iQzVMcnpuSnRnaU1mRkJ3dk9Rak5WOGpUUHhOUFVMVUNpWWxQZ2k2TS9GbTYybE1VVXpIK3Ewd1J2VjNZOHhiUElHZVRMMUwwWEt4VDNOeXRORElaN2dZejhkTDhBeWFPVzlKRkttSGdtMm9meUNxSGZuZktvdytwZjM1TVpveUpsUUJ2MnU5RHVpNFNnT1lMckZlblFreHVYNVVVQ0dmY3lRMXNkSERRSjZyaUl1cGlTazVITHU5YllpbEdpUVI5VWZkR08wYnltSnFCcGNUYUR3Y2ZVWkVLR3dmdU9KUWxweG5XME0wT1pnTnRjbm9VVEY1SHpScTZFcEJsaXBub1ZEZjk2M2dWUWowRUdpZzNUWGx6cHBzWXl5d04zcTlSMDlUbENUUzNnNkJTOWo4UnZ6eStvTmpyMTRKTisvdExKejRCUThOVzFYMkt3L29FWFZybzZTTHVQbGp3aE1aWFVDbjU5bXdER2x3Y1AwRUVlQndhcUw5bmg1aXYvdmVmbnpBaWlUSmhOditIaGU2SzZqRzY2QkhLZ3NLc0ZRSFJCU1JCQTQweTNnU2U1SmhtbDYwQTFGM3VYLzdnYXRWdFFhbTRwM0lIT2ZCdVc5eWNiRmpPVUZGVVc2dU53eXpaZVhiYWp5QWpwKzg2TWVDODViTEpMRmZ2NHlVa2dZU0lNSkdrVEZwanNSMGMrWkFHcUZnRjJqSzNEWlFzRHBWSGtYTEtQeWZUTldOTk1UV3pGL1J0bjdKM1VwWHpoMEE1ZWdzV3lXWU1lMllOU2lPSytJWmJlS0U1ZTd3NjhuaXBsU0k4ZTFDZks2aE4wNDNaeldCbkl1U0txMFNEeXB2cVAwZ3MzTjRBdldaYmVHdENJOWVBT2I3akYzQUFRVDYyTUM3VnhHT285Qmo1aVo2ckJYVUpqQU1VOGF5cXFiQTdEdkZFVnpxWGJhbHJjUjVpNjR3Y0o3ZHB1aVBseDFEZG41NnlwWkFuOHgrRXpETEFHY3daUUl4QU1lczlNb3RJOEFnSGFTRm90dWM3MDZaL0JNOE1pd2VUUjQ3US9yb3hIWkMraG4zRzFDaVlGQ2xrakRqTGpSeTlBSXdFbnVNWitqcXFnRkRvTzcyMnJmWjBTQklTb1AyKzFQTCtUdTExL0pvZU1hUUxWZUdPd1hzNzhOd3JseE45TGI1IiwiY2lwaGVyVGV4dCI6IkFRSUJBSGdCNFUwMFMrcStUTnV3WDJ2UWVoa3FCdWd5ZDdic2tvSlZ1ZDY2ZmNUQ1V3R0xCblpkUmZWWlpqN3hTS3hiMFBUUUFBQUFvakNCbndZSktvWklodmNOQVFjR29JR1JNSUdPQWdFQU1JR0lCZ2txaGtpRzl3MEJCd0V3SGdZSllJWklBV1VEQkFFdU1CRUVETmtCbnRsUzdSbGhkSjJkN2dJQkVJQmJBN0xFYitRTVZyQnR4VmIzOVhkc1RudXNBdkhQYzBNanNidmpjaDlLVHhVc0xKcFo3VlZYVlJvVDFnS0lCd2R0aHFRKzFqcWFkUHFTeHNjQmE4K2tJS1NON05RNms2SEdtcXlGWVpDc3lPWlZZQ01KSXpUSHdSZjBDZz09Iiwic3ViIjoiYXJuOmF3czpzYWdlbWFrZXI6dXMtZWFzdC0xOjUyOTA4ODI4ODk5MDptbGZsb3ctYXBwL2FwcC0yU1Y2UTM1SFRDQk8iLCJpYXQiOjE3NzI3NTMzMjUsImV4cCI6MTc3Mjc1MzYyNX0.X2ERSgvF5-bTlp7a-wWjG7vQiNNi9RJ-dreP8RUI4n8#/experiments/2\u001b\\\u001b[37m🔗 test-finetuned-models-exp\u001b[0m\u001b]8;;\u001b\\\u001b[37m \u001b[0m\u001b[37m \u001b[0m \u001b[38;5;172m│\u001b[0m\n", + "\u001b[38;5;172m│\u001b[0m \u001b[1;36m \u001b[0m\u001b[37m \u001b[0m\u001b[2;37meneration-lite-sft-20260310173212\u001b[0m\u001b[37m \u001b[0m\u001b[37m \u001b[0m \u001b[38;5;172m│\u001b[0m\n", + "\u001b[38;5;172m│\u001b[0m \u001b[1;36m \u001b[0m\u001b[1;36mMLflow Experiment \u001b[0m\u001b[1;36m \u001b[0m\u001b[37m \u001b[0m\u001b]8;id=448259;https://app-2SV6Q35HTCBO.mlflow.sagemaker.us-east-1.app.aws/auth?authToken=eyJhbGciOiJIUzI1NiJ9.eyJhdXRoVG9rZW5JZCI6IlRZTENXRiIsImZhc0NyZWRlbnRpYWxzIjoiQWdWNEs1RzV1QVRWaHc1T21BK041d1VlQkFpOGtrQVVBMWpWWjlGZFpUMjc5UFVBWHdBQkFCVmhkM010WTNKNWNIUnZMWEIxWW14cFl5MXJaWGtBUkVGeE9IRjRZa04xWm1FMmVFWnROWFJDUTJSUGIydGhVazFXYm1FNWNIUlhWM1JUSzFCdWFHVTJkMFJFYmpOWmRFVlRjbVEzTTJSMWJEZHpTelY1ZVZCdFFUMDlBQUVBQjJGM2N5MXJiWE1BUzJGeWJqcGhkM002YTIxek9uVnpMV1ZoYzNRdE1Ub3pNVFF4TkRZek1EWTBPREk2YTJWNUx6Y3dOMkpoTmpjeExUUXpZamd0TkRFeU5DMWhaVFUzTFRrMFlqTXdZbUptT1RJNU13QzRBUUlCQUhnQjRVMDBTK3ErVE51d1gydlFlaGtxQnVneWQ3YnNrb0pWdWQ2NmZjVENVd0dxdHBxTHRyRmNNVTVGNzVWampMTENBQUFBZmpCOEJna3Foa2lHOXcwQkJ3YWdiekJ0QWdFQU1HZ0dDU3FHU0liM0RRRUhBVEFlQmdsZ2hrZ0JaUU1FQVM0d0VRUU1Tek05SFM3Wjl2MkFlNnBnQWdFUWdEdEsxTkVFSm9weW96cWVtcmNEM05kUDdwcERFclplMmVFRWx2UFVhY2JzNTZnekF2USs2eWl1cmR0d0pwYjZydTBoSldweDBiQnR4ZHJqR0FJQUFCQUFzVW5xcysxdnpBV1luNFducGhQSENXb2YyamFkOHhqYWg2Zmg3OUl6Z0FEdE5lMno1eDAwUkF0OXRrRGRIWkMzLy8vLy93QUFBQUVBQUFBQUFBQUFBQUFBQUFFQUFBUkRNT01zOUUyaENlOUpCc0RraWFTMG8wam54bnYzVzVwUU5Fak15bXVsS1JtZDdlRlJTWkJKQmRRTDJ0eUdZdTNmUStQZFhjeWFNYk5FcUN6VTZaM3RPM3c3RGlIVHV6Q0gvamFJNWNjaFRIL1hIdWZsc0NpS2NwaUphMVV5MEZESVhDc2FDKzVIbVBlUmloZHNkdEFibk5LRGp3WXB6NEYwZklVc2VVSmFDTm10NkxmdjM5Ky9VTktVbU9PSXFqQWJvUllqVTh6eW9Bdnc3cUUrMVptbXA2YWtrVXcvVzBNZGJmRGtsK0U4bmFJNG5NZEtkNDVCU2hpYXR1YUpNUnhZbTd0UFRvZHdJRTdCWWVOaUd4R0thSkZiVkNmTXg1WXNqZ1VZcHlTbUF2ZzU3RTdtOGw0cWp0ZnlGVDNKT2NQaUwra3c4NWdrQ1lIclpoYnVhTzlvSzFUVS96QUxPb0dmOHlQbEN5Ym1kVFJReEE2V3ZQREROWkJFcWg5YlZUSGNOQnk2K1JPWjBOWDVMYUNtdDYxY1Y0SUZtNlFRVTFCRVBmWFhIVFhqd3ZYMEVLUXpNM2VjOTNhRHVKcWJFMldFaXl3VHE2R3dNSTR3WG9YMzUwb1NZc2VVZ2RwcWNPWFgxME1tU2dtQUtuUnZrQkc5NElyUzNTV1BIZ0NKb3A3K3pxNVlIZE1EcXVpR04zVlJhWHZDRFlRK0dQRHlOWFBnc0NsZE9rdWdPWmdVV0FVNTloOHp2VGJlOGY2ZmdDdmNCcnVsWTFKeWl4WGpESnd2WTJBeXRzMkdncUh1Z25BZWlwWUdldkVnN2hsV2pvQmwwY3ZxR0w0WGtlMFY4WGpXT1Q2MW1EVVdXOVlwNi9HdXlaUUVKdHZQaFBzT2ZTTjNSamdwaXVNWGM0UzdmR2pxbjYzY3Vnenl3cVRYcDVmaEtJUVFMUzBaWEVPUjdMT2V3WTl4WlczQTgyd2k3MnYyR2FNS1JRSy9MNFZNSjl3QWQ3bWtzQ21OZVNKeVlaS0RlQlVwSGoyYS83dHNQNVNDdXNMMitnaHZLbW03Q3RWQnRLbjlDZ1NJUVBkWEU4U1drai9CVkx0SEp0VG1iSlhXSlNYS2tkRjgwVitud2VaR0FTMUl6RHZNWnlmYXY3WTdzTGlHdWJsRHoxenh3WkxZZ3JHMy9ObUVVbWdxM0hXNDF1QzVWK1NKWE10WEdCbk13SUZ2aS95ZVcya1ZPT2htTWFJV1hXSHhYK2p5cVl0NGl0amVZYWZiZW5qUytCMUJOdERBT29JS0ZMck9rbUYzZTRkcEdCaG9wSXZpenIxcG1TRXNZVFVrOGQwTE5TWEt4Mm41QzNNV3krc1JEUmRtOHhTaXBxYmhvbzdqbWRzQ3Fkb3BvNm5qYkl2SU00M2V5eGF3WC9ZT1o0YXhYTTNGTGZ0Q1M0UWZNL0ZNNGhPRUNXdHRtYVlQN0ZUTjIwUzRWMTB0QUNVbzJERHdoOU1nRTBGZlBucUpXOEw2SzJqVmhSQjRiVjhYZEd0OVNKdWt3N3RSZnhjVU1sM0Vmd2YxWU50YzJ6L24xYzhHU0VtSC9SRUNrc3IwaHRDRmhaZXZBZGR1Qm5GTmxlQ3l5MXdaWnR2VGM4K2QzK1hwU3laOUk1Qk5ndzVyMTF5aUlEaENaWlU4WGxjVmtkNkdqaEkveWxrb2swVGpBTFFMYk83MjcyVEsxbXJsb3NLODkxaEN1RTUrS2s5UktYMktIZ3BsZEVDUU1jeWwvWjBXTy9vUGVQejNKUnMzSk9VUkpBV3k4ZlJZWDlCMm0zOE9yenFzaGtDOVkvN2xHQmVKKzU1Zit2NVo3MWpLZmN6SUE1bEN5YjNqdm1sZjhELy9jMCtEYys3QWgyK2hTSnozYnZYWnBZSFVNdGFzdlhSVEVUUlRVT284am04WVpGSTR4UmtPTGdtQ0FBSzdiMFhIUkMwUVMySDBBR2N3WlFJd1ZRczVHT1I3OW5YV044MldXV3F3dnNjY1p1ZmduZlFRbkRrdjg4bEg3V0dFalpva3dTR3QvWDY3YjRBWDRqSk9BakVBOXJRKzNGOWN4M3FiRWFhcHo3TWhIVHpRRFpKdEJQc201dHIzR08yQ293Sk02UnFVYWRRaU5jNGlBTWRXQjVkVCIsImNpcGhlclRleHQiOiJBUUlCQUhnQjRVMDBTK3ErVE51d1gydlFlaGtxQnVneWQ3YnNrb0pWdWQ2NmZjVENVd0UwS2ZNVEVXU0RzdHE5aWRmWk1zSGRBQUFBb2pDQm53WUpLb1pJaHZjTkFRY0dvSUdSTUlHT0FnRUFNSUdJQmdrcWhraUc5dzBCQndFd0hnWUpZSVpJQVdVREJBRXVNQkVFRExMY0xVTXkrSGpuWlB4YTFBSUJFSUJiMFhWc0RjcWovZVN5WVh5QlJKMFdnQ2xmT0RVY2dLWmJSUlVlNkc1OCsxZkNaTDU2TmoydEx3c1U4V0lwUkVqNkxya1hBb0pnRWVtTUZLMnJwcTVpZmJrMHgySjEwYVd2aFZSQWtMS2lFTDZDMmxJRDhvVEhSSko5Qnc9PSIsInN1YiI6ImFybjphd3M6c2FnZW1ha2VyOnVzLWVhc3QtMTo1MjkwODgyODg5OTA6bWxmbG93LWFwcC9hcHAtMlNWNlEzNUhUQ0JPIiwiaWF0IjoxNzczMTkwMTAxLCJleHAiOjE3NzMxOTA0MDF9._WYatAneAubx7fMUElOyogdDg5Ktk_Y_AT-tgBKt1bQ#/experiments/2\u001b\\\u001b[37m🔗 test-finetuned-models-exp\u001b[0m\u001b]8;;\u001b\\\u001b[37m \u001b[0m\u001b[37m \u001b[0m \u001b[38;5;172m│\u001b[0m\n", "\u001b[38;5;172m│\u001b[0m \u001b[38;5;172m│\u001b[0m\n", "\u001b[38;5;172m│\u001b[0m \u001b[1;36m \u001b[0m\u001b[1;36mJob Status \u001b[0m\u001b[1;36m \u001b[0m\u001b[37m \u001b[0m\u001b[1;38;5;172mInProgress\u001b[0m\u001b[37m \u001b[0m \u001b[38;5;172m│\u001b[0m\n", - "\u001b[38;5;172m│\u001b[0m \u001b[1;36m \u001b[0m\u001b[1;36mSecondary Status \u001b[0m\u001b[1;36m \u001b[0m\u001b[37m \u001b[0m\u001b[1;33mPending\u001b[0m\u001b[37m \u001b[0m\u001b[37m \u001b[0m \u001b[38;5;172m│\u001b[0m\n", - "\u001b[38;5;172m│\u001b[0m \u001b[1;36m \u001b[0m\u001b[1;36mElapsed Time \u001b[0m\u001b[1;36m \u001b[0m\u001b[37m \u001b[0m\u001b[1;91m10.3s\u001b[0m\u001b[37m \u001b[0m\u001b[37m \u001b[0m \u001b[38;5;172m│\u001b[0m\n", + "\u001b[38;5;172m│\u001b[0m \u001b[1;36m \u001b[0m\u001b[1;36mSecondary Status \u001b[0m\u001b[1;36m \u001b[0m\u001b[37m \u001b[0m\u001b[1;33mTraining\u001b[0m\u001b[37m \u001b[0m\u001b[37m \u001b[0m \u001b[38;5;172m│\u001b[0m\n", + "\u001b[38;5;172m│\u001b[0m \u001b[1;36m \u001b[0m\u001b[1;36mElapsed Time \u001b[0m\u001b[1;36m \u001b[0m\u001b[37m \u001b[0m\u001b[1;91m990.7s\u001b[0m\u001b[37m \u001b[0m\u001b[37m \u001b[0m \u001b[38;5;172m│\u001b[0m\n", "\u001b[38;5;172m│\u001b[0m \u001b[38;5;172m│\u001b[0m\n", "\u001b[38;5;172m│\u001b[0m \u001b[1;35mStatus Transitions\u001b[0m \u001b[38;5;172m│\u001b[0m\n", "\u001b[38;5;172m│\u001b[0m \u001b[38;5;172m│\u001b[0m\n", "\u001b[38;5;172m│\u001b[0m \u001b[1;35m \u001b[0m\u001b[1;35m \u001b[0m\u001b[1;35m \u001b[0m \u001b[1;35m \u001b[0m\u001b[1;35mStep \u001b[0m\u001b[1;35m \u001b[0m \u001b[1;35m \u001b[0m\u001b[1;35mDetails \u001b[0m\u001b[1;35m \u001b[0m \u001b[1;35m \u001b[0m\u001b[1;35mDuration \u001b[0m\u001b[1;35m \u001b[0m \u001b[38;5;172m│\u001b[0m\n", "\u001b[38;5;172m│\u001b[0m ─────────────────────────────────────────────────────────────────────────── \u001b[38;5;172m│\u001b[0m\n", - "\u001b[38;5;172m│\u001b[0m \u001b[32m \u001b[0m\u001b[32m✓ \u001b[0m\u001b[32m \u001b[0m \u001b[36m \u001b[0m\u001b[36mStarting \u001b[0m\u001b[36m \u001b[0m \u001b[38;5;172m \u001b[0m\u001b[38;5;172mStarting the training job \u001b[0m\u001b[38;5;172m \u001b[0m \u001b[32m \u001b[0m\u001b[32m1.8s \u001b[0m\u001b[32m \u001b[0m \u001b[38;5;172m│\u001b[0m\n", - "\u001b[38;5;172m│\u001b[0m \u001b[32m \u001b[0m\u001b[32m⋯ \u001b[0m\u001b[32m \u001b[0m \u001b[36m \u001b[0m\u001b[36mPending \u001b[0m\u001b[36m \u001b[0m \u001b[38;5;172m \u001b[0m\u001b[38;5;172mTraining job waiting for capacity \u001b[0m\u001b[38;5;172m \u001b[0m \u001b[32m \u001b[0m\u001b[32mRunning... \u001b[0m\u001b[32m \u001b[0m \u001b[38;5;172m│\u001b[0m\n", + "\u001b[38;5;172m│\u001b[0m \u001b[32m \u001b[0m\u001b[32m✓ \u001b[0m\u001b[32m \u001b[0m \u001b[36m \u001b[0m\u001b[36mStarting \u001b[0m\u001b[36m \u001b[0m \u001b[38;5;172m \u001b[0m\u001b[38;5;172mStarting the training job \u001b[0m\u001b[38;5;172m \u001b[0m \u001b[32m \u001b[0m\u001b[32m4.7s \u001b[0m\u001b[32m \u001b[0m \u001b[38;5;172m│\u001b[0m\n", + "\u001b[38;5;172m│\u001b[0m \u001b[32m \u001b[0m\u001b[32m✓ \u001b[0m\u001b[32m \u001b[0m \u001b[36m \u001b[0m\u001b[36mPending \u001b[0m\u001b[36m \u001b[0m \u001b[38;5;172m \u001b[0m\u001b[38;5;172mPreparing the instances for \u001b[0m\u001b[38;5;172m \u001b[0m \u001b[32m \u001b[0m\u001b[32m656.3s \u001b[0m\u001b[32m \u001b[0m \u001b[38;5;172m│\u001b[0m\n", + "\u001b[38;5;172m│\u001b[0m \u001b[32m \u001b[0m \u001b[36m \u001b[0m \u001b[38;5;172m \u001b[0m\u001b[38;5;172mtraining \u001b[0m\u001b[38;5;172m \u001b[0m \u001b[32m \u001b[0m \u001b[38;5;172m│\u001b[0m\n", + "\u001b[38;5;172m│\u001b[0m \u001b[32m \u001b[0m\u001b[32m✓ \u001b[0m\u001b[32m \u001b[0m \u001b[36m \u001b[0m\u001b[36mDownloading \u001b[0m\u001b[36m \u001b[0m \u001b[38;5;172m \u001b[0m\u001b[38;5;172mDownloading the training image \u001b[0m\u001b[38;5;172m \u001b[0m \u001b[32m \u001b[0m\u001b[32m253.4s \u001b[0m\u001b[32m \u001b[0m \u001b[38;5;172m│\u001b[0m\n", + "\u001b[38;5;172m│\u001b[0m \u001b[32m \u001b[0m\u001b[32m⋯ \u001b[0m\u001b[32m \u001b[0m \u001b[36m \u001b[0m\u001b[36mTraining \u001b[0m\u001b[36m \u001b[0m \u001b[38;5;172m \u001b[0m\u001b[38;5;172mTraining image download completed. \u001b[0m\u001b[38;5;172m \u001b[0m \u001b[32m \u001b[0m\u001b[32mRunning... \u001b[0m\u001b[32m \u001b[0m \u001b[38;5;172m│\u001b[0m\n", + "\u001b[38;5;172m│\u001b[0m \u001b[32m \u001b[0m \u001b[36m \u001b[0m \u001b[38;5;172m \u001b[0m\u001b[38;5;172mTraining in progress. \u001b[0m\u001b[38;5;172m \u001b[0m \u001b[32m \u001b[0m \u001b[38;5;172m│\u001b[0m\n", "\u001b[38;5;172m│\u001b[0m \u001b[38;5;172m│\u001b[0m\n", "\u001b[38;5;172m╰──────────────────────────────────────────────────────────────────────────────────────────╯\u001b[0m\n" ] @@ -355,8 +396,8 @@ " 3 ) \n", " 4 \n", " \n", - " /Users/mollyhe/Documents/SageMaker/sagemaker-python-sdk-molly/sagemaker-core/src/sagemaker/core/ \n", - " telemetry/telemetry_logging.py:180 in wrapper \n", + " /Users/mollyhe/Documents/SageMaker/sagemaker-python-sdk/sagemaker-core/src/sagemaker/core/teleme \n", + " try/telemetry_logging.py:180 in wrapper \n", " \n", " 177 │ │ │ │ │ \"sagemaker_session is not provided or not valid.\", \n", " 178 │ │ │ │ │ func_name, \n", @@ -378,15 +419,15 @@ " 283 \n", " \n", " /Users/mollyhe/Documents/SageMaker/sagemaker-python-sdk-molly/sagemaker-train/src/sagemaker/trai \n", - " n/common_utils/trainer_wait.py:272 in wait \n", + " n/common_utils/trainer_wait.py:278 in wait \n", " \n", - " 269 │ │ │ │ iteration = 0 \n", - " 270 │ │ │ │ while True: \n", - " 271 │ │ │ │ │ iteration += 1 \n", - " 272 │ │ │ │ │ time.sleep(0.5) \n", - " 273 │ │ │ │ │ if iteration >= poll * 2: \n", - " 274 │ │ │ │ │ │ training_job.refresh() \n", - " 275 │ │ │ │ │ │ iteration = 0 \n", + " 275 │ │ │ │ iteration = 0 \n", + " 276 │ │ │ │ while True: \n", + " 277 │ │ │ │ │ iteration += 1 \n", + " 278 │ │ │ │ │ time.sleep(0.5) \n", + " 279 │ │ │ │ │ if iteration >= poll * 2: \n", + " 280 │ │ │ │ │ │ training_job.refresh() \n", + " 281 │ │ │ │ │ │ iteration = 0 \n", "╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n", "KeyboardInterrupt\n", "\n" @@ -400,8 +441,8 @@ "\u001b[38;2;255;0;0m│\u001b[0m \u001b[2m3 \u001b[0m) \u001b[38;2;255;0;0m│\u001b[0m\n", "\u001b[38;2;255;0;0m│\u001b[0m \u001b[2m4 \u001b[0m \u001b[38;2;255;0;0m│\u001b[0m\n", "\u001b[38;2;255;0;0m│\u001b[0m \u001b[38;2;255;0;0m│\u001b[0m\n", - "\u001b[38;2;255;0;0m│\u001b[0m \u001b[2;33m/Users/mollyhe/Documents/SageMaker/sagemaker-python-sdk-molly/sagemaker-core/src/sagemaker/core/\u001b[0m \u001b[38;2;255;0;0m│\u001b[0m\n", - "\u001b[38;2;255;0;0m│\u001b[0m \u001b[2;33mtelemetry/\u001b[0m\u001b[1;33mtelemetry_logging.py\u001b[0m:\u001b[94m180\u001b[0m in \u001b[92mwrapper\u001b[0m \u001b[38;2;255;0;0m│\u001b[0m\n", + "\u001b[38;2;255;0;0m│\u001b[0m \u001b[2;33m/Users/mollyhe/Documents/SageMaker/sagemaker-python-sdk/sagemaker-core/src/sagemaker/core/teleme\u001b[0m \u001b[38;2;255;0;0m│\u001b[0m\n", + "\u001b[38;2;255;0;0m│\u001b[0m \u001b[2;33mtry/\u001b[0m\u001b[1;33mtelemetry_logging.py\u001b[0m:\u001b[94m180\u001b[0m in \u001b[92mwrapper\u001b[0m \u001b[38;2;255;0;0m│\u001b[0m\n", "\u001b[38;2;255;0;0m│\u001b[0m \u001b[38;2;255;0;0m│\u001b[0m\n", "\u001b[38;2;255;0;0m│\u001b[0m \u001b[2m177 \u001b[0m\u001b[2m│ │ │ │ │ \u001b[0m\u001b[33m\"\u001b[0m\u001b[33msagemaker_session is not provided or not valid.\u001b[0m\u001b[33m\"\u001b[0m, \u001b[38;2;255;0;0m│\u001b[0m\n", "\u001b[38;2;255;0;0m│\u001b[0m \u001b[2m178 \u001b[0m\u001b[2m│ │ │ │ │ \u001b[0mfunc_name, \u001b[38;2;255;0;0m│\u001b[0m\n", @@ -423,15 +464,15 @@ "\u001b[38;2;255;0;0m│\u001b[0m \u001b[2m283 \u001b[0m \u001b[38;2;255;0;0m│\u001b[0m\n", "\u001b[38;2;255;0;0m│\u001b[0m \u001b[38;2;255;0;0m│\u001b[0m\n", "\u001b[38;2;255;0;0m│\u001b[0m \u001b[2;33m/Users/mollyhe/Documents/SageMaker/sagemaker-python-sdk-molly/sagemaker-train/src/sagemaker/trai\u001b[0m \u001b[38;2;255;0;0m│\u001b[0m\n", - "\u001b[38;2;255;0;0m│\u001b[0m \u001b[2;33mn/common_utils/\u001b[0m\u001b[1;33mtrainer_wait.py\u001b[0m:\u001b[94m272\u001b[0m in \u001b[92mwait\u001b[0m \u001b[38;2;255;0;0m│\u001b[0m\n", + "\u001b[38;2;255;0;0m│\u001b[0m \u001b[2;33mn/common_utils/\u001b[0m\u001b[1;33mtrainer_wait.py\u001b[0m:\u001b[94m278\u001b[0m in \u001b[92mwait\u001b[0m \u001b[38;2;255;0;0m│\u001b[0m\n", "\u001b[38;2;255;0;0m│\u001b[0m \u001b[38;2;255;0;0m│\u001b[0m\n", - "\u001b[38;2;255;0;0m│\u001b[0m \u001b[2m269 \u001b[0m\u001b[2m│ │ │ │ \u001b[0miteration = \u001b[94m0\u001b[0m \u001b[38;2;255;0;0m│\u001b[0m\n", - "\u001b[38;2;255;0;0m│\u001b[0m \u001b[2m270 \u001b[0m\u001b[2m│ │ │ │ \u001b[0m\u001b[94mwhile\u001b[0m \u001b[94mTrue\u001b[0m: \u001b[38;2;255;0;0m│\u001b[0m\n", - "\u001b[38;2;255;0;0m│\u001b[0m \u001b[2m271 \u001b[0m\u001b[2m│ │ │ │ │ \u001b[0miteration += \u001b[94m1\u001b[0m \u001b[38;2;255;0;0m│\u001b[0m\n", - "\u001b[38;2;255;0;0m│\u001b[0m \u001b[31m❱ \u001b[0m272 \u001b[2m│ │ │ │ │ \u001b[0mtime.sleep(\u001b[94m0.5\u001b[0m) \u001b[38;2;255;0;0m│\u001b[0m\n", - "\u001b[38;2;255;0;0m│\u001b[0m \u001b[2m273 \u001b[0m\u001b[2m│ │ │ │ │ \u001b[0m\u001b[94mif\u001b[0m iteration >= poll * \u001b[94m2\u001b[0m: \u001b[38;2;255;0;0m│\u001b[0m\n", - "\u001b[38;2;255;0;0m│\u001b[0m \u001b[2m274 \u001b[0m\u001b[2m│ │ │ │ │ │ \u001b[0mtraining_job.refresh() \u001b[38;2;255;0;0m│\u001b[0m\n", - "\u001b[38;2;255;0;0m│\u001b[0m \u001b[2m275 \u001b[0m\u001b[2m│ │ │ │ │ │ \u001b[0miteration = \u001b[94m0\u001b[0m \u001b[38;2;255;0;0m│\u001b[0m\n", + "\u001b[38;2;255;0;0m│\u001b[0m \u001b[2m275 \u001b[0m\u001b[2m│ │ │ │ \u001b[0miteration = \u001b[94m0\u001b[0m \u001b[38;2;255;0;0m│\u001b[0m\n", + "\u001b[38;2;255;0;0m│\u001b[0m \u001b[2m276 \u001b[0m\u001b[2m│ │ │ │ \u001b[0m\u001b[94mwhile\u001b[0m \u001b[94mTrue\u001b[0m: \u001b[38;2;255;0;0m│\u001b[0m\n", + "\u001b[38;2;255;0;0m│\u001b[0m \u001b[2m277 \u001b[0m\u001b[2m│ │ │ │ │ \u001b[0miteration += \u001b[94m1\u001b[0m \u001b[38;2;255;0;0m│\u001b[0m\n", + "\u001b[38;2;255;0;0m│\u001b[0m \u001b[31m❱ \u001b[0m278 \u001b[2m│ │ │ │ │ \u001b[0mtime.sleep(\u001b[94m0.5\u001b[0m) \u001b[38;2;255;0;0m│\u001b[0m\n", + "\u001b[38;2;255;0;0m│\u001b[0m \u001b[2m279 \u001b[0m\u001b[2m│ │ │ │ │ \u001b[0m\u001b[94mif\u001b[0m iteration >= poll * \u001b[94m2\u001b[0m: \u001b[38;2;255;0;0m│\u001b[0m\n", + "\u001b[38;2;255;0;0m│\u001b[0m \u001b[2m280 \u001b[0m\u001b[2m│ │ │ │ │ │ \u001b[0mtraining_job.refresh() \u001b[38;2;255;0;0m│\u001b[0m\n", + "\u001b[38;2;255;0;0m│\u001b[0m \u001b[2m281 \u001b[0m\u001b[2m│ │ │ │ │ │ \u001b[0miteration = \u001b[94m0\u001b[0m \u001b[38;2;255;0;0m│\u001b[0m\n", "\u001b[38;2;255;0;0m╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\u001b[0m\n", "\u001b[1;91mKeyboardInterrupt\u001b[0m\n" ] @@ -448,83 +489,10 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": null, "id": "64d68d6f", "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
[03/05/26 15:52:42] WARNING  No region provided. Using default region.                                 utils.py:356\n",
-       "
\n" - ], - "text/plain": [ - "\u001b[2;36m[03/05/26 15:52:42]\u001b[0m\u001b[2;36m \u001b[0m\u001b[1;38;2;215;175;0mWARNING \u001b[0m No region provided. Using default region. \u001b]8;id=398978;file:///Users/mollyhe/Documents/SageMaker/sagemaker-python-sdk-molly/sagemaker-core/src/sagemaker/core/utils/utils.py\u001b\\\u001b[2mutils.py\u001b[0m\u001b]8;;\u001b\\\u001b[2m:\u001b[0m\u001b]8;id=927295;file:///Users/mollyhe/Documents/SageMaker/sagemaker-python-sdk-molly/sagemaker-core/src/sagemaker/core/utils/utils.py#356\u001b\\\u001b[2m356\u001b[0m\u001b]8;;\u001b\\\n" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "
                    INFO     Runs on sagemaker prod, region:us-east-1                                  utils.py:370\n",
-       "
\n" - ], - "text/plain": [ - "\u001b[2;36m \u001b[0m\u001b[2;36m \u001b[0m\u001b[1;38;2;0;105;255mINFO \u001b[0m Runs on sagemaker prod, region:us-east-\u001b[1;36m1\u001b[0m \u001b]8;id=459862;file:///Users/mollyhe/Documents/SageMaker/sagemaker-python-sdk-molly/sagemaker-core/src/sagemaker/core/utils/utils.py\u001b\\\u001b[2mutils.py\u001b[0m\u001b]8;;\u001b\\\u001b[2m:\u001b[0m\u001b]8;id=910567;file:///Users/mollyhe/Documents/SageMaker/sagemaker-python-sdk-molly/sagemaker-core/src/sagemaker/core/utils/utils.py#370\u001b\\\u001b[2m370\u001b[0m\u001b]8;;\u001b\\\n" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "
                    INFO     Found credentials in shared credentials file: ~/.aws/credentials   credentials.py:1392\n",
-       "
\n" - ], - "text/plain": [ - "\u001b[2;36m \u001b[0m\u001b[2;36m \u001b[0m\u001b[1;38;2;0;105;255mINFO \u001b[0m Found credentials in shared credentials file: ~\u001b[38;2;225;0;225m/.aws/\u001b[0m\u001b[38;2;225;0;225mcredentials\u001b[0m \u001b]8;id=500109;file:///Users/mollyhe/.pyenv/versions/3.10.14/envs/py3.10.14/lib/python3.10/site-packages/botocore/credentials.py\u001b\\\u001b[2mcredentials.py\u001b[0m\u001b]8;;\u001b\\\u001b[2m:\u001b[0m\u001b]8;id=207687;file:///Users/mollyhe/.pyenv/versions/3.10.14/envs/py3.10.14/lib/python3.10/site-packages/botocore/credentials.py#1392\u001b\\\u001b[2m1392\u001b[0m\u001b]8;;\u001b\\\n" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "
                    INFO     Found credentials in shared credentials file: ~/.aws/credentials   credentials.py:1392\n",
-       "
\n" - ], - "text/plain": [ - "\u001b[2;36m \u001b[0m\u001b[2;36m \u001b[0m\u001b[1;38;2;0;105;255mINFO \u001b[0m Found credentials in shared credentials file: ~\u001b[38;2;225;0;225m/.aws/\u001b[0m\u001b[38;2;225;0;225mcredentials\u001b[0m \u001b]8;id=615220;file:///Users/mollyhe/.pyenv/versions/3.10.14/envs/py3.10.14/lib/python3.10/site-packages/botocore/credentials.py\u001b\\\u001b[2mcredentials.py\u001b[0m\u001b]8;;\u001b\\\u001b[2m:\u001b[0m\u001b]8;id=20168;file:///Users/mollyhe/.pyenv/versions/3.10.14/envs/py3.10.14/lib/python3.10/site-packages/botocore/credentials.py#1392\u001b\\\u001b[2m1392\u001b[0m\u001b]8;;\u001b\\\n" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "
[03/05/26 15:52:46] INFO     Found credentials in shared credentials file: ~/.aws/credentials   credentials.py:1392\n",
-       "
\n" - ], - "text/plain": [ - "\u001b[2;36m[03/05/26 15:52:46]\u001b[0m\u001b[2;36m \u001b[0m\u001b[1;38;2;0;105;255mINFO \u001b[0m Found credentials in shared credentials file: ~\u001b[38;2;225;0;225m/.aws/\u001b[0m\u001b[38;2;225;0;225mcredentials\u001b[0m \u001b]8;id=639090;file:///Users/mollyhe/.pyenv/versions/3.10.14/envs/py3.10.14/lib/python3.10/site-packages/botocore/credentials.py\u001b\\\u001b[2mcredentials.py\u001b[0m\u001b]8;;\u001b\\\u001b[2m:\u001b[0m\u001b]8;id=958918;file:///Users/mollyhe/.pyenv/versions/3.10.14/envs/py3.10.14/lib/python3.10/site-packages/botocore/credentials.py#1392\u001b\\\u001b[2m1392\u001b[0m\u001b]8;;\u001b\\\n" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "https://app-2SV6Q35HTCBO.mlflow.sagemaker.us-east-1.app.aws/auth?authToken=eyJhbGciOiJIUzI1NiJ9.eyJhdXRoVG9rZW5JZCI6Ilg3NzVSUyIsImZhc0NyZWRlbnRpYWxzIjoiQWdWNFRoSVRsSm9mYWNBZGpHWWtHSms2dnBjbDAwMFNkQmJSRTR6ek5PbmhuTzRBWHdBQkFCVmhkM010WTNKNWNIUnZMWEIxWW14cFl5MXJaWGtBUkVGMlVGZEJSV3RrUzFKMGQzQm1jVWRNUkhrdllYTkNiblJIWkV4YWFtTXhZakZPVEdaaGMwVjNlREJ4WlRGdlN6Z3JWME5GU0RNMk0yTk9UMFZXVkVneFVUMDlBQUVBQjJGM2N5MXJiWE1BUzJGeWJqcGhkM002YTIxek9uVnpMV1ZoYzNRdE1Ub3pNVFF4TkRZek1EWTBPREk2YTJWNUx6Y3dOMkpoTmpjeExUUXpZamd0TkRFeU5DMWhaVFUzTFRrMFlqTXdZbUptT1RJNU13QzRBUUlCQUhnQjRVMDBTK3ErVE51d1gydlFlaGtxQnVneWQ3YnNrb0pWdWQ2NmZjVENVd0Uxb2ZRYnB0UUNKRmgrK3lDR1ZTOW9BQUFBZmpCOEJna3Foa2lHOXcwQkJ3YWdiekJ0QWdFQU1HZ0dDU3FHU0liM0RRRUhBVEFlQmdsZ2hrZ0JaUU1FQVM0d0VRUU1aTTlCUVV2NFluOEZiUXVyQWdFUWdEc1FCbTNoMFN1L3IyL2hWYktoY0phUzRvZ2dCdHMzelJ4T0lXenhUSnNjaVFTQk9JVVN2RHltb29aQWZTRU9pV0RDUnZDeFZnZWFCNGVvWGdJQUFCQUE2ZEVyc1IvWUFRUDYzSVZmY25qTE53SGhmalZjcmMzUUdwZkRtZ2pBVlJhSENPRURES3RtcHZIRmtIYWk3YmRVLy8vLy93QUFBQUVBQUFBQUFBQUFBQUFBQUFFQUFBUTNLNzgyRUdXYnpjY2RZMWJaMUZvS08zRGJPVjM1ZUlGUFdyR3V6YjNvQjdNYWZJZzUzTFRVemtDcjExNmFFK3JPc21CcmIzaTY5cGowck91Tm9TVXQ0d3Q4d2pURzBocjcwS1Q5NmFWTUROVVJKVGZENjVYZENPc25QbGFkbTV2S2lFSURueDdPZ1cyemNDVUd0TlpVYUU5Y1U2bkp1MzlOeWFDVTVlNWFYRDM2ZW5tNHRjSCtCbmhkU1o0WkEzMmM4bVhPMTduQmQ3Mzl6cGZtTFNjMzdGY2tBbnlCcit3dndKSDFES2V5dEJvTTdrUzlhNWpiR0wzeXdnV0h1b2JwODBkdmEzRVpGY3VCRU5XM0lXemFVQkc0R0pCdWdmT2UwQ2wySTlzd2lWbkhlQVc3bFpObUhpVDBlZnJMYXA4dDVZbG5sbzNzSWVYdzlrSG1uVldpakFWa3dxRU1zMTYrNFBRMmVGNzkvM0wyREM4SkJ0QmFESEw4K2FhdTd5TEIvenNQYkZEOEVWY0w0RkZlMjZwZjY0NkdMeW4xMTVnYTNvcmNCREtkMmZ1REE2YlV5Zis2ZzJOOEsxZWlrdTFFQmljZW5URjRRZFJiZlEvMm9Ba2ZXanJPdW9sRlYxWjZrVTVBRnl0dHpHUGhwOVZnOFY3T2VCSE84R2grWHUrbTBTS3hnOEd0NDdyanA5NXo5QnRuSDNGbTdPK3F2b3oyVlo0bGhsU0RGYzN4SWQ2L2IxcnVUVDZxOGFhTjRNNzFqYytQQy9KQ3NKNlI1VlJONFVMeHBuRTZXUmxwZUI1Y3ZaZEFGTE9wVSt5RUV4K21pR0hsUnltNm5laGNidGtaemc5S3ZBUUZUMUFwS0RTMXhFV0RROWxRTkVyaFMrVW4xdFRNQVRlWFpYTzkwRjFjR3dRVmQzZXRUQzlRM2syWFJrYnhrbmY5cmNRc2w0RTE1UER5YWdVTE9aWHJTOGxyVytMbGlsYkxBMExCWnV2VjkraWRrS1F6eGhPUjZSSjBEYVl4ZnJBanFXc2xpdDNvMWxSMy85bUdRMVNOREVQWFVpd2Q1Q05mZW9lSm5QOTJqc29GQ3Z2Sm5mY2wrYmVtRjMwd0ovOGgyRkliQ1RiOUhOaWtUV2wvWllRK2t6V2dhMEhEQ3hWZ0J2Q1J5V1J6K0liUVpZQ1dIL1FJV2dHN1BRazBMNlZWc0VTeURnNnFmdkF3RWs1eEF1cXNzRVhKR1AwRldEMmtYUGg3N0FQTWtuOGNiaGorV2Myc2UyU0xxTEt0MzQweTVud2hiU00xRGJDTWY1NVZ3STBTWWRFRStCRXRlUlZLdlp3RXU3NnFmb1pBMXluVVQvMlZ4NWdDcyt0OEFlNENnUDF6VzZNSGZEeWhUSURjVjM4SHZsbUtVdWE5Sy81ZXorS3Q2M2pyUlhydTlPTzEwYmo4L3Y5eEhxdHdDNjI3RTJTd3BCWU9ncTllTXRwNlo0eGxDdUh6bUhVTkZ0MEtnMC83YW1IcitLUWdnT29Wd3hXdStWcHh2am9EaGxnZUZvUjBicFBtQjI0OFIrRXRoQUdzdlZmVGFlNXoyenlTdGh4UHVmNVRQNGFMTnlQd25zYTVDODVwcExSd3JwUUhHVktPZktYNWVCZjJaTGdueUtpcmR1dlV6dVhmZ29IUTlrRDY4WlVLVXI2Q2h6ZFB0cERSU3AvcCtObXhOb05nT2J5Vy9zS1ZIcVZEUzNCSUZ3M1NaWGkwOXl3OWNSV2xxckVDbXF0S3JsS2JoV0d0UFlyQ2tMM3Q3YjdyTTE2c3UvTEtxKy9lL3FmYitraTB0N0ZUd285Y1pYV240ZmN3dnZNVEYvcXJuMEpXZE9EYWJNMmdOVTdtYmFzd0FHa0xBSHkxM25RTUFIcW9PTnRSUlZsczM2cjhJTmZSWmFGMmN4amNFaGpOKysyWEFHY3daUUl4QU9YNzVLcFY0Sk9WaDU5REZ2T2V6bFVNT1ZZVkxzS3ZFa0cydmE2REc2cXZLV1hCKytIUGFlQVJtaEpMbzA0U3hRSXdHby9sOFJkMWQzSC9Wd1hITUNzckFNcHNQb1hqVDJVdklaN29uTGJqWFNETGtManIyMEx1cTUzUXh1S3prdkpkIiwiY2lwaGVyVGV4dCI6IkFRSUJBSGdCNFUwMFMrcStUTnV3WDJ2UWVoa3FCdWd5ZDdic2tvSlZ1ZDY2ZmNUQ1V3SDl6Tm9yR2JzcjRmMUFUR3FJY1VBVEFBQUFvakNCbndZSktvWklodmNOQVFjR29JR1JNSUdPQWdFQU1JR0lCZ2txaGtpRzl3MEJCd0V3SGdZSllJWklBV1VEQkFFdU1CRUVEUE1PSFVvTlVJTWhyd1IxdFFJQkVJQmIyRlJyQ1dRS0Z0NVZ6cUdJUVRKK0UyZDB6UnBVVGxHWTJVdmMyVmxmd04xVm1WekpzL2JQYThvbUxwT3o3NERERk93NWdDSFByK0hZalJuNjFqMGtxTWFKcFpsVTF0VCtBYU5nZWdiM1l4a1ZXN3RHWm8yVndFazE1dz09Iiwic3ViIjoiYXJuOmF3czpzYWdlbWFrZXI6dXMtZWFzdC0xOjUyOTA4ODI4ODk5MDptbGZsb3ctYXBwL2FwcC0yU1Y2UTM1SFRDQk8iLCJpYXQiOjE3NzI3NTQ3NjQsImV4cCI6MTc3Mjc1NTA2NH0.4TwUDjNP53N-JIHofgyV8hq_6HKftycvEWoOxoqk7fc#/experiments/2\n" - ] - } - ], + "outputs": [], "source": [ "from sagemaker.train import get_mlflow_url\n", "\n", @@ -535,18 +503,10 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": null, "id": "fef3d01d", "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "https://studio-d-ingyyaeglvki.studio.us-east-1.sagemaker.aws/jobs/train/nova-textgeneration-micro-sft-20260302144611\n" - ] - } - ], + "outputs": [], "source": [ "from sagemaker.train import get_studio_url\n", "\n", From ec76bffd908a3d00add44eea90d4cc0f1ce9b838 Mon Sep 17 00:00:00 2001 From: Molly He Date: Wed, 11 Mar 2026 12:08:34 -0700 Subject: [PATCH 03/11] Fix studio domain mismatch for url, update text color, add link of evaluation job --- .../train/common_utils/metrics_visualizer.py | 16 +++++- .../train/common_utils/trainer_wait.py | 4 +- .../src/sagemaker/train/evaluate/execution.py | 52 +++++++++++++++---- 3 files changed, 58 insertions(+), 14 deletions(-) diff --git a/sagemaker-train/src/sagemaker/train/common_utils/metrics_visualizer.py b/sagemaker-train/src/sagemaker/train/common_utils/metrics_visualizer.py index 2b48149871..8d98964c1b 100644 --- a/sagemaker-train/src/sagemaker/train/common_utils/metrics_visualizer.py +++ b/sagemaker-train/src/sagemaker/train/common_utils/metrics_visualizer.py @@ -43,13 +43,25 @@ def get_studio_url(training_job, domain_id: str = None) -> str: region = training_job.region if hasattr(training_job, 'region') and training_job.region else 'us-east-1' job_name = training_job.training_job_name - sm_client = boto3.client('sagemaker', region_name=region) - # Auto-detect domain if not provided if not domain_id: + # First try Studio metadata (when running inside Studio) + try: + import os, json as _json + metadata_path = '/opt/ml/metadata/resource-metadata.json' + if os.path.exists(metadata_path): + with open(metadata_path, 'r') as f: + domain_id = _json.load(f).get('DomainId') + except Exception: + pass + + if not domain_id: + # Fall back to list_domains, sorted by creation time for deterministic results try: + sm_client = boto3.client('sagemaker', region_name=region) domains = sm_client.list_domains()['Domains'] if domains: + domains.sort(key=lambda d: d.get('CreationTime', '')) domain_id = domains[0]['DomainId'] except Exception: pass diff --git a/sagemaker-train/src/sagemaker/train/common_utils/trainer_wait.py b/sagemaker-train/src/sagemaker/train/common_utils/trainer_wait.py index 002793fab1..b99014d07c 100644 --- a/sagemaker-train/src/sagemaker/train/common_utils/trainer_wait.py +++ b/sagemaker-train/src/sagemaker/train/common_utils/trainer_wait.py @@ -302,7 +302,7 @@ def get_cached_mlflow_url(): # Header section with training job name header_table = Table(show_header=False, box=None, padding=(0, 1)) header_table.add_column("Property", style="cyan bold", width=20) - header_table.add_column("Value", style="white", overflow="fold") + header_table.add_column("Value", style="dim", overflow="fold") # Add Studio job link try: @@ -330,7 +330,7 @@ def get_cached_mlflow_url(): status_table = Table(show_header=False, box=None, padding=(0, 1)) status_table.add_column("Property", style="cyan bold", width=20) - status_table.add_column("Value", style="white") + status_table.add_column("Value", style="dim") status_table.add_row("Job Status", f"[bold][orange3]{status}[/][/]") status_table.add_row("Secondary Status", f"[bold yellow]{secondary_status}[/bold yellow]") diff --git a/sagemaker-train/src/sagemaker/train/evaluate/execution.py b/sagemaker-train/src/sagemaker/train/evaluate/execution.py index 38dc63b4ec..0bf53bb782 100644 --- a/sagemaker-train/src/sagemaker/train/evaluate/execution.py +++ b/sagemaker-train/src/sagemaker/train/evaluate/execution.py @@ -915,6 +915,7 @@ def wait( from rich.panel import Panel from rich.text import Text from rich.layout import Layout + from rich.console import Group # Create console with Jupyter support console = Console(force_jupyter=True) @@ -925,14 +926,45 @@ def wait( current_status = self.status.overall_status elapsed = time.time() - start_time + # Create header table with pipeline name link + header_table = Table(show_header=False, box=None, padding=(0, 1)) + header_table.add_column("Property", style="cyan bold", width=20) + header_table.add_column("Value", style="dim", overflow="fold") + + # Extract pipeline name from execution ARN and build Studio link + pipeline_name = None + exec_id = '' + if self.arn: + arn_parts = self.arn.split('/') + if len(arn_parts) >= 4: + pipeline_name = arn_parts[-3] + exec_id = arn_parts[-1] + # Use execution display name if available, fall back to self.name + display_name = self.name + if self._pipeline_execution: + dn = getattr(self._pipeline_execution, 'pipeline_execution_display_name', None) + if dn and not (hasattr(dn, '__class__') and 'Unassigned' in dn.__class__.__name__): + display_name = dn + try: + from sagemaker.train.common_utils.metrics_visualizer import get_studio_url + dummy_url = get_studio_url(self.arn.split('/')[0].replace(':pipeline', ':training-job') + '/dummy' if self.arn else 'dummy') + if dummy_url and pipeline_name: + base = dummy_url.rsplit('/jobs/train/', 1)[0] + pipeline_url = f"{base}/jobs/evaluation/detail?pipeline_name={pipeline_name}&execution_id={exec_id}" + header_table.add_row("Evaluation Job", f"[link={pipeline_url}]🔗 {display_name}[/link]") + else: + header_table.add_row("Evaluation Job", str(display_name)) + except Exception: + header_table.add_row("Evaluation Job", str(display_name)) + # Create main status table status_table = Table(show_header=False, box=None, padding=(0, 1)) status_table.add_column("Property", style="cyan bold", width=20) - status_table.add_column("Value", style="white") + status_table.add_column("Value", style="dim") - status_table.add_row("Overall Status", f"[bold]{current_status}[/bold]") - status_table.add_row("Target Status", f"[bold]{target_status}[/bold]") - status_table.add_row("Elapsed Time", f"{elapsed:.1f}s") + status_table.add_row("Overall Status", f"[bold][orange3]{current_status}[/][/]") + status_table.add_row("Target Status", f"[bold yellow]{target_status}[/bold yellow]") + status_table.add_row("Elapsed Time", f"[bold bright_red]{elapsed:.1f}s[/bold bright_red]") if self.status.failure_reason: status_table.add_row("Failure Reason", f"[red]{self.status.failure_reason}[/red]") @@ -1028,15 +1060,15 @@ def wait( content_parts.append(links_table) console.print(Panel( - Group(*content_parts), - title="[bold blue]Pipeline Execution Status[/bold blue]", - border_style="blue" + Group(header_table, *content_parts), + title="[bold bright_blue]Pipeline Execution Status[/bold bright_blue]", + border_style="orange3" )) else: console.print(Panel( - status_table, - title="[bold blue]Pipeline Execution Status[/bold blue]", - border_style="blue" + Group(header_table, status_table), + title="[bold bright_blue]Pipeline Execution Status[/bold bright_blue]", + border_style="orange3" )) if target_status == current_status: From 0297d0a282af0ab976226fd8a06baead3e337c48 Mon Sep 17 00:00:00 2001 From: Molly He Date: Wed, 18 Mar 2026 11:48:22 -0700 Subject: [PATCH 04/11] Add underscore to fine-tune and eval job links --- .../src/sagemaker/train/common_utils/trainer_wait.py | 4 ++-- sagemaker-train/src/sagemaker/train/evaluate/execution.py | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/sagemaker-train/src/sagemaker/train/common_utils/trainer_wait.py b/sagemaker-train/src/sagemaker/train/common_utils/trainer_wait.py index b99014d07c..a61d3eba56 100644 --- a/sagemaker-train/src/sagemaker/train/common_utils/trainer_wait.py +++ b/sagemaker-train/src/sagemaker/train/common_utils/trainer_wait.py @@ -308,7 +308,7 @@ def get_cached_mlflow_url(): try: from sagemaker.train.common_utils.metrics_visualizer import get_studio_url studio_url = get_studio_url(training_job) - header_table.add_row("TrainingJob Name", f"[link={studio_url}]🔗 {training_job.training_job_name}[/link]") + header_table.add_row("TrainingJob Name", f"[underline][link={studio_url}]🔗 {training_job.training_job_name}[/link][/underline]") except Exception: header_table.add_row("TrainingJob Name", f"[bold green]{training_job.training_job_name}[/bold green]") @@ -324,7 +324,7 @@ def get_cached_mlflow_url(): else: link_text = "MLflow Experiment" - header_table.add_row("MLflow Experiment", f"[link={cached_url}]🔗 {link_text}[/link]") + header_table.add_row("MLflow Experiment", f"[underline][link={cached_url}]🔗 {link_text}[/link][/underline]") elif mlflow_link_cache['error']: header_table.add_row("MLflow Experiment", f"[red]{mlflow_link_cache['error']}[/red]") diff --git a/sagemaker-train/src/sagemaker/train/evaluate/execution.py b/sagemaker-train/src/sagemaker/train/evaluate/execution.py index 0bf53bb782..4945a2318e 100644 --- a/sagemaker-train/src/sagemaker/train/evaluate/execution.py +++ b/sagemaker-train/src/sagemaker/train/evaluate/execution.py @@ -951,7 +951,7 @@ def wait( if dummy_url and pipeline_name: base = dummy_url.rsplit('/jobs/train/', 1)[0] pipeline_url = f"{base}/jobs/evaluation/detail?pipeline_name={pipeline_name}&execution_id={exec_id}" - header_table.add_row("Evaluation Job", f"[link={pipeline_url}]🔗 {display_name}[/link]") + header_table.add_row("Evaluation Job", f"[underline][link={pipeline_url}]🔗 {display_name}[/link][/underline]") else: header_table.add_row("Evaluation Job", str(display_name)) except Exception: @@ -1051,7 +1051,7 @@ def wait( try: from sagemaker.train.common_utils.metrics_visualizer import get_studio_url url = get_studio_url(entry['job_arn']) - link_col = f"[link={url}]🔗 link[/link]" if url else "" + link_col = f"[underline][link={url}]🔗 link[/link][/underline]" if url else "" except Exception: link_col = "" links_table.add_row(entry['step_name'], link_col, entry['job_arn']) From b34d0d88981f1bde569eaaa366e9d08e82c63406 Mon Sep 17 00:00:00 2001 From: Molly He Date: Wed, 18 Mar 2026 16:04:24 -0700 Subject: [PATCH 05/11] Update link to console, conditionally display studio link, update link color to blue --- .gitignore | 1 + .../train/common_utils/metrics_visualizer.py | 105 +++++++++++++----- .../train/common_utils/trainer_wait.py | 45 +++++--- .../src/sagemaker/train/evaluate/execution.py | 67 ++++++++--- 4 files changed, 157 insertions(+), 61 deletions(-) diff --git a/.gitignore b/.gitignore index 7428f4f025..f87727e33e 100644 --- a/.gitignore +++ b/.gitignore @@ -40,3 +40,4 @@ env/ sagemaker_train/src/**/container_drivers/sm_train.sh sagemaker_train/src/**/container_drivers/sourcecode.json sagemaker_train/src/**/container_drivers/distributed.json +.kiro diff --git a/sagemaker-train/src/sagemaker/train/common_utils/metrics_visualizer.py b/sagemaker-train/src/sagemaker/train/common_utils/metrics_visualizer.py index 8d98964c1b..fe861d0646 100644 --- a/sagemaker-train/src/sagemaker/train/common_utils/metrics_visualizer.py +++ b/sagemaker-train/src/sagemaker/train/common_utils/metrics_visualizer.py @@ -2,12 +2,85 @@ import logging from typing import Optional, List, Dict, Any -import boto3 from sagemaker.core.resources import TrainingJob logger = logging.getLogger(__name__) +def _is_in_studio() -> bool: + """Check if running inside SageMaker Studio.""" + from sagemaker.train.common_utils.finetune_utils import _read_domain_id_from_metadata + return _read_domain_id_from_metadata() is not None + + +def _get_studio_base_url(region: str) -> str: + """Get Studio base URL, or empty string if domain not resolvable.""" + from sagemaker.train.common_utils.finetune_utils import _read_domain_id_from_metadata + domain_id = _read_domain_id_from_metadata() + if not domain_id or not region: + return "" + return f"https://studio-{domain_id}.studio.{region}.sagemaker.aws" + + +def _parse_job_arn(job_arn: str): + """Parse a SageMaker job ARN into (region, resource) or None.""" + import re + m = re.match(r'arn:aws(?:-[a-z]+)?:sagemaker:([a-z0-9-]+):\d+:(\S+)', job_arn) + return (m.group(1), m.group(2)) if m else None + + +def get_console_job_url(job_arn: str) -> str: + """Get AWS Console URL for a SageMaker job ARN. + + Args: + job_arn: Full ARN like arn:aws:sagemaker:us-east-1:123:training-job/my-job + + Returns: + Console URL or empty string. + """ + parsed = _parse_job_arn(job_arn) + if not parsed: + return "" + region, resource = parsed + job_type_map = { + "training-job/": "#/jobs/", + "processing-job/": "#/processing-jobs/", + "transform-job/": "#/transform-jobs/", + } + for prefix, fragment in job_type_map.items(): + if resource.startswith(prefix): + job_name = resource.split("/", 1)[1] + return f"https://{region}.console.aws.amazon.com/sagemaker/home?region={region}{fragment}{job_name}" + return "" + + +def get_cloudwatch_logs_url(job_arn: str) -> str: + """Get CloudWatch Logs console URL for a SageMaker job ARN. + + Returns: + CloudWatch console URL or empty string. + """ + parsed = _parse_job_arn(job_arn) + if not parsed: + return "" + region, resource = parsed + log_group_map = { + "training-job/": "/aws/sagemaker/TrainingJobs", + "processing-job/": "/aws/sagemaker/ProcessingJobs", + "transform-job/": "/aws/sagemaker/TransformJobs", + } + for prefix, log_group in log_group_map.items(): + if resource.startswith(prefix): + job_name = resource.split("/", 1)[1] + encoded_group = log_group.replace("/", "$252F") + return ( + f"https://{region}.console.aws.amazon.com/cloudwatch/home?region={region}" + f"#logsV2:log-groups/log-group/{encoded_group}" + f"$3FlogStreamNameFilter$3D{job_name}" + ) + return "" + + def get_studio_url(training_job, domain_id: str = None) -> str: """Get SageMaker Studio URL for training job logs. @@ -43,34 +116,10 @@ def get_studio_url(training_job, domain_id: str = None) -> str: region = training_job.region if hasattr(training_job, 'region') and training_job.region else 'us-east-1' job_name = training_job.training_job_name - # Auto-detect domain if not provided - if not domain_id: - # First try Studio metadata (when running inside Studio) - try: - import os, json as _json - metadata_path = '/opt/ml/metadata/resource-metadata.json' - if os.path.exists(metadata_path): - with open(metadata_path, 'r') as f: - domain_id = _json.load(f).get('DomainId') - except Exception: - pass - - if not domain_id: - # Fall back to list_domains, sorted by creation time for deterministic results - try: - sm_client = boto3.client('sagemaker', region_name=region) - domains = sm_client.list_domains()['Domains'] - if domains: - domains.sort(key=lambda d: d.get('CreationTime', '')) - domain_id = domains[0]['DomainId'] - except Exception: - pass - - if not domain_id: + base = _get_studio_base_url(region) + if not base: return "" - - # Studio URL format: https://studio-{domain_id}.studio.{region}.sagemaker.aws/jobs/train/{job_name} - return f"https://studio-{domain_id}.studio.{region}.sagemaker.aws/jobs/train/{job_name}" + return f"{base}/jobs/train/{job_name}" def display_job_links_html(rows: list, as_html: bool = False): diff --git a/sagemaker-train/src/sagemaker/train/common_utils/trainer_wait.py b/sagemaker-train/src/sagemaker/train/common_utils/trainer_wait.py index a61d3eba56..8b0e6c2f3c 100644 --- a/sagemaker-train/src/sagemaker/train/common_utils/trainer_wait.py +++ b/sagemaker-train/src/sagemaker/train/common_utils/trainer_wait.py @@ -299,34 +299,45 @@ def get_cached_mlflow_url(): clear_output(wait=True) - # Header section with training job name + # Header section with training job info header_table = Table(show_header=False, box=None, padding=(0, 1)) header_table.add_column("Property", style="cyan bold", width=20) header_table.add_column("Value", style="dim", overflow="fold") - # Add Studio job link - try: - from sagemaker.train.common_utils.metrics_visualizer import get_studio_url - studio_url = get_studio_url(training_job) - header_table.add_row("TrainingJob Name", f"[underline][link={studio_url}]🔗 {training_job.training_job_name}[/link][/underline]") - except Exception: - header_table.add_row("TrainingJob Name", f"[bold green]{training_job.training_job_name}[/bold green]") - + header_table.add_row("TrainingJob Name", f"[bold green]{training_job.training_job_name}[/bold green]") header_table.add_row("TrainingJob ARN", f"[dim]{training_job.training_job_arn}[/dim]") - # Add MLflow link to header if available + # Build links row + links = [] + try: + from sagemaker.train.common_utils.metrics_visualizer import ( + _is_in_studio, get_console_job_url, get_cloudwatch_logs_url, get_studio_url + ) + if _is_in_studio(): + studio_url = get_studio_url(training_job) + if studio_url: + links.append(f"[bright_blue underline][link={studio_url}]🔗 Training Job (Studio)[/link][/bright_blue underline]") + else: + console_url = get_console_job_url(training_job.training_job_arn) + if console_url: + links.append(f"[bright_blue underline][link={console_url}]🔗 Training Job[/link][/bright_blue underline]") + cw_url = get_cloudwatch_logs_url(training_job.training_job_arn) + if cw_url: + links.append(f"[bright_blue underline][link={cw_url}]🔗 CloudWatch Logs[/link][/bright_blue underline]") + except Exception: + pass if has_mlflow_config: cached_url = get_cached_mlflow_url() if cached_url: - exp_name = training_job.mlflow_config.mlflow_experiment_name if hasattr(training_job, 'mlflow_config') else None - if exp_name and not _is_unassigned_attribute(exp_name): - link_text = exp_name - else: - link_text = "MLflow Experiment" - - header_table.add_row("MLflow Experiment", f"[underline][link={cached_url}]🔗 {link_text}[/link][/underline]") + links.append(f"[bright_blue underline][link={cached_url}]🔗 MLflow Experiment[/link][/bright_blue underline]") elif mlflow_link_cache['error']: header_table.add_row("MLflow Experiment", f"[red]{mlflow_link_cache['error']}[/red]") + if has_mlflow_config: + exp_name = training_job.mlflow_config.mlflow_experiment_name if hasattr(training_job, 'mlflow_config') else None + if exp_name and not _is_unassigned_attribute(exp_name): + header_table.add_row("MLflow Experiment", f"{exp_name}") + if links: + header_table.add_row("Links", " | ".join(links)) status_table = Table(show_header=False, box=None, padding=(0, 1)) status_table.add_column("Property", style="cyan bold", width=20) diff --git a/sagemaker-train/src/sagemaker/train/evaluate/execution.py b/sagemaker-train/src/sagemaker/train/evaluate/execution.py index 4945a2318e..c1630ea4c6 100644 --- a/sagemaker-train/src/sagemaker/train/evaluate/execution.py +++ b/sagemaker-train/src/sagemaker/train/evaluate/execution.py @@ -931,31 +931,37 @@ def wait( header_table.add_column("Property", style="cyan bold", width=20) header_table.add_column("Value", style="dim", overflow="fold") - # Extract pipeline name from execution ARN and build Studio link + # Extract pipeline name and region from execution ARN pipeline_name = None exec_id = '' + region = None if self.arn: arn_parts = self.arn.split('/') if len(arn_parts) >= 4: pipeline_name = arn_parts[-3] exec_id = arn_parts[-1] + region = self.arn.split(":")[3] if len(self.arn.split(":")) > 3 else None # Use execution display name if available, fall back to self.name display_name = self.name if self._pipeline_execution: dn = getattr(self._pipeline_execution, 'pipeline_execution_display_name', None) if dn and not (hasattr(dn, '__class__') and 'Unassigned' in dn.__class__.__name__): display_name = dn + header_table.add_row("Evaluation Job", str(display_name)) + + # Build links row + links = [] try: - from sagemaker.train.common_utils.metrics_visualizer import get_studio_url - dummy_url = get_studio_url(self.arn.split('/')[0].replace(':pipeline', ':training-job') + '/dummy' if self.arn else 'dummy') - if dummy_url and pipeline_name: - base = dummy_url.rsplit('/jobs/train/', 1)[0] - pipeline_url = f"{base}/jobs/evaluation/detail?pipeline_name={pipeline_name}&execution_id={exec_id}" - header_table.add_row("Evaluation Job", f"[underline][link={pipeline_url}]🔗 {display_name}[/link][/underline]") - else: - header_table.add_row("Evaluation Job", str(display_name)) + from sagemaker.train.common_utils.metrics_visualizer import _is_in_studio, _get_studio_base_url + if region and pipeline_name and _is_in_studio(): + base = _get_studio_base_url(region) + if base: + pipeline_url = f"{base}/jobs/evaluation/detail?pipeline_name={pipeline_name}&execution_id={exec_id}" + links.append(f"[bright_blue underline][link={pipeline_url}]🔗 Pipeline Execution (Studio)[/link][/bright_blue underline]") except Exception: - header_table.add_row("Evaluation Job", str(display_name)) + pass + if links: + header_table.add_row("Links", " | ".join(links)) # Create main status table status_table = Table(show_header=False, box=None, padding=(0, 1)) @@ -1045,16 +1051,45 @@ def wait( if job_arn_entries: links_table = Table(show_header=True, header_style="bold magenta", box=None, padding=(0, 1)) links_table.add_column("Step", style="cyan", width=20) - links_table.add_column("Job Link", width=12) + links_table.add_column("Job Link", style="dim") + links_table.add_column("Logs", style="dim") links_table.add_column("Job ARN", style="dim", overflow="fold") + from sagemaker.train.common_utils.metrics_visualizer import ( + _is_in_studio, _parse_job_arn, _get_studio_base_url, + get_console_job_url, get_cloudwatch_logs_url, + ) + in_studio = _is_in_studio() + studio_base = _get_studio_base_url(region) if in_studio else "" + studio_path_map = { + "training-job/": "jobs/train/", + "processing-job/": "jobs/processing/", + "transform-job/": "jobs/transform/", + } for entry in job_arn_entries: + job_link = "" + logs_link = "" try: - from sagemaker.train.common_utils.metrics_visualizer import get_studio_url - url = get_studio_url(entry['job_arn']) - link_col = f"[underline][link={url}]🔗 link[/link][/underline]" if url else "" + arn = entry['job_arn'] + if in_studio and studio_base: + parsed = _parse_job_arn(arn) + if parsed: + _, resource = parsed + for prefix, path in studio_path_map.items(): + if resource.startswith(prefix): + job_name = resource.split("/", 1)[1] + url = f"{studio_base}/{path}{job_name}" + job_link = f"[bright_blue underline][link={url}]🔗 link[/link][/bright_blue underline]" + break + else: + url = get_console_job_url(arn) + if url: + job_link = f"[bright_blue underline][link={url}]🔗 link[/link][/bright_blue underline]" + cw_url = get_cloudwatch_logs_url(arn) + if cw_url: + logs_link = f"[bright_blue underline][link={cw_url}]🔗 logs[/link][/bright_blue underline]" except Exception: - link_col = "" - links_table.add_row(entry['step_name'], link_col, entry['job_arn']) + pass + links_table.add_row(entry['step_name'], job_link, logs_link, entry['job_arn']) content_parts.append(Text("")) content_parts.append(Text("Job ARNs", style="bold magenta")) content_parts.append(links_table) From 1adce4ff95ae58e3c9c220dffc0307a86db6ffac Mon Sep 17 00:00:00 2001 From: Molly He Date: Wed, 18 Mar 2026 16:33:14 -0700 Subject: [PATCH 06/11] Always show console link, conditional show studio link --- .../train/common_utils/trainer_wait.py | 24 +++++++------ .../src/sagemaker/train/evaluate/execution.py | 34 +++++++++++-------- 2 files changed, 33 insertions(+), 25 deletions(-) diff --git a/sagemaker-train/src/sagemaker/train/common_utils/trainer_wait.py b/sagemaker-train/src/sagemaker/train/common_utils/trainer_wait.py index 8b0e6c2f3c..72537874f5 100644 --- a/sagemaker-train/src/sagemaker/train/common_utils/trainer_wait.py +++ b/sagemaker-train/src/sagemaker/train/common_utils/trainer_wait.py @@ -307,37 +307,39 @@ def get_cached_mlflow_url(): header_table.add_row("TrainingJob Name", f"[bold green]{training_job.training_job_name}[/bold green]") header_table.add_row("TrainingJob ARN", f"[dim]{training_job.training_job_arn}[/dim]") - # Build links row - links = [] + # Build links rows + links_row1 = [] + links_row2 = [] try: from sagemaker.train.common_utils.metrics_visualizer import ( _is_in_studio, get_console_job_url, get_cloudwatch_logs_url, get_studio_url ) + console_url = get_console_job_url(training_job.training_job_arn) + if console_url: + links_row1.append(f"[bright_blue underline][link={console_url}]🔗 Training Job (Console)[/link][/bright_blue underline]") if _is_in_studio(): studio_url = get_studio_url(training_job) if studio_url: - links.append(f"[bright_blue underline][link={studio_url}]🔗 Training Job (Studio)[/link][/bright_blue underline]") - else: - console_url = get_console_job_url(training_job.training_job_arn) - if console_url: - links.append(f"[bright_blue underline][link={console_url}]🔗 Training Job[/link][/bright_blue underline]") + links_row1.append(f"[bright_blue underline][link={studio_url}]🔗 Training Job (Studio)[/link][/bright_blue underline]") cw_url = get_cloudwatch_logs_url(training_job.training_job_arn) if cw_url: - links.append(f"[bright_blue underline][link={cw_url}]🔗 CloudWatch Logs[/link][/bright_blue underline]") + links_row2.append(f"[bright_blue underline][link={cw_url}]🔗 CloudWatch Logs[/link][/bright_blue underline]") except Exception: pass if has_mlflow_config: cached_url = get_cached_mlflow_url() if cached_url: - links.append(f"[bright_blue underline][link={cached_url}]🔗 MLflow Experiment[/link][/bright_blue underline]") + links_row2.append(f"[bright_blue underline][link={cached_url}]🔗 MLflow Experiment[/link][/bright_blue underline]") elif mlflow_link_cache['error']: header_table.add_row("MLflow Experiment", f"[red]{mlflow_link_cache['error']}[/red]") if has_mlflow_config: exp_name = training_job.mlflow_config.mlflow_experiment_name if hasattr(training_job, 'mlflow_config') else None if exp_name and not _is_unassigned_attribute(exp_name): header_table.add_row("MLflow Experiment", f"{exp_name}") - if links: - header_table.add_row("Links", " | ".join(links)) + if links_row1: + header_table.add_row("Links", " | ".join(links_row1)) + if links_row2: + header_table.add_row("" if links_row1 else "Links", " | ".join(links_row2)) status_table = Table(show_header=False, box=None, padding=(0, 1)) status_table.add_column("Property", style="cyan bold", width=20) diff --git a/sagemaker-train/src/sagemaker/train/evaluate/execution.py b/sagemaker-train/src/sagemaker/train/evaluate/execution.py index c1630ea4c6..9d683edd84 100644 --- a/sagemaker-train/src/sagemaker/train/evaluate/execution.py +++ b/sagemaker-train/src/sagemaker/train/evaluate/execution.py @@ -1051,25 +1051,34 @@ def wait( if job_arn_entries: links_table = Table(show_header=True, header_style="bold magenta", box=None, padding=(0, 1)) links_table.add_column("Step", style="cyan", width=20) - links_table.add_column("Job Link", style="dim") - links_table.add_column("Logs", style="dim") - links_table.add_column("Job ARN", style="dim", overflow="fold") + links_table.add_column("Console", style="dim") from sagemaker.train.common_utils.metrics_visualizer import ( _is_in_studio, _parse_job_arn, _get_studio_base_url, get_console_job_url, get_cloudwatch_logs_url, ) in_studio = _is_in_studio() studio_base = _get_studio_base_url(region) if in_studio else "" + if in_studio: + links_table.add_column("Studio", style="dim") + links_table.add_column("Logs", style="dim") + links_table.add_column("Job ARN", style="dim", overflow="fold") studio_path_map = { "training-job/": "jobs/train/", "processing-job/": "jobs/processing/", "transform-job/": "jobs/transform/", } for entry in job_arn_entries: - job_link = "" + console_link = "" logs_link = "" + studio_link = "" try: arn = entry['job_arn'] + url = get_console_job_url(arn) + if url: + console_link = f"[bright_blue underline][link={url}]🔗 link[/link][/bright_blue underline]" + cw_url = get_cloudwatch_logs_url(arn) + if cw_url: + logs_link = f"[bright_blue underline][link={cw_url}]🔗 logs[/link][/bright_blue underline]" if in_studio and studio_base: parsed = _parse_job_arn(arn) if parsed: @@ -1077,19 +1086,16 @@ def wait( for prefix, path in studio_path_map.items(): if resource.startswith(prefix): job_name = resource.split("/", 1)[1] - url = f"{studio_base}/{path}{job_name}" - job_link = f"[bright_blue underline][link={url}]🔗 link[/link][/bright_blue underline]" + s_url = f"{studio_base}/{path}{job_name}" + studio_link = f"[bright_blue underline][link={s_url}]🔗 studio[/link][/bright_blue underline]" break - else: - url = get_console_job_url(arn) - if url: - job_link = f"[bright_blue underline][link={url}]🔗 link[/link][/bright_blue underline]" - cw_url = get_cloudwatch_logs_url(arn) - if cw_url: - logs_link = f"[bright_blue underline][link={cw_url}]🔗 logs[/link][/bright_blue underline]" except Exception: pass - links_table.add_row(entry['step_name'], job_link, logs_link, entry['job_arn']) + row = [entry['step_name'], console_link] + if in_studio: + row.append(studio_link) + row.extend([logs_link, entry['job_arn']]) + links_table.add_row(*row) content_parts.append(Text("")) content_parts.append(Text("Job ARNs", style="bold magenta")) content_parts.append(links_table) From 928204bdf88307999b5ebcf1f29161bf065f5bf8 Mon Sep 17 00:00:00 2001 From: Molly He Date: Wed, 18 Mar 2026 16:37:01 -0700 Subject: [PATCH 07/11] Minor update to execution link names --- sagemaker-train/src/sagemaker/train/evaluate/execution.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sagemaker-train/src/sagemaker/train/evaluate/execution.py b/sagemaker-train/src/sagemaker/train/evaluate/execution.py index 9d683edd84..722461bb6c 100644 --- a/sagemaker-train/src/sagemaker/train/evaluate/execution.py +++ b/sagemaker-train/src/sagemaker/train/evaluate/execution.py @@ -1078,7 +1078,7 @@ def wait( console_link = f"[bright_blue underline][link={url}]🔗 link[/link][/bright_blue underline]" cw_url = get_cloudwatch_logs_url(arn) if cw_url: - logs_link = f"[bright_blue underline][link={cw_url}]🔗 logs[/link][/bright_blue underline]" + logs_link = f"[bright_blue underline][link={cw_url}]🔗 link[/link][/bright_blue underline]" if in_studio and studio_base: parsed = _parse_job_arn(arn) if parsed: @@ -1087,7 +1087,7 @@ def wait( if resource.startswith(prefix): job_name = resource.split("/", 1)[1] s_url = f"{studio_base}/{path}{job_name}" - studio_link = f"[bright_blue underline][link={s_url}]🔗 studio[/link][/bright_blue underline]" + studio_link = f"[bright_blue underline][link={s_url}]🔗 link[/link][/bright_blue underline]" break except Exception: pass From cd1658af8e0391abf180f45793b921418e0e4619 Mon Sep 17 00:00:00 2001 From: Molly He Date: Fri, 20 Mar 2026 16:00:25 -0700 Subject: [PATCH 08/11] Fix region issue for studio url --- .../train/common_utils/metrics_visualizer.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/sagemaker-train/src/sagemaker/train/common_utils/metrics_visualizer.py b/sagemaker-train/src/sagemaker/train/common_utils/metrics_visualizer.py index fe861d0646..fe837a91fc 100644 --- a/sagemaker-train/src/sagemaker/train/common_utils/metrics_visualizer.py +++ b/sagemaker-train/src/sagemaker/train/common_utils/metrics_visualizer.py @@ -94,11 +94,10 @@ def get_studio_url(training_job, domain_id: str = None) -> str: Example: >>> from sagemaker.train import get_studio_url >>> url = get_studio_url('my-training-job') - >>> url = get_studio_url('arn:aws:sagemaker:us-east-1:123456789:training-job/my-job') + >>> url = get_studio_url('arn:aws:sagemaker:us-west-2:123456789:training-job/my-job') """ import re - # Handle ARN string — extract region and job name directly if isinstance(training_job, str): arn_match = re.match( r'arn:aws(?:-[a-z]+)?:sagemaker:([a-z0-9-]+):\d+:training-job/(.+)', @@ -108,12 +107,14 @@ def get_studio_url(training_job, domain_id: str = None) -> str: region = arn_match.group(1) job_name = arn_match.group(2) else: - # Treat as job name, need to fetch the object + # Plain job name — use session region training_job = TrainingJob.get(training_job_name=training_job) - region = training_job.region if hasattr(training_job, 'region') and training_job.region else 'us-east-1' + from sagemaker.core.utils.utils import SageMakerClient + region = SageMakerClient().region_name job_name = training_job.training_job_name else: - region = training_job.region if hasattr(training_job, 'region') and training_job.region else 'us-east-1' + from sagemaker.core.utils.utils import SageMakerClient + region = SageMakerClient().region_name job_name = training_job.training_job_name base = _get_studio_base_url(region) From c6ba964b986c2bf2d864ec710389d35e3a1c01e9 Mon Sep 17 00:00:00 2001 From: Molly He Date: Fri, 20 Mar 2026 16:47:30 -0700 Subject: [PATCH 09/11] Revert notebook change to original --- .../evaluate/benchmark_demo.ipynb | 545 +-------------- ...dpo_trainer_example_notebook_v3_prod.ipynb | 6 +- ...uning_example_notebook_pysdk_prod_v3.ipynb | 477 ++----------- ...io-nova-training-job-sample-notebook.ipynb | 637 +----------------- 4 files changed, 105 insertions(+), 1560 deletions(-) diff --git a/sagemaker-train/example_notebooks/evaluate/benchmark_demo.ipynb b/sagemaker-train/example_notebooks/evaluate/benchmark_demo.ipynb index b78d35b8e9..0719cbbab2 100644 --- a/sagemaker-train/example_notebooks/evaluate/benchmark_demo.ipynb +++ b/sagemaker-train/example_notebooks/evaluate/benchmark_demo.ipynb @@ -24,94 +24,23 @@ "execution_count": null, "metadata": {}, "outputs": [], - "source": [ - "import logging\n", - "logging.basicConfig(level=logging.WARNING)" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
[\n",
-       "<_Benchmark.MMLU: 'mmlu'>,\n",
-       "<_Benchmark.MMLU_PRO: 'mmlu_pro'>,\n",
-       "<_Benchmark.BBH: 'bbh'>,\n",
-       "<_Benchmark.GPQA: 'gpqa'>,\n",
-       "<_Benchmark.MATH: 'math'>,\n",
-       "<_Benchmark.STRONG_REJECT: 'strong_reject'>,\n",
-       "<_Benchmark.IFEVAL: 'ifeval'>,\n",
-       "<_Benchmark.MMMU: 'mmmu'>,\n",
-       "<_Benchmark.LLM_JUDGE: 'llm_judge'>\n",
-       "]\n",
-       "
\n" - ], - "text/plain": [ - "\u001b[1m[\u001b[0m\n", - "\u001b[2;32m│ \u001b[0m\u001b[1m<\u001b[0m\u001b[1;38;2;225;0;225m_Benchmark.MMLU:\u001b[0m\u001b[39m \u001b[0m\u001b[38;2;0;135;0m'mmlu'\u001b[0m\u001b[39m>,\u001b[0m\n", - "\u001b[2;32m│ \u001b[0m\u001b[39m<_Benchmark.MMLU_PRO: \u001b[0m\u001b[38;2;0;135;0m'mmlu_pro'\u001b[0m\u001b[39m>,\u001b[0m\n", - "\u001b[2;32m│ \u001b[0m\u001b[39m<_Benchmark.BBH: \u001b[0m\u001b[38;2;0;135;0m'bbh'\u001b[0m\u001b[39m>,\u001b[0m\n", - "\u001b[2;32m│ \u001b[0m\u001b[39m<_Benchmark.GPQA: \u001b[0m\u001b[38;2;0;135;0m'gpqa'\u001b[0m\u001b[39m>,\u001b[0m\n", - "\u001b[2;32m│ \u001b[0m\u001b[39m<_Benchmark.MATH: \u001b[0m\u001b[38;2;0;135;0m'math'\u001b[0m\u001b[39m>,\u001b[0m\n", - "\u001b[2;32m│ \u001b[0m\u001b[39m<_Benchmark.STRONG_REJECT: \u001b[0m\u001b[38;2;0;135;0m'strong_reject'\u001b[0m\u001b[39m>,\u001b[0m\n", - "\u001b[2;32m│ \u001b[0m\u001b[39m<_Benchmark.IFEVAL: \u001b[0m\u001b[38;2;0;135;0m'ifeval'\u001b[0m\u001b[39m>,\u001b[0m\n", - "\u001b[2;32m│ \u001b[0m\u001b[39m<_Benchmark.MMMU: \u001b[0m\u001b[38;2;0;135;0m'mmmu'\u001b[0m\u001b[39m>,\u001b[0m\n", - "\u001b[2;32m│ \u001b[0m\u001b[39m<_Benchmark.LLM_JUDGE: \u001b[0m\u001b[38;2;0;135;0m'llm_judge'\u001b[0m\u001b[1m>\u001b[0m\n", - "\u001b[1m]\u001b[0m\n" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "
{\n",
-       "'modality': 'Text',\n",
-       "'description': 'General Physics Question Answering – Assesses comprehension of physics concepts and related problem-solving abilities.',\n",
-       "'metrics': ['accuracy'],\n",
-       "'strategy': 'zs_cot',\n",
-       "'subtask_available': False,\n",
-       "'subtasks': None\n",
-       "}\n",
-       "
\n" - ], - "text/plain": [ - "\u001b[1m{\u001b[0m\n", - "\u001b[2;32m│ \u001b[0m\u001b[38;2;0;135;0m'modality'\u001b[0m: \u001b[38;2;0;135;0m'Text'\u001b[0m,\n", - "\u001b[2;32m│ \u001b[0m\u001b[38;2;0;135;0m'description'\u001b[0m: \u001b[38;2;0;135;0m'General Physics Question Answering – Assesses comprehension of physics concepts and related problem-solving abilities.'\u001b[0m,\n", - "\u001b[2;32m│ \u001b[0m\u001b[38;2;0;135;0m'metrics'\u001b[0m: \u001b[1m[\u001b[0m\u001b[38;2;0;135;0m'accuracy'\u001b[0m\u001b[1m]\u001b[0m,\n", - "\u001b[2;32m│ \u001b[0m\u001b[38;2;0;135;0m'strategy'\u001b[0m: \u001b[38;2;0;135;0m'zs_cot'\u001b[0m,\n", - "\u001b[2;32m│ \u001b[0m\u001b[38;2;0;135;0m'subtask_available'\u001b[0m: \u001b[3;38;2;215;0;0mFalse\u001b[0m,\n", - "\u001b[2;32m│ \u001b[0m\u001b[38;2;0;135;0m'subtasks'\u001b[0m: \u001b[3;38;2;225;0;225mNone\u001b[0m\n", - "\u001b[1m}\u001b[0m\n" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], "source": [ "from sagemaker.train.evaluate import get_benchmarks, get_benchmark_properties\n", "from rich.pretty import pprint\n", "\n", "# Configure logging to show INFO messages\n", - "# import logging\n", - "# logging.basicConfig(\n", - "# level=logging.INFO,\n", - "# format='%(levelname)s - %(name)s - %(message)s'\n", - "# )\n", + "import logging\n", + "logging.basicConfig(\n", + " level=logging.INFO,\n", + " format='%(levelname)s - %(name)s - %(message)s'\n", + ")\n", "\n", "# Get available benchmarks\n", "Benchmark = get_benchmarks()\n", "pprint(list(Benchmark))\n", "\n", "# Print properties for a specific benchmark\n", - "pprint(get_benchmark_properties(benchmark=Benchmark.GPQA))" + "pprint(get_benchmark_properties(benchmark=Benchmark.GEN_QA))" ] }, { @@ -138,200 +67,22 @@ }, { "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "WARNING:sagemaker.core.utils.utils:No region provided. Using default region.\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "model_package_group_name='model-package-group-nova13' model_package_group_arn='arn:aws:sagemaker:us-east-1:529088288990:model-package-group/model-package-group-nova13' model_package_group_description=Unassigned() creation_time=datetime.datetime(2026, 3, 4, 13, 13, 33, 33000, tzinfo=tzlocal()) created_by=UserContext(user_profile_arn=Unassigned(), user_profile_name=Unassigned(), domain_id=Unassigned(), iam_identity=IamIdentity(arn='arn:aws:sts::529088288990:assumed-role/Admin/mollyhe-Isengard', principal_id='AROAXWMA6TDPKWL3QDB2W:mollyhe-Isengard', source_identity=Unassigned())) model_package_group_status='Completed'\n" - ] - } - ], - "source": [ - "from sagemaker.core.resources import ModelPackage, ModelPackageGroup\n", - "\n", - "# model_package_group=ModelPackageGroup.create(model_package_group_name=\"model-package-group-nova13\")\n", - "model_package_group=ModelPackageGroup.get(model_package_group_name=\"model-package-group-nova13\")\n", - "print(model_package_group)" - ] - }, - { - "cell_type": "code", - "execution_count": 4, + "execution_count": null, "metadata": {}, "outputs": [], - "source": [ - "\n", - "import json\n", - "import re\n", - "from sagemaker.core.utils.utils import Unassigned\n", - "\n", - "def pretty_print(obj):\n", - " def parse_unassigned(item):\n", - " if isinstance(item, Unassigned):\n", - " return None\n", - " if isinstance(item, dict):\n", - " return {k: parse_unassigned(v) for k, v in item.items() if parse_unassigned(v) is not None}\n", - " if isinstance(item, list):\n", - " return [parse_unassigned(x) for x in item if parse_unassigned(x) is not None]\n", - " if isinstance(item, str) and \"Unassigned object\" in item:\n", - " pairs = re.findall(r\"(\\w+)=([^<][^=]*?)(?=\\s+\\w+=|$)\", item)\n", - " result = {k: v.strip(\"'\\\"\") for k, v in pairs}\n", - " return result if result else None\n", - " return item\n", - "\n", - " cleaned = parse_unassigned(obj.__dict__ if hasattr(obj, '__dict__') else obj)\n", - " print(json.dumps(cleaned, indent=2, default=str))" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "{\n", - " \"training_job_name\": \"nova-textgeneration-micro-sft-20260302144611\",\n", - " \"training_job_arn\": \"arn:aws:sagemaker:us-east-1:529088288990:training-job/nova-textgeneration-micro-sft-20260302144611\",\n", - " \"model_artifacts\": \"s3_model_artifacts='s3://sagemaker-us-east-1-529088288990/output/nova-textgeneration-micro-sft-20260302144611/output/model'\",\n", - " \"training_job_output\": \"s3_training_job_output='s3://sagemaker-us-east-1-529088288990/output/nova-textgeneration-micro-sft-20260302144611/output/output'\",\n", - " \"training_job_status\": \"Completed\",\n", - " \"secondary_status\": \"Completed\",\n", - " \"hyper_parameters\": {\n", - " \"global_batch_size\": \"64\",\n", - " \"learning_rate\": \"1e-05\",\n", - " \"learning_rate_ratio\": \"16.0\",\n", - " \"lora_alpha\": \"128\",\n", - " \"max_context_length\": \"8192\",\n", - " \"max_epochs\": \"2\",\n", - " \"name\": \"my-lora-run-wkxk5\"\n", - " },\n", - " \"role_arn\": \"arn:aws:iam::529088288990:role/Admin\",\n", - " \"input_data_config\": [\n", - " \"channel_name='train' data_source=DataSource(s3_data_source=Unassigned(), file_system_data_source=Unassigned(), dataset_source=DatasetSource(dataset_arn='arn:aws:sagemaker:us-east-1:529088288990:hub-content/G82247CBEQ6TN0FI3J1SAJU59UURMT3H3EKG48C6VQGDINNRNGU0/DataSet/demo-1/5.0.0')) content_type=Unassigned() compression_type='None' record_wrapper_type='None' input_mode=Unassigned() shuffle_config=Unassigned() enable_ffm=False\"\n", - " ],\n", - " \"output_data_config\": \"s3_output_path='s3://sagemaker-us-east-1-529088288990/output/' kms_key_id='' compression_type='NONE' remove_job_name_from_s3_output_path=False disable_model_upload=False channels=Unassigned()\",\n", - " \"stopping_condition\": \"max_runtime_in_seconds=86400 max_wait_time_in_seconds=Unassigned() max_pending_time_in_seconds=Unassigned()\",\n", - " \"creation_time\": \"2026-03-02 14:46:12.873000-08:00\",\n", - " \"training_start_time\": \"2026-03-02 14:58:59.629000-08:00\",\n", - " \"training_end_time\": \"2026-03-02 15:22:26.851000-08:00\",\n", - " \"last_modified_time\": \"2026-03-02 15:22:26.851000-08:00\",\n", - " \"secondary_status_transitions\": [\n", - " \"status='Starting' start_time=datetime.datetime(2026, 3, 2, 14, 46, 12, 873000, tzinfo=tzlocal()) end_time=datetime.datetime(2026, 3, 2, 14, 46, 14, 829000, tzinfo=tzlocal()) status_message='Starting the training job'\",\n", - " \"status='Pending' start_time=datetime.datetime(2026, 3, 2, 14, 46, 14, 829000, tzinfo=tzlocal()) end_time=datetime.datetime(2026, 3, 2, 14, 58, 59, 629000, tzinfo=tzlocal()) status_message='Preparing the instances for training'\",\n", - " \"status='Downloading' start_time=datetime.datetime(2026, 3, 2, 14, 58, 59, 629000, tzinfo=tzlocal()) end_time=datetime.datetime(2026, 3, 2, 15, 7, 4, 705000, tzinfo=tzlocal()) status_message='Downloading the training image'\",\n", - " \"status='Training' start_time=datetime.datetime(2026, 3, 2, 15, 7, 4, 705000, tzinfo=tzlocal()) end_time=datetime.datetime(2026, 3, 2, 15, 21, 2, 752000, tzinfo=tzlocal()) status_message='Training image download completed. Training in progress.'\",\n", - " \"status='Uploading' start_time=datetime.datetime(2026, 3, 2, 15, 21, 2, 752000, tzinfo=tzlocal()) end_time=datetime.datetime(2026, 3, 2, 15, 22, 26, 851000, tzinfo=tzlocal()) status_message='Uploading generated training model'\",\n", - " \"status='Completed' start_time=datetime.datetime(2026, 3, 2, 15, 22, 26, 851000, tzinfo=tzlocal()) end_time=datetime.datetime(2026, 3, 2, 15, 22, 26, 851000, tzinfo=tzlocal()) status_message='Training job completed'\"\n", - " ],\n", - " \"enable_network_isolation\": false,\n", - " \"enable_inter_container_traffic_encryption\": false,\n", - " \"enable_managed_spot_training\": false,\n", - " \"training_time_in_seconds\": 1407,\n", - " \"billable_time_in_seconds\": 0,\n", - " \"billable_token_count\": 153604,\n", - " \"disable_efa\": false,\n", - " \"image_metadata\": \"image_type='BYOImage'\",\n", - " \"serverless_job_config\": \"base_model_arn='arn:aws:sagemaker:us-east-1:aws:hub-content/SageMakerPublicHub/Model/nova-textgeneration-micro/2.45.1' job_type='FineTuning' accept_eula=True customization_technique='SFT' peft='LORA' evaluation_type=Unassigned() evaluator_arn=Unassigned() job_spec=Unassigned()\",\n", - " \"mlflow_config\": \"mlflow_resource_arn='arn:aws:sagemaker:us-east-1:529088288990:mlflow-app/app-2SV6Q35HTCBO' mlflow_tracking_server_arn=Unassigned() mlflow_experiment_name='test-finetuned-models-exp' mlflow_run_name='test-finetuned-models-run'\",\n", - " \"model_package_config\": \"model_package_group_arn='arn:aws:sagemaker:us-east-1:529088288990:model-package-group/model-package-group-nova6' source_model_package_arn=Unassigned()\",\n", - " \"mlflow_details\": \"mlflow_experiment_id='2' mlflow_run_id='1f13027c62db4ac198a7d8452a682d5d'\",\n", - " \"progress_info\": \"total_step_count_per_epoch=1 current_step=1 current_epoch=2 max_epoch=2\",\n", - " \"output_model_package_arn\": \"arn:aws:sagemaker:us-east-1:529088288990:model-package/model-package-group-nova6/1\"\n", - "}\n" - ] - } - ], - "source": [ - "from sagemaker.core.resources import TrainingJob\n", - "\n", - "response = TrainingJob.get(training_job_name=\"nova-textgeneration-micro-sft-20260302144611\")\n", - "pretty_print(response)" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "sagemaker.config INFO - Not applying SDK defaults from location: /Library/Application Support/sagemaker/config.yaml\n", - "sagemaker.config INFO - Not applying SDK defaults from location: /Users/mollyhe/Library/Application Support/sagemaker/config.yaml\n" - ] - }, - { - "data": { - "text/html": [ - "
BenchMarkEvaluator(\n",
-       "region=None,\n",
-       "sagemaker_session=<sagemaker.core.helper.session_helper.Session object at 0x17252afe0>,\n",
-       "model='arn:aws:sagemaker:us-east-1:529088288990:model-package/model-package-group-nova6/1',\n",
-       "base_eval_name='gpqa-eval-demo',\n",
-       "s3_output_path='s3://sagemaker-us-east-1-529088288990/eval',\n",
-       "mlflow_resource_arn='arn:aws:sagemaker:us-east-1:529088288990:mlflow-app/app-2SV6Q35HTCBO',\n",
-       "mlflow_experiment_name='test-eval-models-exp',\n",
-       "mlflow_run_name=None,\n",
-       "networking=None,\n",
-       "kms_key_id=None,\n",
-       "model_package_group='arn:aws:sagemaker:us-east-1:529088288990:model-package-group/model-package-group-nova13',\n",
-       "benchmark=<_Benchmark.GPQA: 'gpqa'>,\n",
-       "subtasks=None,\n",
-       "evaluate_base_model=False\n",
-       ")\n",
-       "
\n" - ], - "text/plain": [ - "\u001b[1;38;2;225;0;225mBenchMarkEvaluator\u001b[0m\u001b[1m(\u001b[0m\n", - "\u001b[2;32m│ \u001b[0m\u001b[38;2;215;175;0mregion\u001b[0m=\u001b[3;38;2;225;0;225mNone\u001b[0m,\n", - "\u001b[2;32m│ \u001b[0m\u001b[38;2;215;175;0msagemaker_session\u001b[0m=\u001b[1m<\u001b[0m\u001b[1;38;2;225;0;225msagemaker.core.helper.session_helper.Session\u001b[0m\u001b[39m object at \u001b[0m\u001b[1;36m0x17252afe0\u001b[0m\u001b[39m>,\u001b[0m\n", - "\u001b[2;32m│ \u001b[0m\u001b[38;2;215;175;0mmodel\u001b[0m\u001b[39m=\u001b[0m\u001b[38;2;0;135;0m'arn:aws:sagemaker:us-east-1:529088288990:model-package/model-package-group-nova6/1'\u001b[0m\u001b[39m,\u001b[0m\n", - "\u001b[2;32m│ \u001b[0m\u001b[38;2;215;175;0mbase_eval_name\u001b[0m\u001b[39m=\u001b[0m\u001b[38;2;0;135;0m'gpqa-eval-demo'\u001b[0m\u001b[39m,\u001b[0m\n", - "\u001b[2;32m│ \u001b[0m\u001b[38;2;215;175;0ms3_output_path\u001b[0m\u001b[39m=\u001b[0m\u001b[38;2;0;135;0m's3://sagemaker-us-east-1-529088288990/eval'\u001b[0m\u001b[39m,\u001b[0m\n", - "\u001b[2;32m│ \u001b[0m\u001b[38;2;215;175;0mmlflow_resource_arn\u001b[0m\u001b[39m=\u001b[0m\u001b[38;2;0;135;0m'arn:aws:sagemaker:us-east-1:529088288990:mlflow-app/app-2SV6Q35HTCBO'\u001b[0m\u001b[39m,\u001b[0m\n", - "\u001b[2;32m│ \u001b[0m\u001b[38;2;215;175;0mmlflow_experiment_name\u001b[0m\u001b[39m=\u001b[0m\u001b[38;2;0;135;0m'test-eval-models-exp'\u001b[0m\u001b[39m,\u001b[0m\n", - "\u001b[2;32m│ \u001b[0m\u001b[38;2;215;175;0mmlflow_run_name\u001b[0m\u001b[39m=\u001b[0m\u001b[3;38;2;225;0;225mNone\u001b[0m\u001b[39m,\u001b[0m\n", - "\u001b[2;32m│ \u001b[0m\u001b[38;2;215;175;0mnetworking\u001b[0m\u001b[39m=\u001b[0m\u001b[3;38;2;225;0;225mNone\u001b[0m\u001b[39m,\u001b[0m\n", - "\u001b[2;32m│ \u001b[0m\u001b[38;2;215;175;0mkms_key_id\u001b[0m\u001b[39m=\u001b[0m\u001b[3;38;2;225;0;225mNone\u001b[0m\u001b[39m,\u001b[0m\n", - "\u001b[2;32m│ \u001b[0m\u001b[38;2;215;175;0mmodel_package_group\u001b[0m\u001b[39m=\u001b[0m\u001b[38;2;0;135;0m'arn:aws:sagemaker:us-east-1:529088288990:model-package-group/model-package-group-nova13'\u001b[0m\u001b[39m,\u001b[0m\n", - "\u001b[2;32m│ \u001b[0m\u001b[38;2;215;175;0mbenchmark\u001b[0m\u001b[39m=<_Benchmark.GPQA: \u001b[0m\u001b[38;2;0;135;0m'gpqa'\u001b[0m\u001b[1m>\u001b[0m,\n", - "\u001b[2;32m│ \u001b[0m\u001b[38;2;215;175;0msubtasks\u001b[0m=\u001b[3;38;2;225;0;225mNone\u001b[0m,\n", - "\u001b[2;32m│ \u001b[0m\u001b[38;2;215;175;0mevaluate_base_model\u001b[0m=\u001b[3;38;2;215;0;0mFalse\u001b[0m\n", - "\u001b[1m)\u001b[0m\n" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], "source": [ "from sagemaker.train.evaluate import BenchMarkEvaluator\n", "\n", "# Create evaluator with GEN_QA benchmark\n", "# These values match our successfully tested configuration\n", "evaluator = BenchMarkEvaluator(\n", - " benchmark=Benchmark.GPQA,\n", - " model=\"arn:aws:sagemaker:us-east-1:529088288990:model-package/model-package-group-nova6/1\",\n", - " s3_output_path=\"s3://sagemaker-us-east-1-529088288990/eval\",\n", - " mlflow_experiment_name=\"test-eval-models-exp\", \n", - " # mlflow_resource_arn=\"arn:aws:sagemaker:us-west-2:052150106756:mlflow-tracking-server/mmlu-eval-experiment\",\n", - " dataset=\"s3://sagemaker-us-east-1-529088288990/eval.jsonl\",\n", - " model_package_group=\"arn:aws:sagemaker:us-east-1:529088288990:model-package-group/model-package-group-nova13\", # Optional inferred from model if model package\n", - " base_eval_name=\"gpqa-eval-demo\",\n", + " benchmark=Benchmark.GEN_QA,\n", + " model=\"arn:aws:sagemaker:us-west-2:052150106756:model-package/test-finetuned-models-gamma/28\",\n", + " s3_output_path=\"s3://mufi-test-serverless-smtj/eval/\",\n", + " mlflow_resource_arn=\"arn:aws:sagemaker:us-west-2:052150106756:mlflow-tracking-server/mmlu-eval-experiment\",\n", + " dataset=\"s3://sagemaker-us-west-2-052150106756/studio-users/d20251107t195443/datasets/2025-11-07T19-55-37-609Z/zc_test.jsonl\",\n", + " model_package_group=\"arn:aws:sagemaker:us-west-2:052150106756:model-package-group/example-name-aovqo\", # Optional inferred from model if model package\n", + " base_eval_name=\"gen-qa-eval-demo\",\n", " # Note: sagemaker_session is optional and will be auto-created if not provided\n", " # Note: region is optional and will be auto deduced using environment variables - SAGEMAKER_REGION, AWS_REGION\n", ")\n", @@ -341,7 +92,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -368,7 +119,7 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -404,23 +155,9 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
{'max_new_tokens': '8196', 'temperature': '0', 'top_k': '-1', 'top_p': '1.0'}\n",
-       "
\n" - ], - "text/plain": [ - "\u001b[1m{\u001b[0m\u001b[38;2;0;135;0m'max_new_tokens'\u001b[0m: \u001b[38;2;0;135;0m'8196'\u001b[0m, \u001b[38;2;0;135;0m'temperature'\u001b[0m: \u001b[38;2;0;135;0m'0'\u001b[0m, \u001b[38;2;0;135;0m'top_k'\u001b[0m: \u001b[38;2;0;135;0m'-1'\u001b[0m, \u001b[38;2;0;135;0m'top_p'\u001b[0m: \u001b[38;2;0;135;0m'1.0'\u001b[0m\u001b[1m}\u001b[0m\n" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], + "outputs": [], "source": [ "pprint(evaluator.hyperparameters.to_dict())\n", "\n", @@ -451,48 +188,9 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
BenchmarkEvaluationExecution(\n",
-       "arn='arn:aws:sagemaker:us-east-1:529088288990:pipeline/SagemakerEvaluation-BenchmarkEvaluation-d47edc79-1d45-4b8f-9ff6-df8e2c8c92be/execution/ab0k65fkztid',\n",
-       "name='gpqa-eval-demo',\n",
-       "status=PipelineExecutionStatus(overall_status='Executing', step_details=[], failure_reason=None),\n",
-       "last_modified_time=datetime.datetime(2026, 3, 10, 17, 48, 10, 308000, tzinfo=tzlocal()),\n",
-       "eval_type=<EvalType.BENCHMARK: 'benchmark'>,\n",
-       "s3_output_path='s3://sagemaker-us-east-1-529088288990/eval',\n",
-       "steps=[]\n",
-       ")\n",
-       "
\n" - ], - "text/plain": [ - "\u001b[1;38;2;225;0;225mBenchmarkEvaluationExecution\u001b[0m\u001b[1m(\u001b[0m\n", - "\u001b[2;32m│ \u001b[0m\u001b[38;2;215;175;0marn\u001b[0m=\u001b[38;2;0;135;0m'arn:aws:sagemaker:us-east-1:529088288990:pipeline/SagemakerEvaluation-BenchmarkEvaluation-d47edc79-1d45-4b8f-9ff6-df8e2c8c92be/execution/ab0k65fkztid'\u001b[0m,\n", - "\u001b[2;32m│ \u001b[0m\u001b[38;2;215;175;0mname\u001b[0m=\u001b[38;2;0;135;0m'gpqa-eval-demo'\u001b[0m,\n", - "\u001b[2;32m│ \u001b[0m\u001b[38;2;215;175;0mstatus\u001b[0m=\u001b[1;38;2;225;0;225mPipelineExecutionStatus\u001b[0m\u001b[1m(\u001b[0m\u001b[38;2;215;175;0moverall_status\u001b[0m=\u001b[38;2;0;135;0m'Executing'\u001b[0m, \u001b[38;2;215;175;0mstep_details\u001b[0m=\u001b[1m[\u001b[0m\u001b[1m]\u001b[0m, \u001b[38;2;215;175;0mfailure_reason\u001b[0m=\u001b[3;38;2;225;0;225mNone\u001b[0m\u001b[1m)\u001b[0m,\n", - "\u001b[2;32m│ \u001b[0m\u001b[38;2;215;175;0mlast_modified_time\u001b[0m=\u001b[1;38;2;225;0;225mdatetime\u001b[0m\u001b[1;38;2;225;0;225m.datetime\u001b[0m\u001b[1m(\u001b[0m\u001b[1;36m2026\u001b[0m, \u001b[1;36m3\u001b[0m, \u001b[1;36m10\u001b[0m, \u001b[1;36m17\u001b[0m, \u001b[1;36m48\u001b[0m, \u001b[1;36m10\u001b[0m, \u001b[1;36m308000\u001b[0m, \u001b[38;2;215;175;0mtzinfo\u001b[0m=\u001b[1;38;2;225;0;225mtzlocal\u001b[0m\u001b[1m(\u001b[0m\u001b[1m)\u001b[0m\u001b[1m)\u001b[0m,\n", - "\u001b[2;32m│ \u001b[0m\u001b[38;2;215;175;0meval_type\u001b[0m=\u001b[1m<\u001b[0m\u001b[1;38;2;225;0;225mEvalType.BENCHMARK:\u001b[0m\u001b[39m \u001b[0m\u001b[38;2;0;135;0m'benchmark'\u001b[0m\u001b[1m>\u001b[0m,\n", - "\u001b[2;32m│ \u001b[0m\u001b[38;2;215;175;0ms3_output_path\u001b[0m=\u001b[38;2;0;135;0m's3://sagemaker-us-east-1-529088288990/eval'\u001b[0m,\n", - "\u001b[2;32m│ \u001b[0m\u001b[38;2;215;175;0msteps\u001b[0m=\u001b[1m[\u001b[0m\u001b[1m]\u001b[0m\n", - "\u001b[1m)\u001b[0m\n" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n", - "Pipeline Execution ARN: arn:aws:sagemaker:us-east-1:529088288990:pipeline/SagemakerEvaluation-BenchmarkEvaluation-d47edc79-1d45-4b8f-9ff6-df8e2c8c92be/execution/ab0k65fkztid\n", - "Initial Status: Executing\n" - ] - } - ], + "outputs": [], "source": [ "# Run evaluation with configured parameters\n", "execution = evaluator.evaluate()\n", @@ -513,7 +211,7 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -533,79 +231,9 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
PipelineExecutionStatus(\n",
-       "overall_status='Executing',\n",
-       "step_details=[\n",
-       "│   │   StepDetail(\n",
-       "│   │   │   name='CreateEvaluationAction',\n",
-       "│   │   │   status='Starting',\n",
-       "│   │   │   start_time='2026-03-10T17:48:10.807000-07:00',\n",
-       "│   │   │   end_time=None,\n",
-       "│   │   │   display_name=None,\n",
-       "│   │   │   failure_reason=None,\n",
-       "│   │   │   job_arn=None\n",
-       "│   │   ),\n",
-       "│   │   StepDetail(\n",
-       "│   │   │   name='EvaluateCustomModel',\n",
-       "│   │   │   status='Starting',\n",
-       "│   │   │   start_time='2026-03-10T17:48:10.807000-07:00',\n",
-       "│   │   │   end_time=None,\n",
-       "│   │   │   display_name=None,\n",
-       "│   │   │   failure_reason=None,\n",
-       "│   │   │   job_arn=None\n",
-       "│   │   )\n",
-       "],\n",
-       "failure_reason=None\n",
-       ")\n",
-       "
\n" - ], - "text/plain": [ - "\u001b[1;38;2;225;0;225mPipelineExecutionStatus\u001b[0m\u001b[1m(\u001b[0m\n", - "\u001b[2;32m│ \u001b[0m\u001b[38;2;215;175;0moverall_status\u001b[0m=\u001b[38;2;0;135;0m'Executing'\u001b[0m,\n", - "\u001b[2;32m│ \u001b[0m\u001b[38;2;215;175;0mstep_details\u001b[0m=\u001b[1m[\u001b[0m\n", - "\u001b[2;32m│ │ \u001b[0m\u001b[1;38;2;225;0;225mStepDetail\u001b[0m\u001b[1m(\u001b[0m\n", - "\u001b[2;32m│ │ │ \u001b[0m\u001b[38;2;215;175;0mname\u001b[0m=\u001b[38;2;0;135;0m'CreateEvaluationAction'\u001b[0m,\n", - "\u001b[2;32m│ │ │ \u001b[0m\u001b[38;2;215;175;0mstatus\u001b[0m=\u001b[38;2;0;135;0m'Starting'\u001b[0m,\n", - "\u001b[2;32m│ │ │ \u001b[0m\u001b[38;2;215;175;0mstart_time\u001b[0m=\u001b[38;2;0;135;0m'2026-03-10T17:48:10.807000-07:00'\u001b[0m,\n", - "\u001b[2;32m│ │ │ \u001b[0m\u001b[38;2;215;175;0mend_time\u001b[0m=\u001b[3;38;2;225;0;225mNone\u001b[0m,\n", - "\u001b[2;32m│ │ │ \u001b[0m\u001b[38;2;215;175;0mdisplay_name\u001b[0m=\u001b[3;38;2;225;0;225mNone\u001b[0m,\n", - "\u001b[2;32m│ │ │ \u001b[0m\u001b[38;2;215;175;0mfailure_reason\u001b[0m=\u001b[3;38;2;225;0;225mNone\u001b[0m,\n", - "\u001b[2;32m│ │ │ \u001b[0m\u001b[38;2;215;175;0mjob_arn\u001b[0m=\u001b[3;38;2;225;0;225mNone\u001b[0m\n", - "\u001b[2;32m│ │ \u001b[0m\u001b[1m)\u001b[0m,\n", - "\u001b[2;32m│ │ \u001b[0m\u001b[1;38;2;225;0;225mStepDetail\u001b[0m\u001b[1m(\u001b[0m\n", - "\u001b[2;32m│ │ │ \u001b[0m\u001b[38;2;215;175;0mname\u001b[0m=\u001b[38;2;0;135;0m'EvaluateCustomModel'\u001b[0m,\n", - "\u001b[2;32m│ │ │ \u001b[0m\u001b[38;2;215;175;0mstatus\u001b[0m=\u001b[38;2;0;135;0m'Starting'\u001b[0m,\n", - "\u001b[2;32m│ │ │ \u001b[0m\u001b[38;2;215;175;0mstart_time\u001b[0m=\u001b[38;2;0;135;0m'2026-03-10T17:48:10.807000-07:00'\u001b[0m,\n", - "\u001b[2;32m│ │ │ \u001b[0m\u001b[38;2;215;175;0mend_time\u001b[0m=\u001b[3;38;2;225;0;225mNone\u001b[0m,\n", - "\u001b[2;32m│ │ │ \u001b[0m\u001b[38;2;215;175;0mdisplay_name\u001b[0m=\u001b[3;38;2;225;0;225mNone\u001b[0m,\n", - "\u001b[2;32m│ │ │ \u001b[0m\u001b[38;2;215;175;0mfailure_reason\u001b[0m=\u001b[3;38;2;225;0;225mNone\u001b[0m,\n", - "\u001b[2;32m│ │ │ \u001b[0m\u001b[38;2;215;175;0mjob_arn\u001b[0m=\u001b[3;38;2;225;0;225mNone\u001b[0m\n", - "\u001b[2;32m│ │ \u001b[0m\u001b[1m)\u001b[0m\n", - "\u001b[2;32m│ \u001b[0m\u001b[1m]\u001b[0m,\n", - "\u001b[2;32m│ \u001b[0m\u001b[38;2;215;175;0mfailure_reason\u001b[0m=\u001b[3;38;2;225;0;225mNone\u001b[0m\n", - "\u001b[1m)\u001b[0m\n" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n", - "Step Details:\n", - " CreateEvaluationAction: Starting\n", - " EvaluateCustomModel: Starting\n" - ] - } - ], + "outputs": [], "source": [ "# Refresh status\n", "execution.refresh()\n", @@ -631,128 +259,9 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
╭─────────────────────────────────────────── Pipeline Execution Status ───────────────────────────────────────────╮\n",
-       "  Overall Status        Executing                                                                                \n",
-       "  Target Status         Succeeded                                                                                \n",
-       "  Elapsed Time          33.4s                                                                                    \n",
-       "                                                                                                                 \n",
-       " Pipeline Steps                                                                                                  \n",
-       "  Step Name                       Status           Duration                                                      \n",
-       "  CreateEvaluationAction          Succeeded        1.1s                                                          \n",
-       "  EvaluateCustomModel             Executing        Running...                                                    \n",
-       "                                                                                                                 \n",
-       " Job ARNs                                                                                                        \n",
-       "  Step                  Job Link      Job ARN                                                                    \n",
-       "  EvaluateCustomModel   🔗 link       arn:aws:sagemaker:us-east-1:529088288990:training-job/pipelines-ab0k65fkz  \n",
-       "                                      tid-EvaluateCustomModel-5XaO1EW1hQ                                         \n",
-       "╰─────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯\n",
-       "
\n" - ], - "text/plain": [ - "\u001b[34m╭─\u001b[0m\u001b[34m──────────────────────────────────────────\u001b[0m\u001b[34m \u001b[0m\u001b[1;34mPipeline Execution Status\u001b[0m\u001b[34m \u001b[0m\u001b[34m──────────────────────────────────────────\u001b[0m\u001b[34m─╮\u001b[0m\n", - "\u001b[34m│\u001b[0m \u001b[1;36m \u001b[0m\u001b[1;36mOverall Status \u001b[0m\u001b[1;36m \u001b[0m\u001b[37m \u001b[0m\u001b[1;37mExecuting\u001b[0m\u001b[37m \u001b[0m \u001b[34m│\u001b[0m\n", - "\u001b[34m│\u001b[0m \u001b[1;36m \u001b[0m\u001b[1;36mTarget Status \u001b[0m\u001b[1;36m \u001b[0m\u001b[37m \u001b[0m\u001b[1;37mSucceeded\u001b[0m\u001b[37m \u001b[0m \u001b[34m│\u001b[0m\n", - "\u001b[34m│\u001b[0m \u001b[1;36m \u001b[0m\u001b[1;36mElapsed Time \u001b[0m\u001b[1;36m \u001b[0m\u001b[37m \u001b[0m\u001b[37m33.4s \u001b[0m\u001b[37m \u001b[0m \u001b[34m│\u001b[0m\n", - "\u001b[34m│\u001b[0m \u001b[34m│\u001b[0m\n", - "\u001b[34m│\u001b[0m \u001b[1;35mPipeline Steps\u001b[0m \u001b[34m│\u001b[0m\n", - "\u001b[34m│\u001b[0m \u001b[1;35m \u001b[0m\u001b[1;35mStep Name \u001b[0m\u001b[1;35m \u001b[0m\u001b[1;35m \u001b[0m\u001b[1;35mStatus \u001b[0m\u001b[1;35m \u001b[0m\u001b[1;35m \u001b[0m\u001b[1;35mDuration \u001b[0m\u001b[1;35m \u001b[0m \u001b[34m│\u001b[0m\n", - "\u001b[34m│\u001b[0m \u001b[36m \u001b[0m\u001b[36mCreateEvaluationAction \u001b[0m\u001b[36m \u001b[0m\u001b[33m \u001b[0m\u001b[32mSucceeded\u001b[0m\u001b[33m \u001b[0m\u001b[33m \u001b[0m\u001b[32m \u001b[0m\u001b[32m1.1s \u001b[0m\u001b[32m \u001b[0m \u001b[34m│\u001b[0m\n", - "\u001b[34m│\u001b[0m \u001b[36m \u001b[0m\u001b[36mEvaluateCustomModel \u001b[0m\u001b[36m \u001b[0m\u001b[33m \u001b[0m\u001b[33mExecuting\u001b[0m\u001b[33m \u001b[0m\u001b[33m \u001b[0m\u001b[32m \u001b[0m\u001b[32mRunning... \u001b[0m\u001b[32m \u001b[0m \u001b[34m│\u001b[0m\n", - "\u001b[34m│\u001b[0m \u001b[34m│\u001b[0m\n", - "\u001b[34m│\u001b[0m \u001b[1;35mJob ARNs\u001b[0m \u001b[34m│\u001b[0m\n", - "\u001b[34m│\u001b[0m \u001b[1;35m \u001b[0m\u001b[1;35mStep \u001b[0m\u001b[1;35m \u001b[0m\u001b[1;35m \u001b[0m\u001b[1;35mJob Link \u001b[0m\u001b[1;35m \u001b[0m\u001b[1;35m \u001b[0m\u001b[1;35mJob ARN \u001b[0m\u001b[1;35m \u001b[0m \u001b[34m│\u001b[0m\n", - "\u001b[34m│\u001b[0m \u001b[36m \u001b[0m\u001b[36mEvaluateCustomModel \u001b[0m\u001b[36m \u001b[0m \u001b]8;id=840181;https://studio-d-ingyyaeglvki.studio.us-east-1.sagemaker.aws/jobs/train/pipelines-ab0k65fkztid-EvaluateCustomModel-5XaO1EW1hQ\u001b\\🔗 link\u001b]8;;\u001b\\ \u001b[2m \u001b[0m\u001b[2marn:aws:sagemaker:us-east-1:529088288990:training-job/pipelines-ab0k65fkz\u001b[0m\u001b[2m \u001b[0m \u001b[34m│\u001b[0m\n", - "\u001b[34m│\u001b[0m \u001b[36m \u001b[0m \u001b[2m \u001b[0m\u001b[2mtid-EvaluateCustomModel-5XaO1EW1hQ \u001b[0m\u001b[2m \u001b[0m \u001b[34m│\u001b[0m\n", - "\u001b[34m╰─────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯\u001b[0m\n" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "
╭─────────────────────────────── Traceback (most recent call last) ────────────────────────────────╮\n",
-       " in <module>:3                                                                                    \n",
-       "                                                                                                  \n",
-       "   1 # Wait for job completion with progress updates                                              \n",
-       "   2 # This will show a rich progress display in Jupyter                                          \n",
-       " 3 execution.wait(target_status=\"Succeeded\", poll=5, timeout=3600)                              \n",
-       "   4                                                                                              \n",
-       "   5 print(f\"\\nFinal Status: {execution.status.overall_status}\")                                  \n",
-       "   6                                                                                              \n",
-       "                                                                                                  \n",
-       " /Users/mollyhe/Documents/SageMaker/sagemaker-python-sdk/sagemaker-core/src/sagemaker/core/teleme \n",
-       " try/telemetry_logging.py:180 in wrapper                                                          \n",
-       "                                                                                                  \n",
-       "   177 │   │   │   │   │   \"sagemaker_session is not provided or not valid.\",                     \n",
-       "   178 │   │   │   │   │   func_name,                                                             \n",
-       "   179 │   │   │   │   )                                                                          \n",
-       " 180 │   │   │   │   return func(*args, **kwargs)                                               \n",
-       "   181 │   │                                                                                      \n",
-       "   182 │   │   return wrapper                                                                     \n",
-       "   183                                                                                            \n",
-       "                                                                                                  \n",
-       " /Users/mollyhe/Documents/SageMaker/sagemaker-python-sdk-molly/sagemaker-train/src/sagemaker/trai \n",
-       " n/evaluate/execution.py:1061 in wait                                                             \n",
-       "                                                                                                  \n",
-       "   1058 │   │   │   │   │   │   status=current_status                                             \n",
-       "   1059 │   │   │   │   │   )                                                                     \n",
-       "   1060 │   │   │   │                                                                             \n",
-       " 1061 │   │   │   │   time.sleep(poll)                                                          \n",
-       "   1062 │   │   else:                                                                             \n",
-       "   1063 │   │   │   # Terminal experience with rich library                                       \n",
-       "   1064 │   │   │   try:                                                                          \n",
-       "╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n",
-       "KeyboardInterrupt\n",
-       "
\n" - ], - "text/plain": [ - "\u001b[38;2;255;0;0m╭─\u001b[0m\u001b[38;2;255;0;0m──────────────────────────────\u001b[0m\u001b[38;2;255;0;0m \u001b[0m\u001b[1;38;2;255;0;0mTraceback \u001b[0m\u001b[1;2;38;2;255;0;0m(most recent call last)\u001b[0m\u001b[38;2;255;0;0m \u001b[0m\u001b[38;2;255;0;0m───────────────────────────────\u001b[0m\u001b[38;2;255;0;0m─╮\u001b[0m\n", - "\u001b[38;2;255;0;0m│\u001b[0m in \u001b[92m\u001b[0m:\u001b[94m3\u001b[0m \u001b[38;2;255;0;0m│\u001b[0m\n", - "\u001b[38;2;255;0;0m│\u001b[0m \u001b[38;2;255;0;0m│\u001b[0m\n", - "\u001b[38;2;255;0;0m│\u001b[0m \u001b[2m1 \u001b[0m\u001b[2m# Wait for job completion with progress updates\u001b[0m \u001b[38;2;255;0;0m│\u001b[0m\n", - "\u001b[38;2;255;0;0m│\u001b[0m \u001b[2m2 \u001b[0m\u001b[2m# This will show a rich progress display in Jupyter\u001b[0m \u001b[38;2;255;0;0m│\u001b[0m\n", - "\u001b[38;2;255;0;0m│\u001b[0m \u001b[31m❱ \u001b[0m3 execution.wait(target_status=\u001b[33m\"\u001b[0m\u001b[33mSucceeded\u001b[0m\u001b[33m\"\u001b[0m, poll=\u001b[94m5\u001b[0m, timeout=\u001b[94m3600\u001b[0m) \u001b[38;2;255;0;0m│\u001b[0m\n", - "\u001b[38;2;255;0;0m│\u001b[0m \u001b[2m4 \u001b[0m \u001b[38;2;255;0;0m│\u001b[0m\n", - "\u001b[38;2;255;0;0m│\u001b[0m \u001b[2m5 \u001b[0m\u001b[96mprint\u001b[0m(\u001b[33mf\u001b[0m\u001b[33m\"\u001b[0m\u001b[33m\\n\u001b[0m\u001b[33mFinal Status: \u001b[0m\u001b[33m{\u001b[0mexecution.status.overall_status\u001b[33m}\u001b[0m\u001b[33m\"\u001b[0m) \u001b[38;2;255;0;0m│\u001b[0m\n", - "\u001b[38;2;255;0;0m│\u001b[0m \u001b[2m6 \u001b[0m \u001b[38;2;255;0;0m│\u001b[0m\n", - "\u001b[38;2;255;0;0m│\u001b[0m \u001b[38;2;255;0;0m│\u001b[0m\n", - "\u001b[38;2;255;0;0m│\u001b[0m \u001b[2;33m/Users/mollyhe/Documents/SageMaker/sagemaker-python-sdk/sagemaker-core/src/sagemaker/core/teleme\u001b[0m \u001b[38;2;255;0;0m│\u001b[0m\n", - "\u001b[38;2;255;0;0m│\u001b[0m \u001b[2;33mtry/\u001b[0m\u001b[1;33mtelemetry_logging.py\u001b[0m:\u001b[94m180\u001b[0m in \u001b[92mwrapper\u001b[0m \u001b[38;2;255;0;0m│\u001b[0m\n", - "\u001b[38;2;255;0;0m│\u001b[0m \u001b[38;2;255;0;0m│\u001b[0m\n", - "\u001b[38;2;255;0;0m│\u001b[0m \u001b[2m177 \u001b[0m\u001b[2m│ │ │ │ │ \u001b[0m\u001b[33m\"\u001b[0m\u001b[33msagemaker_session is not provided or not valid.\u001b[0m\u001b[33m\"\u001b[0m, \u001b[38;2;255;0;0m│\u001b[0m\n", - "\u001b[38;2;255;0;0m│\u001b[0m \u001b[2m178 \u001b[0m\u001b[2m│ │ │ │ │ \u001b[0mfunc_name, \u001b[38;2;255;0;0m│\u001b[0m\n", - "\u001b[38;2;255;0;0m│\u001b[0m \u001b[2m179 \u001b[0m\u001b[2m│ │ │ │ \u001b[0m) \u001b[38;2;255;0;0m│\u001b[0m\n", - "\u001b[38;2;255;0;0m│\u001b[0m \u001b[31m❱ \u001b[0m180 \u001b[2m│ │ │ │ \u001b[0m\u001b[94mreturn\u001b[0m func(*args, **kwargs) \u001b[38;2;255;0;0m│\u001b[0m\n", - "\u001b[38;2;255;0;0m│\u001b[0m \u001b[2m181 \u001b[0m\u001b[2m│ │ \u001b[0m \u001b[38;2;255;0;0m│\u001b[0m\n", - "\u001b[38;2;255;0;0m│\u001b[0m \u001b[2m182 \u001b[0m\u001b[2m│ │ \u001b[0m\u001b[94mreturn\u001b[0m wrapper \u001b[38;2;255;0;0m│\u001b[0m\n", - "\u001b[38;2;255;0;0m│\u001b[0m \u001b[2m183 \u001b[0m \u001b[38;2;255;0;0m│\u001b[0m\n", - "\u001b[38;2;255;0;0m│\u001b[0m \u001b[38;2;255;0;0m│\u001b[0m\n", - "\u001b[38;2;255;0;0m│\u001b[0m \u001b[2;33m/Users/mollyhe/Documents/SageMaker/sagemaker-python-sdk-molly/sagemaker-train/src/sagemaker/trai\u001b[0m \u001b[38;2;255;0;0m│\u001b[0m\n", - "\u001b[38;2;255;0;0m│\u001b[0m \u001b[2;33mn/evaluate/\u001b[0m\u001b[1;33mexecution.py\u001b[0m:\u001b[94m1061\u001b[0m in \u001b[92mwait\u001b[0m \u001b[38;2;255;0;0m│\u001b[0m\n", - "\u001b[38;2;255;0;0m│\u001b[0m \u001b[38;2;255;0;0m│\u001b[0m\n", - "\u001b[38;2;255;0;0m│\u001b[0m \u001b[2m1058 \u001b[0m\u001b[2m│ │ │ │ │ │ \u001b[0mstatus=current_status \u001b[38;2;255;0;0m│\u001b[0m\n", - "\u001b[38;2;255;0;0m│\u001b[0m \u001b[2m1059 \u001b[0m\u001b[2m│ │ │ │ │ \u001b[0m) \u001b[38;2;255;0;0m│\u001b[0m\n", - "\u001b[38;2;255;0;0m│\u001b[0m \u001b[2m1060 \u001b[0m\u001b[2m│ │ │ │ \u001b[0m \u001b[38;2;255;0;0m│\u001b[0m\n", - "\u001b[38;2;255;0;0m│\u001b[0m \u001b[31m❱ \u001b[0m1061 \u001b[2m│ │ │ │ \u001b[0mtime.sleep(poll) \u001b[38;2;255;0;0m│\u001b[0m\n", - "\u001b[38;2;255;0;0m│\u001b[0m \u001b[2m1062 \u001b[0m\u001b[2m│ │ \u001b[0m\u001b[94melse\u001b[0m: \u001b[38;2;255;0;0m│\u001b[0m\n", - "\u001b[38;2;255;0;0m│\u001b[0m \u001b[2m1063 \u001b[0m\u001b[2m│ │ │ \u001b[0m\u001b[2m# Terminal experience with rich library\u001b[0m \u001b[38;2;255;0;0m│\u001b[0m\n", - "\u001b[38;2;255;0;0m│\u001b[0m \u001b[2m1064 \u001b[0m\u001b[2m│ │ │ \u001b[0m\u001b[94mtry\u001b[0m: \u001b[38;2;255;0;0m│\u001b[0m\n", - "\u001b[38;2;255;0;0m╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\u001b[0m\n", - "\u001b[1;91mKeyboardInterrupt\u001b[0m\n" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], + "outputs": [], "source": [ "# Wait for job completion with progress updates\n", "# This will show a rich progress display in Jupyter\n", @@ -829,13 +338,13 @@ "\n", "# Get an existing job by ARN\n", "# Replace with your actual pipeline execution ARN\n", - "existing_arn = \"arn:aws:sagemaker:us-east-1:529088288990:pipeline/SagemakerEvaluation-BenchmarkEvaluation-d47edc79-1d45-4b8f-9ff6-df8e2c8c92be/execution/bhtdxm5tenya\"\n", + "existing_arn = \"arn:aws:sagemaker:us-west-2:052150106756:pipeline/SagemakerEvaluation-BenchmarkEvaluation-c344c91d-6f62-4907-85cc-7e6b29171c42/execution/inlsexrd7jes\"\n", "\n", "# base model only example\n", "# existing_arn = \"arn:aws:sagemaker:us-west-2:052150106756:pipeline/SagemakerEvaluation-benchmark/execution/gdp9f4dbv2vi\"\n", "existing_execution = EvaluationPipelineExecution.get(\n", " arn=existing_arn,\n", - " region=\"us-east-1\"\n", + " region=\"us-west-2\"\n", ")\n", "\n", "pprint(existing_execution)\n", diff --git a/v3-examples/model-customization-examples/dpo_trainer_example_notebook_v3_prod.ipynb b/v3-examples/model-customization-examples/dpo_trainer_example_notebook_v3_prod.ipynb index 130991834a..e5fbe4cd99 100644 --- a/v3-examples/model-customization-examples/dpo_trainer_example_notebook_v3_prod.ipynb +++ b/v3-examples/model-customization-examples/dpo_trainer_example_notebook_v3_prod.ipynb @@ -58,8 +58,8 @@ "# This creates a versioned dataset that can be referenced by ARN\n", "# Provide a source (it can be local file path or S3 URL)\n", "dataset = DataSet.create(\n", - " name=\"demo-2\",\n", - " source=\"s3://sagemaker-us-west-2-529088288990/dataset/preference_dataset_train_256.jsonl\"\n", + " name=\"demo-6\",\n", + " source=\"s3://nova-mlflow-us-west-2/dataset/preference_dataset_train_256.jsonl\"\n", ")\n", "\n", "print(f\"Dataset ARN: {dataset.arn}\")" @@ -292,7 +292,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.14" + "version": "3.12.10" } }, "nbformat": 4, diff --git a/v3-examples/model-customization-examples/sft_finetuning_example_notebook_pysdk_prod_v3.ipynb b/v3-examples/model-customization-examples/sft_finetuning_example_notebook_pysdk_prod_v3.ipynb index 7b4348d0e4..946debc7d7 100644 --- a/v3-examples/model-customization-examples/sft_finetuning_example_notebook_pysdk_prod_v3.ipynb +++ b/v3-examples/model-customization-examples/sft_finetuning_example_notebook_pysdk_prod_v3.ipynb @@ -23,7 +23,7 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": null, "id": "87aa2004556ad7c6", "metadata": {}, "outputs": [], @@ -35,30 +35,10 @@ }, { "cell_type": "code", - "execution_count": 2, - "id": "989646bf", - "metadata": {}, - "outputs": [], - "source": [ - "import logging\n", - "logging.basicConfig(level=logging.WARNING)" - ] - }, - { - "cell_type": "code", - "execution_count": 3, + "execution_count": null, "id": "a51be0b5-fd33-4fa0-af2b-d08ce0dc7a8e", "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "sagemaker.config INFO - Not applying SDK defaults from location: /Library/Application Support/sagemaker/config.yaml\n", - "sagemaker.config INFO - Not applying SDK defaults from location: /Users/mollyhe/Library/Application Support/sagemaker/config.yaml\n" - ] - } - ], + "outputs": [], "source": [ "from sagemaker.train.sft_trainer import SFTTrainer\n", "from sagemaker.train.common import TrainingType\n", @@ -73,7 +53,7 @@ "\n", "\n", "# For MLFlow native metrics in Trainer wait, run below line with approriate region\n", - "os.environ[\"SAGEMAKER_MLFLOW_CUSTOM_ENDPOINT\"] = \"https://mlflow.sagemaker.us-east-1.app.aws\"\n", + "os.environ[\"SAGEMAKER_MLFLOW_CUSTOM_ENDPOINT\"] = \"https://mlflow.sagemaker.us-west-2.app.aws\"\n", "\n" ] }, @@ -96,55 +76,10 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": null, "id": "ef4f0e61-de4d-4228-b7a1-ea7497dad547", "metadata": {}, - "outputs": [ - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "e98bb5da0450400ab462efafe4fd3bb8", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "Output()" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "
Final Resource Status: Available\n",
-       "
\n" - ], - "text/plain": [ - "Final Resource Status: Available\n" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "
\n"
-      ],
-      "text/plain": []
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Dataset ARN: arn:aws:sagemaker:us-east-1:529088288990:hub-content/G82247CBEQ6TN0FI3J1SAJU59UURMT3H3EKG48C6VQGDINNRNGU0/DataSet/demo-1/45.0.0\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "from sagemaker.ai_registry.dataset import DataSet\n",
     "from sagemaker.ai_registry.dataset_utils import CustomizationTechnique\n",
@@ -156,7 +91,7 @@
     "# Provide a source (it can be local file path or S3 URL)\n",
     "dataset = DataSet.create(\n",
     "    name=\"demo-1\",\n",
-    "    source=\"s3://sagemaker-us-east-1-529088288990/nova1_SFT.jsonl\"\n",
+    "    source=\"s3://mc-flows-sdk-testing/input_data/sft/sample_data_256_final.jsonl\"\n",
     ")\n",
     "\n",
     "print(f\"Dataset ARN: {dataset.arn}\")"
@@ -172,23 +107,14 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": null,
    "id": "d6937550-f721-43ff-82dd-c513c328dd17",
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "WARNING:sagemaker.core.utils.utils:No region provided. Using default region.\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "from sagemaker.core.resources import ModelPackage, ModelPackageGroup\n",
     "\n",
-    "# model_package_group=ModelPackageGroup.create(model_package_group_name=\"model-package-group-nova13\")\n",
-    "model_package_group=ModelPackageGroup.get(model_package_group_name=\"model-package-group-nova13\")"
+    "model_package_group=ModelPackageGroup.create(model_package_group_name=\"test-model-package-group\")"
    ]
   },
   {
@@ -224,20 +150,20 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 6,
+   "execution_count": null,
    "id": "88fe8360-de50-481d-932f-564a32be66a0",
    "metadata": {},
    "outputs": [],
    "source": [
     "# For fine-tuning \n",
     "sft_trainer = SFTTrainer(\n",
-    "    model=\"nova-textgeneration-lite\", \n",
-    "    training_type=TrainingType.FULL, \n",
+    "    model=\"meta-textgeneration-llama-3-2-1b-instruct\", \n",
+    "    training_type=TrainingType.LORA, \n",
     "    model_package_group=model_package_group, # or use an existing model package group arn\n",
     "    mlflow_experiment_name=\"test-finetuned-models-exp\", \n",
     "    mlflow_run_name=\"test-finetuned-models-run\", \n",
     "    training_dataset=dataset.arn, \n",
-    "    s3_output_path=\"s3://sagemaker-us-east-1-529088288990/output/\",\n",
+    "    s3_output_path=\"s3://mc-flows-sdk-testing/output/\",\n",
     "    accept_eula=True\n",
     ")\n"
    ]
@@ -254,45 +180,12 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 7,
+   "execution_count": null,
    "id": "de183042-bb92-4947-9acd-78d7231bda13",
    "metadata": {
     "scrolled": true
    },
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Default Finetuning Options:\n"
-     ]
-    },
-    {
-     "data": {
-      "text/html": [
-       "
{\n",
-       "'name': 'my-fullrank-run-k3174',\n",
-       "'global_batch_size': '16',\n",
-       "'max_epochs': '1',\n",
-       "'learning_rate': '5e-06',\n",
-       "'max_context_length': '8192'\n",
-       "}\n",
-       "
\n" - ], - "text/plain": [ - "\u001b[1m{\u001b[0m\n", - "\u001b[2;32m│ \u001b[0m\u001b[38;2;0;135;0m'name'\u001b[0m: \u001b[38;2;0;135;0m'my-fullrank-run-k3174'\u001b[0m,\n", - "\u001b[2;32m│ \u001b[0m\u001b[38;2;0;135;0m'global_batch_size'\u001b[0m: \u001b[38;2;0;135;0m'16'\u001b[0m,\n", - "\u001b[2;32m│ \u001b[0m\u001b[38;2;0;135;0m'max_epochs'\u001b[0m: \u001b[38;2;0;135;0m'1'\u001b[0m,\n", - "\u001b[2;32m│ \u001b[0m\u001b[38;2;0;135;0m'learning_rate'\u001b[0m: \u001b[38;2;0;135;0m'5e-06'\u001b[0m,\n", - "\u001b[2;32m│ \u001b[0m\u001b[38;2;0;135;0m'max_context_length'\u001b[0m: \u001b[38;2;0;135;0m'8192'\u001b[0m\n", - "\u001b[1m}\u001b[0m\n" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], + "outputs": [], "source": [ "print(\"Default Finetuning Options:\")\n", "pprint(sft_trainer.hyperparameters.to_dict()) # rename as hyperparameters" @@ -300,7 +193,7 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": null, "id": "6b57838f-81ac-4fbe-9ddf-5588e42bcce1", "metadata": { "scrolled": true @@ -308,7 +201,7 @@ "outputs": [], "source": [ "# To update any hyperparameter, simply assign the value, example:\n", - "sft_trainer.hyperparameters.global_batch_size=32" + "sft_trainer.hyperparameters.global_batch_size=16" ] }, { @@ -321,7 +214,7 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": null, "id": "4d3b6441-9abb-447b-9307-9606a8c0fabd", "metadata": { "jupyter": { @@ -329,243 +222,13 @@ }, "scrolled": true }, - "outputs": [ - { - "data": { - "text/html": [ - "
╭────────────────────────────────── Training Job Status ───────────────────────────────────╮\n",
-       "  TrainingJob Name      🔗 nova-textgeneration-lite-sft-20260310173212                    \n",
-       "  TrainingJob ARN       arn:aws:sagemaker:us-east-1:529088288990:training-job/nova-textg  \n",
-       "                        eneration-lite-sft-20260310173212                                 \n",
-       "  MLflow Experiment     🔗 test-finetuned-models-exp                                      \n",
-       "                                                                                          \n",
-       "  Job Status            InProgress                                                        \n",
-       "  Secondary Status      Training                                                          \n",
-       "  Elapsed Time          990.7s                                                            \n",
-       "                                                                                          \n",
-       " Status Transitions                                                                       \n",
-       "                                                                                          \n",
-       "        Step              Details                               Duration                  \n",
-       "  ───────────────────────────────────────────────────────────────────────────             \n",
-       "    Starting          Starting the training job             4.7s                      \n",
-       "    Pending           Preparing the instances for           656.3s                    \n",
-       "                          training                                                        \n",
-       "    Downloading       Downloading the training image        253.4s                    \n",
-       "    Training          Training image download completed.    Running...                \n",
-       "                          Training in progress.                                           \n",
-       "                                                                                          \n",
-       "╰──────────────────────────────────────────────────────────────────────────────────────────╯\n",
-       "
\n" - ], - "text/plain": [ - "\u001b[38;5;172m╭─\u001b[0m\u001b[38;5;172m─────────────────────────────────\u001b[0m\u001b[38;5;172m \u001b[0m\u001b[1;94mTraining Job Status\u001b[0m\u001b[38;5;172m \u001b[0m\u001b[38;5;172m──────────────────────────────────\u001b[0m\u001b[38;5;172m─╮\u001b[0m\n", - "\u001b[38;5;172m│\u001b[0m \u001b[1;36m \u001b[0m\u001b[1;36mTrainingJob Name \u001b[0m\u001b[1;36m \u001b[0m\u001b[37m \u001b[0m\u001b]8;id=225350;https://studio-d-ingyyaeglvki.studio.us-east-1.sagemaker.aws/jobs/train/nova-textgeneration-lite-sft-20260310173212\u001b\\\u001b[37m🔗 nova-textgeneration-lite-sft-20260310173212\u001b[0m\u001b]8;;\u001b\\\u001b[37m \u001b[0m\u001b[37m \u001b[0m \u001b[38;5;172m│\u001b[0m\n", - "\u001b[38;5;172m│\u001b[0m \u001b[1;36m \u001b[0m\u001b[1;36mTrainingJob ARN \u001b[0m\u001b[1;36m \u001b[0m\u001b[37m \u001b[0m\u001b[2;37marn:aws:sagemaker:us-east-1:529088288990:training-job/nova-textg\u001b[0m\u001b[37m \u001b[0m \u001b[38;5;172m│\u001b[0m\n", - "\u001b[38;5;172m│\u001b[0m \u001b[1;36m \u001b[0m\u001b[37m \u001b[0m\u001b[2;37meneration-lite-sft-20260310173212\u001b[0m\u001b[37m \u001b[0m\u001b[37m \u001b[0m \u001b[38;5;172m│\u001b[0m\n", - "\u001b[38;5;172m│\u001b[0m \u001b[1;36m \u001b[0m\u001b[1;36mMLflow Experiment \u001b[0m\u001b[1;36m \u001b[0m\u001b[37m \u001b[0m\u001b]8;id=448259;https://app-2SV6Q35HTCBO.mlflow.sagemaker.us-east-1.app.aws/auth?authToken=eyJhbGciOiJIUzI1NiJ9.eyJhdXRoVG9rZW5JZCI6IlRZTENXRiIsImZhc0NyZWRlbnRpYWxzIjoiQWdWNEs1RzV1QVRWaHc1T21BK041d1VlQkFpOGtrQVVBMWpWWjlGZFpUMjc5UFVBWHdBQkFCVmhkM010WTNKNWNIUnZMWEIxWW14cFl5MXJaWGtBUkVGeE9IRjRZa04xWm1FMmVFWnROWFJDUTJSUGIydGhVazFXYm1FNWNIUlhWM1JUSzFCdWFHVTJkMFJFYmpOWmRFVlRjbVEzTTJSMWJEZHpTelY1ZVZCdFFUMDlBQUVBQjJGM2N5MXJiWE1BUzJGeWJqcGhkM002YTIxek9uVnpMV1ZoYzNRdE1Ub3pNVFF4TkRZek1EWTBPREk2YTJWNUx6Y3dOMkpoTmpjeExUUXpZamd0TkRFeU5DMWhaVFUzTFRrMFlqTXdZbUptT1RJNU13QzRBUUlCQUhnQjRVMDBTK3ErVE51d1gydlFlaGtxQnVneWQ3YnNrb0pWdWQ2NmZjVENVd0dxdHBxTHRyRmNNVTVGNzVWampMTENBQUFBZmpCOEJna3Foa2lHOXcwQkJ3YWdiekJ0QWdFQU1HZ0dDU3FHU0liM0RRRUhBVEFlQmdsZ2hrZ0JaUU1FQVM0d0VRUU1Tek05SFM3Wjl2MkFlNnBnQWdFUWdEdEsxTkVFSm9weW96cWVtcmNEM05kUDdwcERFclplMmVFRWx2UFVhY2JzNTZnekF2USs2eWl1cmR0d0pwYjZydTBoSldweDBiQnR4ZHJqR0FJQUFCQUFzVW5xcysxdnpBV1luNFducGhQSENXb2YyamFkOHhqYWg2Zmg3OUl6Z0FEdE5lMno1eDAwUkF0OXRrRGRIWkMzLy8vLy93QUFBQUVBQUFBQUFBQUFBQUFBQUFFQUFBUkRNT01zOUUyaENlOUpCc0RraWFTMG8wam54bnYzVzVwUU5Fak15bXVsS1JtZDdlRlJTWkJKQmRRTDJ0eUdZdTNmUStQZFhjeWFNYk5FcUN6VTZaM3RPM3c3RGlIVHV6Q0gvamFJNWNjaFRIL1hIdWZsc0NpS2NwaUphMVV5MEZESVhDc2FDKzVIbVBlUmloZHNkdEFibk5LRGp3WXB6NEYwZklVc2VVSmFDTm10NkxmdjM5Ky9VTktVbU9PSXFqQWJvUllqVTh6eW9Bdnc3cUUrMVptbXA2YWtrVXcvVzBNZGJmRGtsK0U4bmFJNG5NZEtkNDVCU2hpYXR1YUpNUnhZbTd0UFRvZHdJRTdCWWVOaUd4R0thSkZiVkNmTXg1WXNqZ1VZcHlTbUF2ZzU3RTdtOGw0cWp0ZnlGVDNKT2NQaUwra3c4NWdrQ1lIclpoYnVhTzlvSzFUVS96QUxPb0dmOHlQbEN5Ym1kVFJReEE2V3ZQREROWkJFcWg5YlZUSGNOQnk2K1JPWjBOWDVMYUNtdDYxY1Y0SUZtNlFRVTFCRVBmWFhIVFhqd3ZYMEVLUXpNM2VjOTNhRHVKcWJFMldFaXl3VHE2R3dNSTR3WG9YMzUwb1NZc2VVZ2RwcWNPWFgxME1tU2dtQUtuUnZrQkc5NElyUzNTV1BIZ0NKb3A3K3pxNVlIZE1EcXVpR04zVlJhWHZDRFlRK0dQRHlOWFBnc0NsZE9rdWdPWmdVV0FVNTloOHp2VGJlOGY2ZmdDdmNCcnVsWTFKeWl4WGpESnd2WTJBeXRzMkdncUh1Z25BZWlwWUdldkVnN2hsV2pvQmwwY3ZxR0w0WGtlMFY4WGpXT1Q2MW1EVVdXOVlwNi9HdXlaUUVKdHZQaFBzT2ZTTjNSamdwaXVNWGM0UzdmR2pxbjYzY3Vnenl3cVRYcDVmaEtJUVFMUzBaWEVPUjdMT2V3WTl4WlczQTgyd2k3MnYyR2FNS1JRSy9MNFZNSjl3QWQ3bWtzQ21OZVNKeVlaS0RlQlVwSGoyYS83dHNQNVNDdXNMMitnaHZLbW03Q3RWQnRLbjlDZ1NJUVBkWEU4U1drai9CVkx0SEp0VG1iSlhXSlNYS2tkRjgwVitud2VaR0FTMUl6RHZNWnlmYXY3WTdzTGlHdWJsRHoxenh3WkxZZ3JHMy9ObUVVbWdxM0hXNDF1QzVWK1NKWE10WEdCbk13SUZ2aS95ZVcya1ZPT2htTWFJV1hXSHhYK2p5cVl0NGl0amVZYWZiZW5qUytCMUJOdERBT29JS0ZMck9rbUYzZTRkcEdCaG9wSXZpenIxcG1TRXNZVFVrOGQwTE5TWEt4Mm41QzNNV3krc1JEUmRtOHhTaXBxYmhvbzdqbWRzQ3Fkb3BvNm5qYkl2SU00M2V5eGF3WC9ZT1o0YXhYTTNGTGZ0Q1M0UWZNL0ZNNGhPRUNXdHRtYVlQN0ZUTjIwUzRWMTB0QUNVbzJERHdoOU1nRTBGZlBucUpXOEw2SzJqVmhSQjRiVjhYZEd0OVNKdWt3N3RSZnhjVU1sM0Vmd2YxWU50YzJ6L24xYzhHU0VtSC9SRUNrc3IwaHRDRmhaZXZBZGR1Qm5GTmxlQ3l5MXdaWnR2VGM4K2QzK1hwU3laOUk1Qk5ndzVyMTF5aUlEaENaWlU4WGxjVmtkNkdqaEkveWxrb2swVGpBTFFMYk83MjcyVEsxbXJsb3NLODkxaEN1RTUrS2s5UktYMktIZ3BsZEVDUU1jeWwvWjBXTy9vUGVQejNKUnMzSk9VUkpBV3k4ZlJZWDlCMm0zOE9yenFzaGtDOVkvN2xHQmVKKzU1Zit2NVo3MWpLZmN6SUE1bEN5YjNqdm1sZjhELy9jMCtEYys3QWgyK2hTSnozYnZYWnBZSFVNdGFzdlhSVEVUUlRVT284am04WVpGSTR4UmtPTGdtQ0FBSzdiMFhIUkMwUVMySDBBR2N3WlFJd1ZRczVHT1I3OW5YV044MldXV3F3dnNjY1p1ZmduZlFRbkRrdjg4bEg3V0dFalpva3dTR3QvWDY3YjRBWDRqSk9BakVBOXJRKzNGOWN4M3FiRWFhcHo3TWhIVHpRRFpKdEJQc201dHIzR08yQ293Sk02UnFVYWRRaU5jNGlBTWRXQjVkVCIsImNpcGhlclRleHQiOiJBUUlCQUhnQjRVMDBTK3ErVE51d1gydlFlaGtxQnVneWQ3YnNrb0pWdWQ2NmZjVENVd0UwS2ZNVEVXU0RzdHE5aWRmWk1zSGRBQUFBb2pDQm53WUpLb1pJaHZjTkFRY0dvSUdSTUlHT0FnRUFNSUdJQmdrcWhraUc5dzBCQndFd0hnWUpZSVpJQVdVREJBRXVNQkVFRExMY0xVTXkrSGpuWlB4YTFBSUJFSUJiMFhWc0RjcWovZVN5WVh5QlJKMFdnQ2xmT0RVY2dLWmJSUlVlNkc1OCsxZkNaTDU2TmoydEx3c1U4V0lwUkVqNkxya1hBb0pnRWVtTUZLMnJwcTVpZmJrMHgySjEwYVd2aFZSQWtMS2lFTDZDMmxJRDhvVEhSSko5Qnc9PSIsInN1YiI6ImFybjphd3M6c2FnZW1ha2VyOnVzLWVhc3QtMTo1MjkwODgyODg5OTA6bWxmbG93LWFwcC9hcHAtMlNWNlEzNUhUQ0JPIiwiaWF0IjoxNzczMTkwMTAxLCJleHAiOjE3NzMxOTA0MDF9._WYatAneAubx7fMUElOyogdDg5Ktk_Y_AT-tgBKt1bQ#/experiments/2\u001b\\\u001b[37m🔗 test-finetuned-models-exp\u001b[0m\u001b]8;;\u001b\\\u001b[37m \u001b[0m\u001b[37m \u001b[0m \u001b[38;5;172m│\u001b[0m\n", - "\u001b[38;5;172m│\u001b[0m \u001b[38;5;172m│\u001b[0m\n", - "\u001b[38;5;172m│\u001b[0m \u001b[1;36m \u001b[0m\u001b[1;36mJob Status \u001b[0m\u001b[1;36m \u001b[0m\u001b[37m \u001b[0m\u001b[1;38;5;172mInProgress\u001b[0m\u001b[37m \u001b[0m \u001b[38;5;172m│\u001b[0m\n", - "\u001b[38;5;172m│\u001b[0m \u001b[1;36m \u001b[0m\u001b[1;36mSecondary Status \u001b[0m\u001b[1;36m \u001b[0m\u001b[37m \u001b[0m\u001b[1;33mTraining\u001b[0m\u001b[37m \u001b[0m\u001b[37m \u001b[0m \u001b[38;5;172m│\u001b[0m\n", - "\u001b[38;5;172m│\u001b[0m \u001b[1;36m \u001b[0m\u001b[1;36mElapsed Time \u001b[0m\u001b[1;36m \u001b[0m\u001b[37m \u001b[0m\u001b[1;91m990.7s\u001b[0m\u001b[37m \u001b[0m\u001b[37m \u001b[0m \u001b[38;5;172m│\u001b[0m\n", - "\u001b[38;5;172m│\u001b[0m \u001b[38;5;172m│\u001b[0m\n", - "\u001b[38;5;172m│\u001b[0m \u001b[1;35mStatus Transitions\u001b[0m \u001b[38;5;172m│\u001b[0m\n", - "\u001b[38;5;172m│\u001b[0m \u001b[38;5;172m│\u001b[0m\n", - "\u001b[38;5;172m│\u001b[0m \u001b[1;35m \u001b[0m\u001b[1;35m \u001b[0m\u001b[1;35m \u001b[0m \u001b[1;35m \u001b[0m\u001b[1;35mStep \u001b[0m\u001b[1;35m \u001b[0m \u001b[1;35m \u001b[0m\u001b[1;35mDetails \u001b[0m\u001b[1;35m \u001b[0m \u001b[1;35m \u001b[0m\u001b[1;35mDuration \u001b[0m\u001b[1;35m \u001b[0m \u001b[38;5;172m│\u001b[0m\n", - "\u001b[38;5;172m│\u001b[0m ─────────────────────────────────────────────────────────────────────────── \u001b[38;5;172m│\u001b[0m\n", - "\u001b[38;5;172m│\u001b[0m \u001b[32m \u001b[0m\u001b[32m✓ \u001b[0m\u001b[32m \u001b[0m \u001b[36m \u001b[0m\u001b[36mStarting \u001b[0m\u001b[36m \u001b[0m \u001b[38;5;172m \u001b[0m\u001b[38;5;172mStarting the training job \u001b[0m\u001b[38;5;172m \u001b[0m \u001b[32m \u001b[0m\u001b[32m4.7s \u001b[0m\u001b[32m \u001b[0m \u001b[38;5;172m│\u001b[0m\n", - "\u001b[38;5;172m│\u001b[0m \u001b[32m \u001b[0m\u001b[32m✓ \u001b[0m\u001b[32m \u001b[0m \u001b[36m \u001b[0m\u001b[36mPending \u001b[0m\u001b[36m \u001b[0m \u001b[38;5;172m \u001b[0m\u001b[38;5;172mPreparing the instances for \u001b[0m\u001b[38;5;172m \u001b[0m \u001b[32m \u001b[0m\u001b[32m656.3s \u001b[0m\u001b[32m \u001b[0m \u001b[38;5;172m│\u001b[0m\n", - "\u001b[38;5;172m│\u001b[0m \u001b[32m \u001b[0m \u001b[36m \u001b[0m \u001b[38;5;172m \u001b[0m\u001b[38;5;172mtraining \u001b[0m\u001b[38;5;172m \u001b[0m \u001b[32m \u001b[0m \u001b[38;5;172m│\u001b[0m\n", - "\u001b[38;5;172m│\u001b[0m \u001b[32m \u001b[0m\u001b[32m✓ \u001b[0m\u001b[32m \u001b[0m \u001b[36m \u001b[0m\u001b[36mDownloading \u001b[0m\u001b[36m \u001b[0m \u001b[38;5;172m \u001b[0m\u001b[38;5;172mDownloading the training image \u001b[0m\u001b[38;5;172m \u001b[0m \u001b[32m \u001b[0m\u001b[32m253.4s \u001b[0m\u001b[32m \u001b[0m \u001b[38;5;172m│\u001b[0m\n", - "\u001b[38;5;172m│\u001b[0m \u001b[32m \u001b[0m\u001b[32m⋯ \u001b[0m\u001b[32m \u001b[0m \u001b[36m \u001b[0m\u001b[36mTraining \u001b[0m\u001b[36m \u001b[0m \u001b[38;5;172m \u001b[0m\u001b[38;5;172mTraining image download completed. \u001b[0m\u001b[38;5;172m \u001b[0m \u001b[32m \u001b[0m\u001b[32mRunning... \u001b[0m\u001b[32m \u001b[0m \u001b[38;5;172m│\u001b[0m\n", - "\u001b[38;5;172m│\u001b[0m \u001b[32m \u001b[0m \u001b[36m \u001b[0m \u001b[38;5;172m \u001b[0m\u001b[38;5;172mTraining in progress. \u001b[0m\u001b[38;5;172m \u001b[0m \u001b[32m \u001b[0m \u001b[38;5;172m│\u001b[0m\n", - "\u001b[38;5;172m│\u001b[0m \u001b[38;5;172m│\u001b[0m\n", - "\u001b[38;5;172m╰──────────────────────────────────────────────────────────────────────────────────────────╯\u001b[0m\n" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "
╭─────────────────────────────── Traceback (most recent call last) ────────────────────────────────╮\n",
-       " in <module>:1                                                                                    \n",
-       "                                                                                                  \n",
-       " 1 training_job = sft_trainer.train(                                                            \n",
-       "   2 wait=True                                                                                \n",
-       "   3 )                                                                                            \n",
-       "   4                                                                                              \n",
-       "                                                                                                  \n",
-       " /Users/mollyhe/Documents/SageMaker/sagemaker-python-sdk/sagemaker-core/src/sagemaker/core/teleme \n",
-       " try/telemetry_logging.py:180 in wrapper                                                          \n",
-       "                                                                                                  \n",
-       "   177 │   │   │   │   │   \"sagemaker_session is not provided or not valid.\",                     \n",
-       "   178 │   │   │   │   │   func_name,                                                             \n",
-       "   179 │   │   │   │   )                                                                          \n",
-       " 180 │   │   │   │   return func(*args, **kwargs)                                               \n",
-       "   181 │   │                                                                                      \n",
-       "   182 │   │   return wrapper                                                                     \n",
-       "   183                                                                                            \n",
-       "                                                                                                  \n",
-       " /Users/mollyhe/Documents/SageMaker/sagemaker-python-sdk-molly/sagemaker-train/src/sagemaker/trai \n",
-       " n/sft_trainer.py:280 in train                                                                    \n",
-       "                                                                                                  \n",
-       "   277 │   │   │   from sagemaker.train.common_utils.trainer_wait import wait as _wait            \n",
-       "   278 │   │   │   from sagemaker.core.utils.exceptions import TimeoutExceededError               \n",
-       "   279 │   │   │   try :                                                                          \n",
-       " 280 │   │   │   │   _wait(training_job)                                                        \n",
-       "   281 │   │   │   except TimeoutExceededError as e:                                              \n",
-       "   282 │   │   │   │   logger.error(\"Error: %s\", e)                                               \n",
-       "   283                                                                                            \n",
-       "                                                                                                  \n",
-       " /Users/mollyhe/Documents/SageMaker/sagemaker-python-sdk-molly/sagemaker-train/src/sagemaker/trai \n",
-       " n/common_utils/trainer_wait.py:278 in wait                                                       \n",
-       "                                                                                                  \n",
-       "   275 │   │   │   │   iteration = 0                                                              \n",
-       "   276 │   │   │   │   while True:                                                                \n",
-       "   277 │   │   │   │   │   iteration += 1                                                         \n",
-       " 278 │   │   │   │   │   time.sleep(0.5)                                                        \n",
-       "   279 │   │   │   │   │   if iteration >= poll * 2:                                              \n",
-       "   280 │   │   │   │   │   │   training_job.refresh()                                             \n",
-       "   281 │   │   │   │   │   │   iteration = 0                                                      \n",
-       "╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n",
-       "KeyboardInterrupt\n",
-       "
\n" - ], - "text/plain": [ - "\u001b[38;2;255;0;0m╭─\u001b[0m\u001b[38;2;255;0;0m──────────────────────────────\u001b[0m\u001b[38;2;255;0;0m \u001b[0m\u001b[1;38;2;255;0;0mTraceback \u001b[0m\u001b[1;2;38;2;255;0;0m(most recent call last)\u001b[0m\u001b[38;2;255;0;0m \u001b[0m\u001b[38;2;255;0;0m───────────────────────────────\u001b[0m\u001b[38;2;255;0;0m─╮\u001b[0m\n", - "\u001b[38;2;255;0;0m│\u001b[0m in \u001b[92m\u001b[0m:\u001b[94m1\u001b[0m \u001b[38;2;255;0;0m│\u001b[0m\n", - "\u001b[38;2;255;0;0m│\u001b[0m \u001b[38;2;255;0;0m│\u001b[0m\n", - "\u001b[38;2;255;0;0m│\u001b[0m \u001b[31m❱ \u001b[0m1 training_job = sft_trainer.train( \u001b[38;2;255;0;0m│\u001b[0m\n", - "\u001b[38;2;255;0;0m│\u001b[0m \u001b[2m2 \u001b[0m\u001b[2m│ \u001b[0mwait=\u001b[94mTrue\u001b[0m \u001b[38;2;255;0;0m│\u001b[0m\n", - "\u001b[38;2;255;0;0m│\u001b[0m \u001b[2m3 \u001b[0m) \u001b[38;2;255;0;0m│\u001b[0m\n", - "\u001b[38;2;255;0;0m│\u001b[0m \u001b[2m4 \u001b[0m \u001b[38;2;255;0;0m│\u001b[0m\n", - "\u001b[38;2;255;0;0m│\u001b[0m \u001b[38;2;255;0;0m│\u001b[0m\n", - "\u001b[38;2;255;0;0m│\u001b[0m \u001b[2;33m/Users/mollyhe/Documents/SageMaker/sagemaker-python-sdk/sagemaker-core/src/sagemaker/core/teleme\u001b[0m \u001b[38;2;255;0;0m│\u001b[0m\n", - "\u001b[38;2;255;0;0m│\u001b[0m \u001b[2;33mtry/\u001b[0m\u001b[1;33mtelemetry_logging.py\u001b[0m:\u001b[94m180\u001b[0m in \u001b[92mwrapper\u001b[0m \u001b[38;2;255;0;0m│\u001b[0m\n", - "\u001b[38;2;255;0;0m│\u001b[0m \u001b[38;2;255;0;0m│\u001b[0m\n", - "\u001b[38;2;255;0;0m│\u001b[0m \u001b[2m177 \u001b[0m\u001b[2m│ │ │ │ │ \u001b[0m\u001b[33m\"\u001b[0m\u001b[33msagemaker_session is not provided or not valid.\u001b[0m\u001b[33m\"\u001b[0m, \u001b[38;2;255;0;0m│\u001b[0m\n", - "\u001b[38;2;255;0;0m│\u001b[0m \u001b[2m178 \u001b[0m\u001b[2m│ │ │ │ │ \u001b[0mfunc_name, \u001b[38;2;255;0;0m│\u001b[0m\n", - "\u001b[38;2;255;0;0m│\u001b[0m \u001b[2m179 \u001b[0m\u001b[2m│ │ │ │ \u001b[0m) \u001b[38;2;255;0;0m│\u001b[0m\n", - "\u001b[38;2;255;0;0m│\u001b[0m \u001b[31m❱ \u001b[0m180 \u001b[2m│ │ │ │ \u001b[0m\u001b[94mreturn\u001b[0m func(*args, **kwargs) \u001b[38;2;255;0;0m│\u001b[0m\n", - "\u001b[38;2;255;0;0m│\u001b[0m \u001b[2m181 \u001b[0m\u001b[2m│ │ \u001b[0m \u001b[38;2;255;0;0m│\u001b[0m\n", - "\u001b[38;2;255;0;0m│\u001b[0m \u001b[2m182 \u001b[0m\u001b[2m│ │ \u001b[0m\u001b[94mreturn\u001b[0m wrapper \u001b[38;2;255;0;0m│\u001b[0m\n", - "\u001b[38;2;255;0;0m│\u001b[0m \u001b[2m183 \u001b[0m \u001b[38;2;255;0;0m│\u001b[0m\n", - "\u001b[38;2;255;0;0m│\u001b[0m \u001b[38;2;255;0;0m│\u001b[0m\n", - "\u001b[38;2;255;0;0m│\u001b[0m \u001b[2;33m/Users/mollyhe/Documents/SageMaker/sagemaker-python-sdk-molly/sagemaker-train/src/sagemaker/trai\u001b[0m \u001b[38;2;255;0;0m│\u001b[0m\n", - "\u001b[38;2;255;0;0m│\u001b[0m \u001b[2;33mn/\u001b[0m\u001b[1;33msft_trainer.py\u001b[0m:\u001b[94m280\u001b[0m in \u001b[92mtrain\u001b[0m \u001b[38;2;255;0;0m│\u001b[0m\n", - "\u001b[38;2;255;0;0m│\u001b[0m \u001b[38;2;255;0;0m│\u001b[0m\n", - "\u001b[38;2;255;0;0m│\u001b[0m \u001b[2m277 \u001b[0m\u001b[2m│ │ │ \u001b[0m\u001b[94mfrom\u001b[0m\u001b[90m \u001b[0m\u001b[4;96msagemaker\u001b[0m\u001b[4;96m.\u001b[0m\u001b[4;96mtrain\u001b[0m\u001b[4;96m.\u001b[0m\u001b[4;96mcommon_utils\u001b[0m\u001b[4;96m.\u001b[0m\u001b[4;96mtrainer_wait\u001b[0m\u001b[90m \u001b[0m\u001b[94mimport\u001b[0m wait \u001b[94mas\u001b[0m _wait \u001b[38;2;255;0;0m│\u001b[0m\n", - "\u001b[38;2;255;0;0m│\u001b[0m \u001b[2m278 \u001b[0m\u001b[2m│ │ │ \u001b[0m\u001b[94mfrom\u001b[0m\u001b[90m \u001b[0m\u001b[4;96msagemaker\u001b[0m\u001b[4;96m.\u001b[0m\u001b[4;96mcore\u001b[0m\u001b[4;96m.\u001b[0m\u001b[4;96mutils\u001b[0m\u001b[4;96m.\u001b[0m\u001b[4;96mexceptions\u001b[0m\u001b[90m \u001b[0m\u001b[94mimport\u001b[0m TimeoutExceededError \u001b[38;2;255;0;0m│\u001b[0m\n", - "\u001b[38;2;255;0;0m│\u001b[0m \u001b[2m279 \u001b[0m\u001b[2m│ │ │ \u001b[0m\u001b[94mtry\u001b[0m : \u001b[38;2;255;0;0m│\u001b[0m\n", - "\u001b[38;2;255;0;0m│\u001b[0m \u001b[31m❱ \u001b[0m280 \u001b[2m│ │ │ │ \u001b[0m_wait(training_job) \u001b[38;2;255;0;0m│\u001b[0m\n", - "\u001b[38;2;255;0;0m│\u001b[0m \u001b[2m281 \u001b[0m\u001b[2m│ │ │ \u001b[0m\u001b[94mexcept\u001b[0m TimeoutExceededError \u001b[94mas\u001b[0m e: \u001b[38;2;255;0;0m│\u001b[0m\n", - "\u001b[38;2;255;0;0m│\u001b[0m \u001b[2m282 \u001b[0m\u001b[2m│ │ │ │ \u001b[0mlogger.error(\u001b[33m\"\u001b[0m\u001b[33mError: \u001b[0m\u001b[33m%s\u001b[0m\u001b[33m\"\u001b[0m, e) \u001b[38;2;255;0;0m│\u001b[0m\n", - "\u001b[38;2;255;0;0m│\u001b[0m \u001b[2m283 \u001b[0m \u001b[38;2;255;0;0m│\u001b[0m\n", - "\u001b[38;2;255;0;0m│\u001b[0m \u001b[38;2;255;0;0m│\u001b[0m\n", - "\u001b[38;2;255;0;0m│\u001b[0m \u001b[2;33m/Users/mollyhe/Documents/SageMaker/sagemaker-python-sdk-molly/sagemaker-train/src/sagemaker/trai\u001b[0m \u001b[38;2;255;0;0m│\u001b[0m\n", - "\u001b[38;2;255;0;0m│\u001b[0m \u001b[2;33mn/common_utils/\u001b[0m\u001b[1;33mtrainer_wait.py\u001b[0m:\u001b[94m278\u001b[0m in \u001b[92mwait\u001b[0m \u001b[38;2;255;0;0m│\u001b[0m\n", - "\u001b[38;2;255;0;0m│\u001b[0m \u001b[38;2;255;0;0m│\u001b[0m\n", - "\u001b[38;2;255;0;0m│\u001b[0m \u001b[2m275 \u001b[0m\u001b[2m│ │ │ │ \u001b[0miteration = \u001b[94m0\u001b[0m \u001b[38;2;255;0;0m│\u001b[0m\n", - "\u001b[38;2;255;0;0m│\u001b[0m \u001b[2m276 \u001b[0m\u001b[2m│ │ │ │ \u001b[0m\u001b[94mwhile\u001b[0m \u001b[94mTrue\u001b[0m: \u001b[38;2;255;0;0m│\u001b[0m\n", - "\u001b[38;2;255;0;0m│\u001b[0m \u001b[2m277 \u001b[0m\u001b[2m│ │ │ │ │ \u001b[0miteration += \u001b[94m1\u001b[0m \u001b[38;2;255;0;0m│\u001b[0m\n", - "\u001b[38;2;255;0;0m│\u001b[0m \u001b[31m❱ \u001b[0m278 \u001b[2m│ │ │ │ │ \u001b[0mtime.sleep(\u001b[94m0.5\u001b[0m) \u001b[38;2;255;0;0m│\u001b[0m\n", - "\u001b[38;2;255;0;0m│\u001b[0m \u001b[2m279 \u001b[0m\u001b[2m│ │ │ │ │ \u001b[0m\u001b[94mif\u001b[0m iteration >= poll * \u001b[94m2\u001b[0m: \u001b[38;2;255;0;0m│\u001b[0m\n", - "\u001b[38;2;255;0;0m│\u001b[0m \u001b[2m280 \u001b[0m\u001b[2m│ │ │ │ │ │ \u001b[0mtraining_job.refresh() \u001b[38;2;255;0;0m│\u001b[0m\n", - "\u001b[38;2;255;0;0m│\u001b[0m \u001b[2m281 \u001b[0m\u001b[2m│ │ │ │ │ │ \u001b[0miteration = \u001b[94m0\u001b[0m \u001b[38;2;255;0;0m│\u001b[0m\n", - "\u001b[38;2;255;0;0m╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\u001b[0m\n", - "\u001b[1;91mKeyboardInterrupt\u001b[0m\n" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], + "outputs": [], "source": [ "training_job = sft_trainer.train(\n", - " wait=True\n", + " wait=True,\n", ")" ] }, - { - "cell_type": "code", - "execution_count": null, - "id": "64d68d6f", - "metadata": {}, - "outputs": [], - "source": [ - "from sagemaker.train import get_mlflow_url\n", - "\n", - "# Get MLflow URL for a training job\n", - "url = get_mlflow_url(\"nova-textgeneration-micro-sft-20260303164831\")\n", - "print(url)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "fef3d01d", - "metadata": {}, - "outputs": [], - "source": [ - "from sagemaker.train import get_studio_url\n", - "\n", - "\n", - "# Still working on getting the specific training job\n", - "url = get_studio_url('nova-textgeneration-micro-sft-20260302144611')\n", - "print(url)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "9b50b630", - "metadata": {}, - "outputs": [], - "source": [ - "from sagemaker.core.resources import TrainingJob\n", - "\n", - "training_job = TrainingJob.get(training_job_name='nova-textgeneration-micro-sft-20260303161941')\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "8fa20618", - "metadata": {}, - "outputs": [], - "source": [ - "from sagemaker.train import get_studio_url\n", - "\n", - "# Studio URL (job details)\n", - "studio_url = get_studio_url(training_job)\n", - "print(studio_url)\n", - "\n", - "# CloudWatch Logs URL (view logs directly)\n", - "url = get_studio_url(training_job, direct=True)\n", - "print(url)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "591d5e98", - "metadata": {}, - "outputs": [], - "source": [ - "job.stop()" - ] - }, - { - "cell_type": "markdown", - "id": "da489ad0-36b8-44e7-9f65-2ffd359e5225", - "metadata": {}, - "source": [ - "### View any Training job details\n", - "\n", - "We can get any training job details and its status with TrainingJob.get(...)" - ] - }, { "cell_type": "code", "execution_count": null, @@ -573,10 +236,14 @@ "metadata": {}, "outputs": [], "source": [ + "\n", "\n", "import json\n", "import re\n", "from sagemaker.core.utils.utils import Unassigned\n", + "from sagemaker.core.resources import TrainingJob\n", + "\n", + "response = TrainingJob.get(training_job_name=\"meta-textgeneration-llama-3-2-1b-instruct-sft-20251201114921\")\n", "\n", "def pretty_print(obj):\n", " def parse_unassigned(item):\n", @@ -593,78 +260,62 @@ " return item\n", "\n", " cleaned = parse_unassigned(obj.__dict__ if hasattr(obj, '__dict__') else obj)\n", - " print(json.dumps(cleaned, indent=2, default=str))" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "6bbe96b4-c8cd-4de3-b4c0-a66fd3086eb2", - "metadata": { - "scrolled": true - }, - "outputs": [], - "source": [ - "from sagemaker.core.resources import TrainingJob\n", + " print(json.dumps(cleaned, indent=2, default=str))\n", "\n", - "response = TrainingJob.get(training_job_name=\"nova-textgeneration-micro-sft-20260302144611\")\n", "pretty_print(response)" ] }, { - "cell_type": "markdown", - "id": "8d25735e", + "cell_type": "code", + "execution_count": null, + "id": "e9ee7f8e-b26c-4579-9dbc-f08124f2e944", "metadata": {}, + "outputs": [], "source": [ - "## Visualize Training Metrics\n", + "#In order to skip waiting and monitor the training Job later\n", "\n", - "After training completes, you can visualize metrics logged to MLflow:\n", - "\n", - "- **`get_available_metrics(job)`** - List all available metrics\n", - "- **`plot_training_metrics(job)`** - Plot all metrics\n", - "- **`plot_training_metrics(job, metrics=['reduced_train_loss', 'global_step'])`** - Plot specific metrics" + "'''\n", + "training_job = sft_trainer.train(\n", + " wait=False,\n", + ")\n", + "'''" ] }, { "cell_type": "code", "execution_count": null, - "id": "b8a9256c", - "metadata": {}, + "id": "0d99f212-f0bd-43c1-be21-30202fb4a152", + "metadata": { + "scrolled": true + }, "outputs": [], "source": [ - "from sagemaker.train import get_available_metrics\n", - "\n", - "get_available_metrics('nova-textgeneration-micro-sft-20260302144611')" + "pretty_print(training_job)" ] }, { - "cell_type": "code", - "execution_count": null, - "id": "f1a794ba", + "cell_type": "markdown", + "id": "da489ad0-36b8-44e7-9f65-2ffd359e5225", "metadata": {}, - "outputs": [], "source": [ - "from sagemaker.train import plot_training_metrics\n", + "### View any Training job details\n", "\n", - "# Simple - plot all metrics\n", - "plot_training_metrics('nova-textgeneration-micro-sft-20260302144611')" + "We can get any training job details and its status with TrainingJob.get(...)" ] }, { "cell_type": "code", "execution_count": null, - "id": "2f7ed9d7", - "metadata": {}, + "id": "6bbe96b4-c8cd-4de3-b4c0-a66fd3086eb2", + "metadata": { + "scrolled": true + }, "outputs": [], "source": [ - "from sagemaker.train import plot_training_metrics\n", + "from sagemaker.core.resources import TrainingJob\n", "\n", - "# Advanced - plot specific metrics\n", - "plot_training_metrics(\n", - " training_job='nova-textgeneration-micro-sft-20260302144611',\n", - " metrics=['reduced_train_loss', 'global_step'],\n", - " figsize=(14, 6)\n", - ")" + "response = TrainingJob.get(training_job_name=\"meta-textgeneration-llama-3-2-1b-instruct-sft-20251123162832\")\n", + "pretty_print(response)" ] }, { @@ -813,31 +464,11 @@ " wait=True,\n", ")" ] - }, - { - "cell_type": "markdown", - "id": "be4dc095", - "metadata": {}, - "source": [ - "### Stop Training Job" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "fb92815b", - "metadata": {}, - "outputs": [], - "source": [ - "from sagemaker.core.resources import TrainingJob\n", - "job = TrainingJob.get(training_job_name='nova-textgeneration-pro-sft-20260304135114')\n", - "job.stop()" - ] } ], "metadata": { "kernelspec": { - "display_name": "py3.10.14", + "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, @@ -851,7 +482,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.14" + "version": "3.12.10" } }, "nbformat": 4, diff --git a/v3-examples/model-customization-examples/sm-studio-nova-training-job-sample-notebook.ipynb b/v3-examples/model-customization-examples/sm-studio-nova-training-job-sample-notebook.ipynb index a5f83647cc..6645f2b7d2 100644 --- a/v3-examples/model-customization-examples/sm-studio-nova-training-job-sample-notebook.ipynb +++ b/v3-examples/model-customization-examples/sm-studio-nova-training-job-sample-notebook.ipynb @@ -47,33 +47,10 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": null, "id": "234f7398-fd6b-4d02-a406-0491924c461d", "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
[03/02/26 14:23:42] INFO     Found credentials in shared credentials file: ~/.aws/credentials   credentials.py:1392\n",
-       "
\n" - ], - "text/plain": [ - "\u001b[2;36m[03/02/26 14:23:42]\u001b[0m\u001b[2;36m \u001b[0m\u001b[1;38;2;0;105;255mINFO \u001b[0m Found credentials in shared credentials file: ~\u001b[38;2;225;0;225m/.aws/\u001b[0m\u001b[38;2;225;0;225mcredentials\u001b[0m \u001b]8;id=981430;file:///Users/mollyhe/.pyenv/versions/3.10.14/envs/py3.10.14/lib/python3.10/site-packages/botocore/credentials.py\u001b\\\u001b[2mcredentials.py\u001b[0m\u001b]8;;\u001b\\\u001b[2m:\u001b[0m\u001b]8;id=408099;file:///Users/mollyhe/.pyenv/versions/3.10.14/envs/py3.10.14/lib/python3.10/site-packages/botocore/credentials.py#1392\u001b\\\u001b[2m1392\u001b[0m\u001b]8;;\u001b\\\n" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "sagemaker.config INFO - Not applying SDK defaults from location: /Library/Application Support/sagemaker/config.yaml\n", - "sagemaker.config INFO - Not applying SDK defaults from location: /Users/mollyhe/Library/Application Support/sagemaker/config.yaml\n", - "Region: us-east-1\n" - ] - } - ], + "outputs": [], "source": [ "\n", "import os\n", @@ -105,110 +82,10 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": null, "id": "39aaeb1d", "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
[03/02/26 14:23:45] INFO     Found credentials in shared credentials file: ~/.aws/credentials   credentials.py:1392\n",
-       "
\n" - ], - "text/plain": [ - "\u001b[2;36m[03/02/26 14:23:45]\u001b[0m\u001b[2;36m \u001b[0m\u001b[1;38;2;0;105;255mINFO \u001b[0m Found credentials in shared credentials file: ~\u001b[38;2;225;0;225m/.aws/\u001b[0m\u001b[38;2;225;0;225mcredentials\u001b[0m \u001b]8;id=437588;file:///Users/mollyhe/.pyenv/versions/3.10.14/envs/py3.10.14/lib/python3.10/site-packages/botocore/credentials.py\u001b\\\u001b[2mcredentials.py\u001b[0m\u001b]8;;\u001b\\\u001b[2m:\u001b[0m\u001b]8;id=570403;file:///Users/mollyhe/.pyenv/versions/3.10.14/envs/py3.10.14/lib/python3.10/site-packages/botocore/credentials.py#1392\u001b\\\u001b[2m1392\u001b[0m\u001b]8;;\u001b\\\n" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "
[03/02/26 14:23:46] INFO     SageMaker Python SDK will collect telemetry to help us better  telemetry_logging.py:92\n",
-       "                             understand our user's needs, diagnose issues, and deliver                             \n",
-       "                             additional features.                                                                  \n",
-       "                             To opt out of telemetry, please disable via TelemetryOptOut                           \n",
-       "                             parameter in SDK defaults config. For more information, refer                         \n",
-       "                             to                                                                                    \n",
-       "                             https://sagemaker.readthedocs.io/en/stable/overview.html#confi                        \n",
-       "                             guring-and-using-defaults-with-the-sagemaker-python-sdk.                              \n",
-       "
\n" - ], - "text/plain": [ - "\u001b[2;36m[03/02/26 14:23:46]\u001b[0m\u001b[2;36m \u001b[0m\u001b[1;38;2;0;105;255mINFO \u001b[0m SageMaker Python SDK will collect telemetry to help us better \u001b]8;id=719707;file:///Users/mollyhe/.pyenv/versions/3.10.14/envs/py3.10.14/lib/python3.10/site-packages/sagemaker/core/telemetry/telemetry_logging.py\u001b\\\u001b[2mtelemetry_logging.py\u001b[0m\u001b]8;;\u001b\\\u001b[2m:\u001b[0m\u001b]8;id=805235;file:///Users/mollyhe/.pyenv/versions/3.10.14/envs/py3.10.14/lib/python3.10/site-packages/sagemaker/core/telemetry/telemetry_logging.py#92\u001b\\\u001b[2m92\u001b[0m\u001b]8;;\u001b\\\n", - "\u001b[2;36m \u001b[0m understand our user's needs, diagnose issues, and deliver \u001b[2m \u001b[0m\n", - "\u001b[2;36m \u001b[0m additional features. \u001b[2m \u001b[0m\n", - "\u001b[2;36m \u001b[0m To opt out of telemetry, please disable via TelemetryOptOut \u001b[2m \u001b[0m\n", - "\u001b[2;36m \u001b[0m parameter in SDK defaults config. For more information, refer \u001b[2m \u001b[0m\n", - "\u001b[2;36m \u001b[0m to \u001b[2m \u001b[0m\n", - "\u001b[2;36m \u001b[0m \u001b[4;38;2;0;105;255mhttps://sagemaker.readthedocs.io/en/stable/overview.html#confi\u001b[0m \u001b[2m \u001b[0m\n", - "\u001b[2;36m \u001b[0m \u001b[4;38;2;0;105;255mguring-and-using-defaults-with-the-sagemaker-python-sdk.\u001b[0m \u001b[2m \u001b[0m\n" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "
[03/02/26 14:23:47] INFO     Role not provided. Using default role:                                  defaults.py:75\n",
-       "                             arn:aws:iam::529088288990:role/Admin                                                  \n",
-       "
\n" - ], - "text/plain": [ - "\u001b[2;36m[03/02/26 14:23:47]\u001b[0m\u001b[2;36m \u001b[0m\u001b[1;38;2;0;105;255mINFO \u001b[0m Role not provided. Using default role: \u001b]8;id=193859;file:///Users/mollyhe/.pyenv/versions/3.10.14/envs/py3.10.14/lib/python3.10/site-packages/sagemaker/train/defaults.py\u001b\\\u001b[2mdefaults.py\u001b[0m\u001b]8;;\u001b\\\u001b[2m:\u001b[0m\u001b]8;id=892655;file:///Users/mollyhe/.pyenv/versions/3.10.14/envs/py3.10.14/lib/python3.10/site-packages/sagemaker/train/defaults.py#75\u001b\\\u001b[2m75\u001b[0m\u001b]8;;\u001b\\\n", - "\u001b[2;36m \u001b[0m arn:aws:iam::\u001b[1;36m529088288990\u001b[0m:role/Admin \u001b[2m \u001b[0m\n" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "521e87aea90849e48e5312e3160c5a23", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "Output()" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "
Final Resource Status: Available\n",
-       "
\n" - ], - "text/plain": [ - "Final Resource Status: Available\n" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "
\n"
-      ],
-      "text/plain": []
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "TRAINING_DATASET ARN: arn:aws:sagemaker:us-east-1:529088288990:hub-content/G82247CBEQ6TN0FI3J1SAJU59UURMT3H3EKG48C6VQGDINNRNGU0/DataSet/demo-sft-dataset/3.0.0\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "from sagemaker.ai_registry.dataset import DataSet\n",
     "from sagemaker.ai_registry.dataset_utils import CustomizationTechnique\n",
@@ -216,34 +93,34 @@
     "# Register dataset in SageMaker AI Registry. This creates a versioned dataset that can be referenced by ARN\n",
     "dataset = DataSet.create(\n",
     "    name=\"demo-sft-dataset\",\n",
-    "    source=\"s3://sagemaker-us-east-1-529088288990/nova1_SFT.jsonl\", # source can be S3 or local path\n",
+    "    source=\"s3://your-bucket/dataset/training_dataset.jsonl\", # source can be S3 or local path\n",
     "    #customization_technique=CUSTOMIZATION_TECHNIQUE.SFT # or DPO or RLVR\n",
     "        # Optional technique name for minimal dataset format check.\n",
     "    wait=True\n",
     ")\n",
     "\n",
     "print(f\"TRAINING_DATASET ARN: {dataset.arn}\")\n",
-    "TRAINING_DATASET = dataset.arn"
+    "# TRAINING_DATASET = dataset.arn"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": null,
    "id": "ea22bd22",
    "metadata": {},
    "outputs": [],
    "source": [
     "# Required Configs\n",
-    "BASE_MODEL = \"nova-textgeneration-micro\"\n",
+    "BASE_MODEL = \"\"\n",
     "\n",
     "# MODEL_PACKAGE_GROUP_NAME is same as CUSTOM_MODEL_NAME\n",
-    "MODEL_PACKAGE_GROUP_NAME = \"model-pacakge-group-nova4\"\n",
+    "MODEL_PACKAGE_GROUP_NAME = \"\"\n",
     "\n",
-    "# TRAINING_DATASET = \"\"\n",
+    "TRAINING_DATASET = \"\"\n",
     "\n",
-    "S3_OUTPUT_PATH = \"s3://sagemaker-us-east-1-529088288990/output/\"\n",
+    "S3_OUTPUT_PATH = \"\"\n",
     "\n",
-    "ROLE_ARN = \"arn:aws:iam::529088288990:role/Admin\""
+    "ROLE_ARN = \"\""
    ]
   },
   {
@@ -256,68 +133,15 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": null,
    "id": "90a1069d19eeee7",
    "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/html": [
-       "
[03/02/26 14:23:59] INFO     Creating model_package_group resource.                              resources.py:25559\n",
-       "
\n" - ], - "text/plain": [ - "\u001b[2;36m[03/02/26 14:23:59]\u001b[0m\u001b[2;36m \u001b[0m\u001b[1;38;2;0;105;255mINFO \u001b[0m Creating model_package_group resource. \u001b]8;id=924503;file:///Users/mollyhe/.pyenv/versions/3.10.14/envs/py3.10.14/lib/python3.10/site-packages/sagemaker/core/resources.py\u001b\\\u001b[2mresources.py\u001b[0m\u001b]8;;\u001b\\\u001b[2m:\u001b[0m\u001b]8;id=629107;file:///Users/mollyhe/.pyenv/versions/3.10.14/envs/py3.10.14/lib/python3.10/site-packages/sagemaker/core/resources.py#25559\u001b\\\u001b[2m25559\u001b[0m\u001b]8;;\u001b\\\n" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "
                    WARNING  No region provided. Using default region.                                 utils.py:340\n",
-       "
\n" - ], - "text/plain": [ - "\u001b[2;36m \u001b[0m\u001b[2;36m \u001b[0m\u001b[1;38;2;215;175;0mWARNING \u001b[0m No region provided. Using default region. \u001b]8;id=617102;file:///Users/mollyhe/.pyenv/versions/3.10.14/envs/py3.10.14/lib/python3.10/site-packages/sagemaker/core/utils/utils.py\u001b\\\u001b[2mutils.py\u001b[0m\u001b]8;;\u001b\\\u001b[2m:\u001b[0m\u001b]8;id=566991;file:///Users/mollyhe/.pyenv/versions/3.10.14/envs/py3.10.14/lib/python3.10/site-packages/sagemaker/core/utils/utils.py#340\u001b\\\u001b[2m340\u001b[0m\u001b]8;;\u001b\\\n" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "
                    INFO     Runs on sagemaker prod, region:us-east-1                                  utils.py:354\n",
-       "
\n" - ], - "text/plain": [ - "\u001b[2;36m \u001b[0m\u001b[2;36m \u001b[0m\u001b[1;38;2;0;105;255mINFO \u001b[0m Runs on sagemaker prod, region:us-east-\u001b[1;36m1\u001b[0m \u001b]8;id=132575;file:///Users/mollyhe/.pyenv/versions/3.10.14/envs/py3.10.14/lib/python3.10/site-packages/sagemaker/core/utils/utils.py\u001b\\\u001b[2mutils.py\u001b[0m\u001b]8;;\u001b\\\u001b[2m:\u001b[0m\u001b]8;id=459422;file:///Users/mollyhe/.pyenv/versions/3.10.14/envs/py3.10.14/lib/python3.10/site-packages/sagemaker/core/utils/utils.py#354\u001b\\\u001b[2m354\u001b[0m\u001b]8;;\u001b\\\n" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "
                    INFO     Found credentials in shared credentials file: ~/.aws/credentials   credentials.py:1392\n",
-       "
\n" - ], - "text/plain": [ - "\u001b[2;36m \u001b[0m\u001b[2;36m \u001b[0m\u001b[1;38;2;0;105;255mINFO \u001b[0m Found credentials in shared credentials file: ~\u001b[38;2;225;0;225m/.aws/\u001b[0m\u001b[38;2;225;0;225mcredentials\u001b[0m \u001b]8;id=803081;file:///Users/mollyhe/.pyenv/versions/3.10.14/envs/py3.10.14/lib/python3.10/site-packages/botocore/credentials.py\u001b\\\u001b[2mcredentials.py\u001b[0m\u001b]8;;\u001b\\\u001b[2m:\u001b[0m\u001b]8;id=146617;file:///Users/mollyhe/.pyenv/versions/3.10.14/envs/py3.10.14/lib/python3.10/site-packages/botocore/credentials.py#1392\u001b\\\u001b[2m1392\u001b[0m\u001b]8;;\u001b\\\n" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], + "outputs": [], "source": [ "from sagemaker.core.resources import ModelPackageGroup\n", "model_package_group = ModelPackageGroup.create(\n", " model_package_group_name=MODEL_PACKAGE_GROUP_NAME,\n", - " model_package_group_description='test nova textgeneration micro' # Required Description\n", + " model_package_group_description='' # Required Description\n", ")" ] }, @@ -386,7 +210,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": null, "id": "062953d8", "metadata": { "editable": true, @@ -577,84 +401,10 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": null, "id": "2f6eeb5e", "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Default Finetuning Options:\n" - ] - }, - { - "data": { - "text/html": [ - "
{\n",
-       "'name': 'my-lora-run-wkxk5',\n",
-       "'global_batch_size': '64',\n",
-       "'max_epochs': '2',\n",
-       "'learning_rate': '1e-05',\n",
-       "'lora_alpha': '128',\n",
-       "'learning_rate_ratio': '16.0',\n",
-       "'max_context_length': '8192'\n",
-       "}\n",
-       "
\n" - ], - "text/plain": [ - "\u001b[1m{\u001b[0m\n", - "\u001b[2;32m│ \u001b[0m\u001b[38;2;0;135;0m'name'\u001b[0m: \u001b[38;2;0;135;0m'my-lora-run-wkxk5'\u001b[0m,\n", - "\u001b[2;32m│ \u001b[0m\u001b[38;2;0;135;0m'global_batch_size'\u001b[0m: \u001b[38;2;0;135;0m'64'\u001b[0m,\n", - "\u001b[2;32m│ \u001b[0m\u001b[38;2;0;135;0m'max_epochs'\u001b[0m: \u001b[38;2;0;135;0m'2'\u001b[0m,\n", - "\u001b[2;32m│ \u001b[0m\u001b[38;2;0;135;0m'learning_rate'\u001b[0m: \u001b[38;2;0;135;0m'1e-05'\u001b[0m,\n", - "\u001b[2;32m│ \u001b[0m\u001b[38;2;0;135;0m'lora_alpha'\u001b[0m: \u001b[38;2;0;135;0m'128'\u001b[0m,\n", - "\u001b[2;32m│ \u001b[0m\u001b[38;2;0;135;0m'learning_rate_ratio'\u001b[0m: \u001b[38;2;0;135;0m'16.0'\u001b[0m,\n", - "\u001b[2;32m│ \u001b[0m\u001b[38;2;0;135;0m'max_context_length'\u001b[0m: \u001b[38;2;0;135;0m'8192'\u001b[0m\n", - "\u001b[1m}\u001b[0m\n" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n", - "Modified/User defined Options:\n" - ] - }, - { - "data": { - "text/html": [ - "
{\n",
-       "'name': 'my-lora-run-wkxk5',\n",
-       "'global_batch_size': '64',\n",
-       "'max_epochs': '2',\n",
-       "'learning_rate': '0.0002',\n",
-       "'lora_alpha': '128',\n",
-       "'learning_rate_ratio': '16.0',\n",
-       "'max_context_length': '8192'\n",
-       "}\n",
-       "
\n" - ], - "text/plain": [ - "\u001b[1m{\u001b[0m\n", - "\u001b[2;32m│ \u001b[0m\u001b[38;2;0;135;0m'name'\u001b[0m: \u001b[38;2;0;135;0m'my-lora-run-wkxk5'\u001b[0m,\n", - "\u001b[2;32m│ \u001b[0m\u001b[38;2;0;135;0m'global_batch_size'\u001b[0m: \u001b[38;2;0;135;0m'64'\u001b[0m,\n", - "\u001b[2;32m│ \u001b[0m\u001b[38;2;0;135;0m'max_epochs'\u001b[0m: \u001b[38;2;0;135;0m'2'\u001b[0m,\n", - "\u001b[2;32m│ \u001b[0m\u001b[38;2;0;135;0m'learning_rate'\u001b[0m: \u001b[38;2;0;135;0m'0.0002'\u001b[0m,\n", - "\u001b[2;32m│ \u001b[0m\u001b[38;2;0;135;0m'lora_alpha'\u001b[0m: \u001b[38;2;0;135;0m'128'\u001b[0m,\n", - "\u001b[2;32m│ \u001b[0m\u001b[38;2;0;135;0m'learning_rate_ratio'\u001b[0m: \u001b[38;2;0;135;0m'16.0'\u001b[0m,\n", - "\u001b[2;32m│ \u001b[0m\u001b[38;2;0;135;0m'max_context_length'\u001b[0m: \u001b[38;2;0;135;0m'8192'\u001b[0m\n", - "\u001b[1m}\u001b[0m\n" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], + "outputs": [], "source": [ "print(\"Default Finetuning Options:\")\n", "pprint(trainer.hyperparameters.to_dict())\n", @@ -676,355 +426,10 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": null, "id": "31690f41", "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
╭────────────────────────────────── Training Job Status ───────────────────────────────────╮\n",
-       "  TrainingJob Name      nova-textgeneration-micro-sft-20260302142433                      \n",
-       "                                                                                          \n",
-       "  Job Status            InProgress                                                        \n",
-       "  Secondary Status      Pending                                                           \n",
-       "  Elapsed Time          814.3s                                                            \n",
-       "                                                                                          \n",
-       " Status Transitions                                                                       \n",
-       "                                                                                          \n",
-       "        Step              Details                               Duration                  \n",
-       "  ───────────────────────────────────────────────────────────────────────────             \n",
-       "    Starting          Starting the training job             2.2s                      \n",
-       "        Pending           Training job waiting for capacity                               \n",
-       "                                                                                          \n",
-       "╰──────────────────────────────────────────────────────────────────────────────────────────╯\n",
-       "
\n" - ], - "text/plain": [ - "\u001b[38;5;172m╭─\u001b[0m\u001b[38;5;172m─────────────────────────────────\u001b[0m\u001b[38;5;172m \u001b[0m\u001b[1;94mTraining Job Status\u001b[0m\u001b[38;5;172m \u001b[0m\u001b[38;5;172m──────────────────────────────────\u001b[0m\u001b[38;5;172m─╮\u001b[0m\n", - "\u001b[38;5;172m│\u001b[0m \u001b[1;36m \u001b[0m\u001b[1;36mTrainingJob Name \u001b[0m\u001b[1;36m \u001b[0m\u001b[37m \u001b[0m\u001b[1;32mnova-textgeneration-micro-sft-20260302142433\u001b[0m\u001b[37m \u001b[0m \u001b[38;5;172m│\u001b[0m\n", - "\u001b[38;5;172m│\u001b[0m \u001b[38;5;172m│\u001b[0m\n", - "\u001b[38;5;172m│\u001b[0m \u001b[1;36m \u001b[0m\u001b[1;36mJob Status \u001b[0m\u001b[1;36m \u001b[0m\u001b[37m \u001b[0m\u001b[1;38;5;172mInProgress\u001b[0m\u001b[37m \u001b[0m \u001b[38;5;172m│\u001b[0m\n", - "\u001b[38;5;172m│\u001b[0m \u001b[1;36m \u001b[0m\u001b[1;36mSecondary Status \u001b[0m\u001b[1;36m \u001b[0m\u001b[37m \u001b[0m\u001b[1;33mPending\u001b[0m\u001b[37m \u001b[0m\u001b[37m \u001b[0m \u001b[38;5;172m│\u001b[0m\n", - "\u001b[38;5;172m│\u001b[0m \u001b[1;36m \u001b[0m\u001b[1;36mElapsed Time \u001b[0m\u001b[1;36m \u001b[0m\u001b[37m \u001b[0m\u001b[1;91m814.3s\u001b[0m\u001b[37m \u001b[0m\u001b[37m \u001b[0m \u001b[38;5;172m│\u001b[0m\n", - "\u001b[38;5;172m│\u001b[0m \u001b[38;5;172m│\u001b[0m\n", - "\u001b[38;5;172m│\u001b[0m \u001b[1;35mStatus Transitions\u001b[0m \u001b[38;5;172m│\u001b[0m\n", - "\u001b[38;5;172m│\u001b[0m \u001b[38;5;172m│\u001b[0m\n", - "\u001b[38;5;172m│\u001b[0m \u001b[1;35m \u001b[0m\u001b[1;35m \u001b[0m\u001b[1;35m \u001b[0m \u001b[1;35m \u001b[0m\u001b[1;35mStep \u001b[0m\u001b[1;35m \u001b[0m \u001b[1;35m \u001b[0m\u001b[1;35mDetails \u001b[0m\u001b[1;35m \u001b[0m \u001b[1;35m \u001b[0m\u001b[1;35mDuration \u001b[0m\u001b[1;35m \u001b[0m \u001b[38;5;172m│\u001b[0m\n", - "\u001b[38;5;172m│\u001b[0m ─────────────────────────────────────────────────────────────────────────── \u001b[38;5;172m│\u001b[0m\n", - "\u001b[38;5;172m│\u001b[0m \u001b[32m \u001b[0m\u001b[32m✓ \u001b[0m\u001b[32m \u001b[0m \u001b[36m \u001b[0m\u001b[36mStarting \u001b[0m\u001b[36m \u001b[0m \u001b[38;5;172m \u001b[0m\u001b[38;5;172mStarting the training job \u001b[0m\u001b[38;5;172m \u001b[0m \u001b[32m \u001b[0m\u001b[32m2.2s \u001b[0m\u001b[32m \u001b[0m \u001b[38;5;172m│\u001b[0m\n", - "\u001b[38;5;172m│\u001b[0m \u001b[32m \u001b[0m\u001b[32m \u001b[0m\u001b[32m \u001b[0m \u001b[36m \u001b[0m\u001b[36mPending \u001b[0m\u001b[36m \u001b[0m \u001b[38;5;172m \u001b[0m\u001b[38;5;172mTraining job waiting for capacity \u001b[0m\u001b[38;5;172m \u001b[0m \u001b[32m \u001b[0m\u001b[32m \u001b[0m\u001b[32m \u001b[0m \u001b[38;5;172m│\u001b[0m\n", - "\u001b[38;5;172m│\u001b[0m \u001b[38;5;172m│\u001b[0m\n", - "\u001b[38;5;172m╰──────────────────────────────────────────────────────────────────────────────────────────╯\u001b[0m\n" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "
╭─────────────────────────────── Traceback (most recent call last) ────────────────────────────────╮\n",
-       " /Users/mollyhe/.pyenv/versions/3.10.14/envs/py3.10.14/lib/python3.10/site-packages/sagemaker/tra \n",
-       " in/common_utils/trainer_wait.py:197 in wait                                                      \n",
-       "                                                                                                  \n",
-       "   194 │   │   │   │   │   iteration += 1                                                         \n",
-       "   195 │   │   │   │   │   time.sleep(1)                                                          \n",
-       "   196 │   │   │   │   │   if iteration == poll:                                                  \n",
-       " 197 │   │   │   │   │   │   training_job.refresh()                                             \n",
-       "   198 │   │   │   │   │   │   iteration = 0                                                      \n",
-       "   199 │   │   │   │   │   clear_output(wait=True)                                                \n",
-       "   200                                                                                            \n",
-       "                                                                                                  \n",
-       " /Users/mollyhe/.pyenv/versions/3.10.14/envs/py3.10.14/lib/python3.10/site-packages/sagemaker/cor \n",
-       " e/resources.py:143 in wrapper                                                                    \n",
-       "                                                                                                  \n",
-       "     140 │   │   @functools.wraps(func)                                                           \n",
-       "     141 │   │   def wrapper(*args, **kwargs):                                                    \n",
-       "     142 │   │   │   config = dict(arbitrary_types_allowed=True)                                  \n",
-       "   143 │   │   │   return validate_call(config=config)(func)(*args, **kwargs)                   \n",
-       "     144 │   │                                                                                    \n",
-       "     145 │   │   return wrapper                                                                   \n",
-       "     146                                                                                          \n",
-       "                                                                                                  \n",
-       " /Users/mollyhe/.pyenv/versions/3.10.14/envs/py3.10.14/lib/python3.10/site-packages/pydantic/_int \n",
-       " ernal/_validate_call.py:39 in wrapper_function                                                   \n",
-       "                                                                                                  \n",
-       "    36 │   │                                                                                      \n",
-       "    37 │   │   @functools.wraps(wrapped)                                                          \n",
-       "    38 │   │   def wrapper_function(*args, **kwargs):                                             \n",
-       "  39 │   │   │   return wrapper(*args, **kwargs)                                                \n",
-       "    40                                                                                        \n",
-       "    41 # We need to manually update this because `partial` object has no `__name__` and `__   \n",
-       "    42 wrapper_function.__name__ = extract_function_name(wrapped)                             \n",
-       "                                                                                                  \n",
-       " /Users/mollyhe/.pyenv/versions/3.10.14/envs/py3.10.14/lib/python3.10/site-packages/pydantic/_int \n",
-       " ernal/_validate_call.py:136 in __call__                                                          \n",
-       "                                                                                                  \n",
-       "   133 │   │   if not self.__pydantic_complete__:                                                 \n",
-       "   134 │   │   │   self._create_validators()                                                      \n",
-       "   135 │   │                                                                                      \n",
-       " 136 │   │   res = self.__pydantic_validator__.validate_python(pydantic_core.ArgsKwargs(args,   \n",
-       "   137 │   │   if self.__return_pydantic_validator__:                                             \n",
-       "   138 │   │   │   return self.__return_pydantic_validator__(res)                                 \n",
-       "   139 │   │   else:                                                                              \n",
-       "                                                                                                  \n",
-       " /Users/mollyhe/.pyenv/versions/3.10.14/envs/py3.10.14/lib/python3.10/site-packages/sagemaker/cor \n",
-       " e/resources.py:35682 in refresh                                                                  \n",
-       "                                                                                                  \n",
-       "   35679 │   │   logger.debug(f\"Serialized input request: {operation_input_args}\")                \n",
-       "   35680 │   │                                                                                    \n",
-       "   35681 │   │   client = Base.get_sagemaker_client()                                             \n",
-       " 35682 │   │   response = client.describe_training_job(**operation_input_args)                  \n",
-       "   35683 │   │                                                                                    \n",
-       "   35684 │   │   # deserialize response and update self                                           \n",
-       "   35685 │   │   transform(response, \"DescribeTrainingJobResponse\", self)                         \n",
-       "                                                                                                  \n",
-       " /Users/mollyhe/.pyenv/versions/3.10.14/envs/py3.10.14/lib/python3.10/site-packages/botocore/clie \n",
-       " nt.py:602 in _api_call                                                                           \n",
-       "                                                                                                  \n",
-       "    599 │   │   │   │   │   f\"{py_operation_name}() only accepts keyword arguments.\"              \n",
-       "    600 │   │   │   │   )                                                                         \n",
-       "    601 │   │   │   # The \"self\" in this scope is referring to the BaseClient.                    \n",
-       "  602 │   │   │   return self._make_api_call(operation_name, kwargs)                            \n",
-       "    603 │   │                                                                                     \n",
-       "    604 │   │   _api_call.__name__ = str(py_operation_name)                                       \n",
-       "    605                                                                                           \n",
-       "                                                                                                  \n",
-       " /Users/mollyhe/.pyenv/versions/3.10.14/envs/py3.10.14/lib/python3.10/site-packages/botocore/cont \n",
-       " ext.py:123 in wrapper                                                                            \n",
-       "                                                                                                  \n",
-       "   120 │   │   │   with start_as_current_context():                                               \n",
-       "   121 │   │   │   │   if hook:                                                                   \n",
-       "   122 │   │   │   │   │   hook()                                                                 \n",
-       " 123 │   │   │   │   return func(*args, **kwargs)                                               \n",
-       "   124 │   │                                                                                      \n",
-       "   125 │   │   return wrapper                                                                     \n",
-       "   126                                                                                            \n",
-       "                                                                                                  \n",
-       " /Users/mollyhe/.pyenv/versions/3.10.14/envs/py3.10.14/lib/python3.10/site-packages/botocore/clie \n",
-       " nt.py:1078 in _make_api_call                                                                     \n",
-       "                                                                                                  \n",
-       "   1075 │   │   │   │   'error_code_override'                                                     \n",
-       "   1076 │   │   │   ) or error_info.get(\"Code\")                                                   \n",
-       "   1077 │   │   │   error_class = self.exceptions.from_code(error_code)                           \n",
-       " 1078 │   │   │   raise error_class(parsed_response, operation_name)                            \n",
-       "   1079 │   │   else:                                                                             \n",
-       "   1080 │   │   │   return parsed_response                                                        \n",
-       "   1081                                                                                           \n",
-       "╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n",
-       "ClientError: An error occurred (ExpiredTokenException) when calling the DescribeTrainingJob operation: The security\n",
-       "token included in the request is expired\n",
-       "\n",
-       "The above exception was the direct cause of the following exception:\n",
-       "\n",
-       "╭─────────────────────────────── Traceback (most recent call last) ────────────────────────────────╮\n",
-       " in <module>:1                                                                                    \n",
-       "                                                                                                  \n",
-       " 1 training_job = trainer.train(wait=True)                                                      \n",
-       "   2                                                                                              \n",
-       "   3 TRAINING_JOB_NAME = training_job.training_job_name                                           \n",
-       "   4                                                                                              \n",
-       "                                                                                                  \n",
-       " /Users/mollyhe/.pyenv/versions/3.10.14/envs/py3.10.14/lib/python3.10/site-packages/sagemaker/cor \n",
-       " e/telemetry/telemetry_logging.py:168 in wrapper                                                  \n",
-       "                                                                                                  \n",
-       "   165 │   │   │   │   │   caught_ex = e                                                          \n",
-       "   166 │   │   │   │   finally:                                                                   \n",
-       "   167 │   │   │   │   │   if caught_ex:                                                          \n",
-       " 168 │   │   │   │   │   │   raise caught_ex                                                    \n",
-       "   169 │   │   │   │   │   return response  # pylint: disable=W0150                               \n",
-       "   170 │   │   │   else:                                                                          \n",
-       "   171 │   │   │   │   logger.debug(                                                              \n",
-       "                                                                                                  \n",
-       " /Users/mollyhe/.pyenv/versions/3.10.14/envs/py3.10.14/lib/python3.10/site-packages/sagemaker/cor \n",
-       " e/telemetry/telemetry_logging.py:139 in wrapper                                                  \n",
-       "                                                                                                  \n",
-       "   136 │   │   │   │   start_timer = perf_counter()                                               \n",
-       "   137 │   │   │   │   try:                                                                       \n",
-       "   138 │   │   │   │   │   # Call the original function                                           \n",
-       " 139 │   │   │   │   │   response = func(*args, **kwargs)                                       \n",
-       "   140 │   │   │   │   │   stop_timer = perf_counter()                                            \n",
-       "   141 │   │   │   │   │   elapsed = stop_timer - start_timer                                     \n",
-       "   142 │   │   │   │   │   extra += f\"&x-latency={round(elapsed, 2)}\"                             \n",
-       "                                                                                                  \n",
-       " /Users/mollyhe/.pyenv/versions/3.10.14/envs/py3.10.14/lib/python3.10/site-packages/sagemaker/tra \n",
-       " in/sft_trainer.py:267 in train                                                                   \n",
-       "                                                                                                  \n",
-       "   264 │   │   │   from sagemaker.train.common_utils.trainer_wait import wait as _wait            \n",
-       "   265 │   │   │   from sagemaker.core.utils.exceptions import TimeoutExceededError               \n",
-       "   266 │   │   │   try :                                                                          \n",
-       " 267 │   │   │   │   _wait(training_job)                                                        \n",
-       "   268 │   │   │   except TimeoutExceededError as e:                                              \n",
-       "   269 │   │   │   │   logger.error(\"Error: %s\", e)                                               \n",
-       "   270                                                                                            \n",
-       "                                                                                                  \n",
-       " /Users/mollyhe/.pyenv/versions/3.10.14/envs/py3.10.14/lib/python3.10/site-packages/sagemaker/tra \n",
-       " in/common_utils/trainer_wait.py:374 in wait                                                      \n",
-       "                                                                                                  \n",
-       "   371 except (FailedStatusError, TimeoutExceededError):                                      \n",
-       "   372 │   │   raise                                                                              \n",
-       "   373 except Exception as e:                                                                 \n",
-       " 374 │   │   raise RuntimeError(f\"Training job monitoring failed: {e}\") from e                  \n",
-       "   375                                                                                            \n",
-       "╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n",
-       "RuntimeError: Training job monitoring failed: An error occurred (ExpiredTokenException) when calling the \n",
-       "DescribeTrainingJob operation: The security token included in the request is expired\n",
-       "
\n" - ], - "text/plain": [ - "\u001b[38;2;255;0;0m╭─\u001b[0m\u001b[38;2;255;0;0m──────────────────────────────\u001b[0m\u001b[38;2;255;0;0m \u001b[0m\u001b[1;38;2;255;0;0mTraceback \u001b[0m\u001b[1;2;38;2;255;0;0m(most recent call last)\u001b[0m\u001b[38;2;255;0;0m \u001b[0m\u001b[38;2;255;0;0m───────────────────────────────\u001b[0m\u001b[38;2;255;0;0m─╮\u001b[0m\n", - "\u001b[38;2;255;0;0m│\u001b[0m \u001b[2;33m/Users/mollyhe/.pyenv/versions/3.10.14/envs/py3.10.14/lib/python3.10/site-packages/sagemaker/tra\u001b[0m \u001b[38;2;255;0;0m│\u001b[0m\n", - "\u001b[38;2;255;0;0m│\u001b[0m \u001b[2;33min/common_utils/\u001b[0m\u001b[1;33mtrainer_wait.py\u001b[0m:\u001b[94m197\u001b[0m in \u001b[92mwait\u001b[0m \u001b[38;2;255;0;0m│\u001b[0m\n", - "\u001b[38;2;255;0;0m│\u001b[0m \u001b[38;2;255;0;0m│\u001b[0m\n", - "\u001b[38;2;255;0;0m│\u001b[0m \u001b[2m194 \u001b[0m\u001b[2m│ │ │ │ │ \u001b[0miteration += \u001b[94m1\u001b[0m \u001b[38;2;255;0;0m│\u001b[0m\n", - "\u001b[38;2;255;0;0m│\u001b[0m \u001b[2m195 \u001b[0m\u001b[2m│ │ │ │ │ \u001b[0mtime.sleep(\u001b[94m1\u001b[0m) \u001b[38;2;255;0;0m│\u001b[0m\n", - "\u001b[38;2;255;0;0m│\u001b[0m \u001b[2m196 \u001b[0m\u001b[2m│ │ │ │ │ \u001b[0m\u001b[94mif\u001b[0m iteration == poll: \u001b[38;2;255;0;0m│\u001b[0m\n", - "\u001b[38;2;255;0;0m│\u001b[0m \u001b[31m❱ \u001b[0m197 \u001b[2m│ │ │ │ │ │ \u001b[0mtraining_job.refresh() \u001b[38;2;255;0;0m│\u001b[0m\n", - "\u001b[38;2;255;0;0m│\u001b[0m \u001b[2m198 \u001b[0m\u001b[2m│ │ │ │ │ │ \u001b[0miteration = \u001b[94m0\u001b[0m \u001b[38;2;255;0;0m│\u001b[0m\n", - "\u001b[38;2;255;0;0m│\u001b[0m \u001b[2m199 \u001b[0m\u001b[2m│ │ │ │ │ \u001b[0mclear_output(wait=\u001b[94mTrue\u001b[0m) \u001b[38;2;255;0;0m│\u001b[0m\n", - "\u001b[38;2;255;0;0m│\u001b[0m \u001b[2m200 \u001b[0m \u001b[38;2;255;0;0m│\u001b[0m\n", - "\u001b[38;2;255;0;0m│\u001b[0m \u001b[38;2;255;0;0m│\u001b[0m\n", - "\u001b[38;2;255;0;0m│\u001b[0m \u001b[2;33m/Users/mollyhe/.pyenv/versions/3.10.14/envs/py3.10.14/lib/python3.10/site-packages/sagemaker/cor\u001b[0m \u001b[38;2;255;0;0m│\u001b[0m\n", - "\u001b[38;2;255;0;0m│\u001b[0m \u001b[2;33me/\u001b[0m\u001b[1;33mresources.py\u001b[0m:\u001b[94m143\u001b[0m in \u001b[92mwrapper\u001b[0m \u001b[38;2;255;0;0m│\u001b[0m\n", - "\u001b[38;2;255;0;0m│\u001b[0m \u001b[38;2;255;0;0m│\u001b[0m\n", - "\u001b[38;2;255;0;0m│\u001b[0m \u001b[2m 140 \u001b[0m\u001b[2m│ │ \u001b[0m\u001b[1;95m@functools\u001b[0m.wraps(func) \u001b[38;2;255;0;0m│\u001b[0m\n", - "\u001b[38;2;255;0;0m│\u001b[0m \u001b[2m 141 \u001b[0m\u001b[2m│ │ \u001b[0m\u001b[94mdef\u001b[0m\u001b[90m \u001b[0m\u001b[92mwrapper\u001b[0m(*args, **kwargs): \u001b[38;2;255;0;0m│\u001b[0m\n", - "\u001b[38;2;255;0;0m│\u001b[0m \u001b[2m 142 \u001b[0m\u001b[2m│ │ │ \u001b[0mconfig = \u001b[96mdict\u001b[0m(arbitrary_types_allowed=\u001b[94mTrue\u001b[0m) \u001b[38;2;255;0;0m│\u001b[0m\n", - "\u001b[38;2;255;0;0m│\u001b[0m \u001b[31m❱ \u001b[0m 143 \u001b[2m│ │ │ \u001b[0m\u001b[94mreturn\u001b[0m validate_call(config=config)(func)(*args, **kwargs) \u001b[38;2;255;0;0m│\u001b[0m\n", - "\u001b[38;2;255;0;0m│\u001b[0m \u001b[2m 144 \u001b[0m\u001b[2m│ │ \u001b[0m \u001b[38;2;255;0;0m│\u001b[0m\n", - "\u001b[38;2;255;0;0m│\u001b[0m \u001b[2m 145 \u001b[0m\u001b[2m│ │ \u001b[0m\u001b[94mreturn\u001b[0m wrapper \u001b[38;2;255;0;0m│\u001b[0m\n", - "\u001b[38;2;255;0;0m│\u001b[0m \u001b[2m 146 \u001b[0m \u001b[38;2;255;0;0m│\u001b[0m\n", - "\u001b[38;2;255;0;0m│\u001b[0m \u001b[38;2;255;0;0m│\u001b[0m\n", - "\u001b[38;2;255;0;0m│\u001b[0m \u001b[2;33m/Users/mollyhe/.pyenv/versions/3.10.14/envs/py3.10.14/lib/python3.10/site-packages/pydantic/_int\u001b[0m \u001b[38;2;255;0;0m│\u001b[0m\n", - "\u001b[38;2;255;0;0m│\u001b[0m \u001b[2;33mernal/\u001b[0m\u001b[1;33m_validate_call.py\u001b[0m:\u001b[94m39\u001b[0m in \u001b[92mwrapper_function\u001b[0m \u001b[38;2;255;0;0m│\u001b[0m\n", - "\u001b[38;2;255;0;0m│\u001b[0m \u001b[38;2;255;0;0m│\u001b[0m\n", - "\u001b[38;2;255;0;0m│\u001b[0m \u001b[2m 36 \u001b[0m\u001b[2m│ │ \u001b[0m \u001b[38;2;255;0;0m│\u001b[0m\n", - "\u001b[38;2;255;0;0m│\u001b[0m \u001b[2m 37 \u001b[0m\u001b[2m│ │ \u001b[0m\u001b[1;95m@functools\u001b[0m.wraps(wrapped) \u001b[38;2;255;0;0m│\u001b[0m\n", - "\u001b[38;2;255;0;0m│\u001b[0m \u001b[2m 38 \u001b[0m\u001b[2m│ │ \u001b[0m\u001b[94mdef\u001b[0m\u001b[90m \u001b[0m\u001b[92mwrapper_function\u001b[0m(*args, **kwargs): \u001b[38;2;255;0;0m│\u001b[0m\n", - "\u001b[38;2;255;0;0m│\u001b[0m \u001b[31m❱ \u001b[0m 39 \u001b[2m│ │ │ \u001b[0m\u001b[94mreturn\u001b[0m wrapper(*args, **kwargs) \u001b[38;2;255;0;0m│\u001b[0m\n", - "\u001b[38;2;255;0;0m│\u001b[0m \u001b[2m 40 \u001b[0m\u001b[2m│ \u001b[0m \u001b[38;2;255;0;0m│\u001b[0m\n", - "\u001b[38;2;255;0;0m│\u001b[0m \u001b[2m 41 \u001b[0m\u001b[2m│ \u001b[0m\u001b[2m# We need to manually update this because `partial` object has no `__name__` and `__\u001b[0m \u001b[38;2;255;0;0m│\u001b[0m\n", - "\u001b[38;2;255;0;0m│\u001b[0m \u001b[2m 42 \u001b[0m\u001b[2m│ \u001b[0mwrapper_function.\u001b[91m__name__\u001b[0m = extract_function_name(wrapped) \u001b[38;2;255;0;0m│\u001b[0m\n", - "\u001b[38;2;255;0;0m│\u001b[0m \u001b[38;2;255;0;0m│\u001b[0m\n", - "\u001b[38;2;255;0;0m│\u001b[0m \u001b[2;33m/Users/mollyhe/.pyenv/versions/3.10.14/envs/py3.10.14/lib/python3.10/site-packages/pydantic/_int\u001b[0m \u001b[38;2;255;0;0m│\u001b[0m\n", - "\u001b[38;2;255;0;0m│\u001b[0m \u001b[2;33mernal/\u001b[0m\u001b[1;33m_validate_call.py\u001b[0m:\u001b[94m136\u001b[0m in \u001b[92m__call__\u001b[0m \u001b[38;2;255;0;0m│\u001b[0m\n", - "\u001b[38;2;255;0;0m│\u001b[0m \u001b[38;2;255;0;0m│\u001b[0m\n", - "\u001b[38;2;255;0;0m│\u001b[0m \u001b[2m133 \u001b[0m\u001b[2m│ │ \u001b[0m\u001b[94mif\u001b[0m \u001b[95mnot\u001b[0m \u001b[96mself\u001b[0m.__pydantic_complete__: \u001b[38;2;255;0;0m│\u001b[0m\n", - "\u001b[38;2;255;0;0m│\u001b[0m \u001b[2m134 \u001b[0m\u001b[2m│ │ │ \u001b[0m\u001b[96mself\u001b[0m._create_validators() \u001b[38;2;255;0;0m│\u001b[0m\n", - "\u001b[38;2;255;0;0m│\u001b[0m \u001b[2m135 \u001b[0m\u001b[2m│ │ \u001b[0m \u001b[38;2;255;0;0m│\u001b[0m\n", - "\u001b[38;2;255;0;0m│\u001b[0m \u001b[31m❱ \u001b[0m136 \u001b[2m│ │ \u001b[0mres = \u001b[96mself\u001b[0m.__pydantic_validator__.validate_python(pydantic_core.ArgsKwargs(args, \u001b[38;2;255;0;0m│\u001b[0m\n", - "\u001b[38;2;255;0;0m│\u001b[0m \u001b[2m137 \u001b[0m\u001b[2m│ │ \u001b[0m\u001b[94mif\u001b[0m \u001b[96mself\u001b[0m.__return_pydantic_validator__: \u001b[38;2;255;0;0m│\u001b[0m\n", - "\u001b[38;2;255;0;0m│\u001b[0m \u001b[2m138 \u001b[0m\u001b[2m│ │ │ \u001b[0m\u001b[94mreturn\u001b[0m \u001b[96mself\u001b[0m.__return_pydantic_validator__(res) \u001b[38;2;255;0;0m│\u001b[0m\n", - "\u001b[38;2;255;0;0m│\u001b[0m \u001b[2m139 \u001b[0m\u001b[2m│ │ \u001b[0m\u001b[94melse\u001b[0m: \u001b[38;2;255;0;0m│\u001b[0m\n", - "\u001b[38;2;255;0;0m│\u001b[0m \u001b[38;2;255;0;0m│\u001b[0m\n", - "\u001b[38;2;255;0;0m│\u001b[0m \u001b[2;33m/Users/mollyhe/.pyenv/versions/3.10.14/envs/py3.10.14/lib/python3.10/site-packages/sagemaker/cor\u001b[0m \u001b[38;2;255;0;0m│\u001b[0m\n", - "\u001b[38;2;255;0;0m│\u001b[0m \u001b[2;33me/\u001b[0m\u001b[1;33mresources.py\u001b[0m:\u001b[94m35682\u001b[0m in \u001b[92mrefresh\u001b[0m \u001b[38;2;255;0;0m│\u001b[0m\n", - "\u001b[38;2;255;0;0m│\u001b[0m \u001b[38;2;255;0;0m│\u001b[0m\n", - "\u001b[38;2;255;0;0m│\u001b[0m \u001b[2m35679 \u001b[0m\u001b[2m│ │ \u001b[0mlogger.debug(\u001b[33mf\u001b[0m\u001b[33m\"\u001b[0m\u001b[33mSerialized input request: \u001b[0m\u001b[33m{\u001b[0moperation_input_args\u001b[33m}\u001b[0m\u001b[33m\"\u001b[0m) \u001b[38;2;255;0;0m│\u001b[0m\n", - "\u001b[38;2;255;0;0m│\u001b[0m \u001b[2m35680 \u001b[0m\u001b[2m│ │ \u001b[0m \u001b[38;2;255;0;0m│\u001b[0m\n", - "\u001b[38;2;255;0;0m│\u001b[0m \u001b[2m35681 \u001b[0m\u001b[2m│ │ \u001b[0mclient = Base.get_sagemaker_client() \u001b[38;2;255;0;0m│\u001b[0m\n", - "\u001b[38;2;255;0;0m│\u001b[0m \u001b[31m❱ \u001b[0m35682 \u001b[2m│ │ \u001b[0mresponse = client.describe_training_job(**operation_input_args) \u001b[38;2;255;0;0m│\u001b[0m\n", - "\u001b[38;2;255;0;0m│\u001b[0m \u001b[2m35683 \u001b[0m\u001b[2m│ │ \u001b[0m \u001b[38;2;255;0;0m│\u001b[0m\n", - "\u001b[38;2;255;0;0m│\u001b[0m \u001b[2m35684 \u001b[0m\u001b[2m│ │ \u001b[0m\u001b[2m# deserialize response and update self\u001b[0m \u001b[38;2;255;0;0m│\u001b[0m\n", - "\u001b[38;2;255;0;0m│\u001b[0m \u001b[2m35685 \u001b[0m\u001b[2m│ │ \u001b[0mtransform(response, \u001b[33m\"\u001b[0m\u001b[33mDescribeTrainingJobResponse\u001b[0m\u001b[33m\"\u001b[0m, \u001b[96mself\u001b[0m) \u001b[38;2;255;0;0m│\u001b[0m\n", - "\u001b[38;2;255;0;0m│\u001b[0m \u001b[38;2;255;0;0m│\u001b[0m\n", - "\u001b[38;2;255;0;0m│\u001b[0m \u001b[2;33m/Users/mollyhe/.pyenv/versions/3.10.14/envs/py3.10.14/lib/python3.10/site-packages/botocore/\u001b[0m\u001b[1;33mclie\u001b[0m \u001b[38;2;255;0;0m│\u001b[0m\n", - "\u001b[38;2;255;0;0m│\u001b[0m \u001b[1;33mnt.py\u001b[0m:\u001b[94m602\u001b[0m in \u001b[92m_api_call\u001b[0m \u001b[38;2;255;0;0m│\u001b[0m\n", - "\u001b[38;2;255;0;0m│\u001b[0m \u001b[38;2;255;0;0m│\u001b[0m\n", - "\u001b[38;2;255;0;0m│\u001b[0m \u001b[2m 599 \u001b[0m\u001b[2m│ │ │ │ │ \u001b[0m\u001b[33mf\u001b[0m\u001b[33m\"\u001b[0m\u001b[33m{\u001b[0mpy_operation_name\u001b[33m}\u001b[0m\u001b[33m() only accepts keyword arguments.\u001b[0m\u001b[33m\"\u001b[0m \u001b[38;2;255;0;0m│\u001b[0m\n", - "\u001b[38;2;255;0;0m│\u001b[0m \u001b[2m 600 \u001b[0m\u001b[2m│ │ │ │ \u001b[0m) \u001b[38;2;255;0;0m│\u001b[0m\n", - "\u001b[38;2;255;0;0m│\u001b[0m \u001b[2m 601 \u001b[0m\u001b[2m│ │ │ \u001b[0m\u001b[2m# The \"self\" in this scope is referring to the BaseClient.\u001b[0m \u001b[38;2;255;0;0m│\u001b[0m\n", - "\u001b[38;2;255;0;0m│\u001b[0m \u001b[31m❱ \u001b[0m 602 \u001b[2m│ │ │ \u001b[0m\u001b[94mreturn\u001b[0m \u001b[96mself\u001b[0m._make_api_call(operation_name, kwargs) \u001b[38;2;255;0;0m│\u001b[0m\n", - "\u001b[38;2;255;0;0m│\u001b[0m \u001b[2m 603 \u001b[0m\u001b[2m│ │ \u001b[0m \u001b[38;2;255;0;0m│\u001b[0m\n", - "\u001b[38;2;255;0;0m│\u001b[0m \u001b[2m 604 \u001b[0m\u001b[2m│ │ \u001b[0m_api_call.\u001b[91m__name__\u001b[0m = \u001b[96mstr\u001b[0m(py_operation_name) \u001b[38;2;255;0;0m│\u001b[0m\n", - "\u001b[38;2;255;0;0m│\u001b[0m \u001b[2m 605 \u001b[0m \u001b[38;2;255;0;0m│\u001b[0m\n", - "\u001b[38;2;255;0;0m│\u001b[0m \u001b[38;2;255;0;0m│\u001b[0m\n", - "\u001b[38;2;255;0;0m│\u001b[0m \u001b[2;33m/Users/mollyhe/.pyenv/versions/3.10.14/envs/py3.10.14/lib/python3.10/site-packages/botocore/\u001b[0m\u001b[1;33mcont\u001b[0m \u001b[38;2;255;0;0m│\u001b[0m\n", - "\u001b[38;2;255;0;0m│\u001b[0m \u001b[1;33mext.py\u001b[0m:\u001b[94m123\u001b[0m in \u001b[92mwrapper\u001b[0m \u001b[38;2;255;0;0m│\u001b[0m\n", - "\u001b[38;2;255;0;0m│\u001b[0m \u001b[38;2;255;0;0m│\u001b[0m\n", - "\u001b[38;2;255;0;0m│\u001b[0m \u001b[2m120 \u001b[0m\u001b[2m│ │ │ \u001b[0m\u001b[94mwith\u001b[0m start_as_current_context(): \u001b[38;2;255;0;0m│\u001b[0m\n", - "\u001b[38;2;255;0;0m│\u001b[0m \u001b[2m121 \u001b[0m\u001b[2m│ │ │ │ \u001b[0m\u001b[94mif\u001b[0m hook: \u001b[38;2;255;0;0m│\u001b[0m\n", - "\u001b[38;2;255;0;0m│\u001b[0m \u001b[2m122 \u001b[0m\u001b[2m│ │ │ │ │ \u001b[0mhook() \u001b[38;2;255;0;0m│\u001b[0m\n", - "\u001b[38;2;255;0;0m│\u001b[0m \u001b[31m❱ \u001b[0m123 \u001b[2m│ │ │ │ \u001b[0m\u001b[94mreturn\u001b[0m func(*args, **kwargs) \u001b[38;2;255;0;0m│\u001b[0m\n", - "\u001b[38;2;255;0;0m│\u001b[0m \u001b[2m124 \u001b[0m\u001b[2m│ │ \u001b[0m \u001b[38;2;255;0;0m│\u001b[0m\n", - "\u001b[38;2;255;0;0m│\u001b[0m \u001b[2m125 \u001b[0m\u001b[2m│ │ \u001b[0m\u001b[94mreturn\u001b[0m wrapper \u001b[38;2;255;0;0m│\u001b[0m\n", - "\u001b[38;2;255;0;0m│\u001b[0m \u001b[2m126 \u001b[0m \u001b[38;2;255;0;0m│\u001b[0m\n", - "\u001b[38;2;255;0;0m│\u001b[0m \u001b[38;2;255;0;0m│\u001b[0m\n", - "\u001b[38;2;255;0;0m│\u001b[0m \u001b[2;33m/Users/mollyhe/.pyenv/versions/3.10.14/envs/py3.10.14/lib/python3.10/site-packages/botocore/\u001b[0m\u001b[1;33mclie\u001b[0m \u001b[38;2;255;0;0m│\u001b[0m\n", - "\u001b[38;2;255;0;0m│\u001b[0m \u001b[1;33mnt.py\u001b[0m:\u001b[94m1078\u001b[0m in \u001b[92m_make_api_call\u001b[0m \u001b[38;2;255;0;0m│\u001b[0m\n", - "\u001b[38;2;255;0;0m│\u001b[0m \u001b[38;2;255;0;0m│\u001b[0m\n", - "\u001b[38;2;255;0;0m│\u001b[0m \u001b[2m1075 \u001b[0m\u001b[2m│ │ │ │ \u001b[0m\u001b[33m'\u001b[0m\u001b[33merror_code_override\u001b[0m\u001b[33m'\u001b[0m \u001b[38;2;255;0;0m│\u001b[0m\n", - "\u001b[38;2;255;0;0m│\u001b[0m \u001b[2m1076 \u001b[0m\u001b[2m│ │ │ \u001b[0m) \u001b[95mor\u001b[0m error_info.get(\u001b[33m\"\u001b[0m\u001b[33mCode\u001b[0m\u001b[33m\"\u001b[0m) \u001b[38;2;255;0;0m│\u001b[0m\n", - "\u001b[38;2;255;0;0m│\u001b[0m \u001b[2m1077 \u001b[0m\u001b[2m│ │ │ \u001b[0merror_class = \u001b[96mself\u001b[0m.exceptions.from_code(error_code) \u001b[38;2;255;0;0m│\u001b[0m\n", - "\u001b[38;2;255;0;0m│\u001b[0m \u001b[31m❱ \u001b[0m1078 \u001b[2m│ │ │ \u001b[0m\u001b[94mraise\u001b[0m error_class(parsed_response, operation_name) \u001b[38;2;255;0;0m│\u001b[0m\n", - "\u001b[38;2;255;0;0m│\u001b[0m \u001b[2m1079 \u001b[0m\u001b[2m│ │ \u001b[0m\u001b[94melse\u001b[0m: \u001b[38;2;255;0;0m│\u001b[0m\n", - "\u001b[38;2;255;0;0m│\u001b[0m \u001b[2m1080 \u001b[0m\u001b[2m│ │ │ \u001b[0m\u001b[94mreturn\u001b[0m parsed_response \u001b[38;2;255;0;0m│\u001b[0m\n", - "\u001b[38;2;255;0;0m│\u001b[0m \u001b[2m1081 \u001b[0m \u001b[38;2;255;0;0m│\u001b[0m\n", - "\u001b[38;2;255;0;0m╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\u001b[0m\n", - "\u001b[1;91mClientError: \u001b[0mAn error occurred \u001b[1m(\u001b[0mExpiredTokenException\u001b[1m)\u001b[0m when calling the DescribeTrainingJob operation: The security\n", - "token included in the request is expired\n", - "\n", - "\u001b[3mThe above exception was the direct cause of the following exception:\u001b[0m\n", - "\n", - "\u001b[38;2;255;0;0m╭─\u001b[0m\u001b[38;2;255;0;0m──────────────────────────────\u001b[0m\u001b[38;2;255;0;0m \u001b[0m\u001b[1;38;2;255;0;0mTraceback \u001b[0m\u001b[1;2;38;2;255;0;0m(most recent call last)\u001b[0m\u001b[38;2;255;0;0m \u001b[0m\u001b[38;2;255;0;0m───────────────────────────────\u001b[0m\u001b[38;2;255;0;0m─╮\u001b[0m\n", - "\u001b[38;2;255;0;0m│\u001b[0m in \u001b[92m\u001b[0m:\u001b[94m1\u001b[0m \u001b[38;2;255;0;0m│\u001b[0m\n", - "\u001b[38;2;255;0;0m│\u001b[0m \u001b[38;2;255;0;0m│\u001b[0m\n", - "\u001b[38;2;255;0;0m│\u001b[0m \u001b[31m❱ \u001b[0m1 training_job = trainer.train(wait=\u001b[94mTrue\u001b[0m) \u001b[38;2;255;0;0m│\u001b[0m\n", - "\u001b[38;2;255;0;0m│\u001b[0m \u001b[2m2 \u001b[0m \u001b[38;2;255;0;0m│\u001b[0m\n", - "\u001b[38;2;255;0;0m│\u001b[0m \u001b[2m3 \u001b[0mTRAINING_JOB_NAME = training_job.training_job_name \u001b[38;2;255;0;0m│\u001b[0m\n", - "\u001b[38;2;255;0;0m│\u001b[0m \u001b[2m4 \u001b[0m \u001b[38;2;255;0;0m│\u001b[0m\n", - "\u001b[38;2;255;0;0m│\u001b[0m \u001b[38;2;255;0;0m│\u001b[0m\n", - "\u001b[38;2;255;0;0m│\u001b[0m \u001b[2;33m/Users/mollyhe/.pyenv/versions/3.10.14/envs/py3.10.14/lib/python3.10/site-packages/sagemaker/cor\u001b[0m \u001b[38;2;255;0;0m│\u001b[0m\n", - "\u001b[38;2;255;0;0m│\u001b[0m \u001b[2;33me/telemetry/\u001b[0m\u001b[1;33mtelemetry_logging.py\u001b[0m:\u001b[94m168\u001b[0m in \u001b[92mwrapper\u001b[0m \u001b[38;2;255;0;0m│\u001b[0m\n", - "\u001b[38;2;255;0;0m│\u001b[0m \u001b[38;2;255;0;0m│\u001b[0m\n", - "\u001b[38;2;255;0;0m│\u001b[0m \u001b[2m165 \u001b[0m\u001b[2m│ │ │ │ │ \u001b[0mcaught_ex = e \u001b[38;2;255;0;0m│\u001b[0m\n", - "\u001b[38;2;255;0;0m│\u001b[0m \u001b[2m166 \u001b[0m\u001b[2m│ │ │ │ \u001b[0m\u001b[94mfinally\u001b[0m: \u001b[38;2;255;0;0m│\u001b[0m\n", - "\u001b[38;2;255;0;0m│\u001b[0m \u001b[2m167 \u001b[0m\u001b[2m│ │ │ │ │ \u001b[0m\u001b[94mif\u001b[0m caught_ex: \u001b[38;2;255;0;0m│\u001b[0m\n", - "\u001b[38;2;255;0;0m│\u001b[0m \u001b[31m❱ \u001b[0m168 \u001b[2m│ │ │ │ │ │ \u001b[0m\u001b[94mraise\u001b[0m caught_ex \u001b[38;2;255;0;0m│\u001b[0m\n", - "\u001b[38;2;255;0;0m│\u001b[0m \u001b[2m169 \u001b[0m\u001b[2m│ │ │ │ │ \u001b[0m\u001b[94mreturn\u001b[0m response \u001b[2m# pylint: disable=W0150\u001b[0m \u001b[38;2;255;0;0m│\u001b[0m\n", - "\u001b[38;2;255;0;0m│\u001b[0m \u001b[2m170 \u001b[0m\u001b[2m│ │ │ \u001b[0m\u001b[94melse\u001b[0m: \u001b[38;2;255;0;0m│\u001b[0m\n", - "\u001b[38;2;255;0;0m│\u001b[0m \u001b[2m171 \u001b[0m\u001b[2m│ │ │ │ \u001b[0mlogger.debug( \u001b[38;2;255;0;0m│\u001b[0m\n", - "\u001b[38;2;255;0;0m│\u001b[0m \u001b[38;2;255;0;0m│\u001b[0m\n", - "\u001b[38;2;255;0;0m│\u001b[0m \u001b[2;33m/Users/mollyhe/.pyenv/versions/3.10.14/envs/py3.10.14/lib/python3.10/site-packages/sagemaker/cor\u001b[0m \u001b[38;2;255;0;0m│\u001b[0m\n", - "\u001b[38;2;255;0;0m│\u001b[0m \u001b[2;33me/telemetry/\u001b[0m\u001b[1;33mtelemetry_logging.py\u001b[0m:\u001b[94m139\u001b[0m in \u001b[92mwrapper\u001b[0m \u001b[38;2;255;0;0m│\u001b[0m\n", - "\u001b[38;2;255;0;0m│\u001b[0m \u001b[38;2;255;0;0m│\u001b[0m\n", - "\u001b[38;2;255;0;0m│\u001b[0m \u001b[2m136 \u001b[0m\u001b[2m│ │ │ │ \u001b[0mstart_timer = perf_counter() \u001b[38;2;255;0;0m│\u001b[0m\n", - "\u001b[38;2;255;0;0m│\u001b[0m \u001b[2m137 \u001b[0m\u001b[2m│ │ │ │ \u001b[0m\u001b[94mtry\u001b[0m: \u001b[38;2;255;0;0m│\u001b[0m\n", - "\u001b[38;2;255;0;0m│\u001b[0m \u001b[2m138 \u001b[0m\u001b[2m│ │ │ │ │ \u001b[0m\u001b[2m# Call the original function\u001b[0m \u001b[38;2;255;0;0m│\u001b[0m\n", - "\u001b[38;2;255;0;0m│\u001b[0m \u001b[31m❱ \u001b[0m139 \u001b[2m│ │ │ │ │ \u001b[0mresponse = func(*args, **kwargs) \u001b[38;2;255;0;0m│\u001b[0m\n", - "\u001b[38;2;255;0;0m│\u001b[0m \u001b[2m140 \u001b[0m\u001b[2m│ │ │ │ │ \u001b[0mstop_timer = perf_counter() \u001b[38;2;255;0;0m│\u001b[0m\n", - "\u001b[38;2;255;0;0m│\u001b[0m \u001b[2m141 \u001b[0m\u001b[2m│ │ │ │ │ \u001b[0melapsed = stop_timer - start_timer \u001b[38;2;255;0;0m│\u001b[0m\n", - "\u001b[38;2;255;0;0m│\u001b[0m \u001b[2m142 \u001b[0m\u001b[2m│ │ │ │ │ \u001b[0mextra += \u001b[33mf\u001b[0m\u001b[33m\"\u001b[0m\u001b[33m&x-latency=\u001b[0m\u001b[33m{\u001b[0m\u001b[96mround\u001b[0m(elapsed,\u001b[90m \u001b[0m\u001b[94m2\u001b[0m)\u001b[33m}\u001b[0m\u001b[33m\"\u001b[0m \u001b[38;2;255;0;0m│\u001b[0m\n", - "\u001b[38;2;255;0;0m│\u001b[0m \u001b[38;2;255;0;0m│\u001b[0m\n", - "\u001b[38;2;255;0;0m│\u001b[0m \u001b[2;33m/Users/mollyhe/.pyenv/versions/3.10.14/envs/py3.10.14/lib/python3.10/site-packages/sagemaker/tra\u001b[0m \u001b[38;2;255;0;0m│\u001b[0m\n", - "\u001b[38;2;255;0;0m│\u001b[0m \u001b[2;33min/\u001b[0m\u001b[1;33msft_trainer.py\u001b[0m:\u001b[94m267\u001b[0m in \u001b[92mtrain\u001b[0m \u001b[38;2;255;0;0m│\u001b[0m\n", - "\u001b[38;2;255;0;0m│\u001b[0m \u001b[38;2;255;0;0m│\u001b[0m\n", - "\u001b[38;2;255;0;0m│\u001b[0m \u001b[2m264 \u001b[0m\u001b[2m│ │ │ \u001b[0m\u001b[94mfrom\u001b[0m\u001b[90m \u001b[0m\u001b[4;96msagemaker\u001b[0m\u001b[4;96m.\u001b[0m\u001b[4;96mtrain\u001b[0m\u001b[4;96m.\u001b[0m\u001b[4;96mcommon_utils\u001b[0m\u001b[4;96m.\u001b[0m\u001b[4;96mtrainer_wait\u001b[0m\u001b[90m \u001b[0m\u001b[94mimport\u001b[0m wait \u001b[94mas\u001b[0m _wait \u001b[38;2;255;0;0m│\u001b[0m\n", - "\u001b[38;2;255;0;0m│\u001b[0m \u001b[2m265 \u001b[0m\u001b[2m│ │ │ \u001b[0m\u001b[94mfrom\u001b[0m\u001b[90m \u001b[0m\u001b[4;96msagemaker\u001b[0m\u001b[4;96m.\u001b[0m\u001b[4;96mcore\u001b[0m\u001b[4;96m.\u001b[0m\u001b[4;96mutils\u001b[0m\u001b[4;96m.\u001b[0m\u001b[4;96mexceptions\u001b[0m\u001b[90m \u001b[0m\u001b[94mimport\u001b[0m TimeoutExceededError \u001b[38;2;255;0;0m│\u001b[0m\n", - "\u001b[38;2;255;0;0m│\u001b[0m \u001b[2m266 \u001b[0m\u001b[2m│ │ │ \u001b[0m\u001b[94mtry\u001b[0m : \u001b[38;2;255;0;0m│\u001b[0m\n", - "\u001b[38;2;255;0;0m│\u001b[0m \u001b[31m❱ \u001b[0m267 \u001b[2m│ │ │ │ \u001b[0m_wait(training_job) \u001b[38;2;255;0;0m│\u001b[0m\n", - "\u001b[38;2;255;0;0m│\u001b[0m \u001b[2m268 \u001b[0m\u001b[2m│ │ │ \u001b[0m\u001b[94mexcept\u001b[0m TimeoutExceededError \u001b[94mas\u001b[0m e: \u001b[38;2;255;0;0m│\u001b[0m\n", - "\u001b[38;2;255;0;0m│\u001b[0m \u001b[2m269 \u001b[0m\u001b[2m│ │ │ │ \u001b[0mlogger.error(\u001b[33m\"\u001b[0m\u001b[33mError: \u001b[0m\u001b[33m%s\u001b[0m\u001b[33m\"\u001b[0m, e) \u001b[38;2;255;0;0m│\u001b[0m\n", - "\u001b[38;2;255;0;0m│\u001b[0m \u001b[2m270 \u001b[0m \u001b[38;2;255;0;0m│\u001b[0m\n", - "\u001b[38;2;255;0;0m│\u001b[0m \u001b[38;2;255;0;0m│\u001b[0m\n", - "\u001b[38;2;255;0;0m│\u001b[0m \u001b[2;33m/Users/mollyhe/.pyenv/versions/3.10.14/envs/py3.10.14/lib/python3.10/site-packages/sagemaker/tra\u001b[0m \u001b[38;2;255;0;0m│\u001b[0m\n", - "\u001b[38;2;255;0;0m│\u001b[0m \u001b[2;33min/common_utils/\u001b[0m\u001b[1;33mtrainer_wait.py\u001b[0m:\u001b[94m374\u001b[0m in \u001b[92mwait\u001b[0m \u001b[38;2;255;0;0m│\u001b[0m\n", - "\u001b[38;2;255;0;0m│\u001b[0m \u001b[38;2;255;0;0m│\u001b[0m\n", - "\u001b[38;2;255;0;0m│\u001b[0m \u001b[2m371 \u001b[0m\u001b[2m│ \u001b[0m\u001b[94mexcept\u001b[0m (FailedStatusError, TimeoutExceededError): \u001b[38;2;255;0;0m│\u001b[0m\n", - "\u001b[38;2;255;0;0m│\u001b[0m \u001b[2m372 \u001b[0m\u001b[2m│ │ \u001b[0m\u001b[94mraise\u001b[0m \u001b[38;2;255;0;0m│\u001b[0m\n", - "\u001b[38;2;255;0;0m│\u001b[0m \u001b[2m373 \u001b[0m\u001b[2m│ \u001b[0m\u001b[94mexcept\u001b[0m \u001b[96mException\u001b[0m \u001b[94mas\u001b[0m e: \u001b[38;2;255;0;0m│\u001b[0m\n", - "\u001b[38;2;255;0;0m│\u001b[0m \u001b[31m❱ \u001b[0m374 \u001b[2m│ │ \u001b[0m\u001b[94mraise\u001b[0m \u001b[96mRuntimeError\u001b[0m(\u001b[33mf\u001b[0m\u001b[33m\"\u001b[0m\u001b[33mTraining job monitoring failed: \u001b[0m\u001b[33m{\u001b[0me\u001b[33m}\u001b[0m\u001b[33m\"\u001b[0m) \u001b[94mfrom\u001b[0m\u001b[90m \u001b[0m\u001b[4;96me\u001b[0m \u001b[38;2;255;0;0m│\u001b[0m\n", - "\u001b[38;2;255;0;0m│\u001b[0m \u001b[2m375 \u001b[0m \u001b[38;2;255;0;0m│\u001b[0m\n", - "\u001b[38;2;255;0;0m╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\u001b[0m\n", - "\u001b[1;91mRuntimeError: \u001b[0mTraining job monitoring failed: An error occurred \u001b[1m(\u001b[0mExpiredTokenException\u001b[1m)\u001b[0m when calling the \n", - "DescribeTrainingJob operation: The security token included in the request is expired\n" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], + "outputs": [], "source": [ "training_job = trainer.train(wait=True)\n", "\n", @@ -1650,7 +1055,7 @@ ], "metadata": { "kernelspec": { - "display_name": "Python 3 (ipykernel)", + "display_name": "py3.10.14", "language": "python", "name": "python3" }, From 41ccc9cb6637834698a0cedbb82123ec5022d433 Mon Sep 17 00:00:00 2001 From: Molly He Date: Fri, 20 Mar 2026 17:06:39 -0700 Subject: [PATCH 10/11] Address PR readiness --- sagemaker-train/pyproject.toml | 8 +- .../train/common_utils/trainer_wait.py | 8 +- .../src/sagemaker/train/evaluate/execution.py | 18 +-- .../common_utils/test_metrics_visualizer.py | 121 ++++++++++++++++++ 4 files changed, 140 insertions(+), 15 deletions(-) create mode 100644 sagemaker-train/tests/unit/train/common_utils/test_metrics_visualizer.py diff --git a/sagemaker-train/pyproject.toml b/sagemaker-train/pyproject.toml index e85eb97b1e..648994b5eb 100644 --- a/sagemaker-train/pyproject.toml +++ b/sagemaker-train/pyproject.toml @@ -43,9 +43,6 @@ dependencies = [ "sagemaker-mlflow>=0.0.1,<1.0.0", "mlflow>=3.0.0,<4.0.0", "nest_asyncio>=1.5.0", - "ipywidgets>=8.0.0", - "rich>=13.0.0", - "matplotlib>=3.5.0", ] [project.urls] @@ -64,6 +61,11 @@ test = [ "graphene", "IPython" ] +notebook = [ + "ipywidgets>=8.0.0", + "rich>=13.0.0", + "matplotlib>=3.5.0", +] [tool.setuptools.packages.find] where = ["src/"] diff --git a/sagemaker-train/src/sagemaker/train/common_utils/trainer_wait.py b/sagemaker-train/src/sagemaker/train/common_utils/trainer_wait.py index 72537874f5..59adcdfbfc 100644 --- a/sagemaker-train/src/sagemaker/train/common_utils/trainer_wait.py +++ b/sagemaker-train/src/sagemaker/train/common_utils/trainer_wait.py @@ -182,16 +182,16 @@ def get_mlflow_url(training_job) -> str: if not hasattr(training_job, 'mlflow_config') or _is_unassigned_attribute(training_job.mlflow_config): raise ValueError("Training job does not have MLflow configured") - import boto3 import os from mlflow.tracking import MlflowClient import mlflow - + from sagemaker.core.utils.utils import SageMakerClient + mlflow_arn = training_job.mlflow_config.mlflow_resource_arn exp_name = training_job.mlflow_config.mlflow_experiment_name - + # Get presigned base URL - sm_client = boto3.client('sagemaker') + sm_client = SageMakerClient().sagemaker_client response = sm_client.create_presigned_mlflow_app_url(Arn=mlflow_arn) base_url = response.get('AuthorizedUrl') diff --git a/sagemaker-train/src/sagemaker/train/evaluate/execution.py b/sagemaker-train/src/sagemaker/train/evaluate/execution.py index 722461bb6c..d5e50f86b5 100644 --- a/sagemaker-train/src/sagemaker/train/evaluate/execution.py +++ b/sagemaker-train/src/sagemaker/train/evaluate/execution.py @@ -931,16 +931,14 @@ def wait( header_table.add_column("Property", style="cyan bold", width=20) header_table.add_column("Value", style="dim", overflow="fold") - # Extract pipeline name and region from execution ARN + # Extract pipeline name and exec_id from execution ARN pipeline_name = None exec_id = '' - region = None if self.arn: arn_parts = self.arn.split('/') if len(arn_parts) >= 4: pipeline_name = arn_parts[-3] exec_id = arn_parts[-1] - region = self.arn.split(":")[3] if len(self.arn.split(":")) > 3 else None # Use execution display name if available, fall back to self.name display_name = self.name if self._pipeline_execution: @@ -952,8 +950,10 @@ def wait( # Build links row links = [] try: + from sagemaker.core.utils.utils import SageMakerClient from sagemaker.train.common_utils.metrics_visualizer import _is_in_studio, _get_studio_base_url - if region and pipeline_name and _is_in_studio(): + if pipeline_name and _is_in_studio(): + region = SageMakerClient().region_name base = _get_studio_base_url(region) if base: pipeline_url = f"{base}/jobs/evaluation/detail?pipeline_name={pipeline_name}&execution_id={exec_id}" @@ -1052,12 +1052,13 @@ def wait( links_table = Table(show_header=True, header_style="bold magenta", box=None, padding=(0, 1)) links_table.add_column("Step", style="cyan", width=20) links_table.add_column("Console", style="dim") + from sagemaker.core.utils.utils import SageMakerClient from sagemaker.train.common_utils.metrics_visualizer import ( _is_in_studio, _parse_job_arn, _get_studio_base_url, get_console_job_url, get_cloudwatch_logs_url, ) in_studio = _is_in_studio() - studio_base = _get_studio_base_url(region) if in_studio else "" + studio_base = _get_studio_base_url(SageMakerClient().region_name) if in_studio else "" if in_studio: links_table.add_column("Studio", style="dim") links_table.add_column("Logs", style="dim") @@ -1296,15 +1297,16 @@ def _convert_to_subclass(self, eval_type: EvalType) -> 'EvaluationPipelineExecut @staticmethod def _extract_job_arn_from_metadata(step) -> Optional[str]: """Extract the underlying job ARN from a pipeline step's metadata.""" + from sagemaker.train.common_utils.trainer_wait import _is_unassigned_attribute metadata = getattr(step, 'metadata', None) - if metadata is None or 'Unassigned' in metadata.__class__.__name__: + if metadata is None or _is_unassigned_attribute(metadata): return None for attr in ('training_job', 'processing_job', 'transform_job', 'tuning_job', 'auto_ml_job', 'compilation_job'): job_meta = getattr(metadata, attr, None) - if job_meta is not None and not ('Unassigned' in job_meta.__class__.__name__): + if job_meta is not None and not _is_unassigned_attribute(job_meta): arn = getattr(job_meta, 'arn', None) - if arn and not ('Unassigned' in arn.__class__.__name__): + if arn and not _is_unassigned_attribute(arn): return str(arn) return None diff --git a/sagemaker-train/tests/unit/train/common_utils/test_metrics_visualizer.py b/sagemaker-train/tests/unit/train/common_utils/test_metrics_visualizer.py new file mode 100644 index 0000000000..9b7b804055 --- /dev/null +++ b/sagemaker-train/tests/unit/train/common_utils/test_metrics_visualizer.py @@ -0,0 +1,121 @@ +"""Unit tests for metrics_visualizer module.""" +import pytest +from unittest.mock import Mock, patch, MagicMock + + +class TestParseJobArn: + def test_training_job_arn(self): + from sagemaker.train.common_utils.metrics_visualizer import _parse_job_arn + result = _parse_job_arn("arn:aws:sagemaker:us-west-2:123456789012:training-job/my-job") + assert result == ("us-west-2", "training-job/my-job") + + def test_processing_job_arn(self): + from sagemaker.train.common_utils.metrics_visualizer import _parse_job_arn + result = _parse_job_arn("arn:aws:sagemaker:us-east-1:123456789012:processing-job/my-job") + assert result == ("us-east-1", "processing-job/my-job") + + def test_invalid_arn_returns_none(self): + from sagemaker.train.common_utils.metrics_visualizer import _parse_job_arn + assert _parse_job_arn("not-an-arn") is None + + +class TestGetConsoleJobUrl: + def test_training_job(self): + from sagemaker.train.common_utils.metrics_visualizer import get_console_job_url + url = get_console_job_url("arn:aws:sagemaker:us-west-2:123456789012:training-job/my-job") + assert url == "https://us-west-2.console.aws.amazon.com/sagemaker/home?region=us-west-2#/jobs/my-job" + + def test_invalid_arn_returns_empty(self): + from sagemaker.train.common_utils.metrics_visualizer import get_console_job_url + assert get_console_job_url("not-an-arn") == "" + + def test_unknown_job_type_returns_empty(self): + from sagemaker.train.common_utils.metrics_visualizer import get_console_job_url + assert get_console_job_url("arn:aws:sagemaker:us-west-2:123456789012:unknown-job/my-job") == "" + + +class TestGetCloudwatchLogsUrl: + def test_training_job(self): + from sagemaker.train.common_utils.metrics_visualizer import get_cloudwatch_logs_url + url = get_cloudwatch_logs_url("arn:aws:sagemaker:us-west-2:123456789012:training-job/my-job") + assert "us-west-2" in url + assert "TrainingJobs" in url + assert "my-job" in url + + def test_invalid_arn_returns_empty(self): + from sagemaker.train.common_utils.metrics_visualizer import get_cloudwatch_logs_url + assert get_cloudwatch_logs_url("not-an-arn") == "" + + +class TestGetStudioUrl: + @patch("sagemaker.train.common_utils.metrics_visualizer._get_studio_base_url") + @patch("sagemaker.core.utils.utils.SageMakerClient") + def test_with_training_job_object(self, mock_client_cls, mock_base_url): + from sagemaker.train.common_utils.metrics_visualizer import get_studio_url + mock_client_cls.return_value.region_name = "us-west-2" + mock_base_url.return_value = "https://studio-d-abc.studio.us-west-2.sagemaker.aws" + + mock_job = Mock() + mock_job.training_job_name = "my-job" + + url = get_studio_url(mock_job) + assert url == "https://studio-d-abc.studio.us-west-2.sagemaker.aws/jobs/train/my-job" + mock_base_url.assert_called_once_with("us-west-2") + + @patch("sagemaker.train.common_utils.metrics_visualizer._get_studio_base_url") + def test_with_arn_string(self, mock_base_url): + from sagemaker.train.common_utils.metrics_visualizer import get_studio_url + mock_base_url.return_value = "https://studio-d-abc.studio.us-west-2.sagemaker.aws" + + url = get_studio_url("arn:aws:sagemaker:us-west-2:123456789012:training-job/my-job") + assert url == "https://studio-d-abc.studio.us-west-2.sagemaker.aws/jobs/train/my-job" + mock_base_url.assert_called_once_with("us-west-2") + + @patch("sagemaker.train.common_utils.metrics_visualizer._get_studio_base_url") + @patch("sagemaker.core.utils.utils.SageMakerClient") + @patch("sagemaker.train.common_utils.metrics_visualizer.TrainingJob") + def test_with_job_name_string(self, mock_tj_cls, mock_client_cls, mock_base_url): + from sagemaker.train.common_utils.metrics_visualizer import get_studio_url + mock_client_cls.return_value.region_name = "us-west-2" + mock_base_url.return_value = "https://studio-d-abc.studio.us-west-2.sagemaker.aws" + mock_tj_cls.get.return_value.training_job_name = "my-job" + + url = get_studio_url("my-job") + assert url == "https://studio-d-abc.studio.us-west-2.sagemaker.aws/jobs/train/my-job" + + @patch("sagemaker.train.common_utils.metrics_visualizer._get_studio_base_url") + @patch("sagemaker.core.utils.utils.SageMakerClient") + def test_returns_empty_when_no_domain(self, mock_client_cls, mock_base_url): + from sagemaker.train.common_utils.metrics_visualizer import get_studio_url + mock_client_cls.return_value.region_name = "us-west-2" + mock_base_url.return_value = "" + + url = get_studio_url(Mock(training_job_name="my-job")) + assert url == "" + + +class TestGetAvailableMetrics: + @patch("sagemaker.train.common_utils.metrics_visualizer.TrainingJob") + def test_returns_empty_when_no_mlflow_config(self, _): + from sagemaker.train.common_utils.metrics_visualizer import get_available_metrics + mock_job = Mock(spec=[]) # no mlflow_config attribute + assert get_available_metrics(mock_job) == [] + + @patch("sagemaker.train.common_utils.metrics_visualizer.TrainingJob") + def test_returns_empty_when_mlflow_config_falsy(self, _): + from sagemaker.train.common_utils.metrics_visualizer import get_available_metrics + mock_job = Mock() + mock_job.mlflow_config = None + assert get_available_metrics(mock_job) == [] + + @patch("mlflow.get_run") + @patch("mlflow.set_tracking_uri") + def test_returns_metric_names(self, mock_set_uri, mock_get_run): + from sagemaker.train.common_utils.metrics_visualizer import get_available_metrics + mock_job = Mock() + mock_job.mlflow_config.mlflow_resource_arn = "arn:aws:sagemaker:us-west-2:123:mlflow-tracking/abc" + mock_job.mlflow_details.mlflow_run_id = "run-123" + mock_get_run.return_value.data.metrics = {"loss": 0.5, "accuracy": 0.9} + + result = get_available_metrics(mock_job) + assert set(result) == {"loss", "accuracy"} From d03de045d7ae072e096bb2d9408cd8245c5d28a0 Mon Sep 17 00:00:00 2001 From: Molly He Date: Mon, 23 Mar 2026 10:43:09 -0700 Subject: [PATCH 11/11] Fix sagemaker-train unit tst --- .../tests/unit/train/evaluate/test_pipeline_templates.py | 1 - 1 file changed, 1 deletion(-) diff --git a/sagemaker-train/tests/unit/train/evaluate/test_pipeline_templates.py b/sagemaker-train/tests/unit/train/evaluate/test_pipeline_templates.py index 136ac022cd..36092fbeb3 100644 --- a/sagemaker-train/tests/unit/train/evaluate/test_pipeline_templates.py +++ b/sagemaker-train/tests/unit/train/evaluate/test_pipeline_templates.py @@ -121,7 +121,6 @@ def test_deterministic_template_with_optional_mlflow_params(self): pipeline_def = json.loads(rendered) assert pipeline_def["MlflowConfig"]["MlflowExperimentName"] == "test-experiment" - assert pipeline_def["MlflowConfig"]["MlflowRunName"] == "test-run" def test_deterministic_template_with_all_hyperparameters(self): """Test DETERMINISTIC_TEMPLATE with all optional hyperparameters."""