diff --git a/dockerfiles/aws/test.dockerfile b/dockerfiles/aws/test.dockerfile index 3f4c151..81d6fd7 100644 --- a/dockerfiles/aws/test.dockerfile +++ b/dockerfiles/aws/test.dockerfile @@ -10,8 +10,12 @@ RUN pip install --no-cache-dir boto3 && \ WORKDIR /app -# Copy scripts -COPY src/kernel_ci_cloud_labs/launch_vm.py /app/launch_vm.py +# Copy scripts (with minimal package skeleton so launch_vm.py's +# `from kernel_ci_cloud_labs.core.log_scrub import scrub_text` resolves) +COPY src/kernel_ci_cloud_labs/__init__.py /app/kernel_ci_cloud_labs/__init__.py +COPY src/kernel_ci_cloud_labs/core/__init__.py /app/kernel_ci_cloud_labs/core/__init__.py +COPY src/kernel_ci_cloud_labs/core/log_scrub.py /app/kernel_ci_cloud_labs/core/log_scrub.py +COPY src/kernel_ci_cloud_labs/launch_vm.py /app/launch_vm.py COPY src/kernel_ci_cloud_labs/debug_aws_setup.py /app/debug_aws_setup.py # Run debug check (ignore exit code), then launch VMs diff --git a/src/kernel_ci_cloud_labs/core/pipeline.py b/src/kernel_ci_cloud_labs/core/pipeline.py index 96203f3..23a9ecb 100644 --- a/src/kernel_ci_cloud_labs/core/pipeline.py +++ b/src/kernel_ci_cloud_labs/core/pipeline.py @@ -177,7 +177,14 @@ def _log_vm_output_excerpts(s3_client, bucket, run_prefix, failed_tests): pass -def create_summary(run_dir, start_time, task_arn, expected_vm_count=None, s3_context=None): +def create_summary( + run_dir, + start_time, + task_arn, + expected_vm_count=None, + s3_context=None, + container_failure_log_url=None, +): """Create summary.json with VM statistics. Args: @@ -186,6 +193,10 @@ def create_summary(run_dir, start_time, task_arn, expected_vm_count=None, s3_con task_arn: ARN of the ECS task that ran the tests. expected_vm_count: Number of VMs expected to spawn (None if unknown). s3_context: Optional dict with 'bucket', 'run_prefix', 'region', 's3_client' for debug output. + container_failure_log_url: Public URL of the ECS container's own log when + it exited non-zero before launching any VM. The KCIDB submitter + uses this as ``tests[*].log_url`` for the synthetic Infrastructure + row so users have a link to the actual failure reason. """ end_time = time.time() total_runtime = end_time - start_time @@ -230,6 +241,7 @@ def create_summary(run_dir, start_time, task_arn, expected_vm_count=None, s3_con # each tests[*] row. "instances": vm_stats["instances"], }, + "container_failure_log_url": container_failure_log_url, } summary_file = Path(run_dir) / "summary.json" @@ -459,6 +471,15 @@ def run_pipeline( logger.error("Task did not complete successfully: %s", e) raise + # A non-zero container exit means launch_vm.py died before SSM ever + # ran on a VM — the /ec2/.../ log group will never appear, + # so we shorten the VM-log wait below and surface container.log as + # the failure URL instead of an absent kernel log. + container_failed = bool(final_status) and any( + (c.get("exit_code") or 0) != 0 + for c in (final_status.get("containers") or []) + ) + logger.info("-" * 60) # Refresh CloudWatch client — credentials may have expired during the wait @@ -496,7 +517,15 @@ def run_pipeline( # Wait for VM logs to appear (CloudWatch agent ships in batches # after the VM shuts down — give it up to 5 min to surface). - max_retries = 10 + # When the container itself failed, no VM was ever launched, so + # the log group can't appear — skip straight to the single probe. + if container_failed: + logger.info( + "Container exited non-zero; skipping extended VM-log wait" + ) + max_retries = 1 + else: + max_retries = 10 retry_delay = 30 for attempt in range(max_retries): @@ -581,6 +610,7 @@ def run_pipeline( # and emit artifacts.json — the manifest the KCIDB submitter # consumes to populate tests[*].log_url. Failures here are # non-fatal: the test results in S3 remain the source of truth. + container_failure_log_url = None try: logger.info("\n=== Collecting boot logs & artifacts manifest ===") s3_client = provider.auth.get_client("s3") @@ -596,6 +626,32 @@ def run_pipeline( run_prefix=run_prefix, origin=origin, ) + + # Container died before any VM booted -> there is no kernel log. + # Publish the container's own log as the failure URL so KCIDB + # users land on something actionable instead of a dead link. + if container_failed and container_log_file.exists(): + from kernel_ci_cloud_labs.core.artifacts import s3_public_url + + failure_key = f"{run_prefix}/container-failure.log" + try: + s3_client.upload_file( + str(container_log_file), + storage.bucket, + failure_key, + ExtraArgs={"ContentType": "text/plain; charset=utf-8"}, + ) + container_failure_log_url = s3_public_url( + storage.bucket, region, failure_key + ) + logger.info( + "✓ Uploaded container failure log to %s", + container_failure_log_url, + ) + except Exception as upload_err: # pylint: disable=broad-exception-caught + logger.warning( + "Could not upload container failure log: %s", upload_err + ) except Exception as e: # pylint: disable=broad-exception-caught logger.warning("Could not collect artifacts manifest: %s", e) @@ -686,4 +742,9 @@ def run_pipeline( task_arn if "task_arn" in locals() else None, expected_vm_count if "expected_vm_count" in locals() else None, s3_context=s3_context, + container_failure_log_url=( + container_failure_log_url + if "container_failure_log_url" in locals() + else None + ), ) diff --git a/src/kernel_ci_cloud_labs/pull_labs_poller.py b/src/kernel_ci_cloud_labs/pull_labs_poller.py index 29d61c4..392f252 100644 --- a/src/kernel_ci_cloud_labs/pull_labs_poller.py +++ b/src/kernel_ci_cloud_labs/pull_labs_poller.py @@ -433,12 +433,22 @@ def _extract_test_results(summary: Dict[str, Any]) -> Tuple[List[Dict[str, Any]] when ``summary["vms"]["instances"]`` is absent — keeps older in-flight summary files and unit tests using the old shape working. - The second tuple element (legacy ``log_url`` slot) is always ``None``; - log URLs are now per-row and live in ``row["log_url"]``. + The second tuple element (legacy job-level ``log_url`` slot) is normally + ``None`` — per-row URLs live in ``row["log_url"]`` — but is set to + ``summary["container_failure_log_url"]`` when the ECS container died + before any VM booted, so the fallback Infrastructure row downstream still + carries a clickable failure log. """ vms = summary.get("vms", {}) or {} instances = vms.get("instances") + # When the ECS container itself failed before launching any VM, there is + # no kernel log to publish. The pipeline uploads the container's own log + # to S3 and records its URL here so the synthetic Infrastructure row that + # the caller falls back to (build_test_row(..., log_url=log_url, ...)) at + # least links the user to the actual failure reason. + container_failure_log_url = summary.get("container_failure_log_url") + # Legacy path: no per-instance breakdown -> one row per test name, no URLs. if not instances: rows: List[Dict[str, Any]] = [] @@ -447,7 +457,7 @@ def _extract_test_results(summary: Dict[str, Any]) -> Tuple[List[Dict[str, Any]] for name in test_names: status = "FAIL" if failed_by_test.get(name) else "PASS" rows.append({"name": _test_name_to_path(name), "status": status}) - return rows, None + return rows, container_failure_log_url url_by_pair = _load_artifact_log_urls(summary.get("run_directory")) rows = []