Adding namespace docker fix#1060
Open
RaunakJalan wants to merge 61 commits into
Open
Conversation
Co-authored-by: Copilot Autofix powered by AI <223894421+github-code-quality[bot]@users.noreply.github.com>
Co-authored-by: Copilot Autofix powered by AI <223894421+github-code-quality[bot]@users.noreply.github.com>
Co-authored-by: Copilot Autofix powered by AI <223894421+github-code-quality[bot]@users.noreply.github.com>
Co-authored-by: Copilot Autofix powered by AI <223894421+github-code-quality[bot]@users.noreply.github.com>
Comment on lines
+91
to
+975
| name: Pre-clean -> E2E | ||
| runs-on: [self-hosted] | ||
| timeout-minutes: 240 | ||
|
|
||
| env: | ||
| # Cluster/lab env | ||
| STORAGE_PRIVATE_IPS: ${{ inputs.STORAGE_PRIVATE_IPS }} | ||
| API_INVOKE_URL: ${{ inputs.API_INVOKE_URL }} | ||
| API_BASE_URL: ${{ inputs.API_INVOKE_URL }} | ||
| BASTION_IP: ${{ inputs.BASTION_IP }} | ||
| BASTION_SERVER: ${{ inputs.BASTION_IP }} | ||
| MNODES: ${{ inputs.MNODES }} | ||
| SBCLI_CMD: "sbctl" | ||
| SBCLI_BRANCH: ${{ inputs.SBCLI_BRANCH }} | ||
|
|
||
| # SSH/client env | ||
| SSH_USER: ${{ inputs.SSH_USER }} | ||
| KEY_PATH: ${{ inputs.KEY_PATH }} | ||
| CLIENTNODES: ${{ inputs.CLIENTNODES }} | ||
| CLIENT_IP: ${{ inputs.CLIENTNODES }} | ||
|
|
||
| # Cleanup | ||
| NFS_MOUNTPOINT: ${{ inputs.NFS_MOUNTPOINT }} | ||
|
|
||
| # E2E derived from chunks | ||
| NDCS: ${{ inputs.BOOTSTRAP_DATA_CHUNKS }} | ||
| NPCS: ${{ inputs.BOOTSTRAP_PARITY_CHUNKS }} | ||
| TEST_CLASS: ${{ inputs.TEST_CLASS }} | ||
|
|
||
| # Secrets | ||
| SSH_PASSWORD: ${{ secrets.SSH_PASSWORD }} | ||
| SLACK_WEBHOOK_URL: ${{ secrets.SLACK_WEBHOOK_URL }} | ||
| MINIO_ACCESS_KEY: ${{ secrets.MINIO_ACCESS_KEY }} | ||
| MINIO_SECRET_KEY: ${{ secrets.MINIO_SECRET_KEY }} | ||
| SUPABASE_ANON_KEY: ${{ secrets.SUPABASE_ANON_KEY }} | ||
|
|
||
| # Pre-existing cluster | ||
| CLUSTER_ID: ${{ inputs.CLUSTER_ID }} | ||
| CLUSTER_SECRET: ${{ inputs.CLUSTER_SECRET }} | ||
|
|
||
| steps: | ||
| - name: Runner diagnostics | ||
| shell: bash | ||
| run: | | ||
| set -euxo pipefail | ||
| uname -a | ||
| whoami | ||
| pwd | ||
| python3 --version || true | ||
| git --version | ||
|
|
||
| - name: Install prereqs (sshpass) | ||
| shell: bash | ||
| run: | | ||
| set -euxo pipefail | ||
| if command -v sshpass >/dev/null 2>&1; then | ||
| exit 0 | ||
| fi | ||
| if command -v apt-get >/dev/null 2>&1; then | ||
| sudo apt-get update -y | ||
| sudo apt-get install -y sshpass | ||
| elif command -v yum >/dev/null 2>&1; then | ||
| sudo yum install -y epel-release || true | ||
| sudo yum install -y sshpass | ||
| elif command -v dnf >/dev/null 2>&1; then | ||
| sudo dnf install -y sshpass | ||
| else | ||
| echo "ERROR: Cannot install sshpass (unknown package manager)." | ||
| exit 1 | ||
| fi | ||
|
|
||
| - name: Resolve KEY_PATH (handles .ssh/, ~/.ssh/, quoted ~) and validate key exists | ||
| shell: bash | ||
| run: | | ||
| set -euxo pipefail | ||
|
|
||
| kp="${KEY_PATH}" | ||
|
|
||
| # Strip wrapping quotes if user typed "~/.ssh/..." with quotes | ||
| kp="${kp%\"}"; kp="${kp#\"}" | ||
| kp="${kp%\'}"; kp="${kp#\'}" | ||
|
|
||
| # Normalize ".ssh/..." -> "$HOME/.ssh/..." | ||
| if [[ "$kp" == .ssh/* ]]; then | ||
| kp="${HOME}/${kp}" | ||
| fi | ||
|
|
||
| # Normalize "~/" -> "$HOME/" | ||
| if [[ "$kp" == ~/* ]]; then | ||
| kp="${HOME}/${kp#~/}" | ||
| fi | ||
|
|
||
| # Also handle "~.ssh/.." (unlikely, but safe) | ||
| if [[ "$kp" == "~.ssh/"* ]]; then | ||
| kp="${HOME}/.${kp#~.}" | ||
| fi | ||
|
|
||
| echo "Resolved KEY_PATH=$kp" | ||
| echo "KEY_PATH=$kp" >> "$GITHUB_ENV" | ||
|
|
||
| test -f "$kp" || (echo "ERROR: SSH key not found at $kp" && exit 1) | ||
| chmod 600 "$kp" || true | ||
|
|
||
| - name: Export KEY_NAME from KEY_PATH | ||
| shell: bash | ||
| run: | | ||
| set -euxo pipefail | ||
| # KEY_PATH is already resolved and exported to $GITHUB_ENV in previous step | ||
| key_name="$(basename "${KEY_PATH}")" | ||
| echo "KEY_NAME=${key_name}" >> "$GITHUB_ENV" | ||
| echo "Exported KEY_NAME=${key_name}" | ||
|
|
||
| - name: Validate required secrets exist | ||
| shell: bash | ||
| run: | | ||
| set -euxo pipefail | ||
| [[ -n "${SSH_PASSWORD}" ]] || (echo "ERROR: secrets.SSH_PASSWORD required" && exit 1) | ||
|
|
||
| # ============================================================ | ||
| # PRE-RUN CLEANUP (remote ops only) | ||
| # Targets = MNODES + STORAGE_PRIVATE_IPS + CLIENTNODES | ||
| # ============================================================ | ||
| - name: Pre-clean kill fio/tmux and unmount NFS on MNODES + storage + clients | ||
| shell: bash | ||
| run: | | ||
| set -euxo pipefail | ||
|
|
||
| run_remote() { | ||
| local ip="$1" | ||
| local script="$2" | ||
| sshpass -p "${SSH_PASSWORD}" ssh \ | ||
| -o StrictHostKeyChecking=no \ | ||
| -o UserKnownHostsFile=/dev/null \ | ||
| "${SSH_USER}@${ip}" "bash -s" <<< "$script" | ||
| } | ||
|
|
||
| run_remote_with_retry() { | ||
| local ip="$1" | ||
| local script="$2" | ||
| local max=5 | ||
| for attempt in $(seq 1 $max); do | ||
| run_remote "$ip" "$script" && return 0 | ||
| echo "Attempt $attempt/$max failed for $ip, retrying in 5s..." | ||
| sleep 5 | ||
| done | ||
| echo "All $max attempts failed for $ip, continuing..." | ||
| return 0 | ||
| } | ||
|
|
||
| targets="$MNODES $STORAGE_PRIVATE_IPS $CLIENTNODES" | ||
| uniq_targets="$(echo "$targets" | tr ' ' '\n' | sed '/^$/d' | sort -u | tr '\n' ' ')" | ||
|
|
||
| for ip in $uniq_targets; do | ||
| echo "---- $ip: kill fio/tmux + umount ${NFS_MOUNTPOINT} ----" | ||
| run_remote_with_retry "$ip" "set -euxo pipefail; | ||
| pkill -9 fio || true; | ||
| pkill -9 tmux || true; | ||
| mp='${NFS_MOUNTPOINT}'; | ||
| if mountpoint -q \"\$mp\"; then umount -f \"\$mp\" || umount \"\$mp\"; else | ||
| if mount | grep -q \" \$mp \"; then umount -f \"\$mp\" || umount \"\$mp\" || true; fi | ||
| fi" | ||
| done | ||
|
|
||
| - name: Client cleanup disconnect lvols; ensure NFS not mounted anywhere; unmount all /mnt; remove /mnt dirs | ||
| shell: bash | ||
| run: | | ||
| set -euxo pipefail | ||
|
|
||
| run_remote() { | ||
| local ip="$1" | ||
| local script="$2" | ||
| sshpass -p "${SSH_PASSWORD}" ssh \ | ||
| -o StrictHostKeyChecking=no \ | ||
| -o UserKnownHostsFile=/dev/null \ | ||
| "${SSH_USER}@${ip}" "bash -s" <<< "$script" | ||
| } | ||
|
|
||
| # disconnect lvol subsystems on clients | ||
| for ip in $CLIENTNODES; do | ||
| echo "---- client disconnect lvols: $ip ----" | ||
| run_remote "$ip" "set -euxo pipefail; | ||
| subsystems=\$(nvme list-subsys | grep -i lvol | awk '{print \$3}' | cut -d '=' -f 2 || true); | ||
| for s in \$subsystems; do nvme disconnect -n \"\$s\" || true; done" | ||
| done | ||
|
|
||
| # fail if NFS still mounted anywhere | ||
| targets="$MNODES $STORAGE_PRIVATE_IPS $CLIENTNODES" | ||
| uniq_targets="$(echo "$targets" | tr ' ' '\n' | sed '/^$/d' | sort -u | tr '\n' ' ')" | ||
|
|
||
| still=0 | ||
| for ip in $uniq_targets; do | ||
| if sshpass -p "${SSH_PASSWORD}" ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null "${SSH_USER}@${ip}" \ | ||
| "mount | grep -q \" ${NFS_MOUNTPOINT} \""; then | ||
| echo "ERROR: ${NFS_MOUNTPOINT} still mounted on $ip" | ||
| still=1 | ||
| fi | ||
| done | ||
| [[ "$still" -eq 0 ]] || exit 1 | ||
|
|
||
| # unmount all /mnt and remove dirs on clients | ||
| for ip in $CLIENTNODES; do | ||
| echo "---- client unmount all /mnt and remove dirs: $ip ----" | ||
| run_remote "$ip" "set -euxo pipefail; | ||
| mps=\$(mount | grep ' /mnt' | awk '{print \$3}' || true); | ||
| for mp in \$mps; do umount -f \"\$mp\" || umount \"\$mp\" || true; done; | ||
| dirs=\$(find /mnt -mindepth 1 -type d 2>/dev/null || true); | ||
| for d in \$dirs; do rm -rf \"\$d\" || true; done" | ||
| done | ||
|
|
||
| # ============================================================ | ||
| # E2E TESTS (runner only) | ||
| # ============================================================ | ||
| - name: Clone sbcli repo (prefer same branch as workflow; fallback to SBCLI_BRANCH) | ||
| shell: bash | ||
| run: | | ||
| set -euxo pipefail | ||
| rm -rf sbcli | ||
|
|
||
| wf_branch="${{ github.ref_name }}" | ||
| fallback_branch="${SBCLI_BRANCH}" | ||
|
|
||
| echo "Workflow branch: $wf_branch" | ||
| echo "Fallback sbcli branch: $fallback_branch" | ||
|
|
||
| # Try workflow branch first | ||
| if git ls-remote --heads https://github.com/simplyblock-io/sbcli.git "$wf_branch" | grep -q "$wf_branch"; then | ||
| echo "Cloning sbcli on workflow branch: $wf_branch" | ||
| git clone --branch "$wf_branch" --single-branch https://github.com/simplyblock-io/sbcli.git sbcli | ||
| else | ||
| echo "Branch '$wf_branch' not found in sbcli; cloning fallback branch: $fallback_branch" | ||
| git clone --branch "$fallback_branch" --single-branch https://github.com/simplyblock-io/sbcli.git sbcli | ||
| fi | ||
|
|
||
| test -f sbcli/e2e/e2e.py | ||
| test -f sbcli/e2e/logs/cleanup.py | ||
|
|
||
| - name: Install Python deps (best-effort) | ||
| shell: bash | ||
| run: | | ||
| set -euxo pipefail | ||
| python3 -m pip install --upgrade pip | ||
| if [[ -f "sbcli/e2e/requirements.txt" ]]; then | ||
| pip install -r sbcli/e2e/requirements.txt | ||
| fi | ||
|
|
||
| - name: Cleanup logs before e2e | ||
| shell: bash | ||
| working-directory: sbcli/e2e | ||
| run: | | ||
| set -euxo pipefail | ||
| python3 logs/cleanup.py | ||
|
|
||
| - name: Set RUN_BASE_DIR | ||
| shell: bash | ||
| run: | | ||
| set -euxo pipefail | ||
| RUN_TIMESTAMP="$(date +%Y%m%d-%H%M%S)" | ||
| RUN_BASE_DIR="${NFS_MOUNTPOINT}/e2e-run-${RUN_TIMESTAMP}-${GITHUB_RUN_ID}" | ||
| echo "RUN_BASE_DIR=${RUN_BASE_DIR}" >> "$GITHUB_ENV" | ||
| mkdir -p "${RUN_BASE_DIR}" | ||
|
|
||
| - name: Record test start time | ||
| shell: bash | ||
| run: | | ||
| set -euxo pipefail | ||
| echo "TEST_START_EPOCH=$(date +%s)" >> "$GITHUB_ENV" | ||
| echo "TEST_START_HUMAN=$(date -u +'%Y-%m-%d %H:%M:%S UTC')" >> "$GITHUB_ENV" | ||
|
|
||
| - name: Run e2e tests | ||
| shell: bash | ||
| working-directory: sbcli/e2e | ||
| run: | | ||
| set -euxo pipefail | ||
| TESTNAME_ARGS=() | ||
| if [[ -n "${TEST_CLASS:-}" ]]; then | ||
| TESTNAME_ARGS=(--testname "${TEST_CLASS}") | ||
| fi | ||
| python3 -u e2e.py \ | ||
| "${TESTNAME_ARGS[@]}" \ | ||
| --ndcs "${NDCS}" \ | ||
| --npcs "${NPCS}" \ | ||
| 2>&1 | tee output.log | ||
|
|
||
| - name: Post-test cleanup (kill tmux on mgmt/storage; kill fio on clients) | ||
| if: always() | ||
| shell: bash | ||
| run: | | ||
| set -euxo pipefail | ||
|
|
||
| run_remote() { | ||
| local ip="$1" | ||
| local script="$2" | ||
| sshpass -p "${SSH_PASSWORD}" ssh \ | ||
| -o StrictHostKeyChecking=no \ | ||
| -o UserKnownHostsFile=/dev/null \ | ||
| "${SSH_USER}@${ip}" "bash -s" <<< "$script" || true | ||
| } | ||
|
|
||
| run_remote_with_retry() { | ||
| local ip="$1" | ||
| local script="$2" | ||
| local max=5 | ||
| for attempt in $(seq 1 $max); do | ||
| run_remote "$ip" "$script" && return 0 | ||
| echo "Attempt $attempt/$max failed for $ip, retrying in 5s..." | ||
| sleep 5 | ||
| done | ||
| echo "All $max attempts failed for $ip, continuing..." | ||
| return 0 | ||
| } | ||
|
|
||
| for ip in $MNODES $STORAGE_PRIVATE_IPS; do | ||
| echo "---- $ip: kill tmux ----" | ||
| run_remote_with_retry "$ip" "pkill -9 tmux || true" | ||
| done | ||
|
|
||
| for ip in $CLIENTNODES; do | ||
| echo "---- $ip: kill fio and tmux----" | ||
| run_remote_with_retry "$ip" "pkill -9 fio || true" | ||
| run_remote_with_retry "$ip" "pkill -9 tmux || true" | ||
| done | ||
|
|
||
| - name: Mark test end time (always) | ||
| if: always() | ||
| shell: bash | ||
| run: | | ||
| set -euxo pipefail | ||
| echo "TEST_END_EPOCH=$(date +%s)" >> "$GITHUB_ENV" | ||
| echo "TEST_END_HUMAN=$(date -u '+%Y-%m-%d %H:%M:%S UTC')" >> "$GITHUB_ENV" | ||
|
|
||
| - name: Collect mgmt snapshots into RUN_BASE_DIR (always) | ||
| if: always() | ||
| shell: bash | ||
| run: | | ||
| set -euxo pipefail | ||
|
|
||
| python3 - <<'PY' | ||
| import os, subprocess, json | ||
|
|
||
| mgmt_ip = os.environ["MNODES"].split()[0] | ||
| key = os.environ["KEY_PATH"] | ||
| user = os.environ["SSH_USER"] | ||
| sbcli = os.environ["SBCLI_CMD"] | ||
| cluster_id = os.environ["CLUSTER_ID"] | ||
| run_base = os.environ["RUN_BASE_DIR"].rstrip("/") | ||
| outdir = f"{run_base}/{mgmt_ip}/mgmt_details" | ||
|
|
||
| os.makedirs(f"{outdir}/mgmt", exist_ok=True) | ||
| os.makedirs(f"{outdir}/subtasks", exist_ok=True) | ||
| os.makedirs(f"{outdir}/storage_nodes", exist_ok=True) | ||
|
|
||
| ssh_base = [ | ||
| "ssh", "-i", key, | ||
| "-o", "StrictHostKeyChecking=no", | ||
| "-o", "UserKnownHostsFile=/dev/null", | ||
| "-o", "ConnectTimeout=10", | ||
| f"{user}@{mgmt_ip}", | ||
| ] | ||
|
|
||
| def run_cmd(cmd, out_file): | ||
| print(f" {cmd} -> {out_file}", flush=True) | ||
| try: | ||
| with open(out_file, "w") as f: | ||
| subprocess.run(ssh_base + [cmd], stdout=f, stderr=subprocess.STDOUT, check=False, timeout=60) | ||
| except Exception as e: | ||
| print(f" WARN: failed: {e}", flush=True) | ||
|
|
||
| run_cmd(f"{sbcli} cluster list", f"{outdir}/mgmt/cluster_list.txt") | ||
| run_cmd(f"{sbcli} cluster status {cluster_id}", f"{outdir}/mgmt/cluster_status.txt") | ||
| run_cmd(f"{sbcli} cluster show {cluster_id}", f"{outdir}/mgmt/cluster_show.txt") | ||
| run_cmd(f"{sbcli} cluster get-capacity {cluster_id}", f"{outdir}/mgmt/cluster_capacity.txt") | ||
| run_cmd(f"{sbcli} cluster get-logs {cluster_id} --limit 0", f"{outdir}/mgmt/cluster_get_logs.txt") | ||
|
|
||
| run_cmd(f"{sbcli} pool list", f"{outdir}/mgmt/pool_list.txt") | ||
| run_cmd(f"{sbcli} lvol list", f"{outdir}/mgmt/lvol_list.txt") | ||
| run_cmd(f"{sbcli} snapshot list", f"{outdir}/mgmt/snapshot_list.txt") | ||
|
|
||
| run_cmd(f"{sbcli} sn list", f"{outdir}/mgmt/sn_list.txt") | ||
| run_cmd(f"{sbcli} sn list --json", f"{outdir}/mgmt/sn_list.json") | ||
|
|
||
| # Parse SN UUIDs | ||
| sn_uuids = [] | ||
| try: | ||
| with open(f"{outdir}/mgmt/sn_list.json") as f: | ||
| data = json.load(f) | ||
| for item in (data if isinstance(data, list) else []): | ||
| uid = item.get("UUID") or item.get("uuid") or item.get("Id") or item.get("id") | ||
| if uid: | ||
| sn_uuids.append(uid) | ||
| except Exception: | ||
| pass | ||
|
|
||
| for idx, uuid in enumerate(sn_uuids, 1): | ||
| run_cmd(f"{sbcli} sn list-devices {uuid}", f"{outdir}/storage_nodes/node{idx}_list_devices.txt") | ||
| run_cmd(f"{sbcli} sn check {uuid}", f"{outdir}/storage_nodes/node{idx}_check.txt") | ||
| run_cmd(f"{sbcli} sn get {uuid}", f"{outdir}/storage_nodes/node{idx}_get.txt") | ||
|
|
||
| run_cmd(f"{sbcli} cluster list-tasks {cluster_id} --limit 0", f"{outdir}/mgmt/cluster_list_tasks.txt") | ||
|
|
||
| # Parse balancing task IDs and get subtasks | ||
| bal_ids = [] | ||
| try: | ||
| with open(f"{outdir}/mgmt/cluster_list_tasks.txt") as f: | ||
| for line in f: | ||
| if line.startswith("+") or "Task ID" in line or "|" not in line: | ||
| continue | ||
| cols = [c.strip() for c in line.split("|")] | ||
| # cols[1] = task_id, cols[3] = function name | ||
| if len(cols) >= 5 and cols[3] == "balancing_on_restart" and cols[1]: | ||
| bal_ids.append(cols[1]) | ||
| except Exception: | ||
| pass | ||
|
|
||
| for tid in bal_ids: | ||
| run_cmd(f"{sbcli} cluster get-subtasks {tid}", f"{outdir}/subtasks/{tid}_subtasks.txt") | ||
| PY | ||
|
|
||
| - name: Collect docker logs into RUN_BASE_DIR (always) | ||
| if: always() | ||
| shell: bash | ||
| run: | | ||
| set -euxo pipefail | ||
|
|
||
| TAG="containers-final-$(date +%Y%m%d_%H%M%S)" | ||
| SSH_OPTS=(-i "${KEY_PATH}" -o BatchMode=yes -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null -o ConnectTimeout=10 -C) | ||
|
|
||
| NODES="$(echo "${MNODES} ${STORAGE_PRIVATE_IPS}" | tr ' ' '\n' | sed '/^$/d' | sort -u | tr '\n' ' ')" | ||
|
|
||
| for NODE in ${NODES}; do | ||
| echo ">>> Node: ${NODE}" | ||
| LOCAL_NODE_DIR="${RUN_BASE_DIR}/${NODE}/${TAG}" | ||
| mkdir -p "${LOCAL_NODE_DIR}" | ||
|
|
||
| ssh "${SSH_OPTS[@]}" "${SSH_USER}@${NODE}" "docker ps -a 2>&1 || true" \ | ||
| > "${LOCAL_NODE_DIR}/docker_ps_a_${NODE}.txt" || true | ||
|
|
||
| CONTAINERS="$(ssh "${SSH_OPTS[@]}" "${SSH_USER}@${NODE}" \ | ||
| "docker ps -a --format '{{.Names}}' 2>/dev/null || true" 2>/dev/null || true)" | ||
|
|
||
| if [[ -z "${CONTAINERS}" ]]; then | ||
| echo "No containers found on ${NODE}" > "${LOCAL_NODE_DIR}/_NO_CONTAINERS_${NODE}.txt" | ||
| continue | ||
| fi | ||
|
|
||
| set +e | ||
| while IFS= read -r C; do | ||
| C="${C%%$'\r'}" | ||
| [[ -z "${C}" ]] && continue | ||
| echo " dumping: ${C}" | ||
| ssh "${SSH_OPTS[@]}" "${SSH_USER}@${NODE}" \ | ||
| "docker logs --timestamps --details '${C}' 2>&1 || true" \ | ||
| > "${LOCAL_NODE_DIR}/${C}.txt" || true | ||
| ssh "${SSH_OPTS[@]}" "${SSH_USER}@${NODE}" \ | ||
| "docker inspect '${C}' 2>&1 || true" \ | ||
| > "${LOCAL_NODE_DIR}/${C}_inspect.json" || true | ||
| done <<< "${CONTAINERS}" | ||
| set -e | ||
| done | ||
|
|
||
| - name: Collect distrib debug dumps into RUN_BASE_DIR (always) | ||
| if: always() | ||
| timeout-minutes: 35 | ||
| shell: bash | ||
| run: | | ||
| set -euxo pipefail | ||
|
|
||
| python3 - <<'PY' | ||
| import os, subprocess, sys, textwrap | ||
|
|
||
| ssh_user = os.environ["SSH_USER"] | ||
| key = os.environ["KEY_PATH"] | ||
| run_base = os.environ["RUN_BASE_DIR"].rstrip("/") | ||
| tag = "finaldistrib_bdev_logs" | ||
|
|
||
| storage_ips = os.environ["STORAGE_PRIVATE_IPS"].split() | ||
|
|
||
| ssh_base = [ | ||
| "ssh", | ||
| "-i", key, | ||
| "-o", "StrictHostKeyChecking=no", | ||
| "-o", "UserKnownHostsFile=/dev/null", | ||
| "-o", "ServerAliveInterval=15", | ||
| "-o", "ServerAliveCountMax=4", | ||
| "-o", "ConnectTimeout=10", | ||
| "-C", | ||
| ] | ||
|
|
||
| scp_base = [ | ||
| "scp", | ||
| "-i", key, | ||
| "-o", "StrictHostKeyChecking=no", | ||
| "-o", "UserKnownHostsFile=/dev/null", | ||
| "-o", "ConnectTimeout=10", | ||
| "-C", | ||
| ] | ||
|
|
||
| remote_script = """\ | ||
| set -euo pipefail | ||
| TS="$(date +%Y%m%d_%H%M%S)" | ||
| HOST="$(hostname -s 2>/dev/null || hostname)" | ||
| STAGING="/tmp/distrib_host_collect_${TS}" | ||
| mkdir -p "$STAGING" | ||
|
|
||
| CN="$(sudo docker ps --format '{{.Names}}' | grep -E '^spdk_[0-9]+$' | head -n1 || true)" | ||
| if [[ -z "$CN" ]]; then echo "NO_SPDK_CONTAINER"; exit 0; fi | ||
| SOCK="/mnt/ramdisk/${CN}/spdk.sock" | ||
|
|
||
| BDEV_JSON="$(sudo docker exec "$CN" bash -lc "sudo python spdk/scripts/rpc.py -s '$SOCK' bdev_get_bdevs" 2>/dev/null || true)" | ||
| if [[ -z "$BDEV_JSON" ]]; then echo "BDEV_EMPTY"; exit 0; fi | ||
|
|
||
| if command -v jq >/dev/null 2>&1; then | ||
| mapfile -t DISTRIBS < <(printf '%s' "$BDEV_JSON" | jq -r '.[] | select(.name|startswith("distrib_")) | .name' | sort -u) | ||
| else | ||
| mapfile -t DISTRIBS < <(printf '%s\\n' "$BDEV_JSON" | grep -oE '"name"\\s*:\\s*"distrib_[^"]+"' | sed -E 's/.*"name"\\s*:\\s*"([^"]+)".*/\\1/' | sort -u) | ||
| fi | ||
|
|
||
| if [[ ${#DISTRIBS[@]} -eq 0 ]]; then echo "NO_DISTRIBS"; exit 0; fi | ||
|
|
||
| for d in "${DISTRIBS[@]}"; do | ||
| JF="/tmp/stack_${d}.json" | ||
|
|
||
| python3 - "$d" "$JF" <<'PYIN' | ||
| import json, sys | ||
| d = sys.argv[1] | ||
| jf = sys.argv[2] | ||
| obj = {"subsystems":[{"subsystem":"distr","config":[{"method":"distr_debug_placement_map_dump","params":{"name":d}}]}]} | ||
| with open(jf, "w") as f: | ||
| f.write(json.dumps(obj)) | ||
| PYIN | ||
|
|
||
| sudo docker cp "$JF" "$CN:$JF" || true | ||
| sudo docker exec "$CN" bash -lc "sudo python scripts/rpc_sock.py '$JF' '$SOCK' > /tmp/rpc_${d}.log 2>&1 || true" || true | ||
| sudo docker cp "$CN:/tmp/rpc_${d}.log" "$STAGING/rpc_${d}.log" 2>/dev/null || true | ||
|
|
||
| for f in $(sudo docker exec "$CN" bash -lc "sudo ls /tmp 2>/dev/null | grep -F \\\"$d\\\" || true"); do | ||
| sudo docker cp "$CN:/tmp/$f" "$STAGING/${CN}__$f" 2>/dev/null || true | ||
| done | ||
|
|
||
| sudo docker exec "$CN" bash -lc "sudo rm -f '$JF' '/tmp/rpc_${d}.log'" || true | ||
| rm -f "$JF" || true | ||
| done | ||
|
|
||
| cat /proc/meminfo | grep -i huge > "$STAGING/hugepage_meminfo.txt" 2>/dev/null || true | ||
|
|
||
| TAR="/tmp/${HOST}_distrib_dumps_${TS}.tgz" | ||
| tar -C "$STAGING" -czf "$TAR" . 2>/dev/null || true | ||
| echo "$TAR" | ||
| """ | ||
|
|
||
| # NOTE: the only heredoc above (<<'PYIN') is inside the REMOTE SCRIPT, | ||
| # not inside YAML. YAML never sees it (we send script via stdin). | ||
|
|
||
| for ip in storage_ips: | ||
| print(f"=== {ip} ===", flush=True) | ||
|
|
||
| # Run remote script via stdin (no YAML heredoc) | ||
| cmd = ssh_base + [f"{ssh_user}@{ip}", "bash", "-s"] | ||
| p = subprocess.run(cmd, input=remote_script.encode(), stdout=subprocess.PIPE, stderr=subprocess.STDOUT, check=False) | ||
| out = p.stdout.decode(errors="replace").strip().splitlines() | ||
| last = out[-1].strip() if out else "" | ||
| last = last.replace("\r", "") | ||
|
|
||
| if last in ("NO_SPDK_CONTAINER", "BDEV_EMPTY", "NO_DISTRIBS") or not last.startswith("/tmp/"): | ||
| print(f"[{ip}] WARN: distrib collection skipped/failed: {last or '(no output)'}", flush=True) | ||
| continue | ||
|
|
||
| dest_dir = f"{run_base}/{ip}/{tag}" | ||
| os.makedirs(dest_dir, exist_ok=True) | ||
|
|
||
| scp_cmd = scp_base + [f"{ssh_user}@{ip}:{last}", dest_dir + "/"] | ||
| subprocess.run(scp_cmd, check=False) | ||
|
|
||
| print(f"[{ip}] Saved -> {dest_dir}/{os.path.basename(last)}", flush=True) | ||
| PY | ||
|
|
||
| # ========================= | ||
| # SUMMARY (always) | ||
| # ========================= | ||
| - name: Write Job Summary | ||
| if: always() | ||
| shell: bash | ||
| run: | | ||
| set -euxo pipefail | ||
|
|
||
| mgmt_ip="$(echo "${MNODES}" | awk '{print $1}')" | ||
| out_log="sbcli/e2e/output.log" | ||
|
|
||
| # --- Timing --- | ||
| start="${TEST_START_EPOCH:-0}" | ||
| end="${TEST_END_EPOCH:-0}" | ||
| dur_sec=0 | ||
| if [[ "$start" =~ ^[0-9]+$ && "$end" =~ ^[0-9]+$ && "$end" -ge "$start" ]]; then | ||
| dur_sec=$((end-start)) | ||
| fi | ||
| dur_h=$((dur_sec/3600)); dur_m=$(((dur_sec%3600)/60)); dur_s=$((dur_sec%60)) | ||
| dur_fmt="${dur_h}h ${dur_m}m ${dur_s}s" | ||
|
|
||
| # --- Parse test counts from output.log --- | ||
| total_cases=0; passed_cases=0; failed_cases=0; skipped_cases=0 | ||
| if [[ -f "${out_log}" ]]; then | ||
| v="$(grep -m1 'Number of Total Cases:' "${out_log}" | grep -oE '[0-9]+$' 2>/dev/null || true)"; [[ "${v}" =~ ^[0-9]+$ ]] && total_cases="${v}" | ||
| v="$(grep -m1 'Number of Passed Cases:' "${out_log}" | grep -oE '[0-9]+$' 2>/dev/null || true)"; [[ "${v}" =~ ^[0-9]+$ ]] && passed_cases="${v}" | ||
| v="$(grep -m1 'Number of Failed Cases:' "${out_log}" | grep -oE '[0-9]+$' 2>/dev/null || true)"; [[ "${v}" =~ ^[0-9]+$ ]] && failed_cases="${v}" | ||
| v="$(grep -m1 'Number of Skipped Cases:' "${out_log}" | grep -oE '[0-9]+$' 2>/dev/null || true)"; [[ "${v}" =~ ^[0-9]+$ ]] && skipped_cases="${v}" | ||
| fi | ||
|
|
||
| pass_pct=0; fail_pct=0; skip_pct=0 | ||
| if [[ "${total_cases}" -gt 0 ]]; then | ||
| pass_pct=$(( (passed_cases * 100) / total_cases )) | ||
| fail_pct=$(( (failed_cases * 100) / total_cases )) | ||
| skip_pct=$(( (skipped_cases * 100) / total_cases )) | ||
| fi | ||
|
|
||
| # --- Parse NFS log paths in run order (1 per test, sequential execution) --- | ||
| mapfile -t log_path_arr < <(grep 'Logs Path:' "${out_log}" 2>/dev/null \ | ||
| | sed 's/.*Logs Path: *//' | sed 's/\x1b\[[0-9;]*m//g' | sed 's/[[:space:]]*$//' || true) | ||
|
|
||
| # --- Parse per-test status lines in run order --- | ||
| test_names=(); test_statuses=() | ||
| if [[ -f "${out_log}" ]]; then | ||
| while IFS= read -r line; do | ||
| clean="$(printf '%s' "${line}" | sed 's/\x1b\[[0-9;]*m//g')" | ||
| test_name="$(printf '%s' "${clean}" | grep -oE 'Test[A-Za-z0-9]+' | head -n1 || true)" | ||
| [[ -z "${test_name}" ]] && continue | ||
| if printf '%s' "${clean}" | grep -q 'PASSED CASE'; then test_names+=("${test_name}"); test_statuses+=("PASSED") | ||
| elif printf '%s' "${clean}" | grep -q 'FAILED CASE'; then test_names+=("${test_name}"); test_statuses+=("FAILED") | ||
| elif printf '%s' "${clean}" | grep -q 'SKIPPED CASE'; then test_names+=("${test_name}"); test_statuses+=("SKIPPED") | ||
| fi | ||
| done < <(grep -E ' (PASSED|FAILED|SKIPPED) CASE\.' "${out_log}" 2>/dev/null || true) | ||
| fi | ||
|
|
||
| # --- Build per-test details table: associate by position --- | ||
| # Tests run sequentially → log_path_arr[i] belongs to test_names[i] | ||
| test_details_table="" | ||
| for i in "${!test_names[@]}"; do | ||
| name="${test_names[$i]}" | ||
| status="${test_statuses[$i]}" | ||
| path="${log_path_arr[$i]:-N/A}" | ||
| case "${status}" in | ||
| PASSED) icon="✅" ;; | ||
| FAILED) icon="❌" ;; | ||
| SKIPPED) icon="⏭" ;; | ||
| *) icon="❓" ;; | ||
| esac | ||
| test_details_table+="| \`${name}\` | ${icon} ${status} | \`${path}\` |"$'\n' | ||
| done | ||
|
|
||
| # --- Failure reason: prefer MultipleExceptions summary line --- | ||
| failure_reason="" | ||
| if [[ -f "${out_log}" ]]; then | ||
| multi="$(grep 'MultipleExceptions:' "${out_log}" | sed 's/\x1b\[[0-9;]*m//g' || true)" | ||
| if [[ -n "${multi}" ]]; then | ||
| failure_reason="${multi}" | ||
| elif grep -Eqi 'Traceback \(most recent call last\)|Exception:|AssertionError|Input/output error' "${out_log}"; then | ||
| failure_reason="$(grep -Ei 'Traceback \(most recent call last\)|Exception:|AssertionError|Input/output error' "${out_log}" | tail -n 3 | sed 's/\x1b\[[0-9;]*m//g' || true)" | ||
| fi | ||
| fi | ||
|
|
||
| # --- Mgmt artifacts list (collapsed) --- | ||
| mgmt_dir="${RUN_BASE_DIR:-}/${mgmt_ip}/mgmt_details/mgmt" | ||
| mgmt_files="(not found)" | ||
| if [[ -n "${RUN_BASE_DIR:-}" && -d "${mgmt_dir}" ]]; then | ||
| mgmt_files="$(find "${mgmt_dir}" -maxdepth 1 -type f -printf '%f (%s bytes)\n' 2>/dev/null | sort || true)" | ||
| [[ -n "${mgmt_files}" ]] || mgmt_files="(empty)" | ||
| fi | ||
|
|
||
| # --- Overall result --- | ||
| conclusion="✅ SUCCESS" | ||
| if [[ "${{ job.status }}" != "success" ]]; then | ||
| conclusion="❌ FAILED" | ||
| fi | ||
|
|
||
| { | ||
| echo "## SimplyBlock E2E Run Summary" | ||
| echo "" | ||
| echo "**Result:** ${conclusion} | **Duration:** ${dur_fmt}" | ||
| echo "" | ||
|
|
||
| echo "### Test Results" | ||
| echo "| | Count | % |" | ||
| echo "|---|---|---|" | ||
| echo "| ✅ Passed | ${passed_cases} | ${pass_pct}% |" | ||
| echo "| ❌ Failed | ${failed_cases} | ${fail_pct}% |" | ||
| echo "| ⏭ Skipped | ${skipped_cases} | ${skip_pct}% |" | ||
| echo "| **Total** | **${total_cases}** | |" | ||
| echo "" | ||
|
|
||
| if [[ -n "${test_details_table}" ]]; then | ||
| echo "### Test Case Details" | ||
| echo "| Test | Result | Log Path |" | ||
| echo "|---|---|---|" | ||
| printf '%s' "${test_details_table}" | ||
| echo "" | ||
| fi | ||
|
|
||
| echo "### Run Info" | ||
| echo "- **Test class:** \`${TEST_CLASS:-all}\`" | ||
| echo "- **Cluster ID:** \`${CLUSTER_ID}\`" | ||
| echo "- **Mgmt node:** \`${mgmt_ip}\`" | ||
| echo "- **NDCS/NPCS:** \`${NDCS}/${NPCS}\`" | ||
| echo "- **Start (UTC):** ${TEST_START_HUMAN:-unknown}" | ||
| echo "- **End (UTC):** ${TEST_END_HUMAN:-unknown}" | ||
| echo "" | ||
|
|
||
| if [[ -n "${failure_reason}" ]]; then | ||
| echo "### Failure Reason" | ||
| echo '```' | ||
| printf '%s\n' "${failure_reason}" | ||
| echo '```' | ||
| echo "" | ||
| fi | ||
|
|
||
| if [[ -n "${RUN_BASE_DIR:-}" ]]; then | ||
| echo "<details><summary>Run Artifacts (NFS)</summary>" | ||
| echo "" | ||
| echo "- **Run dir:** \`${RUN_BASE_DIR}/\`" | ||
| echo "- Mgmt details: \`${RUN_BASE_DIR}/${mgmt_ip}/mgmt_details/\`" | ||
| echo "- Docker logs: \`${RUN_BASE_DIR}/<node_ip>/containers-final-*/\`" | ||
| echo "- Distrib dumps: \`${RUN_BASE_DIR}/<storage_ip>/finaldistrib_bdev_logs/\`" | ||
| echo "" | ||
| echo "</details>" | ||
| echo "" | ||
| fi | ||
|
|
||
| echo "<details><summary>Mgmt Artifacts (cluster state at end of run)</summary>" | ||
| echo "" | ||
| echo "Path: \`${mgmt_dir}\`" | ||
| echo "" | ||
| echo '```' | ||
| printf '%s\n' "${mgmt_files}" | ||
| echo '```' | ||
| echo "" | ||
| echo "</details>" | ||
| } >> "$GITHUB_STEP_SUMMARY" | ||
|
|
||
| - name: Send Slack Notification | ||
| if: always() | ||
| shell: bash | ||
| env: | ||
| SLACK_WEBHOOK_URL: ${{ secrets.SLACK_WEBHOOK_URL }} | ||
| JOB_STATUS: ${{ job.status }} | ||
| SLACK_RUN_URL: "${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}" | ||
| GITHUB_REF_NAME: ${{ github.ref_name }} | ||
| SLACK_WF_NAME: "E2E (No Bootstrap)" | ||
| SBCLI_BRANCH: ${{ env.SBCLI_BRANCH }} | ||
| run: | | ||
| python3 - <<'PYEOF' | ||
| import json, os, re, sys, urllib.request, urllib.error | ||
|
|
||
| webhook = os.environ.get("SLACK_WEBHOOK_URL", "") | ||
| if not webhook: | ||
| print("No SLACK_WEBHOOK_URL set, skipping.") | ||
| sys.exit(0) | ||
|
|
||
| out_log = "sbcli/e2e/output.log" | ||
| content = open(out_log).read() if os.path.isfile(out_log) else "" | ||
|
|
||
| # --- Counts --- | ||
| def px(pat): | ||
| m = re.search(pat, content) | ||
| return int(m.group(1)) if m else 0 | ||
| total = px(r'Number of Total Cases:\s*(\d+)') | ||
| passed = px(r'Number of Passed Cases:\s*(\d+)') | ||
| failed = px(r'Number of Failed Cases:\s*(\d+)') | ||
| skipped = px(r'Number of Skipped Cases:\s*(\d+)') | ||
| pass_pct = (passed * 100 // total) if total > 0 else 0 | ||
|
|
||
| # --- Per-test results --- | ||
| ansi = re.compile(r'\x1b\[[0-9;]*m') | ||
| test_results = [] # list of (status, name) | ||
| for line in content.splitlines(): | ||
| clean = ansi.sub('', line) | ||
| if not re.search(r'(PASSED|FAILED|SKIPPED) CASE', clean): | ||
| continue | ||
| m = re.search(r'Test[A-Za-z0-9_]+', clean) | ||
| if not m: | ||
| continue | ||
| name = m.group(0) | ||
| if 'PASSED CASE' in clean: test_results.append(('PASSED', name)) | ||
| elif 'FAILED CASE' in clean: test_results.append(('FAILED', name)) | ||
| elif 'SKIPPED CASE' in clean: test_results.append(('SKIPPED', name)) | ||
|
|
||
| # --- Failure reason --- | ||
| failure_reason = "" | ||
| multi = [ansi.sub('', l) for l in content.splitlines() if 'MultipleExceptions:' in l] | ||
| if multi: | ||
| failure_reason = multi[0][:2000] | ||
| elif content: | ||
| exc_lines = [ansi.sub('', l) for l in content.splitlines() | ||
| if re.search(r'(Exception:|AssertionError|Input/output error)', l)] | ||
| if exc_lines: | ||
| failure_reason = '\n'.join(exc_lines[-5:])[:2000] | ||
|
|
||
| # --- Env --- | ||
| s = int(os.environ.get("TEST_START_EPOCH", "0") or "0") | ||
| e = int(os.environ.get("TEST_END_EPOCH", "0") or "0") | ||
| secs = max(0, e - s) if e >= s > 0 else 0 | ||
| dur = f"{secs//3600}h {(secs%3600)//60}m {secs%60}s" | ||
| run_url = os.environ.get("SLACK_RUN_URL", "") | ||
| log_dir = os.environ.get("RUN_BASE_DIR", "N/A") | ||
| ndcs = os.environ.get("NDCS", "?") | ||
| npcs = os.environ.get("NPCS", "?") | ||
| test_cls = os.environ.get("TEST_CLASS", "") or "all" | ||
| branch = os.environ.get("GITHUB_REF_NAME", "?") | ||
| sbcli_branch= os.environ.get("SBCLI_BRANCH", "?") | ||
| wf_name = os.environ.get("SLACK_WF_NAME", "Run") | ||
| ok = os.environ.get("JOB_STATUS", "") == "success" | ||
|
|
||
| icon = ":white_check_mark:" if ok else ":x:" | ||
| status = "SUCCESS" if ok else "FAILURE" | ||
| mention = "" if ok else " <!channel>" | ||
|
|
||
| lines = [ | ||
| f"{icon} *SimplyBlock {wf_name}*{mention}", | ||
| f"*Status:* {status} | *Duration:* {dur}", | ||
| f"*Branch:* `{branch}` | *NDCS/NPCS:* `{ndcs}/{npcs}` | *Test class:* `{test_cls}`", | ||
| f"*SBCLI Branch:* `{sbcli_branch}`", | ||
| "", | ||
| ] | ||
|
|
||
| if total > 0: | ||
| lines += [ | ||
| f":white_check_mark: *Passed:* {passed}/{total} ({pass_pct}%)", | ||
| f":x: *Failed:* {failed}", | ||
| f":fast_forward: *Skipped:* {skipped}", | ||
| ] | ||
| else: | ||
| lines.append("_(test counts not found in log)_") | ||
|
|
||
| if test_results: | ||
| lines.append("") | ||
| lines.append("*Test Results:*") | ||
| icons = {'PASSED': ':white_check_mark:', 'FAILED': ':x:', 'SKIPPED': ':fast_forward:'} | ||
| for st, nm in test_results: | ||
| lines.append(f"{icons.get(st, ':grey_question:')} `{nm}`") | ||
|
|
||
| if failure_reason: | ||
| lines += ["", "*Failure:*", f"```{failure_reason}```"] | ||
|
|
||
| lines += [ | ||
| "", | ||
| f":link: *Run:* <{run_url}|View on GitHub>", | ||
| f":file_folder: *Final Logs:* `{log_dir}`", | ||
| ] | ||
|
|
||
| payload = {"text": "\n".join(lines)} | ||
| req = urllib.request.Request( | ||
| webhook, | ||
| data=json.dumps(payload).encode(), | ||
| headers={"Content-Type": "application/json"}, | ||
| ) | ||
| try: | ||
| urllib.request.urlopen(req, timeout=15) | ||
| print("Slack notification sent.") | ||
| except Exception as exc: | ||
| print(f"WARN: Slack notification failed: {exc}", file=sys.stderr) | ||
| PYEOF | ||
|
|
||
| - name: Upload logs (always) | ||
| if: always() | ||
| uses: actions/upload-artifact@v4 | ||
| with: | ||
| name: simplyblock-e2e-logs-${{ github.run_id }} | ||
| path: | | ||
| sbcli/e2e/output.log | ||
| sbcli/e2e/logs/** | ||
| if-no-files-found: warn | ||
|
|
||
| - name: Export MGMT_IP (first MNODES) | ||
| if: always() | ||
| shell: bash | ||
| run: | | ||
| echo "MGMT_IP=$(echo "${MNODES}" | awk '{print $1}')" >> "$GITHUB_ENV" | ||
|
|
||
| - name: Upload small artifacts (always) | ||
| if: always() && env.RUN_BASE_DIR != '' && env.MGMT_IP != '' | ||
| uses: actions/upload-artifact@v4 | ||
| with: | ||
| name: simplyblock-e2e-small-logs-${{ github.run_id }} | ||
| path: | | ||
| sbcli/e2e/output.log | ||
| ${{ env.RUN_BASE_DIR }}/${{ env.MGMT_IP }}/mgmt_details/mgmt/*.txt | ||
| ${{ env.RUN_BASE_DIR }}/${{ env.MGMT_IP }}/mgmt_details/subtasks/*.txt | ||
| ${{ env.RUN_BASE_DIR }}/${{ env.MGMT_IP }}/mgmt_details/storage_nodes/*.txt | ||
| if-no-files-found: warn |
Comment on lines
+86
to
+885
| name: Pre-clean -> Bootstrap -> Stress | ||
| runs-on: [self-hosted] | ||
| timeout-minutes: 4320 | ||
|
|
||
| env: | ||
| # Cluster/lab env | ||
| STORAGE_PRIVATE_IPS: ${{ inputs.STORAGE_PRIVATE_IPS }} | ||
| API_INVOKE_URL: ${{ inputs.API_INVOKE_URL }} | ||
| API_BASE_URL: ${{ inputs.API_INVOKE_URL }} | ||
| BASTION_IP: ${{ inputs.BASTION_IP }} | ||
| BASTION_SERVER: ${{ inputs.BASTION_IP }} | ||
| MNODES: ${{ inputs.MNODES }} | ||
| SBCLI_CMD: 'sbctl' | ||
|
|
||
| # SSH/client env | ||
| SSH_USER: ${{ inputs.SSH_USER }} | ||
| KEY_PATH: ${{ inputs.KEY_PATH }} | ||
| CLIENTNODES: ${{ inputs.CLIENTNODES }} | ||
| CLIENT_IP: ${{ inputs.CLIENTNODES }} | ||
|
|
||
| # Cleanup | ||
| NFS_MOUNTPOINT: ${{ inputs.NFS_MOUNTPOINT }} | ||
|
|
||
| # Bootstrap params | ||
| BOOTSTRAP_DATA_CHUNKS: ${{ inputs.BOOTSTRAP_DATA_CHUNKS }} | ||
| BOOTSTRAP_PARITY_CHUNKS: ${{ inputs.BOOTSTRAP_PARITY_CHUNKS }} | ||
|
|
||
| # Stress derived from bootstrap chunks | ||
| NDCS: ${{ inputs.BOOTSTRAP_DATA_CHUNKS }} | ||
| NPCS: ${{ inputs.BOOTSTRAP_PARITY_CHUNKS }} | ||
| TEST_CLASS: ${{ inputs.TEST_CLASS }} | ||
|
|
||
| # Secrets | ||
| SSH_PASSWORD: ${{ secrets.SSH_PASSWORD }} | ||
| SLACK_WEBHOOK_URL: ${{ secrets.SLACK_WEBHOOK_URL }} | ||
| MINIO_ACCESS_KEY: ${{ secrets.MINIO_ACCESS_KEY }} | ||
| MINIO_SECRET_KEY: ${{ secrets.MINIO_SECRET_KEY }} | ||
| SUPABASE_ANON_KEY: ${{ secrets.SUPABASE_ANON_KEY }} | ||
|
|
||
| # Filled after bootstrap parsing | ||
| CLUSTER_ID: ${{ inputs.CLUSTER_ID }} | ||
| CLUSTER_SECRET: ${{ inputs.CLUSTER_SECRET }} | ||
|
|
||
| steps: | ||
| - name: Runner diagnostics | ||
| shell: bash | ||
| run: | | ||
| set -euxo pipefail | ||
| uname -a | ||
| whoami | ||
| pwd | ||
| python3 --version || true | ||
| git --version | ||
|
|
||
| - name: Install prereqs (sshpass) | ||
| shell: bash | ||
| run: | | ||
| set -euxo pipefail | ||
| if command -v sshpass >/dev/null 2>&1; then | ||
| exit 0 | ||
| fi | ||
| if command -v apt-get >/dev/null 2>&1; then | ||
| sudo apt-get update -y | ||
| sudo apt-get install -y sshpass | ||
| elif command -v yum >/dev/null 2>&1; then | ||
| sudo yum install -y epel-release || true | ||
| sudo yum install -y sshpass | ||
| elif command -v dnf >/dev/null 2>&1; then | ||
| sudo dnf install -y sshpass | ||
| else | ||
| echo "ERROR: Cannot install sshpass (unknown package manager)." | ||
| exit 1 | ||
| fi | ||
|
|
||
| - name: Resolve KEY_PATH (handles .ssh/, ~/.ssh/, quoted ~) and validate key exists | ||
| shell: bash | ||
| run: | | ||
| set -euxo pipefail | ||
|
|
||
| kp="${KEY_PATH}" | ||
|
|
||
| # Strip wrapping quotes if user typed "~/.ssh/..." with quotes | ||
| kp="${kp%\"}"; kp="${kp#\"}" | ||
| kp="${kp%\'}"; kp="${kp#\'}" | ||
|
|
||
| # Normalize ".ssh/..." -> "$HOME/.ssh/..." | ||
| if [[ "$kp" == .ssh/* ]]; then | ||
| kp="${HOME}/${kp}" | ||
| fi | ||
|
|
||
| # Normalize "~/" -> "$HOME/" | ||
| if [[ "$kp" == ~/* ]]; then | ||
| kp="${HOME}/${kp#~/}" | ||
| fi | ||
|
|
||
| # Also handle "~.ssh/.." (unlikely, but safe) | ||
| if [[ "$kp" == "~.ssh/"* ]]; then | ||
| kp="${HOME}/.${kp#~.}" | ||
| fi | ||
|
|
||
| echo "Resolved KEY_PATH=$kp" | ||
| echo "KEY_PATH=$kp" >> "$GITHUB_ENV" | ||
|
|
||
| test -f "$kp" || (echo "ERROR: SSH key not found at $kp" && exit 1) | ||
| chmod 600 "$kp" || true | ||
|
|
||
| - name: Export KEY_NAME from KEY_PATH | ||
| shell: bash | ||
| run: | | ||
| set -euxo pipefail | ||
| # KEY_PATH is already resolved and exported to $GITHUB_ENV in previous step | ||
| key_name="$(basename "${KEY_PATH}")" | ||
| echo "KEY_NAME=${key_name}" >> "$GITHUB_ENV" | ||
| echo "Exported KEY_NAME=${key_name}" | ||
|
|
||
| - name: Validate required secrets exist | ||
| shell: bash | ||
| run: | | ||
| set -euxo pipefail | ||
| [[ -n "${SSH_PASSWORD}" ]] || (echo "ERROR: secrets.SSH_PASSWORD required" && exit 1) | ||
|
|
||
| # ============================================================ | ||
| # PRE-BOOTSTRAP CLEANUP (remote ops only) | ||
| # Targets = MNODES + STORAGE_PRIVATE_IPS + CLIENTNODES | ||
| # ============================================================ | ||
| - name: Pre-clean kill fio/tmux and unmount NFS on MNODES + storage + clients | ||
| shell: bash | ||
| run: | | ||
| set -euxo pipefail | ||
|
|
||
| run_remote() { | ||
| local ip="$1" | ||
| local script="$2" | ||
| sshpass -p "${SSH_PASSWORD}" ssh \ | ||
| -o StrictHostKeyChecking=no \ | ||
| -o UserKnownHostsFile=/dev/null \ | ||
| "${SSH_USER}@${ip}" "bash -s" <<< "$script" | ||
| } | ||
|
|
||
| run_remote_with_retry() { | ||
| local ip="$1" | ||
| local script="$2" | ||
| local max=5 | ||
| for attempt in $(seq 1 $max); do | ||
| run_remote "$ip" "$script" && return 0 | ||
| echo "Attempt $attempt/$max failed for $ip, retrying in 5s..." | ||
| sleep 5 | ||
| done | ||
| echo "All $max attempts failed for $ip, continuing..." | ||
| return 0 | ||
| } | ||
|
|
||
| targets="$MNODES $STORAGE_PRIVATE_IPS $CLIENTNODES" | ||
| uniq_targets="$(echo "$targets" | tr ' ' '\n' | sed '/^$/d' | sort -u | tr '\n' ' ')" | ||
|
|
||
| for ip in $uniq_targets; do | ||
| echo "---- $ip: kill fio/tmux + umount ${NFS_MOUNTPOINT} ----" | ||
| run_remote_with_retry "$ip" "set -euxo pipefail; | ||
| pkill -9 fio || true; | ||
| pkill -9 tmux || true; | ||
| mp='${NFS_MOUNTPOINT}'; | ||
| if mountpoint -q \"\$mp\"; then umount -f \"\$mp\" || umount \"\$mp\"; else | ||
| if mount | grep -q \" \$mp \"; then umount -f \"\$mp\" || umount \"\$mp\" || true; fi | ||
| fi" | ||
| done | ||
|
|
||
|
|
||
| - name: Client cleanup disconnect lvols; ensure NFS not mounted anywhere; unmount all /mnt; remove /mnt dirs | ||
| shell: bash | ||
| run: | | ||
| set -euxo pipefail | ||
|
|
||
| run_remote() { | ||
| local ip="$1" | ||
| local script="$2" | ||
| sshpass -p "${SSH_PASSWORD}" ssh \ | ||
| -o StrictHostKeyChecking=no \ | ||
| -o UserKnownHostsFile=/dev/null \ | ||
| "${SSH_USER}@${ip}" "bash -s" <<< "$script" | ||
| } | ||
|
|
||
| # disconnect lvol subsystems on clients | ||
| for ip in $CLIENTNODES; do | ||
| echo "---- client disconnect lvols: $ip ----" | ||
| run_remote "$ip" "set -euxo pipefail; | ||
| subsystems=\$(nvme list-subsys | grep -i lvol | awk '{print \$3}' | cut -d '=' -f 2 || true); | ||
| for s in \$subsystems; do nvme disconnect -n \"\$s\" || true; done" | ||
| done | ||
|
|
||
| # fail if NFS still mounted anywhere | ||
| targets="$MNODES $STORAGE_PRIVATE_IPS $CLIENTNODES" | ||
| uniq_targets="$(echo "$targets" | tr ' ' '\n' | sed '/^$/d' | sort -u | tr '\n' ' ')" | ||
|
|
||
| still=0 | ||
| for ip in $uniq_targets; do | ||
| if sshpass -p "${SSH_PASSWORD}" ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null "${SSH_USER}@${ip}" \ | ||
| "mount | grep -q \" ${NFS_MOUNTPOINT} \""; then | ||
| echo "ERROR: ${NFS_MOUNTPOINT} still mounted on $ip" | ||
| still=1 | ||
| fi | ||
| done | ||
| [[ "$still" -eq 0 ]] || exit 1 | ||
|
|
||
| # unmount all /mnt and remove dirs on clients | ||
| for ip in $CLIENTNODES; do | ||
| echo "---- client unmount all /mnt and remove dirs: $ip ----" | ||
| run_remote "$ip" "set -euxo pipefail; | ||
| mps=\$(mount | grep ' /mnt' | awk '{print \$3}' || true); | ||
| for mp in \$mps; do umount -f \"\$mp\" || umount \"\$mp\" || true; done; | ||
| dirs=\$(find /mnt -mindepth 1 -type d 2>/dev/null || true); | ||
| for d in \$dirs; do rm -rf \"\$d\" || true; done" | ||
| done | ||
|
|
||
| # ============================================================ | ||
| # STRESS (runner only) | ||
| # ============================================================ | ||
| - name: Clone sbcli repo (prefer same branch as workflow; fallback to SBCLI_BRANCH) | ||
| shell: bash | ||
| run: | | ||
| set -euxo pipefail | ||
| rm -rf sbcli | ||
|
|
||
| wf_branch="${{ github.ref_name }}" | ||
| fallback_branch="${SBCLI_BRANCH}" | ||
|
|
||
| echo "Workflow branch: $wf_branch" | ||
| echo "Fallback sbcli branch: $fallback_branch" | ||
|
|
||
| # Try workflow branch first | ||
| if git ls-remote --heads https://github.com/simplyblock-io/sbcli.git "$wf_branch" | grep -q "$wf_branch"; then | ||
| echo "Cloning sbcli on workflow branch: $wf_branch" | ||
| git clone --branch "$wf_branch" --single-branch https://github.com/simplyblock-io/sbcli.git sbcli | ||
| else | ||
| echo "Branch '$wf_branch' not found in sbcli; cloning fallback branch: $fallback_branch" | ||
| git clone --branch "$fallback_branch" --single-branch https://github.com/simplyblock-io/sbcli.git sbcli | ||
| fi | ||
|
|
||
| test -f sbcli/e2e/stress.py | ||
| test -f sbcli/e2e/logs/cleanup.py | ||
|
|
||
| - name: Install Python deps (best-effort) | ||
| shell: bash | ||
| run: | | ||
| set -euxo pipefail | ||
| python3 -m pip install --upgrade pip | ||
| if [[ -f "sbcli/e2e/requirements.txt" ]]; then | ||
| pip install -r sbcli/e2e/requirements.txt | ||
| fi | ||
|
|
||
| - name: Cleanup logs before stress | ||
| shell: bash | ||
| working-directory: sbcli/e2e | ||
| run: | | ||
| set -euxo pipefail | ||
| python3 logs/cleanup.py | ||
|
|
||
| - name: Set RUN_DIR_FILE | ||
| shell: bash | ||
| run: | | ||
| echo "RUN_DIR_FILE=/tmp/sb_run_dir_${GITHUB_RUN_ID}_${GITHUB_RUN_ATTEMPT}.txt" >> "$GITHUB_ENV" | ||
|
|
||
| - name: Record test start time | ||
| shell: bash | ||
| run: | | ||
| set -euxo pipefail | ||
| echo "TEST_START_EPOCH=$(date +%s)" >> "$GITHUB_ENV" | ||
| echo "TEST_START_HUMAN=$(date -u +'%Y-%m-%d %H:%M:%S UTC')" >> "$GITHUB_ENV" | ||
|
|
||
| - name: Run stress (foreground; runs until failure) | ||
| shell: bash | ||
| working-directory: sbcli/e2e | ||
| run: | | ||
| set -euxo pipefail | ||
| python3 -u stress.py \ | ||
| --testname "${TEST_CLASS}" \ | ||
| --ndcs "${NDCS}" \ | ||
| --npcs "${NPCS}" \ | ||
| --send_debug_notification false \ | ||
| 2>&1 | tee output.log | ||
|
|
||
| - name: Post-test cleanup (kill tmux on mgmt/storage; kill fio on clients) | ||
| if: always() | ||
| shell: bash | ||
| run: | | ||
| set -euxo pipefail | ||
|
|
||
| run_remote() { | ||
| local ip="$1" | ||
| local script="$2" | ||
| sshpass -p "${SSH_PASSWORD}" ssh \ | ||
| -o StrictHostKeyChecking=no \ | ||
| -o UserKnownHostsFile=/dev/null \ | ||
| "${SSH_USER}@${ip}" "bash -s" <<< "$script" || true | ||
| } | ||
|
|
||
| run_remote_with_retry() { | ||
| local ip="$1" | ||
| local script="$2" | ||
| local max=5 | ||
| for attempt in $(seq 1 $max); do | ||
| run_remote "$ip" "$script" && return 0 | ||
| echo "Attempt $attempt/$max failed for $ip, retrying in 5s..." | ||
| sleep 5 | ||
| done | ||
| echo "All $max attempts failed for $ip, continuing..." | ||
| return 0 | ||
| } | ||
|
|
||
| for ip in $MNODES $STORAGE_PRIVATE_IPS; do | ||
| echo "---- $ip: kill tmux ----" | ||
| run_remote_with_retry "$ip" "pkill -9 tmux || true" | ||
| done | ||
|
|
||
| for ip in $CLIENTNODES; do | ||
| echo "---- $ip: kill fio and tmux----" | ||
| run_remote_with_retry "$ip" "pkill -9 fio || true" | ||
| run_remote_with_retry "$ip" "pkill -9 tmux || true" | ||
| done | ||
|
|
||
| - name: Mark test end time (always) | ||
| if: always() | ||
| shell: bash | ||
| run: | | ||
| set -euxo pipefail | ||
| echo "TEST_END_EPOCH=$(date +%s)" >> "$GITHUB_ENV" | ||
| echo "TEST_END_HUMAN=$(date -u '+%Y-%m-%d %H:%M:%S UTC')" >> "$GITHUB_ENV" | ||
|
|
||
| - name: Collect mgmt snapshots into RUN_BASE_DIR (on failure) | ||
| if: always() | ||
| shell: bash | ||
| run: | | ||
| set -euxo pipefail | ||
|
|
||
| python3 - <<'PY' | ||
| import os, subprocess | ||
|
|
||
| mgmt_ip = os.environ["MNODES"].split()[0] | ||
| key = os.environ["KEY_PATH"] | ||
| user = os.environ["SSH_USER"] | ||
| sbcli = os.environ["SBCLI_CMD"] | ||
| cluster_id = os.environ["CLUSTER_ID"] | ||
| run_base = os.environ["RUN_BASE_DIR"].rstrip("/") | ||
| outdir = f"{run_base}/{mgmt_ip}/mgmt_details" | ||
|
|
||
| os.makedirs(f"{outdir}/mgmt", exist_ok=True) | ||
| os.makedirs(f"{outdir}/subtasks", exist_ok=True) | ||
| os.makedirs(f"{outdir}/storage_nodes", exist_ok=True) | ||
|
|
||
| script = r"""set -euo pipefail | ||
| SBCLI="__SBCLI__" | ||
| CLUSTER_ID="__CLUSTER_ID__" | ||
| OUTDIR="__OUTDIR__" | ||
|
|
||
| mkdir -p "$OUTDIR"/{mgmt,subtasks,storage_nodes} | ||
|
|
||
| log(){ printf '[%(%F %T)T] %s\n' -1 "$*" >&2; } | ||
| run_local(){ local cmd="$1" out="$2"; log "LOCAL: $cmd -> $out"; bash -lc "$cmd" > "$out" 2>&1 || true; } | ||
|
|
||
| run_local "$SBCLI cluster list" "$OUTDIR/mgmt/cluster_list.txt" | ||
| run_local "$SBCLI cluster status $CLUSTER_ID" "$OUTDIR/mgmt/cluster_status.txt" | ||
| run_local "$SBCLI cluster show $CLUSTER_ID" "$OUTDIR/mgmt/cluster_show.txt" | ||
| run_local "$SBCLI cluster get-capacity $CLUSTER_ID" "$OUTDIR/mgmt/cluster_capacity.txt" | ||
| run_local "$SBCLI cluster get-logs $CLUSTER_ID --limit 0" "$OUTDIR/mgmt/cluster_get_logs.txt" | ||
|
|
||
| run_local "$SBCLI pool list" "$OUTDIR/mgmt/pool_list.txt" | ||
| run_local "$SBCLI lvol list" "$OUTDIR/mgmt/lvol_list.txt" | ||
| run_local "$SBCLI snapshot list" "$OUTDIR/mgmt/snapshot_list.txt" | ||
|
|
||
| run_local "$SBCLI sn list" "$OUTDIR/mgmt/sn_list.txt" | ||
| run_local "$SBCLI sn list --json" "$OUTDIR/mgmt/sn_list.json" | ||
|
|
||
| SN_UUIDS=() | ||
| if [[ -s "$OUTDIR/mgmt/sn_list.json" ]] && command -v jq >/dev/null 2>&1; then | ||
| mapfile -t SN_UUIDS < <(jq -r '.[]?|.UUID // .uuid // .Id // .id // empty' "$OUTDIR/mgmt/sn_list.json" | awk 'NF') | ||
| else | ||
| mapfile -t SN_UUIDS < <( | ||
| awk 'BEGIN{FS="|"} /^\|/ && !/UUID/ && !/^\+-/ { | ||
| if (NF>=3) { gsub(/^[ \t]+|[ \t]+$/, "", $3); if ($3 ~ /[0-9a-f-]{8,}/) print $3; } | ||
| }' "$OUTDIR/mgmt/sn_list.txt" | sort -u | ||
| ) | ||
| fi | ||
|
|
||
| idx=1 | ||
| for uuid in "${SN_UUIDS[@]}"; do | ||
| run_local "$SBCLI sn list-devices $uuid" "$OUTDIR/storage_nodes/node${idx}_list_devices.txt" | ||
| run_local "$SBCLI sn check $uuid" "$OUTDIR/storage_nodes/node${idx}_check.txt" | ||
| run_local "$SBCLI sn get $uuid" "$OUTDIR/storage_nodes/node${idx}_get.txt" | ||
| idx=$((idx+1)) | ||
| done | ||
|
|
||
| run_local "$SBCLI cluster list-tasks $CLUSTER_ID --limit 0" "$OUTDIR/mgmt/cluster_list_tasks.txt" | ||
|
|
||
| BAL_IDS=() | ||
| while IFS= read -r line; do | ||
| [[ "$line" =~ ^\+ ]] && continue | ||
| [[ "$line" =~ Task\ ID ]] && continue | ||
| [[ "$line" =~ ^\|[[:space:]]*$ ]] && continue | ||
|
|
||
| func=$(awk -F'|' '{gsub(/^[ \t]+|[ \t]+$/, "", $4); print $4}' <<<"$line") | ||
| if [[ "$func" == "balancing_on_restart" ]]; then | ||
| task_id=$(awk -F'|' '{gsub(/^[ \t]+|[ \t]+$/, "", $2); print $2}' <<<"$line") | ||
| [[ -n "$task_id" ]] && BAL_IDS+=("$task_id") | ||
| fi | ||
| done < "$OUTDIR/mgmt/cluster_list_tasks.txt" | ||
|
|
||
| for tid in "${BAL_IDS[@]:-}"; do | ||
| run_local "$SBCLI cluster get-subtasks $tid" "$OUTDIR/subtasks/${tid}_subtasks.txt" | ||
| done | ||
| """ | ||
|
|
||
| script = (script | ||
| .replace("__SBCLI__", sbcli) | ||
| .replace("__CLUSTER_ID__", cluster_id) | ||
| .replace("__OUTDIR__", outdir)) | ||
|
|
||
| ssh_cmd = [ | ||
| "ssh", | ||
| "-i", key, | ||
| "-o", "StrictHostKeyChecking=no", | ||
| "-o", "UserKnownHostsFile=/dev/null", | ||
| "-o", "ConnectTimeout=10", | ||
| f"{user}@{mgmt_ip}", | ||
| "bash", "-s", | ||
| ] | ||
|
|
||
| subprocess.run(ssh_cmd, input=script.encode(), check=False) | ||
| PY | ||
|
|
||
| - name: Collect docker logs into RUN_BASE_DIR (on failure) | ||
| if: always() | ||
| shell: bash | ||
| run: | | ||
| set -euxo pipefail | ||
|
|
||
| TAG="containers-final-$(date +%Y%m%d_%H%M%S)" | ||
| SSH_OPTS=(-i "${KEY_PATH}" -o BatchMode=yes -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null -o ConnectTimeout=10 -C) | ||
|
|
||
| NODES="$(echo "${MNODES} ${STORAGE_PRIVATE_IPS}" | tr ' ' '\n' | sed '/^$/d' | sort -u | tr '\n' ' ')" | ||
|
|
||
| for NODE in ${NODES}; do | ||
| echo ">>> Node: ${NODE}" | ||
| REMOTE_DIR="${RUN_BASE_DIR}/${NODE}/${TAG}" | ||
|
|
||
| ssh "${SSH_OPTS[@]}" "${SSH_USER}@${NODE}" "sudo mkdir -p '${REMOTE_DIR}' && sudo chmod -R 0777 '${RUN_BASE_DIR}/${NODE}'" || true | ||
| ssh "${SSH_OPTS[@]}" "${SSH_USER}@${NODE}" "bash -lc \"docker ps -a > '${REMOTE_DIR}/docker_ps_a_${NODE}.txt' 2>&1 || true\"" || true | ||
|
|
||
| CONTAINERS="$(ssh "${SSH_OPTS[@]}" "${SSH_USER}@${NODE}" "bash -lc \"docker ps -a --format '{{.Names}}' || true\"" || true)" | ||
|
|
||
| if [[ -z "${CONTAINERS}" ]]; then | ||
| ssh "${SSH_OPTS[@]}" "${SSH_USER}@${NODE}" "bash -lc \"echo 'No containers found' > '${REMOTE_DIR}/_NO_CONTAINERS_${NODE}.txt'\"" || true | ||
| continue | ||
| fi | ||
|
|
||
| set +e | ||
| while IFS= read -r C; do | ||
| C="${C%%$'\r'}" | ||
| [[ -z "${C}" ]] && continue | ||
| echo " dumping: ${C}" | ||
| LOG_FILE="${REMOTE_DIR}/${C}.txt" | ||
| INSPECT_FILE="${REMOTE_DIR}/${C}_inspect.json" | ||
|
|
||
| ssh "${SSH_OPTS[@]}" "${SSH_USER}@${NODE}" "bash -lc \"docker logs --timestamps --details '${C}' > '${LOG_FILE}' 2>&1 || true\"" || true | ||
| ssh "${SSH_OPTS[@]}" "${SSH_USER}@${NODE}" "bash -lc \"docker inspect '${C}' > '${INSPECT_FILE}' 2>&1 || true\"" || true | ||
| done <<< "${CONTAINERS}" | ||
| set -e | ||
| done | ||
|
|
||
|
|
||
| - name: Collect distrib debug dumps into RUN_BASE_DIR | ||
| if: always() | ||
| timeout-minutes: 35 | ||
| shell: bash | ||
| run: | | ||
| set -euxo pipefail | ||
|
|
||
| python3 - <<'PY' | ||
| import os, subprocess, sys, textwrap | ||
|
|
||
| ssh_user = os.environ["SSH_USER"] | ||
| key = os.environ["KEY_PATH"] | ||
| run_base = os.environ["RUN_BASE_DIR"].rstrip("/") | ||
| tag = "finaldistrib_bdev_logs" | ||
|
|
||
| storage_ips = os.environ["STORAGE_PRIVATE_IPS"].split() | ||
|
|
||
| ssh_base = [ | ||
| "ssh", | ||
| "-i", key, | ||
| "-o", "StrictHostKeyChecking=no", | ||
| "-o", "UserKnownHostsFile=/dev/null", | ||
| "-o", "ServerAliveInterval=15", | ||
| "-o", "ServerAliveCountMax=4", | ||
| "-o", "ConnectTimeout=10", | ||
| "-C", | ||
| ] | ||
|
|
||
| scp_base = [ | ||
| "scp", | ||
| "-i", key, | ||
| "-o", "StrictHostKeyChecking=no", | ||
| "-o", "UserKnownHostsFile=/dev/null", | ||
| "-o", "ConnectTimeout=10", | ||
| "-C", | ||
| ] | ||
|
|
||
| remote_script = """\ | ||
| set -euo pipefail | ||
| TS="$(date +%Y%m%d_%H%M%S)" | ||
| HOST="$(hostname -s 2>/dev/null || hostname)" | ||
| STAGING="/tmp/distrib_host_collect_${TS}" | ||
| mkdir -p "$STAGING" | ||
|
|
||
| CN="$(sudo docker ps --format '{{.Names}}' | grep -E '^spdk_[0-9]+$' | head -n1 || true)" | ||
| if [[ -z "$CN" ]]; then echo "NO_SPDK_CONTAINER"; exit 0; fi | ||
| SOCK="/mnt/ramdisk/${CN}/spdk.sock" | ||
|
|
||
| BDEV_JSON="$(sudo docker exec "$CN" bash -lc "sudo python spdk/scripts/rpc.py -s '$SOCK' bdev_get_bdevs" 2>/dev/null || true)" | ||
| if [[ -z "$BDEV_JSON" ]]; then echo "BDEV_EMPTY"; exit 0; fi | ||
|
|
||
| if command -v jq >/dev/null 2>&1; then | ||
| mapfile -t DISTRIBS < <(printf '%s' "$BDEV_JSON" | jq -r '.[] | select(.name|startswith("distrib_")) | .name' | sort -u) | ||
| else | ||
| mapfile -t DISTRIBS < <(printf '%s\n' "$BDEV_JSON" | grep -oE '"name"\\s*:\\s*"distrib_[^"]+"' | sed -E 's/.*"name"\\s*:\\s*"([^"]+)".*/\\1/' | sort -u) | ||
| fi | ||
|
|
||
| if [[ ${#DISTRIBS[@]} -eq 0 ]]; then echo "NO_DISTRIBS"; exit 0; fi | ||
|
|
||
| for d in "${DISTRIBS[@]}"; do | ||
| JF="/tmp/stack_${d}.json" | ||
|
|
||
| python3 - "$d" "$JF" <<'PYIN' | ||
| import json, sys | ||
| d = sys.argv[1] | ||
| jf = sys.argv[2] | ||
| obj = {"subsystems":[{"subsystem":"distr","config":[{"method":"distr_debug_placement_map_dump","params":{"name":d}}]}]} | ||
| with open(jf, "w") as f: | ||
| f.write(json.dumps(obj)) | ||
| PYIN | ||
|
|
||
| sudo docker cp "$JF" "$CN:$JF" || true | ||
| sudo docker exec "$CN" bash -lc "sudo python scripts/rpc_sock.py '$JF' '$SOCK' > /tmp/rpc_${d}.log 2>&1 || true" || true | ||
| sudo docker cp "$CN:/tmp/rpc_${d}.log" "$STAGING/rpc_${d}.log" 2>/dev/null || true | ||
|
|
||
| for f in $(sudo docker exec "$CN" bash -lc "sudo ls /tmp 2>/dev/null | grep -F \\"$d\\" || true"); do | ||
| sudo docker cp "$CN:/tmp/$f" "$STAGING/${CN}__$f" 2>/dev/null || true | ||
| done | ||
|
|
||
| sudo docker exec "$CN" bash -lc "sudo rm -f '$JF' '/tmp/rpc_${d}.log'" || true | ||
| rm -f "$JF" || true | ||
| done | ||
|
|
||
| cat /proc/meminfo | grep -i huge > "$STAGING/hugepage_meminfo.txt" 2>/dev/null || true | ||
|
|
||
| TAR="/tmp/${HOST}_distrib_dumps_${TS}.tgz" | ||
| tar -C "$STAGING" -czf "$TAR" . 2>/dev/null || true | ||
| echo "$TAR" | ||
| """ | ||
|
|
||
| # NOTE: the only heredoc above (<<'PYIN') is inside the REMOTE SCRIPT, | ||
| # not inside YAML. YAML never sees it (we send script via stdin). | ||
|
|
||
| for ip in storage_ips: | ||
| print(f"=== {ip} ===", flush=True) | ||
|
|
||
| # Run remote script via stdin (no YAML heredoc) | ||
| cmd = ssh_base + [f"{ssh_user}@{ip}", "bash", "-s"] | ||
| p = subprocess.run(cmd, input=remote_script.encode(), stdout=subprocess.PIPE, stderr=subprocess.STDOUT, check=False) | ||
| out = p.stdout.decode(errors="replace").strip().splitlines() | ||
| last = out[-1].strip() if out else "" | ||
| last = last.replace("\r", "") | ||
|
|
||
| if last in ("NO_SPDK_CONTAINER", "BDEV_EMPTY", "NO_DISTRIBS") or not last.startswith("/tmp/"): | ||
| print(f"[{ip}] WARN: distrib collection skipped/failed: {last or '(no output)'}", flush=True) | ||
| continue | ||
|
|
||
| dest_dir = f"{run_base}/{ip}/{tag}" | ||
| os.makedirs(dest_dir, exist_ok=True) | ||
|
|
||
| scp_cmd = scp_base + [f"{ssh_user}@{ip}:{last}", dest_dir + "/"] | ||
| subprocess.run(scp_cmd, check=False) | ||
|
|
||
| print(f"[{ip}] ✓ Saved → {dest_dir}/{os.path.basename(last)}", flush=True) | ||
| PY | ||
|
|
||
| # ========================= | ||
| # SUMMARY (always): test run, outages, failure reason, mgmt files, duration | ||
| # Assumes outages are in sbcli/e2e/logs/outage* | ||
| # ========================= | ||
| - name: Write Job Summary (test/outages/failure/mgmt/duration) | ||
| if: always() | ||
| shell: bash | ||
| run: | | ||
| set -euxo pipefail | ||
|
|
||
| mgmt_ip="$(echo "${MNODES}" | awk '{print $1}')" | ||
| out_log="sbcli/e2e/output.log" | ||
|
|
||
| start="${TEST_START_EPOCH:-0}" | ||
| end="${TEST_END_EPOCH:-0}" | ||
| dur_sec=0 | ||
| if [[ "$start" =~ ^[0-9]+$ && "$end" =~ ^[0-9]+$ && "$end" -ge "$start" ]]; then | ||
| dur_sec=$((end-start)) | ||
| fi | ||
| dur_h=$((dur_sec/3600)) | ||
| dur_m=$(((dur_sec%3600)/60)) | ||
| dur_s=$((dur_sec%60)) | ||
| dur_fmt="${dur_h}h ${dur_m}m ${dur_s}s" | ||
|
|
||
| outage_dir="sbcli/e2e/logs" | ||
| outage_count=0 | ||
| outage_latest="" | ||
| outage_lines=0 | ||
| outage_tail="" | ||
| if compgen -G "${outage_dir}/outage*" > /dev/null; then | ||
| outage_count="$(ls -1 ${outage_dir}/outage* 2>/dev/null | wc -l | awk '{print $1}')" | ||
| outage_latest="$(ls -1t ${outage_dir}/outage* 2>/dev/null | head -n 1 || true)" | ||
| outage_lines="$(cat ${outage_dir}/outage* 2>/dev/null | wc -l | awk '{print $1}')" | ||
| [[ -n "${outage_latest}" && -f "${outage_latest}" ]] && outage_tail="$(tail -n 20 "${outage_latest}" 2>/dev/null || true)" | ||
| fi | ||
|
|
||
| failure_reason="(unknown)" | ||
| if [[ -f "${out_log}" ]]; then | ||
| if grep -Eqi 'Traceback \(most recent call last\)|Exception:|ERROR|AssertionError|Input/output error|FAILED|FATAL' "${out_log}"; then | ||
| failure_reason="$(grep -Ein 'Traceback \(most recent call last\)|Exception:|ERROR|AssertionError|Input/output error|FAILED|FATAL' "${out_log}" | tail -n 3 | sed 's/\x1b\[[0-9;]*m//g' || true)" | ||
| failure_reason="$(printf '%s' "${failure_reason}" | tr '\r' ' ' | sed 's/[[:space:]]\+/ /g')" | ||
| else | ||
| failure_reason="$(tail -n 20 "${out_log}" | tr '\r' ' ' | sed 's/[[:space:]]\+/ /g' | tail -n 3)" | ||
| fi | ||
| fi | ||
|
|
||
| mgmt_dir="${RUN_BASE_DIR:-}/$(echo "${MNODES}" | awk '{print $1}')/mgmt_details/mgmt" | ||
| mgmt_files="(not found)" | ||
| if [[ -n "${RUN_BASE_DIR:-}" && -d "${mgmt_dir}" ]]; then | ||
| mgmt_files="$(find "${mgmt_dir}" -maxdepth 1 -type f -printf '%f (%s bytes)\n' 2>/dev/null | sort || true)" | ||
| [[ -n "${mgmt_files}" ]] || mgmt_files="(empty)" | ||
| fi | ||
|
|
||
| conclusion="SUCCESS" | ||
| if [[ "${{ job.status }}" != "success" ]]; then | ||
| conclusion="FAILED" | ||
| fi | ||
|
|
||
| { | ||
| echo "## SimplyBlock Stress Run Summary" | ||
| echo "" | ||
| echo "**Result:** ${conclusion}" | ||
| echo "" | ||
| echo "### Run Info" | ||
| echo "- **Test class:** \`${TEST_CLASS}\`" | ||
| echo "- **Cluster ID:** \`${CLUSTER_ID}\`" | ||
| echo "- **Mgmt node:** \`${mgmt_ip}\`" | ||
| echo "- **NDCS/NPCS:** \`${NDCS}/${NPCS}\`" | ||
| echo "- **Start (UTC):** ${TEST_START_HUMAN:-unknown}" | ||
| echo "- **End (UTC):** ${TEST_END_HUMAN:-unknown}" | ||
| echo "- **Duration:** ${dur_fmt}" | ||
| echo "" | ||
| echo "### NFS Run Folder" | ||
| echo "- **RUN_BASE_DIR:** \`${RUN_BASE_DIR:-not detected}\`" | ||
| echo "- Client logs: \`${RUN_BASE_DIR:-<run_dir>}/ClientLogs\`" | ||
| echo "- Mgmt details: \`${RUN_BASE_DIR:-<run_dir>}/${mgmt_ip}/mgmt_details\`" | ||
| echo "" | ||
| echo "### Outages" | ||
| echo "- Outage files matched: \`${outage_dir}/outage*\`" | ||
| echo "- **Outage file count:** ${outage_count}" | ||
| echo "- **Total outage log lines:** ${outage_lines}" | ||
| [[ -n "${outage_latest}" ]] && echo "- **Latest outage file:** \`${outage_latest}\`" | ||
| if [[ -n "${outage_tail}" ]]; then | ||
| echo "" | ||
| echo "<details><summary>Latest outage file (last 20 lines)</summary>" | ||
| echo "" | ||
| echo '```' | ||
| printf '%s\n' "${outage_tail}" | ||
| echo '```' | ||
| echo "</details>" | ||
| fi | ||
| echo "" | ||
| echo "### Failure Reason (best-effort)" | ||
| echo '```' | ||
| printf '%s\n' "${failure_reason}" | ||
| echo '```' | ||
| echo "" | ||
| echo "### Mgmt Artifacts (mgmt_details/mgmt)" | ||
| echo "- Path: \`${mgmt_dir}\`" | ||
| echo "<details><summary>Files</summary>" | ||
| echo "" | ||
| echo '```' | ||
| printf '%s\n' "${mgmt_files}" | ||
| echo '```' | ||
| echo "</details>" | ||
| echo "" | ||
| echo "### Key Logs" | ||
| echo "- Runner stress log: \`${out_log}\`" | ||
| echo "- Outage logs: \`${outage_dir}/outage*\`" | ||
| echo "- Docker logs: \`${RUN_BASE_DIR:-<run_dir>}/<node_ip>/containers-final-*\`" | ||
| echo "- Distrib dumps: \`${RUN_BASE_DIR:-<run_dir>}/<storage_ip>/finaldistrib_bdev_logs/\`" | ||
| } >> "$GITHUB_STEP_SUMMARY" | ||
|
|
||
| - name: Send Slack Notification | ||
| if: always() | ||
| shell: bash | ||
| env: | ||
| SLACK_WEBHOOK_URL: ${{ secrets.SLACK_WEBHOOK_URL }} | ||
| JOB_STATUS: ${{ job.status }} | ||
| SLACK_RUN_URL: "${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}" | ||
| GITHUB_REF_NAME: ${{ github.ref_name }} | ||
| SLACK_WF_NAME: "Stress (No Bootstrap)" | ||
| run: | | ||
| python3 - <<'PYEOF' | ||
| import json, os, re, sys, urllib.request, urllib.error | ||
| webhook = os.environ.get("SLACK_WEBHOOK_URL", "") | ||
| if not webhook: | ||
| print("No SLACK_WEBHOOK_URL set, skipping.") | ||
| sys.exit(0) | ||
| out_log = "sbcli/e2e/output.log" | ||
| total = passed = failed = skipped = 0 | ||
| if os.path.isfile(out_log): | ||
| content = open(out_log).read() | ||
| def px(pat): | ||
| m = re.search(pat, content) | ||
| return int(m.group(1)) if m else 0 | ||
| total = px(r'Number of Total Cases:\s*(\d+)') | ||
| passed = px(r'Number of Passed Cases:\s*(\d+)') | ||
| failed = px(r'Number of Failed Cases:\s*(\d+)') | ||
| skipped = px(r'Number of Skipped Cases:\s*(\d+)') | ||
| pass_pct = (passed * 100 // total) if total > 0 else 0 | ||
| s = int(os.environ.get("TEST_START_EPOCH", "0") or "0") | ||
| e = int(os.environ.get("TEST_END_EPOCH", "0") or "0") | ||
| secs = max(0, e - s) if e >= s > 0 else 0 | ||
| dur = f"{secs//3600}h {(secs%3600)//60}m {secs%60}s" | ||
| run_url = os.environ.get("SLACK_RUN_URL", "") | ||
| log_dir = os.environ.get("RUN_BASE_DIR", "N/A") | ||
| ndcs = os.environ.get("NDCS", "?") | ||
| npcs = os.environ.get("NPCS", "?") | ||
| test_cls = os.environ.get("TEST_CLASS", "") or "all" | ||
| branch = os.environ.get("GITHUB_REF_NAME", "?") | ||
| wf_name = os.environ.get("SLACK_WF_NAME", "Run") | ||
| ok = os.environ.get("JOB_STATUS", "") == "success" | ||
| icon = ":white_check_mark:" if ok else ":x:" | ||
| status = "SUCCESS" if ok else "FAILURE" | ||
| mention = "" if ok else " <!channel>" | ||
| lines = [ | ||
| f"{icon} *SimplyBlock {wf_name}*{mention}", | ||
| f"*Status:* {status} | *Duration:* {dur}", | ||
| f"*Branch:* `{branch}` | *NDCS/NPCS:* `{ndcs}/{npcs}` | *Test class:* `{test_cls}`", | ||
| "", | ||
| ] | ||
| if total > 0: | ||
| lines += [ | ||
| f":white_check_mark: *Passed:* {passed}/{total} ({pass_pct}%)", | ||
| f":x: *Failed:* {failed}", | ||
| f":fast_forward: *Skipped:* {skipped}", | ||
| ] | ||
| else: | ||
| lines.append("_(test counts not found in log)_") | ||
| lines += [ | ||
| "", | ||
| f":link: *Run:* <{run_url}|View on GitHub>", | ||
| f":file_folder: *Final Logs:* `{log_dir}`", | ||
| ] | ||
| payload = {"text": "\n".join(lines)} | ||
| req = urllib.request.Request( | ||
| webhook, | ||
| data=json.dumps(payload).encode(), | ||
| headers={"Content-Type": "application/json"}, | ||
| ) | ||
| try: | ||
| urllib.request.urlopen(req, timeout=15) | ||
| print("Slack notification sent.") | ||
| except Exception as exc: | ||
| print(f"WARN: Slack notification failed: {exc}", file=sys.stderr) | ||
| PYEOF | ||
|
|
||
| - name: Upload logs (always) | ||
| if: always() | ||
| uses: actions/upload-artifact@v4 | ||
| with: | ||
| name: simplyblock-stress-logs-${{ github.run_id }} | ||
| path: | | ||
| simplyBlockDeploy/bare-metal/bootstrap.log | ||
| sbcli/e2e/output.log | ||
| sbcli/e2e/logs/** | ||
| if-no-files-found: warn | ||
|
|
||
| - name: Export MGMT_IP (first MNODES) | ||
| if: always() | ||
| shell: bash | ||
| run: | | ||
| echo "MGMT_IP=$(echo "${MNODES}" | awk '{print $1}')" >> "$GITHUB_ENV" | ||
|
|
||
| - name: Upload small artifacts (always) | ||
| if: always() && env.RUN_BASE_DIR != '' && env.MGMT_IP != '' | ||
| uses: actions/upload-artifact@v4 | ||
| with: | ||
| name: simplyblock-small-logs-${{ github.run_id }} | ||
| path: | | ||
| sbcli/e2e/output.log | ||
| ${{ env.RUN_BASE_DIR }}/${{ env.MGMT_IP }}/mgmt_details/mgmt/*.txt | ||
| ${{ env.RUN_BASE_DIR }}/${{ env.MGMT_IP }}/mgmt_details/subtasks/*.txt | ||
| ${{ env.RUN_BASE_DIR }}/${{ env.MGMT_IP }}/mgmt_details/storage_nodes/*.txt | ||
| if-no-files-found: warn |
| f"[bulk_wait_pvc] {len(bound)}/{len(target)} Bound " | ||
| f"({remaining} remaining)" | ||
| ) | ||
| if bound == target: |
| data = json.loads(json_out) | ||
| if isinstance(data, dict) and data.get("filename"): | ||
| container_txt = data["filename"] | ||
| except Exception: |
| f"[wait_fio] Job {job_name}: {status}" | ||
| ) | ||
| still_running.discard(job_name) | ||
| except Exception: |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Add this suggestion to a batch that can be applied as a single commit.This suggestion is invalid because no changes were made to the code.Suggestions cannot be applied while the pull request is closed.Suggestions cannot be applied while viewing a subset of changes.Only one suggestion per line can be applied in a batch.Add this suggestion to a batch that can be applied as a single commit.Applying suggestions on deleted lines is not supported.You must change the existing code in this line in order to create a valid suggestion.Outdated suggestions cannot be applied.This suggestion has been applied or marked resolved.Suggestions cannot be applied from pending reviews.Suggestions cannot be applied on multi-line comments.Suggestions cannot be applied while the pull request is queued to merge.Suggestion cannot be applied right now. Please check back later.
No description provided.