diff --git a/flows/flow-14-live-obol-base-sepolia.sh b/flows/flow-14-live-obol-base-sepolia.sh index c7f2898a..6e8ccb4b 100755 --- a/flows/flow-14-live-obol-base-sepolia.sh +++ b/flows/flow-14-live-obol-base-sepolia.sh @@ -745,12 +745,40 @@ if [ "$import_rc" -ne 0 ]; then fi pass "Alice remote-signer seeded with seller wallet" +# Guard: confirm the remote-signer pod was actually rolled by the wallet +# import. Helm does NOT re-roll a Deployment when only a Secret's data +# changed, so a regression that drops the explicit kubectl rollout-restart +# would leave the pod running with the chart's bootstrap keystore-password +# Secret in env. The pod would then sign with the throwaway address and +# `obol sell register` would fail 5 minutes later with "gas required +# exceeds allowance (0)" — a confusing, slow failure. This step fails +# fast with a clear diagnostic instead. +step "Alice: remote-signer pod rolled by wallet import (age < 120s)" +set +e +pod_start=$(alice kubectl get pods -n hermes-obol-agent \ + -l app.kubernetes.io/name=remote-signer \ + -o jsonpath='{.items[0].status.startTime}' 2>/dev/null) +set -e +if [ -z "$pod_start" ]; then + fail "remote-signer pod not found (label app.kubernetes.io/name=remote-signer)" + emit_metrics; exit 1 +fi +pod_epoch=$(date -u -d "$pod_start" +%s 2>/dev/null || python3 -c "import datetime,sys; print(int(datetime.datetime.fromisoformat(sys.argv[1].replace('Z','+00:00')).timestamp()))" "$pod_start") +now_epoch=$(date -u +%s) +pod_age=$((now_epoch - pod_epoch)) +if [ "$pod_age" -gt 120 ]; then + fail "remote-signer pod is ${pod_age}s old — wallet import did not roll the deployment (likely stale keystore-password Secret). Run 'obol kubectl -n hermes-obol-agent rollout restart deployment/remote-signer' and retry." + emit_metrics; exit 1 +fi +pass "remote-signer pod is ${pod_age}s old (rolled by wallet import)" + step "Alice: drive ERC-8004 registration (obol sell register)" # 5-minute hard timeout: the on-chain tx + WaitForAgent + SetMetadata # should complete in ~30-60s; anything beyond that is a hang we want # to surface, not silently block the run. `timeout` is an external # program and cannot see the `alice()` bash function, so call the # binary directly with the same env the function exports. +set +e register_out=$(timeout 300 \ env OBOL_DEVELOPMENT=true OBOL_NONINTERACTIVE=true \ OBOL_CONFIG_DIR="$ALICE_DIR/config" \ @@ -761,6 +789,7 @@ register_out=$(timeout 300 \ --endpoint "$TUNNEL_URL" \ --name "Live OBOL Base Sepolia Test Inference" 2>&1) register_rc=$? +set -e printf '%s\n' "$register_out" | tail -10 if [ "$register_rc" -ne 0 ]; then fail "obol sell register failed (exit $register_rc) — offer will stay AwaitingExternalRegistration" diff --git a/internal/agentruntime/charts.go b/internal/agentruntime/charts.go new file mode 100644 index 00000000..316a1bec --- /dev/null +++ b/internal/agentruntime/charts.go @@ -0,0 +1,16 @@ +package agentruntime + +// RemoteSignerChartVersion is the single source of truth for the +// `remote-signer` Helm chart pin used by both Hermes and OpenClaw +// deployments. It MUST be updated as a single edit; bumping it here +// updates every consumer in lockstep. +// +// Chart 0.3.1 ships remote-signer image `v0.2.0`, which accepts the +// canonical-string signer contract (chain_id, value, etc. serialized +// as JSON strings) introduced by PR #359. Chart 0.3.0 ships `v0.1.0`, +// which only accepts the legacy u64 contract and breaks `obol sell +// register` for current obol-stack with HTTP 422 "chain_id: invalid +// type: string \"84532\", expected u64". +// +// renovate: datasource=helm depName=remote-signer registryUrl=https://obolnetwork.github.io/helm-charts/ +const RemoteSignerChartVersion = "0.3.1" diff --git a/internal/hermes/hermes.go b/internal/hermes/hermes.go index 876af18d..28cefe68 100644 --- a/internal/hermes/hermes.go +++ b/internal/hermes/hermes.go @@ -30,8 +30,6 @@ const ( gatewayTokenFileName = ".gateway-token" obolSkillsDirName = "obol-skills" - // renovate: datasource=helm depName=remote-signer registryUrl=https://obolnetwork.github.io/helm-charts/ - remoteSignerChartVersion = "0.3.0" // renovate: datasource=helm depName=raw registryUrl=https://bedag.github.io/helm-charts/ rawChartVersion = "2.0.2" @@ -625,7 +623,7 @@ releases: version: %s values: - values-remote-signer.yaml -`, namespace, rawChartVersion, valuesFileName, namespace, remoteSignerChartVersion) +`, namespace, rawChartVersion, valuesFileName, namespace, agentruntime.RemoteSignerChartVersion) } func dashboardHostname(id string) string { diff --git a/internal/hermes/wallet_import.go b/internal/hermes/wallet_import.go index 5a770571..e72cce3c 100644 --- a/internal/hermes/wallet_import.go +++ b/internal/hermes/wallet_import.go @@ -11,6 +11,7 @@ import ( "github.com/ObolNetwork/obol-stack/internal/agentruntime" "github.com/ObolNetwork/obol-stack/internal/config" + "github.com/ObolNetwork/obol-stack/internal/kubectl" "github.com/ObolNetwork/obol-stack/internal/ui" ethcrypto "github.com/ethereum/go-ethereum/crypto" ) @@ -23,6 +24,16 @@ type ImportPrivateKeyWalletOptions struct { ApplyCluster bool } +// Indirection seams so tests can spy on / replace the cluster-side calls +// without standing up a real k3d cluster. Production wires them to the real +// helmfile-sync + kubectl rollout helpers. +var ( + syncFn = Sync + restartHermesRemoteSignerFn = restartHermesRemoteSigner + ensureVolumeWritableFn = ensureVolumeWritable + fixRuntimeVolumeOwnershipFn = fixRuntimeVolumeOwnership +) + // ImportPrivateKeyWalletCmd imports an existing private key as the // remote-signer wallet for a Hermes instance. Mirror of the OpenClaw path. func ImportPrivateKeyWalletCmd(cfg *config.Config, id string, opts ImportPrivateKeyWalletOptions, u *ui.UI) error { @@ -68,9 +79,49 @@ func ImportPrivateKeyWalletCmd(cfg *config.Config, id string, opts ImportPrivate u.Detail("Address", wallet.Address) u.Detail("Instance", id) + // When the cluster is live, helmfile-sync so the new keystore password + // Secret reaches the pod, then explicitly roll the deployment — helm + // does not roll on Secret-data-only changes, so the pod would keep + // decrypting with the old chart-bootstrap password and remote-signer + // calls would sign with the throwaway address. + if opts.ApplyCluster { + u.Blank() + u.Info("Applying changes to cluster (helmfile sync)...") + if err := syncFn(cfg, id, u); err != nil { + u.Warnf("helmfile sync failed: %v", err) + u.Printf("Run 'obol hermes sync %s' manually before issuing remote-signer calls.", id) + } else { + restartHermesRemoteSignerFn(cfg, id, u) + } + } + return nil } +// restartHermesRemoteSigner kicks a rollout-restart on the remote-signer +// deployment so the pod re-reads the freshly-applied keystore-password Secret. +// Best-effort: a still-coming-up cluster will surface the error to the caller +// as a warning, not a hard failure, since wallet metadata + values files have +// already been written and a later `obol hermes sync` can finish the job. +func restartHermesRemoteSigner(cfg *config.Config, id string, u *ui.UI) { + namespace := agentruntime.Namespace(agentruntime.Hermes, id) + kubectlBin, kubeconfig := kubectl.Paths(cfg) + if err := kubectl.RunSilent(kubectlBin, kubeconfig, + "rollout", "restart", "deployment/remote-signer", "-n", namespace, + ); err != nil { + u.Warnf("Could not restart remote-signer (cluster may not be running): %v", err) + u.Printf("Run 'obol kubectl -n %s rollout restart deployment/remote-signer' to apply the new keystore.", namespace) + return + } + if err := kubectl.RunSilent(kubectlBin, kubeconfig, + "rollout", "status", "deployment/remote-signer", "-n", namespace, "--timeout=120s", + ); err != nil { + u.Warnf("remote-signer rollout did not complete in 120s: %v", err) + return + } + u.Success("Remote-signer restarted") +} + // ImportWalletFromPrivateKey provisions an existing Ethereum private key as // the remote-signer wallet for a Hermes instance. func ImportWalletFromPrivateKey(cfg *config.Config, id, privateKeyHex string, u *ui.UI) (*WalletInfo, error) { @@ -130,6 +181,14 @@ func archiveReplacedHermesKeystore(cfg *config.Config, id string, existingWallet } dir := agentruntime.KeystoreVolumePath(cfg, agentruntime.Hermes, id) + + // The keystores volume is normally container-owned (uid 10000, mode 700) + // after provisionKeystoreToVolume's fixRuntimeVolumeOwnership. Bookend the + // stat/mkdir/rename with the same ownership flip provision uses, otherwise + // the host process can't even traverse the directory. + ensureVolumeWritableFn(cfg, dir, u) + defer fixRuntimeVolumeOwnershipFn(cfg, dir, u) + oldPath := filepath.Join(dir, existingWallet.KeystoreUUID+".json") if _, err := os.Stat(oldPath); err != nil { if errors.Is(err, os.ErrNotExist) { diff --git a/internal/hermes/wallet_import_test.go b/internal/hermes/wallet_import_test.go new file mode 100644 index 00000000..2d190176 --- /dev/null +++ b/internal/hermes/wallet_import_test.go @@ -0,0 +1,249 @@ +package hermes + +import ( + "os" + "path/filepath" + "testing" + + "github.com/ObolNetwork/obol-stack/internal/agentruntime" + "github.com/ObolNetwork/obol-stack/internal/config" + "github.com/ObolNetwork/obol-stack/internal/ui" +) + +// Anvil account #1 — well-known Hardhat test mnemonic. Public on every +// install, never used on mainnet, safe to commit. +const testPrivateKeyHex = "ac0974bec39a17e36ba4a6b4d238ff944bacb478cbed5efcae784d7bf4f2ff80" + +func newTestUI() *ui.UI { return ui.New(false) } + +func walletImportTestConfig(t *testing.T, id string) (*config.Config, string) { + t.Helper() + dir := t.TempDir() + cfg := &config.Config{ConfigDir: dir, DataDir: dir, BinDir: dir} + deployDir := DeploymentPath(cfg, id) + if err := os.MkdirAll(deployDir, 0o755); err != nil { + t.Fatalf("mkdir deploy: %v", err) + } + keystoreDir := agentruntime.KeystoreVolumePath(cfg, agentruntime.Hermes, id) + if err := os.MkdirAll(keystoreDir, 0o700); err != nil { + t.Fatalf("mkdir keystore: %v", err) + } + return cfg, deployDir +} + +func writeKeyFile(t *testing.T) string { + t.Helper() + f := filepath.Join(t.TempDir(), "key.hex") + if err := os.WriteFile(f, []byte(testPrivateKeyHex), 0o600); err != nil { + t.Fatalf("write key: %v", err) + } + return f +} + +// stubVolumeOwnership replaces the k3d-node-exec helpers with no-op spies that +// record call order. Tests use this to assert the bookend ordering inside +// archiveReplacedHermesKeystore without needing a real cluster. +func stubVolumeOwnership(t *testing.T) *[]string { + t.Helper() + calls := []string{} + origEnsure := ensureVolumeWritableFn + origFix := fixRuntimeVolumeOwnershipFn + t.Cleanup(func() { + ensureVolumeWritableFn = origEnsure + fixRuntimeVolumeOwnershipFn = origFix + }) + ensureVolumeWritableFn = func(_ *config.Config, _ string, _ *ui.UI) { + calls = append(calls, "ensureVolumeWritable") + } + fixRuntimeVolumeOwnershipFn = func(_ *config.Config, _ string, _ *ui.UI) { + calls = append(calls, "fixRuntimeVolumeOwnership") + } + return &calls +} + +// stubClusterApply replaces Sync + restartHermesRemoteSigner with spies that +// record invocation. Sync's return value is configurable so tests can drive +// the success and failure branches independently. +func stubClusterApply(t *testing.T, syncErr error) (syncCalled, restartCalled *bool) { + t.Helper() + sc, rc := false, false + origSync := syncFn + origRestart := restartHermesRemoteSignerFn + t.Cleanup(func() { + syncFn = origSync + restartHermesRemoteSignerFn = origRestart + }) + syncFn = func(_ *config.Config, _ string, _ *ui.UI) error { + sc = true + return syncErr + } + restartHermesRemoteSignerFn = func(_ *config.Config, _ string, _ *ui.UI) { + rc = true + } + return &sc, &rc +} + +// TestArchiveReplacedHermesKeystore_NilExisting guards the happy path: no +// previous wallet on disk → archive is a no-op and must not call the +// k3d-node-exec helpers. +func TestArchiveReplacedHermesKeystore_NilExisting(t *testing.T) { + cfg, _ := walletImportTestConfig(t, "obol-agent") + calls := stubVolumeOwnership(t) + if err := archiveReplacedHermesKeystore(cfg, "obol-agent", nil, "new-uuid", newTestUI()); err != nil { + t.Fatalf("archive: %v", err) + } + if len(*calls) != 0 { + t.Fatalf("expected zero ownership calls, got %v", *calls) + } +} + +// TestArchiveReplacedHermesKeystore_SameUUID covers the case where the import +// re-uses the existing keystore UUID (a re-import idempotency case). Archive +// must short-circuit before any filesystem mutation. +func TestArchiveReplacedHermesKeystore_SameUUID(t *testing.T) { + cfg, _ := walletImportTestConfig(t, "obol-agent") + calls := stubVolumeOwnership(t) + existing := &WalletInfo{KeystoreUUID: "same-uuid"} + if err := archiveReplacedHermesKeystore(cfg, "obol-agent", existing, "same-uuid", newTestUI()); err != nil { + t.Fatalf("archive: %v", err) + } + if len(*calls) != 0 { + t.Fatalf("expected zero ownership calls, got %v", *calls) + } +} + +// TestArchiveReplacedHermesKeystore_BookendOrder is the regression guard for +// the fix in commit 6c5106a: the bookend pair MUST run in the order +// (ensure → … → fix), and the deferred fix MUST run even when the function +// returns early because the old keystore file is missing. +func TestArchiveReplacedHermesKeystore_BookendOrder(t *testing.T) { + cfg, _ := walletImportTestConfig(t, "obol-agent") + calls := stubVolumeOwnership(t) + // existingWallet points at a UUID whose file does not exist on disk → + // archive returns early after the os.Stat ENOENT branch. Even on that + // early-return path the deferred fix call must still fire. + existing := &WalletInfo{KeystoreUUID: "missing-uuid"} + if err := archiveReplacedHermesKeystore(cfg, "obol-agent", existing, "new-uuid", newTestUI()); err != nil { + t.Fatalf("archive: %v", err) + } + if len(*calls) != 2 || (*calls)[0] != "ensureVolumeWritable" || (*calls)[1] != "fixRuntimeVolumeOwnership" { + t.Fatalf("expected [ensureVolumeWritable, fixRuntimeVolumeOwnership], got %v", *calls) + } +} + +// TestArchiveReplacedHermesKeystore_RenamesToReplaced exercises the happy +// path: a real keystore file exists in the volume, archive renames it under +// the `replaced/` subdir with a timestamp suffix, and the original path no +// longer exists. +func TestArchiveReplacedHermesKeystore_RenamesToReplaced(t *testing.T) { + cfg, _ := walletImportTestConfig(t, "obol-agent") + stubVolumeOwnership(t) + dir := agentruntime.KeystoreVolumePath(cfg, agentruntime.Hermes, "obol-agent") + oldUUID := "old-uuid-aaaa" + oldPath := filepath.Join(dir, oldUUID+".json") + if err := os.WriteFile(oldPath, []byte(`{"placeholder":"keystore"}`), 0o600); err != nil { + t.Fatalf("write old keystore: %v", err) + } + + existing := &WalletInfo{KeystoreUUID: oldUUID} + if err := archiveReplacedHermesKeystore(cfg, "obol-agent", existing, "new-uuid", newTestUI()); err != nil { + t.Fatalf("archive: %v", err) + } + + if _, err := os.Stat(oldPath); !os.IsNotExist(err) { + t.Fatalf("old keystore should be gone, got err=%v", err) + } + entries, err := os.ReadDir(filepath.Join(dir, "replaced")) + if err != nil { + t.Fatalf("read replaced dir: %v", err) + } + if len(entries) != 1 { + t.Fatalf("expected exactly 1 archived keystore, got %d", len(entries)) + } +} + +// TestImportPrivateKeyWalletCmd_ApplyClusterFalseSkipsCluster guards the +// inverse direction of the ApplyCluster wiring (a214050): when the caller +// did not opt in to cluster-side apply (e.g. pre-cluster-up bootstrap), we +// must NOT call helmfile sync or rollout-restart. +func TestImportPrivateKeyWalletCmd_ApplyClusterFalseSkipsCluster(t *testing.T) { + cfg, _ := walletImportTestConfig(t, "obol-agent") + stubVolumeOwnership(t) + syncCalled, restartCalled := stubClusterApply(t, nil) + + keyFile := writeKeyFile(t) + if err := ImportPrivateKeyWalletCmd(cfg, "obol-agent", ImportPrivateKeyWalletOptions{ + PrivateKeyFile: keyFile, + Force: true, + ApplyCluster: false, + }, newTestUI()); err != nil { + t.Fatalf("import: %v", err) + } + if *syncCalled { + t.Fatal("Sync was called even though ApplyCluster=false") + } + if *restartCalled { + t.Fatal("restartHermesRemoteSigner was called even though ApplyCluster=false") + } +} + +// TestImportPrivateKeyWalletCmd_ApplyClusterTrueRollsPod is the regression +// guard for the pair of fixes in PR #397: ApplyCluster=true MUST trigger +// helmfile-sync AND the explicit rollout-restart that follows on success +// (helm doesn't roll on Secret-data-only changes, so without the restart +// the pod keeps signing with the chart's bootstrap address). +func TestImportPrivateKeyWalletCmd_ApplyClusterTrueRollsPod(t *testing.T) { + cfg, _ := walletImportTestConfig(t, "obol-agent") + stubVolumeOwnership(t) + syncCalled, restartCalled := stubClusterApply(t, nil) + + keyFile := writeKeyFile(t) + if err := ImportPrivateKeyWalletCmd(cfg, "obol-agent", ImportPrivateKeyWalletOptions{ + PrivateKeyFile: keyFile, + Force: true, + ApplyCluster: true, + }, newTestUI()); err != nil { + t.Fatalf("import: %v", err) + } + if !*syncCalled { + t.Fatal("Sync was NOT called with ApplyCluster=true") + } + if !*restartCalled { + t.Fatal("restartHermesRemoteSigner was NOT called after successful Sync") + } +} + +// TestImportPrivateKeyWalletCmd_SyncFailureSkipsRestart documents the +// best-effort contract: if helmfile-sync errored, do NOT proceed to +// rollout-restart (it would just roll the pod against a stale Secret). +// The wallet import as a whole still succeeds — the on-disk artifacts +// (wallet.json, keystore, values-remote-signer.yaml) are written, so a +// later `obol hermes sync` can finish the job. +func TestImportPrivateKeyWalletCmd_SyncFailureSkipsRestart(t *testing.T) { + cfg, _ := walletImportTestConfig(t, "obol-agent") + stubVolumeOwnership(t) + syncCalled, restartCalled := stubClusterApply(t, errSyncFailed) + + keyFile := writeKeyFile(t) + if err := ImportPrivateKeyWalletCmd(cfg, "obol-agent", ImportPrivateKeyWalletOptions{ + PrivateKeyFile: keyFile, + Force: true, + ApplyCluster: true, + }, newTestUI()); err != nil { + t.Fatalf("import should be best-effort, got hard error: %v", err) + } + if !*syncCalled { + t.Fatal("Sync was NOT called") + } + if *restartCalled { + t.Fatal("restartHermesRemoteSigner ran even though Sync failed") + } +} + +// errSyncFailed is a sentinel error used by tests to drive the +// helmfile-sync-failed branch of ImportPrivateKeyWalletCmd. +var errSyncFailed = &syncErr{msg: "test: sync failed"} + +type syncErr struct{ msg string } + +func (e *syncErr) Error() string { return e.msg } diff --git a/internal/openclaw/openclaw.go b/internal/openclaw/openclaw.go index eb8774e7..0ed58079 100644 --- a/internal/openclaw/openclaw.go +++ b/internal/openclaw/openclaw.go @@ -48,10 +48,6 @@ const ( // chartVersion pins the openclaw Helm chart version from the obol repo. // renovate: datasource=helm depName=openclaw registryUrl=https://obolnetwork.github.io/helm-charts/ chartVersion = "0.4.0" - - // remoteSignerChartVersion pins the remote-signer Helm chart version. - // renovate: datasource=helm depName=remote-signer registryUrl=https://obolnetwork.github.io/helm-charts/ - remoteSignerChartVersion = "0.3.1" ) // openclawVersionRaw is the single source of truth for the upstream OpenClaw @@ -2802,5 +2798,5 @@ releases: version: %s values: - values-remote-signer.yaml -`, id, namespace, chartVersion, namespace, remoteSignerChartVersion) +`, id, namespace, chartVersion, namespace, agentruntime.RemoteSignerChartVersion) }