Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
185 changes: 182 additions & 3 deletions pkg/controllers/kube-apiserver.go
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ import (
"io"
"net"
"os"
"os/exec"
"path/filepath"
"strconv"
"strings"
Expand Down Expand Up @@ -54,7 +55,18 @@ import (
)

const (
kubeAPIStartupTimeout = 60
kubeAPIStartupTimeout = 60 * time.Second
// rbacHookDeadlockTimeout is the time to wait for the RBAC bootstrap hook
// before declaring a deadlock. This is shorter than kubeAPIStartupTimeout
// to allow for faster recovery.
rbacHookDeadlockTimeout = 15 * time.Second
// rbacHookCheckInterval is how often to check the RBAC hook status
rbacHookPollDelayStart = 5 * time.Second
rbacHookCheckInterval = 2 * time.Second
// rbacHookMaxWaitDuration is the absolute maximum time to wait for the RBAC hook
// regardless of etcd health state changes. This prevents flapping from extending
// detection indefinitely.
rbacHookMaxWaitDuration = 30 * time.Second
Comment thread
copejon marked this conversation as resolved.
)

var (
Expand Down Expand Up @@ -348,9 +360,15 @@ func (s *KubeAPIServer) Run(ctx context.Context, ready chan<- struct{}, stopped
return err
}

// run readiness check
// Channel to signal RBAC hook deadlock detection
rbacDeadlockDetected := make(chan struct{})

// Run RBAC hook deadlock detector
go s.detectRBACHookDeadlock(ctx, restClient, rbacDeadlockDetected)

// Run standard readiness check
go func() {
err := wait.PollUntilContextTimeout(ctx, time.Second, kubeAPIStartupTimeout*time.Second, true, func(ctx context.Context) (bool, error) {
err := wait.PollUntilContextTimeout(ctx, time.Second, kubeAPIStartupTimeout, true, func(ctx context.Context) (bool, error) {
var status int
if err := restClient.Get().AbsPath("/readyz").Do(ctx).StatusCode(&status).Error(); err != nil {
klog.Infof("%q not yet ready: %v", s.Name(), err)
Expand Down Expand Up @@ -420,7 +438,168 @@ func (s *KubeAPIServer) Run(ctx context.Context, ready chan<- struct{}, stopped
return err
case perr := <-panicChannel:
panic(perr)
case <-rbacDeadlockDetected:
klog.Error("RBAC bootstrap hook deadlock detected - restarting microshift-etcd.scope to recover")
if err := restartMicroshiftEtcdScope(ctx); err != nil {
klog.Errorf("Failed to restart microshift-etcd.scope: %v", err)
}
return fmt.Errorf("RBAC bootstrap hook deadlock detected after %v", rbacHookDeadlockTimeout)
}
}

// detectRBACHookDeadlock monitors the RBAC bootstrap hook status and detects deadlock conditions.
// A deadlock is detected when:
// 1. The RBAC hook is not completing (stuck in "not finished" state)
// 2. etcd is healthy and responsive
// This indicates the circular dependency where the hook waits for API server
// while API server waits for the hook.
//
// Closed upstream Kubernetes issues:
// https://github.com/kubernetes/kubernetes/issues/86715
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What about the last comment in that issue? kubernetes/kubernetes#86715 (comment)
If this problems comes from not-fast-enough system, then we're just putting a bandaid that might blow up in different, unexpected place

Copy link
Copy Markdown
Contributor Author

@copejon copejon Apr 28, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Maybe but it's impossible to predict what that will be. Regardless, the bug is expressed when the system becomes resource-starved, which can happen no matter how much hardware you have.

// https://github.com/kubernetes/kubernetes/issues/97119
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This ticket suggest that if we restart microshift, we just start from the beginning? How robust is this solution because if the API server has trouble reconciling RBACs, we'll just restart endlessly.
Or is it a best effort fix?

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Right. I'm rethinking the approach on this, but the solution pattern is still the same in that we should restart the KAS internally.

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Rebooting the whole system is a crap-shoot because the resource-starvation is given another opportunity to occur. This way, microshift service start-up pauses until this passes or the timeout is reached. That means the affected service has more headroom to boot up, hopefully reducing the race window significantly.

func (s *KubeAPIServer) detectRBACHookDeadlock(ctx context.Context, restClient rest.Interface, deadlockDetected chan<- struct{}) {
// Wait a few seconds before starting detection to allow normal startup
select {
case <-ctx.Done():
return
case <-time.After(rbacHookPollDelayStart):
}

checkCount := 0
maxChecks := int((rbacHookDeadlockTimeout - rbacHookPollDelayStart) / rbacHookCheckInterval) // Account for initial delay
// Track wall-clock deadline to prevent flapping from extending detection indefinitely
startTime := time.Now()

for {
// Check absolute deadline first - this cannot be reset by etcd state changes
if time.Since(startTime) >= rbacHookMaxWaitDuration {
klog.Errorf("RBAC bootstrap hook exceeded maximum wait duration of %v", rbacHookMaxWaitDuration)
// Only trigger deadlock recovery if we've confirmed the predicate enough times
if checkCount >= maxChecks {
break // Fall through to close(deadlockDetected)
}
// Timeout but not confirmed deadlock - exit without triggering recovery
return
}

// Check if we've confirmed deadlock enough times
if checkCount >= maxChecks {
break // Fall through to close(deadlockDetected)
}

select {
case <-ctx.Done():
return
case <-time.After(rbacHookCheckInterval * time.Second):
}

// Check RBAC hook status
probeCtx, cancel := context.WithTimeout(ctx, time.Second)
var status int
result := restClient.Get().
AbsPath("/readyz/poststarthook/rbac/bootstrap-roles").
Do(probeCtx).
StatusCode(&status)
err := result.Error()
cancel()

Comment thread
copejon marked this conversation as resolved.
// If hook is ready, no deadlock
if err == nil && status == 200 {
klog.V(4).Info("RBAC bootstrap hook completed successfully")
return
}

// If no HTTP status was received, skip this iteration (transport/timeout).
if err != nil && status == 0 {
klog.V(4).Infof("RBAC probe error (not counting toward deadlock): %v", err)
continue
Comment thread
copejon marked this conversation as resolved.
}

// Hook not ready (status != 200) - check if etcd is healthy
etcdHealthy, etcdErr := isEtcdHealthy(ctx)
if etcdErr != nil {
klog.V(4).Infof("Could not check etcd health (not counting toward deadlock): %v", etcdErr)
continue
}

if etcdHealthy {
// Only increment when BOTH conditions are met:
// RBAC probe returned not-ready AND etcd is healthy
checkCount++
klog.Warningf("RBAC bootstrap hook not ready (check %d/%d, elapsed %v), but etcd is healthy - potential deadlock",
checkCount, maxChecks, time.Since(startTime).Round(time.Second))
} else {
// etcd not healthy - not a deadlock, just waiting for etcd
klog.V(4).Infof("RBAC hook waiting, etcd not yet healthy (check %d/%d)", checkCount, maxChecks)
// Reset counter since this isn't a deadlock condition
// Note: wall-clock deadline (startTime) is NOT reset - flapping cannot extend indefinitely
checkCount = 0
}
Comment thread
copejon marked this conversation as resolved.
}

// Only reached when checkCount >= maxChecks (deadlock confirmed)
klog.Errorf("RBAC bootstrap hook deadlock confirmed after %v: etcd healthy but hook not completing",
time.Since(startTime).Round(time.Second))
close(deadlockDetected)
}

// isEtcdHealthy checks if etcd is responsive by attempting to connect and get status.
func isEtcdHealthy(ctx context.Context) (bool, error) {
certsDir := cryptomaterial.CertsDirectory(config.DataDir)
etcdAPIServerClientCertDir := cryptomaterial.EtcdAPIServerClientCertDir(certsDir)

tlsInfo := transport.TLSInfo{
CertFile: cryptomaterial.ClientCertPath(etcdAPIServerClientCertDir),
KeyFile: cryptomaterial.ClientKeyPath(etcdAPIServerClientCertDir),
TrustedCAFile: cryptomaterial.CACertPath(cryptomaterial.EtcdSignerDir(certsDir)),
}
tlsConfig, err := tlsInfo.ClientConfig()
if err != nil {
return false, fmt.Errorf("failed to create TLS config: %w", err)
}

// Use a short timeout for health check
checkCtx, cancel := context.WithTimeout(ctx, 2*time.Second)
defer cancel()

client, err := clientv3.New(clientv3.Config{
Endpoints: []string{"https://localhost:2379"},
DialTimeout: 1 * time.Second,
TLS: tlsConfig,
Context: checkCtx,
})
if err != nil {
return false, fmt.Errorf("failed to create etcd client: %w", err)
}
defer func() { _ = client.Close() }()

_, err = client.Status(checkCtx, "localhost:2379")
if err != nil {
return false, nil // etcd not healthy, but not an error condition
}

return true, nil
}

// restartMicroshiftEtcdScope restarts the microshift-etcd.scope to recover from deadlock.
// This forces a clean restart of etcd which can help break the circular dependency.
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Restarting MicroShift from within doesn't cause that?

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It would, but it would take longer.

func restartMicroshiftEtcdScope(ctx context.Context) error {
klog.Info("Stopping microshift-etcd.scope for recovery")

// Set a timeout in case systemd or DBus stalls and the fail-fast recovery path hangs and Run never returns
cmdCtx, cancel := context.WithTimeout(ctx, 5*time.Second)
defer cancel()

stopCmd := exec.CommandContext(cmdCtx, "systemctl", "stop", "microshift-etcd.scope")
if out, err := stopCmd.CombinedOutput(); err != nil {
return fmt.Errorf("failed to stop microshift-etcd.scope: %w, output: %s", err, string(out))
}

// Wait briefly for cleanup
time.Sleep(1 * time.Second)

klog.Info("microshift-etcd.scope stopped - MicroShift will restart")
return nil
}

func discoverEtcdServers(ctx context.Context, kubeconfigPath string) ([]string, error) {
Expand Down