diff --git a/framework/.changeset/v0.15.16.md b/framework/.changeset/v0.15.16.md new file mode 100644 index 000000000..e23a9f230 --- /dev/null +++ b/framework/.changeset/v0.15.16.md @@ -0,0 +1,2 @@ +- Dump two memory profiles: inuse and alloc +- Dump LOOPs profiles via admin command diff --git a/framework/components/clnode/default.go b/framework/components/clnode/default.go index ffb8a4f6f..66aed9b0c 100644 --- a/framework/components/clnode/default.go +++ b/framework/components/clnode/default.go @@ -19,13 +19,8 @@ const defaultConfigTmpl = ` [Log] Level = 'debug' -[Pyroscope] -ServerAddress = 'http://pyroscope:4040' -Environment = 'local' -LinkTracesToProfiles = true - [WebServer] -HTTPWriteTimeout = '30s' +HTTPWriteTimeout = '360s' SecureCookies = false HTTPPort = {{.HTTPPort}} diff --git a/framework/docker.go b/framework/docker.go index 784b50d63..37b63e122 100644 --- a/framework/docker.go +++ b/framework/docker.go @@ -186,6 +186,47 @@ func (dc *DockerClient) CopyFile(containerName, sourceFile, targetPath string) e return dc.copyToContainer(containerID, sourceFile, targetPath) } +// CopyFromContainer copies files from a container path and returns a tar archive stream. +func (dc *DockerClient) CopyFromContainer(containerName, sourcePath string) (io.ReadCloser, container.PathStat, error) { + return dc.CopyFromContainerWithContext(context.Background(), containerName, sourcePath) +} + +// CopyFromContainerWithContext copies files from a container path and returns a tar archive stream. +func (dc *DockerClient) CopyFromContainerWithContext(ctx context.Context, containerName, sourcePath string) (io.ReadCloser, container.PathStat, error) { + containerID, err := dc.findContainerIDByName(ctx, containerName) + if err != nil { + return nil, container.PathStat{}, fmt.Errorf("failed to find container ID by name: %s", containerName) + } + reader, stat, err := dc.cli.CopyFromContainer(ctx, containerID, sourcePath) + if err != nil { + return nil, container.PathStat{}, fmt.Errorf("could not copy from container %s path %s: %w", containerName, sourcePath, err) + } + return reader, stat, nil +} + +// CopyFromContainerToTarWithContext writes the Docker copy tar stream to targetTarPath. +func (dc *DockerClient) CopyFromContainerToTarWithContext(ctx context.Context, containerName, sourcePath, targetTarPath string) error { + reader, _, err := dc.CopyFromContainerWithContext(ctx, containerName, sourcePath) + if err != nil { + return err + } + defer reader.Close() + + if err := os.MkdirAll(filepath.Dir(targetTarPath), 0o755); err != nil { + return fmt.Errorf("failed to create destination directory for %s: %w", targetTarPath, err) + } + file, err := os.Create(targetTarPath) + if err != nil { + return fmt.Errorf("failed to create destination archive %s: %w", targetTarPath, err) + } + defer file.Close() + + if _, err := io.Copy(file, reader); err != nil { + return fmt.Errorf("failed to write archive %s: %w", targetTarPath, err) + } + return nil +} + // findContainerIDByName finds a container ID by its name func (dc *DockerClient) findContainerIDByName(ctx context.Context, containerName string) (string, error) { containers, err := dc.cli.ContainerList(ctx, container.ListOptions{ diff --git a/framework/leak/detector_cl_node.go b/framework/leak/detector_cl_node.go index 423ca3ef3..5d8abef9c 100644 --- a/framework/leak/detector_cl_node.go +++ b/framework/leak/detector_cl_node.go @@ -1,6 +1,7 @@ package leak import ( + "context" "errors" "fmt" "strconv" @@ -43,7 +44,9 @@ type CLNodesLeakDetector struct { ContainerAliveQuery string c *ResourceLeakChecker - nodesetName string + nodesetName string + dumpPyroscopeProfiles bool + dumpAdminProfiles bool } // WithCPUQuery allows to override CPU leak query (Prometheus) @@ -70,6 +73,22 @@ func WithNodesetName(name string) func(*CLNodesLeakDetector) { } } +// WithDumpPyroscopeProfiles allows to dump Pyroscope profiles for each node at the end of the test. +// Dumped profiles are aggragate (cumulative) profiles from the whole test duration. +func WithDumpPyroscopeProfiles(dump bool) func(*CLNodesLeakDetector) { + return func(cd *CLNodesLeakDetector) { + cd.dumpPyroscopeProfiles = dump + } +} + +// WithDumpAdminProfiles allows to dump admin profiles for each node at the end of the test. +// Uses CL node's debug endpoint to fetch pprof snapshots. +func WithDumpAdminProfiles(dump bool) func(*CLNodesLeakDetector) { + return func(cd *CLNodesLeakDetector) { + cd.dumpAdminProfiles = dump + } +} + // sanitizeNodesetName escapes characters that would corrupt fmt.Sprintf format strings // or invalidate PromQL double-quoted label literals. func sanitizeNodesetName(name string) string { @@ -108,6 +127,15 @@ func NewCLNodesLeakDetector(c *ResourceLeakChecker, opts ...func(*CLNodesLeakDet cd.MemoryQueryAbsolute = replaceNodeset(cd.MemoryQueryAbsolute) } + if cd.dumpPyroscopeProfiles == true && cd.dumpAdminProfiles == true { + return nil, fmt.Errorf("both Pyroscope and admin profile dumping enabled, please choose only one. Dumping admin profiles will fail if Pyroscope is enabled.") + } + + if cd.dumpAdminProfiles == false && cd.dumpPyroscopeProfiles == false { + // default to dumping admin profiles since that's what engineers prefer + cd.dumpAdminProfiles = true + } + return cd, nil } @@ -243,15 +271,42 @@ func (cd *CLNodesLeakDetector) Check(t *CLNodesCheck) error { Str("TestDuration", t.End.Sub(t.Start).String()). Float64("TestDurationSec", t.End.Sub(t.Start).Seconds()). Msg("Leaks info") - framework.L.Info().Msg("Downloading pprof profile..") - dumper := NewProfileDumper(framework.LocalPyroscopeBaseURL) - profilePath, err := dumper.MemoryProfile(&ProfileDumperConfig{ - ServiceName: "chainlink-node", - }) - if err != nil { - errs = append(errs, fmt.Errorf("failed to download Pyroscopt profile: %w", err)) - return errors.Join(errs...) + + if cd.dumpPyroscopeProfiles { + profilesToDump := []string{DefaultProfileType, "memory:inuse_space:bytes:space:bytes"} + framework.L.Info().Msgf("Downloading %d pprof profiles..", len(profilesToDump)) + dumper := NewProfileDumper(framework.LocalPyroscopeBaseURL) + + for _, profileType := range profilesToDump { + profileSplit := strings.Split(profileType, ":") + outputPath := DefaultOutputPath + if len(profileSplit) > 1 { + // e.g. for "memory:inuse_space:bytes:space:bytes" we want to have output file "memory-inuse_space.pprof" + outputPath = fmt.Sprintf("%s-%s.pprof", profileSplit[0], profileSplit[1]) + } + profilePath, err := dumper.MemoryProfile(&ProfileDumperConfig{ + ServiceName: "chainlink-node", + ProfileType: profileType, + OutputPath: outputPath, + }) + if err != nil { + errs = append(errs, fmt.Errorf("failed to download Pyroscope profile %s: %w", profileType, err)) + return errors.Join(errs...) + } + framework.L.Info().Str("Path", profilePath).Str("ProfileType", profileType).Msg("Saved pprof profile") + } + } + + if cd.dumpAdminProfiles { + framework.L.Info().Msg("Dumping admin profiles..") + ctx, cancel := context.WithTimeout(context.Background(), DefaultNodeProfileDumpTimeout) + defer cancel() + if err := DumpNodeProfiles(ctx, cd.nodesetName+"-node", DefaultAdminProfilesDir); err != nil { + framework.L.Error().Err(err).Msg("Failed to dump node profiles") + errs = append(errs, fmt.Errorf("failed to dump node profiles: %w", err)) + } + framework.L.Info().Str("Path", DefaultAdminProfilesDir).Msg("Admin profiles dumped successfully") } - framework.L.Info().Str("Path", profilePath).Msg("Saved pprof profile") + return errors.Join(errs...) } diff --git a/framework/leak/node_dumper.go b/framework/leak/node_dumper.go new file mode 100644 index 000000000..285cc3307 --- /dev/null +++ b/framework/leak/node_dumper.go @@ -0,0 +1,234 @@ +package leak + +import ( + "context" + "errors" + "fmt" + "io" + "os" + "path" + "path/filepath" + "regexp" + "strings" + "sync" + "time" + + "github.com/docker/docker/api/types/container" + "github.com/docker/docker/client" + f "github.com/smartcontractkit/chainlink-testing-framework/framework" + "github.com/smartcontractkit/chainlink-testing-framework/framework/components/clnode" + "golang.org/x/sync/errgroup" +) + +var containerNameSanitizer = regexp.MustCompile(`[^a-zA-Z0-9._-]`) + +const ( + DefaultAdminProfilesDir = "admin-profiles" + DefaultNodeProfileDumpTimeout = 5 * time.Minute + // maxConcurrentNodeProfileDumps limits parallel Docker exec/copy work so the daemon is not flooded. + maxConcurrentNodeProfileDumps = 4 +) + +// DumpNodeProfiles runs chainlink profile collection in each running container +// with a name containing namePattern and saves ./profiles as dst/profile-.tar. +func DumpNodeProfiles(ctx context.Context, namePattern, dst string) error { + f.L.Info(). + Str("NamePattern", namePattern). + Str("DestinationDir", dst). + Msg("Dumping node profiles by container name pattern") + + if strings.TrimSpace(namePattern) == "" { + return fmt.Errorf("container name pattern must not be empty") + } + if strings.TrimSpace(dst) == "" { + return fmt.Errorf("destination path must not be empty") + } + + if err := os.MkdirAll(dst, 0o755); err != nil { + return fmt.Errorf("failed to create destination directory %q: %w", dst, err) + } + + cli, err := client.NewClientWithOpts(client.FromEnv, client.WithAPIVersionNegotiation()) + if err != nil { + return fmt.Errorf("failed to create Docker client: %w", err) + } + defer cli.Close() + dc, err := f.NewDockerClient() + if err != nil { + return fmt.Errorf("failed to create framework docker client: %w", err) + } + + containers, err := runningContainers(ctx, cli) + if err != nil { + return err + } + + targets := make([]runningContainer, 0) + for _, c := range containers { + if strings.Contains(c.name, namePattern) { + targets = append(targets, c) + } + } + + var ( + mu sync.Mutex + errs []error + ) + g := new(errgroup.Group) + g.SetLimit(maxConcurrentNodeProfileDumps) + + for _, c := range targets { + c := c + g.Go(func() error { + safeName := containerNameSanitizer.ReplaceAllString(c.name, "_") + targetArchivePath := filepath.Join(dst, fmt.Sprintf("profile-%s.tar", safeName)) + + if err := loginCLINodeAdmin(ctx, cli, c); err != nil { + mu.Lock() + errs = append(errs, err) + mu.Unlock() + return nil + } + + f.L.Info().Str("ContainerName", c.name).Msg("Collecting node profile") + + out, execErr := dc.ExecContainerWithContext( + ctx, + c.name, + []string{"chainlink", "admin", "profile", "-seconds", "1", "-output_dir", "./profiles"}, + ) + if execErr != nil { + mu.Lock() + errs = append(errs, fmt.Errorf("failed to execute profile command in container %s: %w, output: %s", c.name, execErr, strings.TrimSpace(out))) + mu.Unlock() + return nil + } + + profilesPath := path.Clean(path.Join(c.workingDir, "profiles")) + if copyErr := dc.CopyFromContainerToTarWithContext(ctx, c.name, profilesPath, targetArchivePath); copyErr != nil { + mu.Lock() + errs = append(errs, fmt.Errorf("failed to copy profiles archive from container %s to %s: %w", c.name, targetArchivePath, copyErr)) + mu.Unlock() + return nil + } + + f.L.Info().Str("ContainerName", c.name).Str("Destination", targetArchivePath).Msg("Profiles copied as archive") + return nil + }) + } + + if err := g.Wait(); err != nil { + return err + } + return errors.Join(errs...) +} + +type runningContainer struct { + id string + name string + workingDir string +} + +func runningContainers(ctx context.Context, cli *client.Client) ([]runningContainer, error) { + containers, err := cli.ContainerList(ctx, container.ListOptions{}) + if err != nil { + return nil, fmt.Errorf("failed to list running Docker containers: %w", err) + } + + res := make([]runningContainer, 0, len(containers)) + for _, c := range containers { + name := firstContainerName(c.Names) + if name == "" { + continue + } + + inspect, inspectErr := cli.ContainerInspect(ctx, c.ID) + if inspectErr != nil { + return nil, fmt.Errorf("failed to inspect container %s: %w", name, inspectErr) + } + workingDir := "/" + if inspect.Config != nil && inspect.Config.WorkingDir != "" { + workingDir = inspect.Config.WorkingDir + } + res = append(res, runningContainer{ + id: c.ID, + name: name, + workingDir: workingDir, + }) + } + return res, nil +} + +func firstContainerName(names []string) string { + for _, n := range names { + if n == "" { + continue + } + return strings.TrimPrefix(n, "/") + } + return "" +} + +func loginCLINodeAdmin(ctx context.Context, cli *client.Client, c runningContainer) error { + credsPath := path.Clean(path.Join(c.workingDir, "creds.txt")) + createCredsCmd := []string{ + "sh", + "-lc", + fmt.Sprintf( + "printf '%%s\\n%%s\\n' %s %s > %s", + shellQuote(clnode.DefaultAPIUser), + shellQuote(clnode.DefaultAPIPassword), + shellQuote(credsPath), + ), + } + createOut, createExit, createErr := execContainerWithExitCode(ctx, cli, c.id, createCredsCmd) + if createErr != nil { + return fmt.Errorf("failed to create creds.txt in container %s: %w, output: %s", c.name, createErr, strings.TrimSpace(createOut)) + } + if createExit != 0 { + return fmt.Errorf("failed to create creds.txt in container %s: exit code %d, output: %s", c.name, createExit, strings.TrimSpace(createOut)) + } + + loginCmd := []string{"chainlink", "admin", "login", "-f", credsPath, "--bypass-version-check"} + loginOut, loginExit, loginErr := execContainerWithExitCode(ctx, cli, c.id, loginCmd) + if loginErr != nil { + return fmt.Errorf("failed to login admin via CLI for container %s: %w, output: %s", c.name, loginErr, strings.TrimSpace(loginOut)) + } + if loginExit != 0 { + return fmt.Errorf("failed to login admin via CLI for container %s: exit code %d, output: %s", c.name, loginExit, strings.TrimSpace(loginOut)) + } + return nil +} + +func execContainerWithExitCode(ctx context.Context, cli *client.Client, containerID string, cmd []string) (string, int, error) { + execResp, err := cli.ContainerExecCreate(ctx, containerID, container.ExecOptions{ + Cmd: cmd, + AttachStdout: true, + AttachStderr: true, + }) + if err != nil { + return "", -1, fmt.Errorf("failed to create exec in container %s: %w", containerID, err) + } + + attachResp, err := cli.ContainerExecAttach(ctx, execResp.ID, container.ExecStartOptions{}) + if err != nil { + return "", -1, fmt.Errorf("failed to attach to exec in container %s: %w", containerID, err) + } + defer attachResp.Close() + + outBytes, err := io.ReadAll(attachResp.Reader) + if err != nil { + return "", -1, fmt.Errorf("failed to read exec output in container %s: %w", containerID, err) + } + output := string(outBytes) + + execInspect, err := cli.ContainerExecInspect(ctx, execResp.ID) + if err != nil { + return output, -1, fmt.Errorf("failed to inspect exec in container %s: %w", containerID, err) + } + return output, execInspect.ExitCode, nil +} + +func shellQuote(s string) string { + return "'" + strings.ReplaceAll(s, "'", `'"'"'`) + "'" +}