Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions .changeset/named-compaction-truncation.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
---
"@moonshot-ai/agent-core": patch
"@moonshot-ai/kosong": patch
"@moonshot-ai/kimi-code": patch
---

Report truncated compaction summaries clearly and apply valid completion token budgets across supported providers.
31 changes: 26 additions & 5 deletions packages/agent-core/src/agent/compaction/full.ts
Original file line number Diff line number Diff line change
Expand Up @@ -25,10 +25,18 @@ import {
estimateTokens,
estimateTokensForMessages,
} from '../../utils/tokens';
import {
applyCompletionBudget,
resolveCompletionBudget,
} from '../../utils/completion-budget';
import compactionInstructionTemplate from './compaction-instruction.md';
import { renderMessagesToText } from './render-messages';
import type { CompactionBeginData, CompactionResult } from './types';
import { DEFAULT_COMPACTION_CONFIG, DefaultCompactionStrategy, type CompactionStrategy } from './strategy';
import {
DEFAULT_COMPACTION_CONFIG,
DefaultCompactionStrategy,
type CompactionStrategy,
} from './strategy';

type CompactionTelemetryTrigger = CompactionBeginData['source'] | 'manual-with-prompt' | 'unknown';

Expand All @@ -38,6 +46,13 @@ export interface CompactedHistory {

export const MAX_COMPACTION_RETRY_ATTEMPTS = 5;

class CompactionTruncatedError extends Error {
constructor() {
super('Compaction response was truncated before producing a complete summary.');
this.name = 'CompactionTruncatedError';
}
}

export class FullCompaction {
protected compactionCountInTurn = 0;
protected compacting: {
Expand Down Expand Up @@ -225,6 +240,13 @@ export class FullCompaction {
await this.triggerPreCompactHook(data, tokensBefore, signal);

const model = this.agent.config.model;
const provider = applyCompletionBudget({
provider: this.agent.config.provider,
budget: resolveCompletionBudget({
reservedContextSize: this.agent.kimiConfig?.loopControl?.reservedContextSize,
}),
capability: this.agent.config.modelCapabilities,
});

const delays = retryBackoffDelays(MAX_COMPACTION_RETRY_ATTEMPTS);
let usage: TokenUsage | null;
Expand All @@ -244,24 +266,23 @@ export class FullCompaction {
toolCalls: [],
} satisfies Message,
];
class TruncatedError extends Error {}
try {
const response = await this.agent.generate(
this.agent.config.provider,
provider,
this.agent.config.systemPrompt,
[...this.agent.tools.loopTools],
messages,
undefined,
{ signal },
);
if (response.finishReason === 'truncated') {
throw new TruncatedError();
throw new CompactionTruncatedError();
}
usage = response.usage;
summary = extractCompactionSummary(response);
break;
} catch (error) {
if (error instanceof APIContextOverflowError || error instanceof TruncatedError) {
if (error instanceof APIContextOverflowError || error instanceof CompactionTruncatedError) {
compactedCount = this.strategy.reduceCompactOnOverflow(messagesToCompact);
}
else if (!isRetryableGenerateError(error)) {
Expand Down
117 changes: 116 additions & 1 deletion packages/agent-core/test/agent/compaction/full.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -731,6 +731,42 @@ describe('FullCompaction', () => {
await ctx.expectResumeMatches();
});

it('names truncated compaction responses when retries are exhausted', async () => {
vi.useFakeTimers();
let attempts = 0;
const generate: GenerateFn = async () => {
attempts += 1;
return {
...textResult('Partial summary.'),
finishReason: 'truncated',
rawFinishReason: 'length',
};
};
const ctx = testAgent({ generate, compactionStrategy: alwaysCompactOnce });
ctx.configure();

await ctx.rpc.prompt({ input: [{ type: 'text', text: 'Trigger truncated auto compaction' }] });
await vi.advanceTimersByTimeAsync(60_000);
const events = await ctx.untilTurnEnd();

expect(attempts).toBe(5);
expect(events).toContainEqual(
expect.objectContaining({
event: 'turn.ended',
args: {
turnId: 0,
reason: 'failed',
error: expect.objectContaining({
code: 'compaction.failed',
message:
'CompactionTruncatedError: Compaction response was truncated before producing a complete summary.',
}),
},
}),
);
await ctx.expectResumeMatches();
});

it('reports compaction retry_count when retryable generation failures are exhausted', async () => {
vi.useFakeTimers();
const records: TelemetryRecord[] = [];
Expand Down Expand Up @@ -1382,12 +1418,14 @@ describe('FullCompaction', () => {

it('compacts provider overflow when model context size is unknown', async () => {
let callCount = 0;
const generate: GenerateFn = async (_provider, _system, _tools, _history, callbacks) => {
const compactionMaxCompletionTokens: unknown[] = [];
const generate: GenerateFn = async (provider, _system, _tools, _history, callbacks) => {
callCount += 1;
if (callCount === 1) {
throw new APIContextOverflowError(400, 'Context length exceeded', 'req-unknown-context');
}
if (callCount === 2) {
compactionMaxCompletionTokens.push(providerMaxCompletionTokens(provider));
return textResult('Unknown window compacted summary.');
}
if (callCount === 3) {
Expand Down Expand Up @@ -1419,6 +1457,7 @@ describe('FullCompaction', () => {
const events = await ctx.untilTurnEnd();

expect(callCount).toBe(3);
expect(compactionMaxCompletionTokens).toEqual([32000]);
expect(events).toContainEqual(
expect.objectContaining({
event: 'compaction.started',
Expand All @@ -1442,6 +1481,74 @@ describe('FullCompaction', () => {
);
});

it('honors completion budget env hard caps during compaction', async () => {
vi.stubEnv('KIMI_MODEL_MAX_COMPLETION_TOKENS', '8192');
let callCount = 0;
const compactionMaxCompletionTokens: unknown[] = [];
const generate: GenerateFn = async (provider, _system, _tools, _history, callbacks) => {
callCount += 1;
if (callCount === 1) {
throw new APIContextOverflowError(400, 'Context length exceeded', 'req-hard-cap');
}
if (callCount === 2) {
compactionMaxCompletionTokens.push(providerMaxCompletionTokens(provider));
return textResult('Hard cap compacted summary.');
}
await callbacks?.onMessagePart?.({
type: 'text',
text: 'Recovered with hard cap.',
});
return textResult('Recovered with hard cap.');
};
const ctx = testAgent({ generate });
ctx.configure({
provider: CATALOGUED_PROVIDER,
modelCapabilities: CATALOGUED_MODEL_CAPABILITIES,
});
ctx.appendExchange(1, 'old user one', 'old assistant one', 20);
ctx.newEvents();

await ctx.rpc.prompt({ input: [{ type: 'text', text: 'Retry with hard cap' }] });
await ctx.untilTurnEnd();

expect(callCount).toBe(3);
expect(compactionMaxCompletionTokens).toEqual([8192]);
});

it('honors completion budget env opt-out during compaction', async () => {
vi.stubEnv('KIMI_MODEL_MAX_COMPLETION_TOKENS', '0');
let callCount = 0;
const compactionMaxCompletionTokens: unknown[] = [];
const generate: GenerateFn = async (provider, _system, _tools, _history, callbacks) => {
callCount += 1;
if (callCount === 1) {
throw new APIContextOverflowError(400, 'Context length exceeded', 'req-opt-out');
}
if (callCount === 2) {
compactionMaxCompletionTokens.push(providerMaxCompletionTokens(provider));
return textResult('Opt-out compacted summary.');
}
await callbacks?.onMessagePart?.({
type: 'text',
text: 'Recovered with opt-out.',
});
return textResult('Recovered with opt-out.');
};
const ctx = testAgent({ generate });
ctx.configure({
provider: CATALOGUED_PROVIDER,
modelCapabilities: CATALOGUED_MODEL_CAPABILITIES,
});
ctx.appendExchange(1, 'old user one', 'old assistant one', 20);
ctx.newEvents();

await ctx.rpc.prompt({ input: [{ type: 'text', text: 'Retry with opt-out' }] });
await ctx.untilTurnEnd();

expect(callCount).toBe(3);
expect(compactionMaxCompletionTokens).toEqual([undefined]);
});

it('ignores filtered assistant placeholders when checking the retained overflow suffix', async () => {
let callCount = 0;
const generate: GenerateFn = async (_provider, _system, _tools, _history, callbacks) => {
Expand Down Expand Up @@ -1625,6 +1732,14 @@ function oauthTestAgentOptions(
};
}

function providerMaxCompletionTokens(provider: Parameters<GenerateFn>[0]): unknown {
return (
provider as {
readonly modelParameters?: Record<string, unknown>;
}
).modelParameters?.['max_completion_tokens'];
}

function textResult(text: string): Awaited<ReturnType<GenerateFn>> {
return {
id: 'mock-compaction-oauth-retry',
Expand Down
18 changes: 18 additions & 0 deletions packages/kosong/src/providers/anthropic.ts
Original file line number Diff line number Diff line change
Expand Up @@ -815,6 +815,7 @@ export class AnthropicChatProvider implements ChatProvider {
private _defaultHeaders: Record<string, string> | undefined;
private _clientFactory: ((auth: ProviderRequestAuth) => Anthropic) | undefined;
private _adaptiveThinking: boolean | undefined;
private _explicitMaxTokens: boolean;

constructor(options: AnthropicOptions) {
this._model = options.model;
Expand All @@ -827,6 +828,7 @@ export class AnthropicChatProvider implements ChatProvider {
this._defaultHeaders = options.defaultHeaders;
this._clientFactory = options.clientFactory;
this._client = this._apiKey === undefined ? undefined : this._buildClient(this._apiKey);
this._explicitMaxTokens = options.defaultMaxTokens !== undefined;
this._generationKwargs = {
max_tokens: resolveDefaultMaxTokens(options.model, options.defaultMaxTokens),
betaFeatures: options.betaFeatures ?? [INTERLEAVED_THINKING_BETA],
Expand Down Expand Up @@ -1082,9 +1084,25 @@ export class AnthropicChatProvider implements ChatProvider {
return this._withGenerationKwargs(kwargs);
}

withMaxCompletionTokens(maxCompletionTokens: number): AnthropicChatProvider {
const requestedCap = resolveDefaultMaxTokens(this._model, maxCompletionTokens);
const existingCap = this._generationKwargs.max_tokens;
const clone = this._withGenerationKwargs({
max_tokens:
existingCap === undefined || this._explicitMaxTokens
? existingCap ?? requestedCap
: Math.min(existingCap, requestedCap),
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

P2 Badge Keep Anthropic thinking budgets below max_tokens

For fixed-budget Anthropic thinking (for example withThinking('high') on pre-adaptive Claude models) plus a lower completion cap such as KIMI_MODEL_MAX_COMPLETION_TOKENS=8192, this branch lowers max_tokens to the cap but leaves the existing thinking.budget_tokens at 32000. Anthropic's extended-thinking docs require budget_tokens to be less than max_tokens outside the interleaved-tools exception (https://platform.claude.com/docs/en/build-with-claude/extended-thinking), so simple Anthropic requests in that configuration become invalid; please lower/disable the thinking budget or avoid applying an incompatible cap.

Useful? React with 👍 / 👎.

});
Comment on lines +1087 to +1095
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

P2 Badge Preserve lower Anthropic max token caps

When an Anthropic model alias sets maxOutputSize, provider construction passes it as defaultMaxTokens, but after this method exists the agent's per-request completion-budget path calls withMaxCompletionTokens(maxContextSize) for normal and compaction requests. Because this assignment recomputes max_tokens from the larger context-window cap, a user-configured lower output limit is raised back to the model ceiling, so aliases intended to limit cost or force shorter responses no longer do so. Please keep the existing lower max_tokens when it is below maxCompletionTokens.

Useful? React with 👍 / 👎.

clone._explicitMaxTokens = this._explicitMaxTokens;
return clone;
}

private _withGenerationKwargs(kwargs: Partial<AnthropicGenerationKwargs>): AnthropicChatProvider {
const clone = this._clone();
clone._generationKwargs = { ...clone._generationKwargs, ...kwargs };
if ('max_tokens' in kwargs) {
clone._explicitMaxTokens = kwargs.max_tokens !== undefined;
}
return clone;
}

Expand Down
4 changes: 4 additions & 0 deletions packages/kosong/src/providers/google-genai.ts
Original file line number Diff line number Diff line change
Expand Up @@ -888,6 +888,10 @@ export class GoogleGenAIChatProvider implements ChatProvider {
return clone;
}

withMaxCompletionTokens(maxCompletionTokens: number): GoogleGenAIChatProvider {
return this.withGenerationKwargs({ max_output_tokens: maxCompletionTokens });
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

P2 Badge Clamp Gemini budgets to output limits

When a google-genai/vertexai alias has maxContextSize above Gemini's output limit (the env-model default context is 262144), applyCompletionBudget passes that context-sized cap into this hook, and this line serializes the same value as max_output_tokens. Google documents Gemini 2.5 Pro's output limit as 65,536 tokens, so the default 256K-context configuration can make every normal turn or compaction request invalid instead of merely allowing a large completion; please clamp this to the provider/model output ceiling or preserve a lower existing cap.

Useful? React with 👍 / 👎.

}

private _clone(): GoogleGenAIChatProvider {
const clone = Object.assign(
Object.create(Object.getPrototypeOf(this) as object) as GoogleGenAIChatProvider,
Expand Down
4 changes: 4 additions & 0 deletions packages/kosong/src/providers/openai-legacy.ts
Original file line number Diff line number Diff line change
Expand Up @@ -476,6 +476,10 @@ export class OpenAILegacyChatProvider implements ChatProvider {
return clone;
}

withMaxCompletionTokens(maxCompletionTokens: number): OpenAILegacyChatProvider {
return this.withGenerationKwargs({ max_tokens: maxCompletionTokens });
Comment on lines +479 to +480
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

P2 Badge Preserve explicit Chat Completions token caps

When an OpenAI Chat Completions provider is constructed with maxTokens (or has max_tokens set via withGenerationKwargs), ordinary agent turns now route through applyCompletionBudget, so this method replaces that explicit limit with the generic completion budget derived from the context window or fallback. In direct SDK/provider usage that turns a deliberately low cap into a much larger request budget, which can increase cost or defeat tests/configurations that rely on truncation; keep an existing lower max_tokens instead of overwriting it unconditionally.

Useful? React with 👍 / 👎.

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

P2 Badge Use max_completion_tokens for o-series chat

When this provider is used with an o-series Chat Completions model such as o1/o3 (the local capability catalog explicitly matches those models), ordinary agent turns now always call this hook through applyCompletionBudget, so every request includes max_tokens. OpenAI's Chat Completions docs state that max_tokens is not compatible with o-series models and is superseded by max_completion_tokens (https://platform.openai.com/docs/api-reference/chat/create), so these configurations will start failing before generation; please use the compatible field for o-series models or skip the hook there.

Useful? React with 👍 / 👎.

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

P2 Badge Leave room for Chat Completions prompts

For non-o-series Chat Completions aliases, the default budget path passes the full configured context window into this hook (for env models that is 262144 unless overridden), so even a tiny prompt is sent with max_tokens equal to the whole context. OpenAI-compatible chat endpoints count prompt plus max_tokens against the model context, so these requests can be rejected immediately instead of letting the model use the remaining space; please subtract the serialized prompt size or clamp to a safe output ceiling before setting max_tokens.

Useful? React with 👍 / 👎.

}

private _clone(): OpenAILegacyChatProvider {
const clone = Object.assign(
Object.create(Object.getPrototypeOf(this) as object) as OpenAILegacyChatProvider,
Expand Down
4 changes: 4 additions & 0 deletions packages/kosong/src/providers/openai-responses.ts
Original file line number Diff line number Diff line change
Expand Up @@ -975,6 +975,10 @@ export class OpenAIResponsesChatProvider implements ChatProvider {
return clone;
}

withMaxCompletionTokens(maxCompletionTokens: number): OpenAIResponsesChatProvider {
return this.withGenerationKwargs({ max_output_tokens: maxCompletionTokens });
Comment on lines +978 to +979
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

P2 Badge Clamp OpenAI output caps below context size

For OpenAI Responses aliases whose max_context_size is larger than the model's maximum output size, adding this method activates the generic completion-budget path and sends that context window as max_output_tokens on every turn. The repo already treats upstream messages such as max_output_tokens must not exceed 8192 as plain APIStatusErrors rather than context-overflow retries, so a correctly configured large-context OpenAI model can start failing before generation instead of just allowing a large completion. Please clamp to the provider/model output ceiling or preserve an existing lower cap before serializing this value.

Useful? React with 👍 / 👎.

}
Comment on lines +978 to +980
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

P2 Badge Preserve explicit Responses output caps

When an OpenAI Responses provider is constructed with maxOutputTokens (or already has max_output_tokens via withGenerationKwargs), normal agent turns now always pass through applyCompletionBudget, so this method overwrites that explicit cap with the generic context-sized budget (often 32000 or the model context window). In those configurations a user/operator limit meant to bound cost or avoid provider max-output errors is silently raised; this hook should keep the existing lower max_output_tokens instead of replacing it unconditionally.

Useful? React with 👍 / 👎.


private _clone(): OpenAIResponsesChatProvider {
const clone = Object.assign(
Object.create(Object.getPrototypeOf(this) as object) as OpenAIResponsesChatProvider,
Expand Down
76 changes: 76 additions & 0 deletions packages/kosong/test/anthropic.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -2138,4 +2138,80 @@ describe('AnthropicChatProvider constructor max_tokens', () => {
it('clamps defaultMaxTokens above the documented ceiling for known models', async () => {
expect(await maxTokensFor('claude-opus-4-7', { defaultMaxTokens: 999999 })).toBe(128000);
});

it('withMaxCompletionTokens sets max_tokens when no existing cap is present', async () => {
const original = new AnthropicChatProvider({
model: 'claude-opus-4-7',
apiKey: 'test-key',
stream: false,
});
const provider = original
.withGenerationKwargs({ max_tokens: undefined })
.withMaxCompletionTokens(2048);
const history: Message[] = [
{ role: 'user', content: [{ type: 'text', text: 'hi' }], toolCalls: [] },
];
const body = await captureRequestBody(provider, '', [], history);

expect(provider).not.toBe(original);
expect(body['max_tokens']).toBe(2048);
});

it('withMaxCompletionTokens lowers the inferred model default cap', async () => {
const provider = new AnthropicChatProvider({
model: 'claude-opus-4-7',
apiKey: 'test-key',
stream: false,
}).withMaxCompletionTokens(8192);
const history: Message[] = [
{ role: 'user', content: [{ type: 'text', text: 'hi' }], toolCalls: [] },
];
const body = await captureRequestBody(provider, '', [], history);

expect(body['max_tokens']).toBe(8192);
});

it('withMaxCompletionTokens preserves an existing lower max_tokens cap', async () => {
const provider = new AnthropicChatProvider({
model: 'claude-opus-4-7',
apiKey: 'test-key',
stream: false,
defaultMaxTokens: 1024,
}).withMaxCompletionTokens(128000);
const history: Message[] = [
{ role: 'user', content: [{ type: 'text', text: 'hi' }], toolCalls: [] },
];
const body = await captureRequestBody(provider, '', [], history);

expect(body['max_tokens']).toBe(1024);
});

it('withMaxCompletionTokens preserves an existing higher max_tokens cap', async () => {
const provider = new AnthropicChatProvider({
model: 'unknown-model',
apiKey: 'test-key',
stream: false,
defaultMaxTokens: 128000,
}).withMaxCompletionTokens(1024);
const history: Message[] = [
{ role: 'user', content: [{ type: 'text', text: 'hi' }], toolCalls: [] },
];
const body = await captureRequestBody(provider, '', [], history);

expect(body['max_tokens']).toBe(128000);
});

it('withMaxCompletionTokens clamps above the documented ceiling for known models', async () => {
const provider = new AnthropicChatProvider({
model: 'claude-opus-4-7',
apiKey: 'test-key',
stream: false,
}).withMaxCompletionTokens(999999);
const history: Message[] = [
{ role: 'user', content: [{ type: 'text', text: 'hi' }], toolCalls: [] },
];
const body = await captureRequestBody(provider, '', [], history);

expect(body['max_tokens']).toBe(128000);
});
});
Loading
Loading