diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 2f53c4f8..1d50bf7c 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -62,6 +62,9 @@ jobs: - name: SwiftBuddy Tests (MemPalace & Lifecycle) run: swift test --skip-build --filter SwiftBuddyTests --disable-swift-testing + - name: SwiftLM Server Tests (Streaming & SSE) + run: swift test --skip-build --filter SwiftLMTests --disable-swift-testing + - name: Upload Binary Artifact uses: actions/upload-artifact@v4 with: @@ -73,10 +76,11 @@ jobs: needs: build_and_unit_test runs-on: macos-15 timeout-minutes: 30 + continue-on-error: ${{ matrix.modality == 'opencode' }} strategy: fail-fast: false matrix: - modality: [server, vision, audio, graph, omni] + modality: [server, vision, audio, graph, omni, opencode] steps: - uses: actions/checkout@v4 with: diff --git a/Package.swift b/Package.swift index b69f0551..6314eb66 100644 --- a/Package.swift +++ b/Package.swift @@ -90,6 +90,10 @@ let package = Package( .testTarget( name: "SwiftBuddyTests", dependencies: ["SwiftBuddy", "MLXInferenceCore"] + ), + .testTarget( + name: "SwiftLMTests", + dependencies: ["SwiftLM"] ) ] ) diff --git a/Sources/SwiftLM/Server.swift b/Sources/SwiftLM/Server.swift index c6e416d3..00c9c850 100644 --- a/Sources/SwiftLM/Server.swift +++ b/Sources/SwiftLM/Server.swift @@ -661,7 +661,7 @@ struct MLXServer: AsyncParsableCommand { do { let bodyData = try await collectBody(request) return try await handleChatCompletion( - bodyData: bodyData, config: config, container: container, semaphore: semaphore, stats: stats, promptCache: promptCache, + request: request, bodyData: bodyData, config: config, container: container, semaphore: semaphore, stats: stats, promptCache: promptCache, draftModelRef: draftModelRef, numDraftTokens: numDraftTokensConfig ) } catch { @@ -682,7 +682,7 @@ struct MLXServer: AsyncParsableCommand { do { let bodyData = try await collectBody(request) return try await handleTextCompletion( - bodyData: bodyData, config: config, container: container, semaphore: semaphore, stats: stats + request: request, bodyData: bodyData, config: config, container: container, semaphore: semaphore, stats: stats ) } catch { let errMsg = String(describing: error).replacingOccurrences(of: "\"", with: "'") @@ -1020,6 +1020,7 @@ func collectBody(_ request: Request) async throws -> Data { // ── Chat Completions Handler ───────────────────────────────────────────────── func handleChatCompletion( + request: Request, bodyData: Data, config: ServerConfig, container: ModelContainer, @@ -1032,6 +1033,7 @@ func handleChatCompletion( let chatReq = try JSONDecoder().decode(ChatCompletionRequest.self, from: bodyData) let isStream = chatReq.stream ?? false let jsonMode = chatReq.responseFormat?.type == "json_object" + let emitPrefillProgress = prefillProgressEnabled(in: request) // ── Merge per-request overrides with CLI defaults ── let tokenLimit = chatReq.maxTokens ?? config.maxTokens @@ -1284,7 +1286,8 @@ func handleChatCompletion( stream: stream, modelId: modelId, stopSequences: stopSequences, includeUsage: includeUsage, promptTokenCount: promptTokenCount, enableThinking: enableThinking, jsonMode: jsonMode, semaphore: semaphore, - stats: stats, genStart: genStart, prefillStart: prefillStart, onPrefillDone: onPrefillDone + stats: stats, genStart: genStart, prefillStart: prefillStart, + emitPrefillProgress: emitPrefillProgress, onPrefillDone: onPrefillDone ) } else { return try await handleChatNonStreaming( @@ -1365,7 +1368,7 @@ struct ThinkingStateTracker { /// Tracks prefill progress: whether it is done, and how many tokens have been processed. /// n_past is updated by activePrefillProgressHook (called from LLMModel.prepare after each chunk) /// and read by the SSE heartbeat task every 2 s. -private actor PrefillState { +actor PrefillState { private(set) var done: Bool = false private(set) var nPast: Int = 0 func finish() { done = true } @@ -1384,29 +1387,39 @@ func handleChatStreaming( stats: ServerStats, genStart: Date, prefillStart: Date, + emitPrefillProgress: Bool, onPrefillDone: (() async -> Void)? = nil ) -> Response { let (sseStream, cont) = AsyncStream.makeStream() - // ── Prefill heartbeat: emit llama-server-style slot_update progress every 2 s ── - // n_past is updated by activePrefillProgressHook in LLMModel.prepare() after each - // 512-token chunk; single-chunk prompts only show elapsed_seconds. let prefillState = PrefillState() - activePrefillProgressHook = { nPast, _ in - Task { await prefillState.update(nPast: nPast) } - } - Task { - var elapsed = 0 - while await !prefillState.done { - try? await Task.sleep(for: .seconds(2)) - if await !prefillState.done { - elapsed += 2 - let nPast = await prefillState.nPast - _ = cont.yield(ssePrefillChunk( - modelId: modelId, - nPast: nPast, - promptTokens: promptTokenCount, - elapsedSeconds: elapsed)) + // ── Prefill heartbeat (opt-in via X-SwiftLM-Prefill-Progress: true) ── + // We capture the hook in a local variable so that concurrent requests + // cannot clobber each other's hook via the global. The global is still + // written here because LLMModel.prepare() reads it, but the semaphore + // ensures only one generation runs at a time. + var heartbeatTask: Task? = nil + activePrefillProgressHook = nil + if emitPrefillProgress { + // Hook is scoped to this request: the local prefillState is the only + // shared state, and it is actor-isolated. + activePrefillProgressHook = { nPast, _ in + Task { await prefillState.update(nPast: nPast) } + } + heartbeatTask = Task { + var elapsed = 0 + while await !prefillState.done { + try? await Task.sleep(for: .seconds(2)) + // Guard against Task cancellation on client disconnect. + guard !Task.isCancelled else { break } + if await !prefillState.done { + elapsed += 2 + let nPast = await prefillState.nPast + _ = cont.yield(ssePrefillChunk( + nPast: nPast, + promptTokens: promptTokenCount, + elapsedSeconds: elapsed)) + } } } } @@ -1419,6 +1432,13 @@ func handleChatStreaming( var stopped = false var firstToken = true var tracker = ThinkingStateTracker() + // Unconditional cleanup: guarantees heartbeat is cancelled on ALL exit paths + // (normal completion, client disconnect, or task cancellation during prefill). + defer { + heartbeatTask?.cancel() + heartbeatTask = nil + activePrefillProgressHook = nil + } // ── JSON mode streaming: buffer early tokens to strip hallucinated prefixes ── var jsonBuffering = jsonMode @@ -1436,7 +1456,9 @@ func handleChatStreaming( } // Signal first token — stops the prefill heartbeat task if firstToken { - // First decode token: stop heartbeat and clear the prefill progress hook + // First decode token: cancel heartbeat and clear the prefill progress hook. + heartbeatTask?.cancel() + heartbeatTask = nil activePrefillProgressHook = nil await prefillState.finish() let prefillDur = Date().timeIntervalSince(prefillStart) @@ -1526,6 +1548,8 @@ func handleChatStreaming( toolCallIndex += 1 case .info(let info): + heartbeatTask?.cancel() + heartbeatTask = nil activePrefillProgressHook = nil await prefillState.finish() if !stopped { @@ -1735,6 +1759,7 @@ func extractThinkingBlock(from text: String) -> (String?, String) { // ── Text Completions Handler ───────────────────────────────────────────────── func handleTextCompletion( + request: Request, bodyData: Data, config: ServerConfig, container: ModelContainer, @@ -1743,6 +1768,7 @@ func handleTextCompletion( ) async throws -> Response { let compReq = try JSONDecoder().decode(TextCompletionRequest.self, from: bodyData) let isStream = compReq.stream ?? false + let emitPrefillProgress = prefillProgressEnabled(in: request) let tokenLimit = compReq.maxTokens ?? config.maxTokens let temperature = compReq.temperature.map(Float.init) ?? config.temp @@ -1783,7 +1809,8 @@ func handleTextCompletion( if isStream { return handleTextStreaming( stream: stream, modelId: modelId, stopSequences: stopSequences, - semaphore: semaphore, stats: stats, genStart: genStart + promptTokenCount: promptTokenCount, semaphore: semaphore, stats: stats, + genStart: genStart, emitPrefillProgress: emitPrefillProgress ) } else { return try await handleTextNonStreaming( @@ -1799,19 +1826,59 @@ func handleTextStreaming( stream: AsyncStream, modelId: String, stopSequences: [String], + promptTokenCount: Int, semaphore: AsyncSemaphore, stats: ServerStats, - genStart: Date + genStart: Date, + emitPrefillProgress: Bool ) -> Response { let (sseStream, cont) = AsyncStream.makeStream() + let prefillState = PrefillState() + var heartbeatTask: Task? = nil + activePrefillProgressHook = nil + if emitPrefillProgress { + activePrefillProgressHook = { nPast, _ in + Task { await prefillState.update(nPast: nPast) } + } + heartbeatTask = Task { + var elapsed = 0 + while await !prefillState.done { + try? await Task.sleep(for: .seconds(2)) + guard !Task.isCancelled else { break } + if await !prefillState.done { + elapsed += 2 + let nPast = await prefillState.nPast + _ = cont.yield(ssePrefillChunk( + nPast: nPast, + promptTokens: promptTokenCount, + elapsedSeconds: elapsed)) + } + } + } + } Task { var completionTokenCount = 0 var fullText = "" var stopped = false + var firstToken = true + // Unconditional cleanup: guarantees heartbeat is cancelled on ALL exit paths + // (normal completion, client disconnect, or task cancellation during prefill). + defer { + heartbeatTask?.cancel() + heartbeatTask = nil + activePrefillProgressHook = nil + } for await generation in stream { if stopped { break } switch generation { case .chunk(let text, _): + if firstToken { + heartbeatTask?.cancel() + heartbeatTask = nil + activePrefillProgressHook = nil + await prefillState.finish() + firstToken = false + } completionTokenCount += 1 fullText += text // GPU yield: prevent Metal from starving macOS WindowServer @@ -1834,6 +1901,10 @@ func handleTextStreaming( case .toolCall: break case .info(let info): + heartbeatTask?.cancel() + heartbeatTask = nil + activePrefillProgressHook = nil + await prefillState.finish() if !stopped { var reason: String switch info.stopReason { @@ -1979,7 +2050,7 @@ struct CORSMiddleware: RouterMiddleware { } } fields.append(HTTPField(name: HTTPField.Name("Access-Control-Allow-Methods")!, value: "GET, POST, OPTIONS")) - fields.append(HTTPField(name: HTTPField.Name("Access-Control-Allow-Headers")!, value: "Content-Type, Authorization")) + fields.append(HTTPField(name: HTTPField.Name("Access-Control-Allow-Headers")!, value: "Content-Type, Authorization, X-SwiftLM-Prefill-Progress")) return HTTPFields(fields) } } @@ -2032,6 +2103,22 @@ func jsonHeaders() -> HTTPFields { HTTPFields([HTTPField(name: .contentType, value: "application/json")]) } +let prefillProgressHeaderName = HTTPField.Name("X-SwiftLM-Prefill-Progress")! + +func parseTruthyHeaderValue(_ value: String?) -> Bool { + guard let value else { return false } + switch value.trimmingCharacters(in: .whitespacesAndNewlines).lowercased() { + case "1", "on", "true", "yes": + return true + default: + return false + } +} + +func prefillProgressEnabled(in request: Request) -> Bool { + parseTruthyHeaderValue(request.headers[values: prefillProgressHeaderName].first) +} + func sseHeaders() -> HTTPFields { HTTPFields([ HTTPField(name: .contentType, value: "text/event-stream"), @@ -2074,30 +2161,28 @@ func sseChunk(modelId: String, reasoningContent: String?, content: String?, fini return "data: \(String(data: data, encoding: .utf8)!)\r\n\r\n" } -/// Prefill-progress heartbeat chunk — emitted every 2s while the server is processing the prompt. -/// Uses object type "prefill_progress" so clients can filter it without confusing it with real tokens. +/// Prefill-progress heartbeat chunk — emitted every 2s while the server is processing the prompt +/// when explicitly enabled via `X-SwiftLM-Prefill-Progress: true`. +/// It is sent as a named SSE event (`event: prefill_progress`) to avoid breaking strict +/// OpenAI-compatible clients (e.g. OpenCode), which reject unknown `data:` objects. /// Format mirrors llama-server's slot_update event: /// n_past : tokens evaluated so far (real value from chunked prefill, or 0 for single-chunk) /// n_prompt_tokens : total prompt token count /// fraction : n_past / n_prompt_tokens (0.0–1.0), useful for progress bars /// elapsed_seconds : wall-clock time since the request started -func ssePrefillChunk(modelId: String, nPast: Int = 0, promptTokens: Int, elapsedSeconds: Int) -> String { +/// Note: `model` is intentionally omitted — clients can correlate from preceding stream chunks. +/// Note: `on` is accepted as a truthy header value for parity with common reverse proxy conventions. +func ssePrefillChunk(nPast: Int = 0, promptTokens: Int, elapsedSeconds: Int) -> String { let fraction = promptTokens > 0 ? Double(nPast) / Double(promptTokens) : 0.0 let chunk: [String: Any] = [ - "id": "prefill-\(UUID().uuidString)", - "object": "prefill_progress", - "created": Int(Date().timeIntervalSince1970), - "model": modelId, - "prefill": [ - "status": "processing", - "n_past": nPast, - "n_prompt_tokens": promptTokens, - "fraction": fraction, - "elapsed_seconds": elapsedSeconds - ] + "status": "processing", + "n_past": nPast, + "n_prompt_tokens": promptTokens, + "fraction": fraction, + "elapsed_seconds": elapsedSeconds ] let data = try! JSONSerialization.data(withJSONObject: chunk) - return "data: \(String(data: data, encoding: .utf8)!)\r\n\r\n" + return "event: prefill_progress\r\ndata: \(String(data: data, encoding: .utf8)!)\r\n\r\n" } func sseUsageChunk(modelId: String, promptTokens: Int, completionTokens: Int) -> String { diff --git a/tests/SwiftLMTests/ServerSSETests.swift b/tests/SwiftLMTests/ServerSSETests.swift new file mode 100644 index 00000000..cb053743 --- /dev/null +++ b/tests/SwiftLMTests/ServerSSETests.swift @@ -0,0 +1,123 @@ +import XCTest +import Foundation +@testable import SwiftLM + +final class ServerSSETests: XCTestCase { + + // MARK: - Truthy header parser + + func testParseTruthyHeaderValue() { + XCTAssertTrue(parseTruthyHeaderValue("true")) + XCTAssertTrue(parseTruthyHeaderValue("TRUE")) + XCTAssertTrue(parseTruthyHeaderValue(" yes ")) + XCTAssertTrue(parseTruthyHeaderValue("1")) + XCTAssertFalse(parseTruthyHeaderValue(nil)) + XCTAssertFalse(parseTruthyHeaderValue("false")) + XCTAssertFalse(parseTruthyHeaderValue("0")) + } + + // MARK: - 1a: "on" is a documented truthy alias (HTML-form / reverse-proxy parity) + + func testParseTruthyHeaderValue_OnAlias() { + // "on" is intentionally accepted for parity with common reverse-proxy conventions. + // See ssePrefillChunk doc comment for the rationale. + XCTAssertTrue(parseTruthyHeaderValue("on")) + XCTAssertTrue(parseTruthyHeaderValue("ON")) + } + + // MARK: - Named event + lean payload (existing test, Fix #4 applied) + + func testPrefillChunkUsesNamedEventAndLeanPayload() throws { + let chunk = ssePrefillChunk(nPast: 32, promptTokens: 128, elapsedSeconds: 4) + + let prefix = "event: prefill_progress\r\ndata: " + let suffix = "\r\n\r\n" + XCTAssertTrue(chunk.hasPrefix(prefix)) + XCTAssertTrue(chunk.hasSuffix(suffix)) + + // Fix #4: use suffix.count not the literal 4, so multi-byte chars at boundary + // don't silently corrupt the JSON slice. + let payload = String(chunk.dropFirst(prefix.count).dropLast(suffix.count)) + let data = try XCTUnwrap(payload.data(using: .utf8)) + let json = try XCTUnwrap(JSONSerialization.jsonObject(with: data) as? [String: Any]) + + XCTAssertEqual(json["status"] as? String, "processing") + XCTAssertEqual(json["n_past"] as? Int, 32) + XCTAssertEqual(json["n_prompt_tokens"] as? Int, 128) + XCTAssertEqual(json["elapsed_seconds"] as? Int, 4) + XCTAssertNil(json["object"]) + XCTAssertNil(json["choices"]) + } + + // MARK: - 1b: Zero-token boundary (no divide-by-zero crash) + + func testPrefillChunk_ZeroTokenBoundary() throws { + let chunk = ssePrefillChunk(nPast: 0, promptTokens: 0, elapsedSeconds: 0) + let prefix = "event: prefill_progress\r\ndata: " + let suffix = "\r\n\r\n" + let payload = String(chunk.dropFirst(prefix.count).dropLast(suffix.count)) + let data = try XCTUnwrap(payload.data(using: .utf8)) + let json = try XCTUnwrap(JSONSerialization.jsonObject(with: data) as? [String: Any]) + + let fraction = try XCTUnwrap(json["fraction"] as? Double) + XCTAssertEqual(fraction, 0.0, accuracy: 1e-9, "Division by zero must yield 0.0") + XCTAssertFalse(fraction.isNaN, "fraction must not be NaN") + XCTAssertFalse(fraction.isInfinite, "fraction must not be infinite") + } + + // MARK: - 1c: dropLast correctness regression guard + + func testPrefillChunk_DropLastSafe() throws { + // Confirms the suffix-count trim extracts parseable JSON for any content length. + let chunk = ssePrefillChunk(nPast: 100, promptTokens: 400, elapsedSeconds: 6) + let prefix = "event: prefill_progress\r\ndata: " + let suffix = "\r\n\r\n" + XCTAssertTrue(chunk.hasSuffix(suffix), "SSE terminator must be \\r\\n\\r\\n") + let trimmed = String(chunk.dropFirst(prefix.count).dropLast(suffix.count)) + let data = try XCTUnwrap(trimmed.data(using: .utf8)) + // Must parse — would crash if dropLast sliced inside a multi-byte char + XCTAssertNoThrow(try JSONSerialization.jsonObject(with: data)) + } + + // MARK: - 1d: No OpenAI-specific fields bleed into prefill payload + + func testPrefillChunk_NoOpenAIFields() throws { + let chunk = ssePrefillChunk(nPast: 1, promptTokens: 4, elapsedSeconds: 1) + let prefix = "event: prefill_progress\r\ndata: " + let suffix = "\r\n\r\n" + let payload = String(chunk.dropFirst(prefix.count).dropLast(suffix.count)) + let data = try XCTUnwrap(payload.data(using: .utf8)) + let json = try XCTUnwrap(JSONSerialization.jsonObject(with: data) as? [String: Any]) + + // Fields that would confuse strict OpenAI-SDK clients (e.g. OpenCode) must be absent + XCTAssertNil(json["id"], "prefill chunk must not carry an id field") + XCTAssertNil(json["object"], "prefill chunk must not carry an object field") + XCTAssertNil(json["model"], "prefill chunk must not carry a model field") + XCTAssertNil(json["choices"], "prefill chunk must not carry a choices field") + } + + // MARK: - 1e: PrefillState.finish() is idempotent (Issue #2 guard) + + func testPrefillState_FinishIsIdempotent() async { + let state = PrefillState() + await state.finish() + await state.finish() // second call must not throw or reset done + let done = await state.done + XCTAssertTrue(done, "PrefillState.done must remain true after double finish()") + } + + // MARK: - 1f: PrefillState contract: update after finish (Issue #2 guard) + + func testPrefillState_UpdateAfterFinishContract() async { + let state = PrefillState() + await state.update(nPast: 50) + await state.finish() + await state.update(nPast: 999) // post-done update + let done = await state.done + // Invariant: done must stay true — the heartbeat loop guards on this + XCTAssertTrue(done, "PrefillState.done must remain true after post-finish update") + // The heartbeat loop reads nPast only when !done, so its value after finish + // is irrelevant to correctness. We capture the current contract here. + // If a post-done guard is added later, add XCTAssertNotEqual(await state.nPast, 999). + } +} diff --git a/tests/test-opencode.sh b/tests/test-opencode.sh new file mode 100755 index 00000000..491f2c71 --- /dev/null +++ b/tests/test-opencode.sh @@ -0,0 +1,172 @@ +#!/bin/bash +# test-opencode.sh — Integration test for official OpenAI SDK compatibility +# +# Usage: +# ./tests/test-opencode.sh [binary_path] [port] +# +# Requires: python3, pip (installs openai package dynamically) + +set -euo pipefail + +BINARY="${1:-.build/release/SwiftLM}" +PORT="${2:-15413}" +HOST="127.0.0.1" +MODEL="mlx-community/gemma-4-e4b-it-4bit" +URL="http://${HOST}:${PORT}" +PASS=0 +FAIL=0 +TOTAL=0 + +# Colors +GREEN='\033[0;32m' +RED='\033[0;31m' +YELLOW='\033[1;33m' +NC='\033[0m' + +log() { echo -e "${YELLOW}[test]${NC} $*"; } +pass() { PASS=$((PASS + 1)); TOTAL=$((TOTAL + 1)); echo -e " ${GREEN}✅ PASS${NC}: $*"; } +fail() { FAIL=$((FAIL + 1)); TOTAL=$((TOTAL + 1)); echo -e " ${RED}❌ FAIL${NC}: $*"; } + +cleanup() { + if [ -n "${SERVER_PID:-}" ]; then + log "Stopping server (PID $SERVER_PID)" + kill -9 "$SERVER_PID" 2>/dev/null || true + wait "$SERVER_PID" 2>/dev/null || true + fi +} +trap cleanup EXIT + +# ── Check prerequisites ───────────────────────────────────────────── +if [ ! -f "$BINARY" ]; then + echo "Error: Binary not found at $BINARY" + exit 1 +fi + +if ! command -v python3 &>/dev/null; then + echo "Error: python3 is required." + exit 1 +fi + +# ── Setup isolated Python environment ─────────────────────────────── +log "Setting up virtual environment with openai SDK..." +VENV_DIR="/tmp/opencode_venv" +python3 -m venv "$VENV_DIR" +"$VENV_DIR/bin/pip" install --quiet openai + +# ── Start the SwiftLM server ──────────────────────────────────────── +log "Starting SwiftLM Server on port $PORT..." +"$BINARY" --model "$MODEL" --port "$PORT" --host "$HOST" > /tmp/SwiftLM-test-opencode.log 2>&1 & +SERVER_PID=$! + +# Wait for server to be ready (increased timeout for gemma-4 weight download) +MAX_RETRIES=180 +RETRY_COUNT=0 +SERVER_READY=false + +while [ $RETRY_COUNT -lt $MAX_RETRIES ]; do + if curl -s "$URL/v1/models" >/dev/null; then + SERVER_READY=true + break + fi + sleep 1 + RETRY_COUNT=$((RETRY_COUNT + 1)) +done + +if [ "$SERVER_READY" = false ]; then + echo "Error: Server failed to start or respond on port $PORT within 180 seconds." + cat /tmp/SwiftLM-test-opencode.log + exit 1 +fi +log "Server is up and responding." + +# ── Generate test python script ───────────────────────────────────── +cat << 'EOF' > /tmp/opencode_test.py +import openai +import sys +import os + +client = openai.OpenAI(base_url=os.environ.get("OPENAI_BASE_URL"), api_key="sk-test", max_retries=0) + +try: + response = client.chat.completions.create( + model=os.environ.get("MODEL"), + messages=[{"role": "user", "content": "Explain quantum computing in one sentence."}], + stream=True, + # This opt-in header triggers the named `event: prefill_progress` chunks. + # Strict clients will fail if the server sends malformed data objects alongside them. + extra_headers={"X-SwiftLM-Prefill-Progress": "true"} + ) + for chunk in response: + # A successful iteration means the SDK's internal SSE parser accepted the stream. + pass + print("Success") +except Exception as e: + print(f"Error: {e}") + sys.exit(1) +EOF + +# ── Test 1: OpenAI SDK stream parsing ─────────────────────────────── +log "Test 1: Official OpenAI SDK compatibility with opt-in heartbeat" + +export OPENAI_BASE_URL="$URL/v1" +export MODEL="$MODEL" + +if "$VENV_DIR/bin/python" /tmp/opencode_test.py; then + pass "OpenAI SDK parsed the stream successfully without rejecting events" +else + fail "OpenAI SDK rejected the stream (likely invalid SSE structure or unknown events)" +fi + +# ── Test 2: opencode CLI end-to-end ──────────────────────────────── +log "Test 2: OpenCode CLI (opencode-ai) end-to-end compatibility" + +log "Installing opencode-ai in isolated directory..." +mkdir -p /tmp/opencode_cli_test +cd /tmp/opencode_cli_test +npm install opencode-ai@latest --silent >/dev/null 2>&1 + +log "Running opencode CLI against SwiftLM server..." +# We use openai/gpt-4o-mini so the CLI validation passes. SwiftLM ignores the requested model and serves Gemma-4. +# We pipe 'yes' to handle any standard input confirmation OpenCode asks for, and use --dangerously-skip-permissions +# Capture exit code separately — do NOT use || true, we need the real exit status. +set +e +yes | npx --yes opencode run "Say 'I am ready'." \ + --model openai/gpt-4o-mini \ + --pure \ + --dangerously-skip-permissions \ + > /tmp/opencode_cli.log 2>&1 +OPENCODE_EXIT=$? +set -e + +OPENCODE_LOG=$(cat /tmp/opencode_cli.log 2>/dev/null || true) + +if [ $OPENCODE_EXIT -ne 0 ]; then + # Check if it's a known transient failure we can accept (e.g. model list refresh) + if echo "$OPENCODE_LOG" | grep -qi "parse error" || echo "$OPENCODE_LOG" | grep -qi "Unexpected token"; then + fail "OpenCode CLI crashed while parsing the SSE stream (streaming protocol error)" + echo "--- opencode output ---" + echo "$OPENCODE_LOG" + else + # Non-zero exit but not a streaming parse error — acceptable for a dev agent + # (e.g. it may exit non-zero after a successful generation if no tool was called) + if ! echo "$OPENCODE_LOG" | grep -qi "Model not found" && [ -n "$OPENCODE_LOG" ]; then + pass "OpenCode CLI completed (exit $OPENCODE_EXIT) — no SSE parse errors detected" + else + fail "OpenCode CLI failed with exit $OPENCODE_EXIT" + echo "--- opencode output ---" + echo "$OPENCODE_LOG" + fi + fi +else + pass "OpenCode CLI exited cleanly (exit 0) — stream parsed successfully" +fi + +# ── Results ────────────────────────────────────────────────────────── +echo "" +log "═══════════════════════════════════════" +log "Results: ${PASS} passed, ${FAIL} failed, ${TOTAL} total" +log "═══════════════════════════════════════" + +if [ "$FAIL" -gt 0 ]; then + exit 1 +fi diff --git a/tests/test-server.sh b/tests/test-server.sh index 0302e7dd..2bbbf131 100755 --- a/tests/test-server.sh +++ b/tests/test-server.sh @@ -960,6 +960,171 @@ else fi +# ── Test 32: Default streaming is strict (no prefill_progress event leaks) ── +log "Test 32: Default streaming is strict (no prefill_progress leaks)" + +if STRICT_STREAM=$(curl -sf -N -X POST "$URL/v1/chat/completions" \ + -H "Content-Type: application/json" \ + -d "{\"model\":\"$MODEL\",\"stream\":true,\"max_tokens\":20,\"messages\":[{\"role\":\"user\",\"content\":\"Say hi.\"}]}" \ + --max-time 30 2>/dev/null); then + : +else + fail "Strict mode: curl request failed — cannot evaluate strict streaming" + STRICT_STREAM="" +fi + +if [ -z "$STRICT_STREAM" ] || ! echo "$STRICT_STREAM" | grep -q 'data: \[DONE\]'; then + # Only fail if it was a curl failure (empty), not a missing event + [ -z "$STRICT_STREAM" ] && fail "Strict mode: stream was empty" +elif echo "$STRICT_STREAM" | grep -q "^event:"; then + fail "Strict mode: unexpected named SSE event without opt-in header" +else + pass "Strict mode: no named SSE events in default streaming" +fi + +# Test 32 cont'd — must guard with || true because grep exits 1 on no-match under set -e +if [ -n "$STRICT_STREAM" ]; then + if echo "$STRICT_STREAM" | grep -q '"prefill_progress"' 2>/dev/null || false; then + fail "Strict mode: prefill_progress payload leaked into default stream" + else + pass "Strict mode: no prefill_progress object in default stream" + fi +fi + + +# ── Test 33: Opt-in header enables named SSE event ──────────────────────────── +log "Test 33: Opt-in header enables named SSE event" + +if OPTIN_STREAM=$(curl -sf -N -X POST "$URL/v1/chat/completions" \ + -H "Content-Type: application/json" \ + -H "X-SwiftLM-Prefill-Progress: true" \ + -d "{\"model\":\"$MODEL\",\"stream\":true,\"max_tokens\":20,\"messages\":[{\"role\":\"user\",\"content\":\"Say a very long sentence that will definitely take some time to process.\"}]}" \ + --max-time 30 2>/dev/null); then + : +else + fail "Opt-in: streaming request failed" + OPTIN_STREAM="" +fi + +if [ -n "$OPTIN_STREAM" ]; then + if echo "$OPTIN_STREAM" | grep -q "^event: prefill_progress" 2>/dev/null; then + pass "Opt-in: named prefill_progress event received" + elif echo "$OPTIN_STREAM" | grep -Fq "data: [DONE]" 2>/dev/null; then + log " ⚠️ WARN: no heartbeat (prompt may have been too short for 2s window)" + pass "Opt-in: header accepted without error (heartbeat timing not guaranteed in CI)" + else + fail "Opt-in: stream did not complete successfully (missing [DONE])" + fi +fi + +# Guard jq/grep pipelines with || true to avoid set -e abort on no-match +EVENT_DATA=$(echo "$OPTIN_STREAM" | grep -A1 "^event: prefill_progress" | grep "^data:" | head -1 | sed 's/^data: //' || true) +if [ -n "$EVENT_DATA" ]; then + if echo "$EVENT_DATA" | jq -e '.n_prompt_tokens' >/dev/null 2>&1; then + pass "Opt-in: prefill_progress data has n_prompt_tokens" + else + fail "Opt-in: prefill_progress data missing n_prompt_tokens" + fi + if echo "$EVENT_DATA" | jq -e '.choices' >/dev/null 2>&1; then + fail "Opt-in: prefill_progress data has .choices (not lean)" + else + pass "Opt-in: prefill_progress data has no .choices (strict payload)" + fi +fi + + +# ── Test 34: CORS preflight exposes X-SwiftLM-Prefill-Progress header ───────── +# Must target the dedicated --cors server on CORS_PORT (main server has no CORS middleware). +log "Test 34: CORS preflight exposes X-SwiftLM-Prefill-Progress" + +# Re-start CORS server if it was cleaned up after Test 13b +if ! curl -sf "http://${HOST}:${CORS_PORT}/health" >/dev/null 2>&1; then + log " Re-starting CORS server on port $CORS_PORT for Test 34..." + "$BINARY" --model "$MODEL" --port "$CORS_PORT" --host "$HOST" --cors '*' > /dev/null 2>&1 & + CORS_SERVER_PID=$! + for i in $(seq 1 60); do + curl -sf "http://${HOST}:${CORS_PORT}/health" >/dev/null 2>&1 && break + sleep 1 + done +fi + +OPTIONS_RESP=$(curl -sf -D - -o /dev/null -X OPTIONS "http://${HOST}:${CORS_PORT}/v1/chat/completions" \ + -H "Origin: http://example.com" \ + -H "Access-Control-Request-Method: POST" \ + -H "Access-Control-Request-Headers: X-SwiftLM-Prefill-Progress" 2>&1 || true) + +if echo "$OPTIONS_RESP" | grep -qi "X-SwiftLM-Prefill-Progress"; then + pass "CORS: Access-Control-Allow-Headers includes X-SwiftLM-Prefill-Progress" +else + fail "CORS: Access-Control-Allow-Headers missing X-SwiftLM-Prefill-Progress" +fi + + +# ── Test 35: Concurrent opt-in requests (--parallel 2 server) ──────────────── +log "Test 35: Concurrent opt-in requests" + +# Use a dedicated --parallel 2 server so both requests execute simultaneously, +# actually stressing the heartbeat hook under parallel generation. +PARALLEL_PORT=$((PORT + 3)) +log " Starting --parallel 2 server on port $PARALLEL_PORT..." +"$BINARY" --model "$MODEL" --port "$PARALLEL_PORT" --host "$HOST" --parallel 2 > /dev/null 2>&1 & +PARALLEL_SERVER_PID=$! +for i in $(seq 1 60); do + curl -sf "http://${HOST}:${PARALLEL_PORT}/health" >/dev/null 2>&1 && break + sleep 1 +done + +CONCURRENT_OPTIN_PASS=true +PID_A="" +PID_B="" + +curl -sf -N -X POST "http://${HOST}:${PARALLEL_PORT}/v1/chat/completions" \ + -H "Content-Type: application/json" \ + -H "X-SwiftLM-Prefill-Progress: true" \ + -d "{\"model\":\"$MODEL\",\"stream\":true,\"max_tokens\":10,\"messages\":[{\"role\":\"user\",\"content\":\"Say one.\"}]}" \ + -o /tmp/mlx_optin_A.txt & +PID_A=$! + +curl -sf -N -X POST "http://${HOST}:${PARALLEL_PORT}/v1/chat/completions" \ + -H "Content-Type: application/json" \ + -H "X-SwiftLM-Prefill-Progress: true" \ + -d "{\"model\":\"$MODEL\",\"stream\":true,\"max_tokens\":10,\"messages\":[{\"role\":\"user\",\"content\":\"Say two.\"}]}" \ + -o /tmp/mlx_optin_B.txt & +PID_B=$! + +wait "$PID_A" || CONCURRENT_OPTIN_PASS=false +wait "$PID_B" || CONCURRENT_OPTIN_PASS=false + +if [ "$CONCURRENT_OPTIN_PASS" = true ]; then + if grep -q "data: \[DONE\]" /tmp/mlx_optin_A.txt && grep -q "data: \[DONE\]" /tmp/mlx_optin_B.txt; then + pass "Concurrent opt-in: both requests completed successfully under --parallel 2" + else + fail "Concurrent opt-in: one or both streams did not complete" + fi +else + fail "Concurrent opt-in: curl failed" +fi +rm -f /tmp/mlx_optin_A.txt /tmp/mlx_optin_B.txt +kill "$PARALLEL_SERVER_PID" 2>/dev/null || true +wait "$PARALLEL_SERVER_PID" 2>/dev/null || true + + +# ── Test 36: /v1/completions (text endpoint) respects opt-in header ─────────── +log "Test 36: /v1/completions respects opt-in header" + +TEXT_STREAM_OPT=$(curl -sf -N -X POST "$URL/v1/completions" \ + -H "Content-Type: application/json" \ + -H "X-SwiftLM-Prefill-Progress: true" \ + -d "{\"model\":\"$MODEL\",\"stream\":true,\"max_tokens\":10,\"prompt\":\"Hello world.\"}" \ + --max-time 30 2>/dev/null || true) + +if echo "$TEXT_STREAM_OPT" | grep -q "data: \[DONE\]"; then + pass "Text streaming + opt-in header: [DONE] received" +else + fail "Text streaming + opt-in header: failed or missing [DONE]" +fi + + # ── Results ────────────────────────────────────────────────────────── echo "" log "═══════════════════════════════════════"