From 7cc8e112a5b9e92c92500ec4bb6c7e9c8536c679 Mon Sep 17 00:00:00 2001 From: Fabrizio Ferrai Date: Fri, 29 May 2026 00:56:20 +0300 Subject: [PATCH 1/5] Only set sqlite WAL on file creation to allow for concurrent runs on Windows --- CHANGELOG.md | 3 +++ src/Spago/Db.js | 19 ++++++++++++++++++- 2 files changed, 21 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 65c9f8549..7d0400514 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,9 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] +Bugfixes: +* Fix flaky `SQLITE_IOERR_TRUNCATE` on Windows when multiple spago processes connect concurrently to the cache DB, by only setting `PRAGMA journal_mode = WAL` on first creation (WAL mode is persistent in the DB file header) + ## [1.0.4] - 2026-03-30 Bugfixes: diff --git a/src/Spago/Db.js b/src/Spago/Db.js index a21242237..fe031c2ee 100644 --- a/src/Spago/Db.js +++ b/src/Spago/Db.js @@ -9,12 +9,29 @@ export const connectImpl = (databasePath, logger) => { const dir = path.dirname(databasePath); fs.mkdirSync(dir, { recursive: true }); + // WAL journal mode is persistent in the DB file header (bytes 18-19), so + // once set it sticks across connections and reopens. We only run the PRAGMA + // when creating a fresh DB; on subsequent connects the file header already + // says WAL and SQLite picks it up automatically. + // + // Why we go out of our way to skip it: we get SQLITE_IOERR_TRUNCATE (errcode 1546) + // on Windows when running multiple spago processes. + // I think this is because `PRAGMA journal_mode = WAL` inits the wal-index (.shm) + // init path, which calls winTruncate on Windows. + // + // See: + // https://sqlite.org/pragma.html (journal_mode persistence) + // https://sqlite.org/fileformat.html (header bytes 18-19 = WAL marker) + const isNewDatabase = !fs.existsSync(databasePath); + const db = new DatabaseSync(databasePath, { enableForeignKeyConstraints: true, timeout: 5000, // Wait up to 5s if database is locked (matches better-sqlite3 default) }); - db.exec("PRAGMA journal_mode = WAL"); + if (isNewDatabase) { + db.exec("PRAGMA journal_mode = WAL"); + } db.prepare(`CREATE TABLE IF NOT EXISTS package_sets ( version TEXT PRIMARY KEY NOT NULL From ab0d5fb10e57bf21f5b4e8de70748f05a647d3de Mon Sep 17 00:00:00 2001 From: Fabrizio Ferrai Date: Sat, 30 May 2026 17:24:50 +0300 Subject: [PATCH 2/5] Fix races --- CHANGELOG.md | 2 +- src/Spago/Db.js | 33 +++++++++++++++++---------------- 2 files changed, 18 insertions(+), 17 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 7d0400514..c2742938f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -8,7 +8,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] Bugfixes: -* Fix flaky `SQLITE_IOERR_TRUNCATE` on Windows when multiple spago processes connect concurrently to the cache DB, by only setting `PRAGMA journal_mode = WAL` on first creation (WAL mode is persistent in the DB file header) +* Fix flaky `SQLITE_IOERR_TRUNCATE` on Windows when multiple spago processes connect concurrently to the cache DB, by skipping `PRAGMA journal_mode = WAL` when it's already enabled (WAL mode is persistent in the DB file header) and tolerating the race on the initial set ## [1.0.4] - 2026-03-30 diff --git a/src/Spago/Db.js b/src/Spago/Db.js index fe031c2ee..db9d0c88b 100644 --- a/src/Spago/Db.js +++ b/src/Spago/Db.js @@ -9,28 +9,29 @@ export const connectImpl = (databasePath, logger) => { const dir = path.dirname(databasePath); fs.mkdirSync(dir, { recursive: true }); + const db = new DatabaseSync(databasePath, { + enableForeignKeyConstraints: true, + timeout: 5000, // Wait up to 5s if database is locked + }); + // WAL journal mode is persistent in the DB file header (bytes 18-19), so - // once set it sticks across connections and reopens. We only run the PRAGMA - // when creating a fresh DB; on subsequent connects the file header already - // says WAL and SQLite picks it up automatically. + // once set it sticks across connections and reopens. We skip the PRAGMA when + // it's already set to avoid hitting winTruncate on the wal-index (.shm), + // which races between concurrent spago processes on Windows and surfaces as + // SQLITE_IOERR_TRUNCATE (errcode 1546). // - // Why we go out of our way to skip it: we get SQLITE_IOERR_TRUNCATE (errcode 1546) - // on Windows when running multiple spago processes. - // I think this is because `PRAGMA journal_mode = WAL` inits the wal-index (.shm) - // init path, which calls winTruncate on Windows. + // When two fresh processes race the initial set, the loser's exec throws, + // but the winner has already written WAL to the header — so we only re-throw + // if WAL didn't actually end up enabled (i.e. the error wasn't the benign + // race we expect). // // See: // https://sqlite.org/pragma.html (journal_mode persistence) // https://sqlite.org/fileformat.html (header bytes 18-19 = WAL marker) - const isNewDatabase = !fs.existsSync(databasePath); - - const db = new DatabaseSync(databasePath, { - enableForeignKeyConstraints: true, - timeout: 5000, // Wait up to 5s if database is locked (matches better-sqlite3 default) - }); - - if (isNewDatabase) { - db.exec("PRAGMA journal_mode = WAL"); + const inWal = () => db.prepare("PRAGMA journal_mode").get()?.journal_mode === "wal"; + if (!inWal()) { + try { db.exec("PRAGMA journal_mode = WAL"); } + catch (e) { if (!inWal()) throw e; } } db.prepare(`CREATE TABLE IF NOT EXISTS package_sets From 5f2f538b3e221cfde3c1dd0cc19ac8f91db07992 Mon Sep 17 00:00:00 2001 From: Fabrizio Ferrai Date: Sat, 30 May 2026 18:39:00 +0300 Subject: [PATCH 3/5] Fix transient http failures --- CHANGELOG.md | 1 + src/Spago/Command/Fetch.purs | 8 ++++++-- src/Spago/Registry.purs | 21 ++++++++++++++------- 3 files changed, 21 insertions(+), 9 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index c2742938f..67a30651d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -9,6 +9,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 Bugfixes: * Fix flaky `SQLITE_IOERR_TRUNCATE` on Windows when multiple spago processes connect concurrently to the cache DB, by skipping `PRAGMA journal_mode = WAL` when it's already enabled (WAL mode is persistent in the DB file header) and tolerating the race on the initial set +* Retry transient network failures (connection errors and 5xx responses) when fetching package tarballs and calling the registry API, instead of failing immediately ## [1.0.4] - 2026-03-30 diff --git a/src/Spago/Command/Fetch.purs b/src/Spago/Command/Fetch.purs index c439bea25..976dd9a3e 100644 --- a/src/Spago/Command/Fetch.purs +++ b/src/Spago/Command/Fetch.purs @@ -319,9 +319,13 @@ fetchPackagesToLocalCache packages = do , url = packageUrl } ) - -- If we get a 503, we want the backoff to kick in, so we wait here and we'll eventually be retried + -- If the request failed (connection error) or got a 5xx, we want the backoff + -- to kick in. withBackoff' only retries on its own timeout, so we delay here + -- to lose the race against runTimeout and trigger a retry. case res of - Right { status } | status == StatusCode 503 -> Aff.delay (Aff.Milliseconds 30_000.0) + Left _ -> Aff.delay (Aff.Milliseconds 30_000.0) + Right { status } | status >= StatusCode 500 && status < StatusCode 600 -> + Aff.delay (Aff.Milliseconds 30_000.0) _ -> pure unit pure res case response of diff --git a/src/Spago/Registry.purs b/src/Spago/Registry.purs index 2ede68d87..3f85f4bb0 100644 --- a/src/Spago/Registry.purs +++ b/src/Spago/Registry.purs @@ -431,13 +431,20 @@ submitRegistryOperation payload = do callRegistry :: forall env a b. String -> CJ.Codec b -> Maybe { codec :: CJ.Codec a, data :: a } -> Spago (GitEnv env) b callRegistry url outputCodec maybeInput = handleError do logDebug $ "Calling registry at " <> url - response <- liftAff $ withBackoff' $ try case maybeInput of - Just { codec: inputCodec, data: input } -> Http.fetch url - { method: Http.POST - , headers: { "Content-Type": "application/json" } - , body: Json.stringifyJson inputCodec input - } - Nothing -> Http.fetch url { method: Http.GET } + response <- liftAff $ withBackoff' do + res <- try case maybeInput of + Just { codec: inputCodec, data: input } -> Http.fetch url + { method: Http.POST + , headers: { "Content-Type": "application/json" } + , body: Json.stringifyJson inputCodec input + } + Nothing -> Http.fetch url { method: Http.GET } + case res of + Left _ -> Aff.delay (Aff.Milliseconds 30_000.0) + Right { status } | status >= 500 && status < 600 -> + Aff.delay (Aff.Milliseconds 30_000.0) + _ -> pure unit + pure res case response of Nothing -> pure $ Left $ "Could not reach the registry at " <> url Just (Left err) -> pure $ Left $ "Error while calling the registry:\n " <> Exception.message err From 3ee6d541c81590b5e4a0b4afc19b9010fb162092 Mon Sep 17 00:00:00 2001 From: Fabrizio Ferrai Date: Sun, 31 May 2026 01:48:50 +0300 Subject: [PATCH 4/5] Fix backoff timing --- src/Spago/Prelude.purs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Spago/Prelude.purs b/src/Spago/Prelude.purs index 6f051baef..8e5de4e81 100644 --- a/src/Spago/Prelude.purs +++ b/src/Spago/Prelude.purs @@ -162,7 +162,7 @@ withBackoff { delay: Aff.Milliseconds timeout, action, shouldCancel, shouldRetry case maybeRetry of Maybe.Nothing -> pure Maybe.Nothing Maybe.Just newAction -> do - let newTimeout = Int.floor timeout `Int.pow` (attempt + 1) + let newTimeout = Int.floor timeout * (2 `Int.pow` attempt) maybeResult <- runAction attempt newAction newTimeout loop (attempt + 1) maybeResult Maybe.Just result -> From fcb2c21e60931bddd50b46c7c5b0a7301ec3ae71 Mon Sep 17 00:00:00 2001 From: Fabrizio Ferrai Date: Sun, 31 May 2026 02:27:24 +0300 Subject: [PATCH 5/5] Fix flaky test --- .../errors/expected-stderr.txt | 4 ++-- test/Prelude.purs | 13 +++++++++++++ test/Spago/Build.purs | 2 +- test/Spago/Publish.purs | 12 +----------- 4 files changed, 17 insertions(+), 14 deletions(-) diff --git a/test-fixtures/build/1148-warnings-diff-errors/errors/expected-stderr.txt b/test-fixtures/build/1148-warnings-diff-errors/errors/expected-stderr.txt index a3ffb4e6f..39f17c106 100644 --- a/test-fixtures/build/1148-warnings-diff-errors/errors/expected-stderr.txt +++ b/test-fixtures/build/1148-warnings-diff-errors/errors/expected-stderr.txt @@ -4,8 +4,8 @@ Reading Spago workspace configuration... Downloading dependencies... Building... -[1 of 2] Compiling Foo -[2 of 2] Compiling Main +[x of 2] Compiling module-name +[x of 2] Compiling module-name [ERROR 1/2 TypesDoNotUnify] src/Foo.purs:4:5 4 x = "nope" diff --git a/test/Prelude.purs b/test/Prelude.purs index 2263ddc5d..868e0cd65 100644 --- a/test/Prelude.purs +++ b/test/Prelude.purs @@ -9,6 +9,8 @@ import Data.Array as Array import Data.Map as Map import Data.String (Pattern(..), Replacement(..)) import Data.String as String +import Data.String.Regex as Regex +import Data.String.Regex.Flags as RF import Effect.Aff as Aff import Effect.Aff.AVar (AVar) import Effect.Aff.AVar as AVar @@ -176,6 +178,17 @@ sanitizePlatformOutput = >>> String.replaceAll (Pattern "\\") (Replacement "/") >>> String.replaceAll (Pattern "\r\n") (Replacement "\n") +-- | Normalize `[N of ] Compiling ` lines. purs schedules +-- | independent modules in whatever order system resources allow, so fixture +-- | comparison has to ignore the order. Pass the total module count expected. +normalizeCompileOrder :: Int -> String -> String +normalizeCompileOrder total = + Regex.replace regex ("[x of " <> show total <> "] Compiling module-name") + where + regex = unsafeFromRight $ Regex.regex + ("\\[\\d+ of " <> show total <> "\\] Compiling [^\n]+") + RF.global + checkFixture :: ∀ path. IsPath path => path -> FixturePath -> Aff Unit checkFixture filepath fixturePath = checkFixture' filepath fixturePath identity (shouldEqualStr `on` String.trim) diff --git a/test/Spago/Build.purs b/test/Spago/Build.purs index 240d5e32b..db0104b81 100644 --- a/test/Spago/Build.purs +++ b/test/Spago/Build.purs @@ -212,7 +212,7 @@ spec sem = Spec.parallel $ Spec.around (withBuildLock sem) do { stdoutFile: Nothing , stderrFile: Just $ fixture expectedFixture , result - , sanitize: sanitizePlatformOutput + , sanitize: sanitizePlatformOutput >>> normalizeCompileOrder 2 } FS.copyTree { src: fixture "build/1148-warnings-diff-errors", dst: testCwd "." } diff --git a/test/Spago/Publish.purs b/test/Spago/Publish.purs index 054a0411a..bed67ea7f 100644 --- a/test/Spago/Publish.purs +++ b/test/Spago/Publish.purs @@ -5,8 +5,6 @@ module Test.Spago.Publish import Test.Prelude -import Data.String.Regex as Regex -import Data.String.Regex.Flags as RF import Node.Platform as Platform import Node.Process as Process import Spago.FS as FS @@ -94,17 +92,9 @@ spec = Spec.around withTempDir do { stdoutFile: Nothing , stderrFile: Just file , result: isLeft - , sanitize: sanitizePlatformOutput >>> Regex.replace buildOrderRegex "[x of 3] Compiling module-name" + , sanitize: sanitizePlatformOutput >>> normalizeCompileOrder 3 } - -- We have to ignore lines like "[1 of 3] Compiling Effect.Console" when - -- comparing output, because the compiler will always compile in - -- different order, depending on how the system resources happened to - -- align at the moment of the test run. - buildOrderRegex = unsafeFromRight $ Regex.regex - "\\[\\d of 3\\] Compiling (Effect\\.Console|Effect\\.Class\\.Console|Lib)" - RF.global - FS.copyTree { src: fixture "publish/1110-solver-different-version", dst: testCwd } spago [ "build" ] >>= shouldBeSuccess doTheGitThing testCwd