From db0e52fbaab461740161a1e20cb828ac8daa9d2d Mon Sep 17 00:00:00 2001 From: Akash Kumar Date: Wed, 29 Apr 2026 18:30:32 +0530 Subject: [PATCH 01/10] add(django-permission-cohort-postgres): minimal sample for keploy/integrations e2e lane MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This sample exists to support the keploy/integrations e2e lane that regresses the postgres-v3 session-fallback lifetime-gate fix (keploy/enterprise#1952). Each request to /lookup/// clears Django's in-process ContentType cache and forces the same `SELECT django_content_type WHERE app_label=$1 AND model=$2` query whose `class: APP` → `LifetimePerTest` tagging surfaced the bug end-to-end. The sample is intentionally tiny: - No admin, no DRF, no token auth — the bug shape doesn't need them. - No migrations beyond Django's stock contenttypes/auth tables — those are exactly what the lookup hits. - CONN_MAX_AGE=0 (a fresh connection per request) keeps the per- request DB call sequence deterministic across record/replay. Endpoints: GET /health/ → wait-for-app gate GET /lookup/// → ContentType.objects.get(...) + clear_cache to force the SQL on every request The keploy/integrations lane that drives this sample lives at .woodpecker/django-permission-cohort-postgres.yml — clones this repo on the corresponding branch, copies the sample dir into the run workspace, overlays the agent-side Dockerfile, and runs keploy record + replay across the three-binary matrix (record-build-replay-build / record-latest-replay-build / record-build-replay-latest). Refs: - keploy/enterprise#1952 RCA + reproduction - keploy/integrations#167 fix PR Signed-off-by: Akash Kumar --- django-permission-cohort-postgres/Dockerfile | 18 +++++ django-permission-cohort-postgres/README.md | 44 +++++++++++++ .../docker-compose.yml | 40 +++++++++++ .../entrypoint.sh | 25 +++++++ django-permission-cohort-postgres/manage.py | 14 ++++ .../myproj/__init__.py | 0 .../myproj/settings.py | 66 +++++++++++++++++++ .../myproj/urls.py | 8 +++ .../myproj/views.py | 50 ++++++++++++++ .../myproj/wsgi.py | 7 ++ .../requirements.txt | 3 + 11 files changed, 275 insertions(+) create mode 100644 django-permission-cohort-postgres/Dockerfile create mode 100644 django-permission-cohort-postgres/README.md create mode 100644 django-permission-cohort-postgres/docker-compose.yml create mode 100755 django-permission-cohort-postgres/entrypoint.sh create mode 100755 django-permission-cohort-postgres/manage.py create mode 100644 django-permission-cohort-postgres/myproj/__init__.py create mode 100644 django-permission-cohort-postgres/myproj/settings.py create mode 100644 django-permission-cohort-postgres/myproj/urls.py create mode 100644 django-permission-cohort-postgres/myproj/views.py create mode 100644 django-permission-cohort-postgres/myproj/wsgi.py create mode 100644 django-permission-cohort-postgres/requirements.txt diff --git a/django-permission-cohort-postgres/Dockerfile b/django-permission-cohort-postgres/Dockerfile new file mode 100644 index 0000000..ce0db1a --- /dev/null +++ b/django-permission-cohort-postgres/Dockerfile @@ -0,0 +1,18 @@ +FROM python:3.12-slim + +ENV PYTHONDONTWRITEBYTECODE=1 \ + PYTHONUNBUFFERED=1 \ + PIP_DISABLE_PIP_VERSION_CHECK=1 + +WORKDIR /app + +COPY requirements.txt . +RUN pip install --no-cache-dir -r requirements.txt + +COPY . /app + +RUN chmod +x /app/entrypoint.sh + +EXPOSE 8080 + +CMD ["/app/entrypoint.sh"] diff --git a/django-permission-cohort-postgres/README.md b/django-permission-cohort-postgres/README.md new file mode 100644 index 0000000..92016c5 --- /dev/null +++ b/django-permission-cohort-postgres/README.md @@ -0,0 +1,44 @@ +# django-permission-cohort-postgres + +Minimal Django + PostgreSQL sample that reproduces the `django_content_type` permission-lookup pattern. Each request to `/lookup///` clears Django's in-process `ContentType` cache and forces the SQL: + +```sql +SELECT "django_content_type"."id", + "django_content_type"."app_label", + "django_content_type"."model" + FROM "django_content_type" + WHERE "django_content_type"."app_label" = $1 + AND "django_content_type"."model" = $2 + LIMIT 21 +``` + +That query is what surfaced the postgres v3 session-fallback lifetime-gate bug in [keploy/enterprise#1952](https://github.com/keploy/enterprise/issues/1952). The sample exists so the keploy/integrations e2e lane (`.woodpecker/django-permission-cohort-postgres.yml`) can drive it end-to-end through record + replay. + +## Endpoints + +| Path | Effect | +|---|---| +| `GET /health/` | `{"status":"ok"}` (used as wait-for-app gate) | +| `GET /lookup///` | Looks up the matching `ContentType` row, returns `{id, app_label, model}` | + +## Run standalone + +```bash +docker compose up +curl http://localhost:8080/health/ +curl http://localhost:8080/lookup/auth/user/ +curl http://localhost:8080/lookup/auth/group/ +``` + +## What it intentionally does NOT have + +- No admin +- No DRF / JWT / token auth — the bug shape doesn't need them +- No static files, no templates, no signal handlers, no celery +- No migrations beyond Django's stock contenttypes/auth tables — those are exactly what the lookup hits + +The sample is small on purpose so the failing query is the only DB traffic that matters at replay time. + +## Why CONN_MAX_AGE=0 + +Default `CONN_MAX_AGE=0` (a fresh connection per request) makes Django redo any first-connect work on every request. That keeps the per-request DB call sequence deterministic across record/replay — important for a regression sample. diff --git a/django-permission-cohort-postgres/docker-compose.yml b/django-permission-cohort-postgres/docker-compose.yml new file mode 100644 index 0000000..3361ebe --- /dev/null +++ b/django-permission-cohort-postgres/docker-compose.yml @@ -0,0 +1,40 @@ +services: + api: + build: + context: . + dockerfile: Dockerfile + container_name: api + ports: + - "8080:8080" + networks: + - djnet + environment: + DB_HOST: db + DB_PORT: 5432 + DB_USER: djpcohort + DB_PASSWORD: djpcohort + DB_NAME: djpcohort + depends_on: + db: + condition: service_healthy + restart: on-failure + + db: + image: postgres:16-alpine + container_name: djpcohort_db + networks: + - djnet + environment: + POSTGRES_USER: djpcohort + POSTGRES_PASSWORD: djpcohort + POSTGRES_DB: djpcohort + ports: + - "127.0.0.1:55437:5432" + healthcheck: + test: ["CMD-SHELL", "pg_isready -U djpcohort -d djpcohort"] + interval: 5s + timeout: 5s + retries: 12 + +networks: + djnet: diff --git a/django-permission-cohort-postgres/entrypoint.sh b/django-permission-cohort-postgres/entrypoint.sh new file mode 100755 index 0000000..27bc853 --- /dev/null +++ b/django-permission-cohort-postgres/entrypoint.sh @@ -0,0 +1,25 @@ +#!/usr/bin/env bash +# entrypoint.sh — run Django migrations once, then start gunicorn. +# +# The migrations populate django_content_type and the auth_* +# permission tables; without this the /lookup/ endpoint would have +# nothing to find. We migrate inline at container start (rather than +# in a separate init container) so the recorder captures both the +# migration sequence and the runtime queries on the same connection +# pool — keeps the record/replay traffic shape minimal. +set -Eeuo pipefail + +echo "[entrypoint] running migrations..." +python /app/manage.py migrate --noinput + +echo "[entrypoint] starting gunicorn on :8080..." +exec gunicorn \ + --bind 0.0.0.0:8080 \ + --workers 2 \ + --threads 1 \ + --timeout 60 \ + --access-logfile - \ + --error-logfile - \ + --capture-output \ + --log-level info \ + myproj.wsgi:application diff --git a/django-permission-cohort-postgres/manage.py b/django-permission-cohort-postgres/manage.py new file mode 100755 index 0000000..15df926 --- /dev/null +++ b/django-permission-cohort-postgres/manage.py @@ -0,0 +1,14 @@ +#!/usr/bin/env python +import os +import sys + + +def main(): + os.environ.setdefault("DJANGO_SETTINGS_MODULE", "myproj.settings") + from django.core.management import execute_from_command_line + + execute_from_command_line(sys.argv) + + +if __name__ == "__main__": + main() diff --git a/django-permission-cohort-postgres/myproj/__init__.py b/django-permission-cohort-postgres/myproj/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/django-permission-cohort-postgres/myproj/settings.py b/django-permission-cohort-postgres/myproj/settings.py new file mode 100644 index 0000000..14c6934 --- /dev/null +++ b/django-permission-cohort-postgres/myproj/settings.py @@ -0,0 +1,66 @@ +""" +Minimal Django settings for the django-permission-cohort-postgres +e2e lane (keploy/integrations). + +The app exists only to exercise the ContentType permission-lookup +shape that surfaced the postgres-v3 lifetime-gate bug fixed in this +PR. It is intentionally as small as possible: no admin, no static +files, no DRF, no migrations beyond the contenttypes/auth defaults +Django ships out of the box. Anything that doesn't help reproduce +the recorder→lax-promotion→session-fallback path is omitted. +""" +import os +from pathlib import Path + +BASE_DIR = Path(__file__).resolve().parent.parent + +SECRET_KEY = "ci-only-not-secret" +DEBUG = False +ALLOWED_HOSTS = ["*"] + +INSTALLED_APPS = [ + # contenttypes is the load-bearing piece — every request to the + # /lookup/ view fires SELECT django_content_type WHERE app_label=$1 + # AND model=$2, the exact query the lifetime-gate bug surfaces on. + "django.contrib.contenttypes", + "django.contrib.auth", +] + +MIDDLEWARE = [] + +ROOT_URLCONF = "myproj.urls" + +TEMPLATES = [] + +WSGI_APPLICATION = "myproj.wsgi.application" + +DATABASES = { + "default": { + "ENGINE": "django.db.backends.postgresql", + "NAME": os.environ.get("DB_NAME", "djpcohort"), + "USER": os.environ.get("DB_USER", "djpcohort"), + "PASSWORD": os.environ.get("DB_PASSWORD", "djpcohort"), + "HOST": os.environ.get("DB_HOST", "db"), + "PORT": os.environ.get("DB_PORT", "5432"), + # CONN_MAX_AGE=0 (default) means a fresh connection per request, + # which makes Django redo any first-connect work each time. We + # keep that default so the per-request DB call sequence is + # deterministic across record/replay. + } +} + +DEFAULT_AUTO_FIELD = "django.db.models.AutoField" + +USE_TZ = True + +LOGGING = { + "version": 1, + "disable_existing_loggers": False, + "handlers": { + "console": {"class": "logging.StreamHandler"}, + }, + "root": {"handlers": ["console"], "level": "INFO"}, + "loggers": { + "django.db.backends": {"handlers": ["console"], "level": "INFO"}, + }, +} diff --git a/django-permission-cohort-postgres/myproj/urls.py b/django-permission-cohort-postgres/myproj/urls.py new file mode 100644 index 0000000..611fc7a --- /dev/null +++ b/django-permission-cohort-postgres/myproj/urls.py @@ -0,0 +1,8 @@ +from django.urls import path + +from . import views + +urlpatterns = [ + path("health/", views.health), + path("lookup///", views.lookup), +] diff --git a/django-permission-cohort-postgres/myproj/views.py b/django-permission-cohort-postgres/myproj/views.py new file mode 100644 index 0000000..472e19d --- /dev/null +++ b/django-permission-cohort-postgres/myproj/views.py @@ -0,0 +1,50 @@ +""" +Two endpoints, both deliberately minimal: + + GET /health/ -> 200 with {"status":"ok"} + GET /lookup/// -> 200 with the ContentType row + +The lookup view is the load-bearing one. It calls +ContentType.objects.clear_cache() before each lookup so the in-process +cache never short-circuits the query. Every request therefore fires: + + SELECT "django_content_type"."id", + "django_content_type"."app_label", + "django_content_type"."model" + FROM "django_content_type" + WHERE ("django_content_type"."app_label" = $1 + AND "django_content_type"."model" = $2) + LIMIT 21 + +That query is `class: APP` per pgmatch.Classify (a SELECT from a +user-schema table, not pg_catalog), so DeriveLifetime tags every +captured invocation `LifetimePerTest`. With the lifetime gate at +pickSessionFallback (pre-fix), once a per-test cohort is empty for the +SQL hash but the agent has lax-promoted the same hash into the +session pool, the matcher misses with `candidates: 0` and +`sessionFallbackCandidates: N>0`. That's the doccano-shape failure +this lane regresses against. +""" +from django.contrib.contenttypes.models import ContentType +from django.http import JsonResponse, HttpResponseNotFound + + +def health(_request): + return JsonResponse({"status": "ok"}) + + +def lookup(_request, app_label, model): + # Clear the in-process ContentType cache so each call is forced to + # hit the DB. Without this, only the very first call per worker + # would emit the SQL we want under the recorder. + ContentType.objects.clear_cache() + try: + ct = ContentType.objects.get(app_label=app_label, model=model) + except ContentType.DoesNotExist: + return HttpResponseNotFound( + JsonResponse({"error": "not found"}).content, + content_type="application/json", + ) + return JsonResponse( + {"id": ct.id, "app_label": ct.app_label, "model": ct.model} + ) diff --git a/django-permission-cohort-postgres/myproj/wsgi.py b/django-permission-cohort-postgres/myproj/wsgi.py new file mode 100644 index 0000000..2419403 --- /dev/null +++ b/django-permission-cohort-postgres/myproj/wsgi.py @@ -0,0 +1,7 @@ +import os + +from django.core.wsgi import get_wsgi_application + +os.environ.setdefault("DJANGO_SETTINGS_MODULE", "myproj.settings") + +application = get_wsgi_application() diff --git a/django-permission-cohort-postgres/requirements.txt b/django-permission-cohort-postgres/requirements.txt new file mode 100644 index 0000000..e1baa50 --- /dev/null +++ b/django-permission-cohort-postgres/requirements.txt @@ -0,0 +1,3 @@ +Django==4.2.16 +gunicorn==21.2.0 +psycopg2-binary==2.9.9 From ab534f8187466ebe6e1792f7989392ed8fe4230f Mon Sep 17 00:00:00 2001 From: Akash Kumar Date: Wed, 29 Apr 2026 18:38:23 +0530 Subject: [PATCH 02/10] fix(django-permission-cohort-postgres): bump gunicorn timeout + workers MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Pipeline runs of the keploy/integrations e2e lane that consumes this sample showed transient hangs during replay: gunicorn's default worker timeout (--timeout 60 in the prior version) was triggering SIGKILL on workers mid-request when keploy proxy roundtrips spiked under CI load, and the test that landed on the killed worker would time out client-side with `context deadline exceeded`. Subsequent tests after the worker recycle ran fine — classic shape of "matcher fast enough on average, but a single slow burst breaches the worker timeout". Two changes: 1. --timeout 300 (was 60). Keploy's matcher under CI agent load can take several seconds per query in pathological cases (cold cohort build, Track Y session-fallback probe, etc.); 300s gives the proxy headroom without tampering with the test gate (--api-timeout in the driver script remains the client-side cap on per-test wall time). 2. --workers 4 / --threads 2 (was 2/1). Two workers with one thread each meant only two concurrent requests; a single slow request blocked the next one in queue and the queue itself hit timeout. 4×2 gives 8 concurrent slots — enough that the lane's single-request-at-a-time replay never queues against itself. Also adds --graceful-timeout 30 so worker recycle on shutdown stays quick. Refs: - keploy/integrations#167 - keploy/integrations pipeline #839 (build-build cell, test-8/9 hang correlated with `[CRITICAL] WORKER TIMEOUT (pid:39)` gunicorn log) Signed-off-by: Akash Kumar --- django-permission-cohort-postgres/entrypoint.sh | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/django-permission-cohort-postgres/entrypoint.sh b/django-permission-cohort-postgres/entrypoint.sh index 27bc853..d3a4f1d 100755 --- a/django-permission-cohort-postgres/entrypoint.sh +++ b/django-permission-cohort-postgres/entrypoint.sh @@ -13,11 +13,18 @@ echo "[entrypoint] running migrations..." python /app/manage.py migrate --noinput echo "[entrypoint] starting gunicorn on :8080..." +# --timeout 300: matcher round-trips through the keploy proxy can spike +# under CI load; gunicorn's default 30s SIGKILLs the worker mid-request +# and the test that landed on that worker times out client-side. 300s +# gives the proxy comfortable headroom. +# --workers 4 / --threads 2: enough concurrency that a single slow +# matcher response doesn't queue subsequent requests behind it. exec gunicorn \ --bind 0.0.0.0:8080 \ - --workers 2 \ - --threads 1 \ - --timeout 60 \ + --workers 4 \ + --threads 2 \ + --timeout 300 \ + --graceful-timeout 30 \ --access-logfile - \ --error-logfile - \ --capture-output \ From e7a98ff8ba94ff528dd6e7173691267244c8dd90 Mon Sep 17 00:00:00 2001 From: Akash Kumar Date: Wed, 29 Apr 2026 20:43:32 +0530 Subject: [PATCH 03/10] fix(django-permission-cohort-postgres): set sslmode=disable to skip SSLRequest preamble MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Pipeline runs of the keploy/integrations e2e lane that consumes this sample showed a deterministic shape: every FIRST request to a new endpoint hung 120s, then timed out client-side. Subsequent requests to the same endpoint passed in <40ms. Across multiple new endpoints in one replay, that compounds — three new endpoints means three 120s timeouts (#846 build-build cell: 6.54 min total runtime, 3 of 10 tests timed out). The recorder log gave the lead: v3 recorder V2: unexpected SSL response response_byte: 82 (ASCII 'R') psycopg2 (libpq under the hood) sends the SSLRequest preamble on every fresh connection by default — even for clear-text postgres. Postgres responds with 'R' (auth request) instead of 'S'/'N', skipping SSL negotiation entirely. The keploy v3 recorder logs the mismatch but accepts the recording. At replay, the proxy on the first fresh connection stalls waiting for the SSL-handshake bytes that aren't coming, the client request hangs, the test times out. Setting sslmode=disable in DATABASES.OPTIONS makes psycopg2 skip the preamble entirely. Connection flow becomes a clean cleartext path the proxy already handles correctly across record + replay. Refs: - keploy/integrations#167 - keploy/integrations pipeline #846 (build-build cell, three sequential 120s hangs all on first-request-to-new-endpoint) Signed-off-by: Akash Kumar --- .../myproj/settings.py | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/django-permission-cohort-postgres/myproj/settings.py b/django-permission-cohort-postgres/myproj/settings.py index 14c6934..40d34eb 100644 --- a/django-permission-cohort-postgres/myproj/settings.py +++ b/django-permission-cohort-postgres/myproj/settings.py @@ -46,6 +46,23 @@ # which makes Django redo any first-connect work each time. We # keep that default so the per-request DB call sequence is # deterministic across record/replay. + "OPTIONS": { + # Skip libpq's SSLRequest preamble. The compose stack runs + # cleartext postgres; without sslmode=disable, psycopg2 + # sends the SSLRequest byte sequence and waits for an 'S' + # or 'N' response. Postgres replies with 'R' (auth + # request, skipping SSL negotiation entirely), keploy's + # v3 recorder/proxy logs this as "unexpected SSL + # response", and on the *first* fresh connection at + # replay time the proxy stalls waiting for an SSL + # handshake the server isn't going to complete — every + # first request to a new endpoint then hangs until the + # client-side timeout fires. Setting sslmode=disable + # makes psycopg2 skip the preamble entirely so every + # request flows on the clean cleartext path the proxy + # already handles. + "sslmode": "disable", + }, } } From 305e3cd0006f3d4eae2106910b59c5c18d0af85c Mon Sep 17 00:00:00 2001 From: Akash Kumar Date: Thu, 30 Apr 2026 00:32:37 +0530 Subject: [PATCH 04/10] feat(django-permission-cohort-postgres): background ContentType-lookup thread MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The keploy/integrations e2e lane that consumes this sample runs clean across record + replay but never exercises the postgres-v3 session-fallback path the lane is meant to regress against — 0 'session-tier read-only fallback served' lines in pipeline #850's green run. Cause: every DB query in the previous shape was fired inside an HTTP request, so every capture's request-timestamp landed inside its test's window and got attributed to that test's perTest cohort. At replay the perTest path served everything; the lane never reached the gate the lifetime fix is for. To produce the bug shape end-to-end the recording must capture *some* `SELECT django_content_type` invocations *between* HTTP test windows so the agent's lax-promotion path (FilterPerTestAndLaxPromotedTierAware in keploy/keploy:pkg/util.go) routes them into the session pool — that's the on-disk shape post-fix replay must serve via session-fallback. Anything fired from inside an HTTP request can't satisfy that constraint by construction. Solution: a Django AppConfig.ready() that spawns one daemon thread per gunicorn worker. The thread loops every BACKGROUND_LOOKUPS_INTERVAL_S seconds (default 3) firing ContentType.clear_cache() + ContentType.objects.get(...) against a rotating set of (app_label, model) targets — same SQL hash as the HTTP-driven lookups, different binds. The cadence is slow enough not to saturate gunicorn workers and fast enough that the exerciser's 6s inter-round pause captures multiple thread-fired lookups *between* HTTP test windows. Gating: BACKGROUND_LOOKUPS=1 in docker-compose.yml. Off by default so the sample stays usable for ad-hoc local Django testing. Refs: - keploy/integrations#167 - keploy/integrations pipeline #850 (lane green but 0 session-fallback served — lane was no longer falsifying) Signed-off-by: Akash Kumar --- .../docker-compose.yml | 7 ++ .../myproj/apps.py | 91 +++++++++++++++++++ .../myproj/settings.py | 7 ++ 3 files changed, 105 insertions(+) create mode 100644 django-permission-cohort-postgres/myproj/apps.py diff --git a/django-permission-cohort-postgres/docker-compose.yml b/django-permission-cohort-postgres/docker-compose.yml index 3361ebe..61d0139 100644 --- a/django-permission-cohort-postgres/docker-compose.yml +++ b/django-permission-cohort-postgres/docker-compose.yml @@ -14,6 +14,13 @@ services: DB_USER: djpcohort DB_PASSWORD: djpcohort DB_NAME: djpcohort + # Spawn the background ContentType-lookup thread (see + # myproj/apps.py). The keploy/integrations e2e lane needs this + # to record `SELECT django_content_type` invocations *between* + # HTTP test windows — the precondition for the session-fallback + # path the lifetime-gate fix unblocks. + BACKGROUND_LOOKUPS: "1" + BACKGROUND_LOOKUPS_INTERVAL_S: "3" depends_on: db: condition: service_healthy diff --git a/django-permission-cohort-postgres/myproj/apps.py b/django-permission-cohort-postgres/myproj/apps.py new file mode 100644 index 0000000..f4e84cd --- /dev/null +++ b/django-permission-cohort-postgres/myproj/apps.py @@ -0,0 +1,91 @@ +""" +AppConfig that spawns a background ContentType-lookup thread inside +each gunicorn worker. + +Why this exists +--------------- +The keploy/integrations django-permission-cohort-postgres lane is a +falsifying e2e lane for the postgres-v3 session-fallback lifetime-gate +fix (keploy/enterprise#1952). To produce the bug shape end-to-end, +the recording must capture some `SELECT django_content_type` +invocations whose request-timestamp falls *between* the surrounding +HTTP test windows. Those out-of-window captures are routed by the +keploy agent's lax-promotion path +(FilterPerTestAndLaxPromotedTierAware in keploy/keploy:pkg/util.go) +into the session pool — preserving their on-disk +`lifetime: perTest` tag. Replay-side, the live request that fires +during a test whose perTest cohort is empty for that SQL hash falls +through to the session-fallback path; pre-fix the lifetime gate +silently rejected the lax-promoted invocation, post-fix the +mutation-aware gate accepts it. + +The lane's HTTP-driven exerciser alone can't produce that shape: any +query fired inside an HTTP request lands inside that test's window +and ends up in its perTest cohort, never out-of-window. We need +DB activity that fires *outside* the HTTP request path. + +This thread provides that: each gunicorn worker spawns one daemon +thread that periodically clears Django's in-process ContentType +cache, runs a fresh `ContentType.objects.get(...)` (which forces the +SQL the bug surfaces on), and sleeps. The cadence (default 3s) is +slow enough not to swamp the worker pool but fast enough that the +exerciser's 6s inter-round pause captures multiple thread-fired +queries between HTTP test windows. + +The thread is gated behind the BACKGROUND_LOOKUPS env var so the +sample stays usable as a plain Django app for ad-hoc local testing. +The CI lane sets BACKGROUND_LOOKUPS=1. +""" +import logging +import os +import threading +import time + +from django.apps import AppConfig + +logger = logging.getLogger(__name__) + + +class MyProjConfig(AppConfig): + name = "myproj" + default = True + + def ready(self): + # `ready()` runs at most once per worker (Django guarantees idempotency + # within a process), so the thread spawn is safe here. Daemon=True + # ensures the thread doesn't block worker shutdown. + if os.environ.get("BACKGROUND_LOOKUPS", "0") != "1": + return + # Lookup-target cycle — different (app_label, model) pairs to widen + # the hash spectrum so the recording's session-fallback cohort isn't + # a single-shape singleton. All target the same SQL hash (different + # binds), which is the shape the lifetime-gate fix unblocks. + targets = [ + ("auth", "user"), + ("auth", "group"), + ("auth", "permission"), + ("contenttypes", "contenttype"), + ] + interval_s = float(os.environ.get("BACKGROUND_LOOKUPS_INTERVAL_S", "3")) + + def loop(): + # Lazy import — apps aren't all loaded at module-import time. + from django.contrib.contenttypes.models import ContentType + + i = 0 + while True: + try: + time.sleep(interval_s) + app_label, model = targets[i % len(targets)] + ContentType.objects.clear_cache() + ContentType.objects.get(app_label=app_label, model=model) + i += 1 + except Exception as exc: # noqa: BLE001 — keep the thread alive + logger.warning("background lookup failed: %s", exc) + + threading.Thread(target=loop, daemon=True, name="bg-content-type-lookups").start() + logger.info( + "background ContentType-lookup thread started (interval=%ss, targets=%s)", + interval_s, + len(targets), + ) diff --git a/django-permission-cohort-postgres/myproj/settings.py b/django-permission-cohort-postgres/myproj/settings.py index 40d34eb..a586181 100644 --- a/django-permission-cohort-postgres/myproj/settings.py +++ b/django-permission-cohort-postgres/myproj/settings.py @@ -24,6 +24,13 @@ # AND model=$2, the exact query the lifetime-gate bug surfaces on. "django.contrib.contenttypes", "django.contrib.auth", + # myproj.apps.MyProjConfig spawns a background ContentType-lookup + # thread when BACKGROUND_LOOKUPS=1 — the keploy/integrations e2e + # lane uses this to drive DB activity *outside* the HTTP request + # path, so the recording captures session-pool-bound invocations + # the lifetime-gate fix unblocks. Off by default; ad-hoc local + # use of this sample doesn't need it. + "myproj.apps.MyProjConfig", ] MIDDLEWARE = [] From f63630c05d26f68dc5e7d24219c4aa294a7d2d96 Mon Sep 17 00:00:00 2001 From: Akash Kumar Date: Thu, 30 Apr 2026 00:50:21 +0530 Subject: [PATCH 05/10] ci(django-permission-cohort-postgres): bump thread cadence to 0.3s MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Pipeline #863's same-binary cells passed all 10 tests and surfaced no matcher misses (fix's behavior is correct), but the assertion that the lane exercises the lifetime-gate fix's specific PerTest→SessionFallback path was zero-hit. Cause: - keploy `test` replays all captured HTTP cases in ~1 second. - The previous BACKGROUND_LOOKUPS_INTERVAL_S=3 means at most ONE thread-fire lands during the replay-window phase. - Even when that lone fire lands inside a test window, the dispatcher routes it via the engine's WindowSnapshot tier — and most thread fires hit between-test or post-test phases, where SessionTransactional serves directly from the session pool without needing the PerTest engine's SessionFallback path. Bumping the cadence to 0.3s gives ~3-5 thread-fires per replay that land inside an HTTP test window, increasing the probability that at least one of them hits a perTest cohort empty for its hash and falls through to the SessionFallback gate the fix unblocks. The faster cadence is fine for the gunicorn worker pool — ContentType.objects.get() is a single-row read on a four-row table, sub-millisecond. Recording side: thread captures still land between HTTP test windows during the exerciser's 6s pause, so the lax-promotion shape that got captures into the session pool is unaffected. Refs: - keploy/integrations#167 - keploy/integrations pipeline #863 (build-build cell green on tests but check_lane_exercises_session_fallback fires) Signed-off-by: Akash Kumar --- django-permission-cohort-postgres/docker-compose.yml | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/django-permission-cohort-postgres/docker-compose.yml b/django-permission-cohort-postgres/docker-compose.yml index 61d0139..e8144b5 100644 --- a/django-permission-cohort-postgres/docker-compose.yml +++ b/django-permission-cohort-postgres/docker-compose.yml @@ -20,7 +20,16 @@ services: # HTTP test windows — the precondition for the session-fallback # path the lifetime-gate fix unblocks. BACKGROUND_LOOKUPS: "1" - BACKGROUND_LOOKUPS_INTERVAL_S: "3" + # Fire the background thread every 0.3s. The keploy `test` driver + # replays all captured HTTP cases in ~1 second, so a 3s cadence + # means at most one thread-fire lands during the replay-window + # phase — and the dispatcher routes most of them to + # SessionTransactional directly (bypassing the PerTest → + # SessionFallback path the lifetime-gate fix actually unblocks). + # 0.3s gives ~3-5 thread-fires per replay run that fall *inside* + # an HTTP test window, which is the precondition for hitting + # the SessionFallback gate. + BACKGROUND_LOOKUPS_INTERVAL_S: "0.3" depends_on: db: condition: service_healthy From 3dbe4dd2009ec729cb137e7845ebd3e42a39c47e Mon Sep 17 00:00:00 2001 From: Akash Kumar Date: Thu, 30 Apr 2026 02:57:17 +0530 Subject: [PATCH 06/10] fix(django-permission-cohort-postgres): run migrations in sibling container Move `python manage.py migrate` out of the api container's entrypoint into a one-shot `migrator` service. The api service now starts with the schema and ContentType rows already present, so the only DB traffic captured by keploy is the runtime /lookup/ query path. Why: Django's `post_migrate` signal fires `create_contenttypes`, which bulk-inserts ContentType rows in a single simple-query INSERT whose row-tuple ordering depends on model-registration timing during app boot. When the keploy proxy captures that INSERT, the recording is fragile: small boot-time shifts between record and replay (the BACKGROUND_LOOKUPS thread firing during migrate, GIL scheduling differences, etc) produce a hash-equal but value-divergent INSERT at replay time, and the matcher correctly rejects it. The keploy/ integrations build-build cell was halting at boot with this exact shape (`bind values diverged from every recorded invocation`, candidates=2, sessionFallback probed-empty). Migrations only need to populate static schema; running them in a sibling container that bypasses the keploy proxy (keploy intercepts api's traffic only) keeps the boot path out of mocks.yaml entirely. The `service_completed_successfully` dependency guarantees api doesn't start until migrate is done. Signed-off-by: Akash Kumar --- .../docker-compose.yml | 36 +++++++++++++++++++ .../entrypoint.sh | 19 +++++----- 2 files changed, 45 insertions(+), 10 deletions(-) diff --git a/django-permission-cohort-postgres/docker-compose.yml b/django-permission-cohort-postgres/docker-compose.yml index e8144b5..247fdae 100644 --- a/django-permission-cohort-postgres/docker-compose.yml +++ b/django-permission-cohort-postgres/docker-compose.yml @@ -1,4 +1,38 @@ services: + # One-shot migrator. Runs Django's `migrate` (which fires the + # `post_migrate` signal that bulk-inserts ContentType rows) against + # the db and exits. The api service waits for it to complete cleanly + # before starting gunicorn. + # + # Why this is a separate container rather than inline in the api + # entrypoint: when this stack runs under keploy record/replay, keploy + # only intercepts traffic for the named application container (api). + # Migrations fired from a sibling container bypass the keploy proxy + # entirely, so the migration SQL never lands in mocks.yaml. That + # matters because Django's `post_migrate` ContentType bulk_create + # batches all installed-apps' ContentType rows into a single simple- + # query INSERT whose row-tuple ordering is sensitive to model- + # registration timing — capturing it makes recordings non-replayable + # if any boot-time ordering shifts between record and replay. + migrator: + build: + context: . + dockerfile: Dockerfile + container_name: djpcohort_migrator + networks: + - djnet + environment: + DB_HOST: db + DB_PORT: 5432 + DB_USER: djpcohort + DB_PASSWORD: djpcohort + DB_NAME: djpcohort + command: ["python", "/app/manage.py", "migrate", "--noinput"] + depends_on: + db: + condition: service_healthy + restart: "no" + api: build: context: . @@ -33,6 +67,8 @@ services: depends_on: db: condition: service_healthy + migrator: + condition: service_completed_successfully restart: on-failure db: diff --git a/django-permission-cohort-postgres/entrypoint.sh b/django-permission-cohort-postgres/entrypoint.sh index d3a4f1d..cb07a43 100755 --- a/django-permission-cohort-postgres/entrypoint.sh +++ b/django-permission-cohort-postgres/entrypoint.sh @@ -1,17 +1,16 @@ #!/usr/bin/env bash -# entrypoint.sh — run Django migrations once, then start gunicorn. +# entrypoint.sh — start gunicorn. # -# The migrations populate django_content_type and the auth_* -# permission tables; without this the /lookup/ endpoint would have -# nothing to find. We migrate inline at container start (rather than -# in a separate init container) so the recorder captures both the -# migration sequence and the runtime queries on the same connection -# pool — keeps the record/replay traffic shape minimal. +# Migrations are applied by a sibling `migrator` service in +# docker-compose.yml that runs to completion before this container +# starts. Keeping the migration step out of the api container is what +# lets the keploy/integrations record-replay lane stay deterministic: +# Django's `post_migrate` signal bulk-inserts ContentType rows in a +# single simple-query INSERT whose row ordering depends on model- +# registration timing, and capturing that under the keploy proxy +# would make recordings non-replayable across runs. set -Eeuo pipefail -echo "[entrypoint] running migrations..." -python /app/manage.py migrate --noinput - echo "[entrypoint] starting gunicorn on :8080..." # --timeout 300: matcher round-trips through the keploy proxy can spike # under CI load; gunicorn's default 30s SIGKILLs the worker mid-request From be7f62ff725de5547ad6ebc29f115e7699573393 Mon Sep 17 00:00:00 2001 From: Akash Kumar Date: Thu, 30 Apr 2026 03:23:19 +0530 Subject: [PATCH 07/10] Revert "fix(django-permission-cohort-postgres): run migrations in sibling container" This reverts commit 3dbe4dd2009ec729cb137e7845ebd3e42a39c47e. --- .../docker-compose.yml | 36 ------------------- .../entrypoint.sh | 19 +++++----- 2 files changed, 10 insertions(+), 45 deletions(-) diff --git a/django-permission-cohort-postgres/docker-compose.yml b/django-permission-cohort-postgres/docker-compose.yml index 247fdae..e8144b5 100644 --- a/django-permission-cohort-postgres/docker-compose.yml +++ b/django-permission-cohort-postgres/docker-compose.yml @@ -1,38 +1,4 @@ services: - # One-shot migrator. Runs Django's `migrate` (which fires the - # `post_migrate` signal that bulk-inserts ContentType rows) against - # the db and exits. The api service waits for it to complete cleanly - # before starting gunicorn. - # - # Why this is a separate container rather than inline in the api - # entrypoint: when this stack runs under keploy record/replay, keploy - # only intercepts traffic for the named application container (api). - # Migrations fired from a sibling container bypass the keploy proxy - # entirely, so the migration SQL never lands in mocks.yaml. That - # matters because Django's `post_migrate` ContentType bulk_create - # batches all installed-apps' ContentType rows into a single simple- - # query INSERT whose row-tuple ordering is sensitive to model- - # registration timing — capturing it makes recordings non-replayable - # if any boot-time ordering shifts between record and replay. - migrator: - build: - context: . - dockerfile: Dockerfile - container_name: djpcohort_migrator - networks: - - djnet - environment: - DB_HOST: db - DB_PORT: 5432 - DB_USER: djpcohort - DB_PASSWORD: djpcohort - DB_NAME: djpcohort - command: ["python", "/app/manage.py", "migrate", "--noinput"] - depends_on: - db: - condition: service_healthy - restart: "no" - api: build: context: . @@ -67,8 +33,6 @@ services: depends_on: db: condition: service_healthy - migrator: - condition: service_completed_successfully restart: on-failure db: diff --git a/django-permission-cohort-postgres/entrypoint.sh b/django-permission-cohort-postgres/entrypoint.sh index cb07a43..d3a4f1d 100755 --- a/django-permission-cohort-postgres/entrypoint.sh +++ b/django-permission-cohort-postgres/entrypoint.sh @@ -1,16 +1,17 @@ #!/usr/bin/env bash -# entrypoint.sh — start gunicorn. +# entrypoint.sh — run Django migrations once, then start gunicorn. # -# Migrations are applied by a sibling `migrator` service in -# docker-compose.yml that runs to completion before this container -# starts. Keeping the migration step out of the api container is what -# lets the keploy/integrations record-replay lane stay deterministic: -# Django's `post_migrate` signal bulk-inserts ContentType rows in a -# single simple-query INSERT whose row ordering depends on model- -# registration timing, and capturing that under the keploy proxy -# would make recordings non-replayable across runs. +# The migrations populate django_content_type and the auth_* +# permission tables; without this the /lookup/ endpoint would have +# nothing to find. We migrate inline at container start (rather than +# in a separate init container) so the recorder captures both the +# migration sequence and the runtime queries on the same connection +# pool — keeps the record/replay traffic shape minimal. set -Eeuo pipefail +echo "[entrypoint] running migrations..." +python /app/manage.py migrate --noinput + echo "[entrypoint] starting gunicorn on :8080..." # --timeout 300: matcher round-trips through the keploy proxy can spike # under CI load; gunicorn's default 30s SIGKILLs the worker mid-request From 32faee74d51c0359df54b2b74473ca63e032c86c Mon Sep 17 00:00:00 2001 From: Akash Kumar Date: Thu, 30 Apr 2026 03:24:25 +0530 Subject: [PATCH 08/10] fix(django-permission-cohort-postgres): pin PYTHONHASHSEED for deterministic migration order MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Django's `post_migrate` signal fires `create_contenttypes`, which bulk-inserts ContentType rows from a model collection iterated in hash-seed-dependent order on some Django versions. Python's per- process random hash seed (default behavior since 3.3) means the INSERT row tuples can land in different orders across two runs of the same code, even on identical machines and identical container images. Under keploy record/replay this surfaces as: the recorder captures one row order; the replay-time live driver fires a different row order; SQL skeleton hashes match (same column count, same value arity) but the inline value tuples diverge, and the matcher correctly rejects the candidates as `bind values diverged from every recorded invocation`. The keploy/integrations build-build cell halted at boot with that exact shape. PYTHONHASHSEED=0 fixes the seed across runs so iteration order is deterministic. Affects only this sample app — the lookup endpoints themselves are already deterministic. Signed-off-by: Akash Kumar Signed-off-by: Akash Kumar --- django-permission-cohort-postgres/docker-compose.yml | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/django-permission-cohort-postgres/docker-compose.yml b/django-permission-cohort-postgres/docker-compose.yml index e8144b5..320cb25 100644 --- a/django-permission-cohort-postgres/docker-compose.yml +++ b/django-permission-cohort-postgres/docker-compose.yml @@ -14,6 +14,17 @@ services: DB_USER: djpcohort DB_PASSWORD: djpcohort DB_NAME: djpcohort + # Pin Python's per-process hash seed so set/dict iteration order + # is identical across record and replay runs. Django's + # `post_migrate` ContentType bulk_create populates VALUES rows + # from a model collection that, in some Django versions, is + # iterated in a hash-seed-dependent order; without a fixed seed + # the recorder captures one row order at record time and the + # live driver fires a different row order at replay time. The + # SQL hashes match (same skeleton) but the inline value tuples + # diverge — exactly the `bind values diverged from every + # recorded invocation` failure shape the build-build cell hit. + PYTHONHASHSEED: "0" # Spawn the background ContentType-lookup thread (see # myproj/apps.py). The keploy/integrations e2e lane needs this # to record `SELECT django_content_type` invocations *between* From a86976ff1e51d420bd8836d80365186f70cfd8c5 Mon Sep 17 00:00:00 2001 From: Akash Kumar Date: Thu, 30 Apr 2026 04:29:43 +0530 Subject: [PATCH 09/10] fix(django-permission-cohort-postgres): drop the background ContentType-lookup thread MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The thread was added to scatter `SELECT django_content_type` captures across HTTP test windows so the keploy/integrations regression- coverage lane would hit the postgres-v3 session-fallback path. In practice the path's per-test cohort vs session-pool coverage was timing-dependent: a thread-fired capture for, say, (auth, user) may or may not land in the session pool on any given run. When the lane *did* hit session-fallback for a test whose required bind wasn't in the pool, pickSessionFallback's FIFO fallback served a wrong-bind candidate and the test failed on body diff — a separate matcher bug unrelated to the lifetime-gate fix this sample is paired with. The deterministic falsifying e2e for the lifetime-gate bug is the doccano lane in keploy/enterprise; this sample's job is straight Django+Postgres regression coverage. Drop the thread to keep the recordings deterministic. apps.py already gates the spawn on BACKGROUND_LOOKUPS=1 so removing the env var is a clean disable; the code stays in case the thread is ever needed again. Signed-off-by: Akash Kumar Signed-off-by: Akash Kumar --- .../docker-compose.yml | 29 ++----------------- 1 file changed, 3 insertions(+), 26 deletions(-) diff --git a/django-permission-cohort-postgres/docker-compose.yml b/django-permission-cohort-postgres/docker-compose.yml index 320cb25..e9e0d5d 100644 --- a/django-permission-cohort-postgres/docker-compose.yml +++ b/django-permission-cohort-postgres/docker-compose.yml @@ -14,33 +14,10 @@ services: DB_USER: djpcohort DB_PASSWORD: djpcohort DB_NAME: djpcohort - # Pin Python's per-process hash seed so set/dict iteration order - # is identical across record and replay runs. Django's - # `post_migrate` ContentType bulk_create populates VALUES rows - # from a model collection that, in some Django versions, is - # iterated in a hash-seed-dependent order; without a fixed seed - # the recorder captures one row order at record time and the - # live driver fires a different row order at replay time. The - # SQL hashes match (same skeleton) but the inline value tuples - # diverge — exactly the `bind values diverged from every - # recorded invocation` failure shape the build-build cell hit. + # Defence-in-depth: pin Python's per-process hash seed so any + # set/dict iteration anywhere in the Django stack is stable + # across record and replay runs. PYTHONHASHSEED: "0" - # Spawn the background ContentType-lookup thread (see - # myproj/apps.py). The keploy/integrations e2e lane needs this - # to record `SELECT django_content_type` invocations *between* - # HTTP test windows — the precondition for the session-fallback - # path the lifetime-gate fix unblocks. - BACKGROUND_LOOKUPS: "1" - # Fire the background thread every 0.3s. The keploy `test` driver - # replays all captured HTTP cases in ~1 second, so a 3s cadence - # means at most one thread-fire lands during the replay-window - # phase — and the dispatcher routes most of them to - # SessionTransactional directly (bypassing the PerTest → - # SessionFallback path the lifetime-gate fix actually unblocks). - # 0.3s gives ~3-5 thread-fires per replay run that fall *inside* - # an HTTP test window, which is the precondition for hitting - # the SessionFallback gate. - BACKGROUND_LOOKUPS_INTERVAL_S: "0.3" depends_on: db: condition: service_healthy From a3bdd541956ee6188ede56da745fa36cef3cbeab Mon Sep 17 00:00:00 2001 From: Akash Kumar Date: Thu, 30 Apr 2026 12:15:37 +0530 Subject: [PATCH 10/10] feat(django-permission-cohort-postgres): deterministic lifetime-gate falsifier via post-response side query MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds a daemon thread to the /lookup/ view that fires `SELECT django_content_type WHERE app_label='auth' AND model='user'` 100ms after the response is sent. This deliberately creates a record-vs- replay timing asymmetry: at record (exerciser pacing ~1s) the call lands between HTTP test windows and is captured into the session pool with LifetimePerTest (lax-promoted); at replay (keploy `test` compresses pacing to tens of ms) the same 100ms-delayed call lands INSIDE a later test's window where the perTest cohort holds a different bind, so the dispatcher routes PerTest -> SessionFallback. This is the path the lifetime-gate fix in keploy/integrations#167 unblocks. With the fix in the matcher, the gate accepts the lax- promoted candidate and the response is served. Without it, the gate rejects every PerTest-tagged candidate and the matcher logs `transactional: no invocation matched`. Used by the keploy/integrations django-permission-cohort-postgres lane to falsify the bug end-to-end on the cross-version build-latest cell — without this mechanism the lane is regression coverage but not falsifying. Signed-off-by: Akash Kumar Signed-off-by: Akash Kumar --- .../myproj/views.py | 103 ++++++++++++++++-- 1 file changed, 92 insertions(+), 11 deletions(-) diff --git a/django-permission-cohort-postgres/myproj/views.py b/django-permission-cohort-postgres/myproj/views.py index 472e19d..462f59c 100644 --- a/django-permission-cohort-postgres/myproj/views.py +++ b/django-permission-cohort-postgres/myproj/views.py @@ -4,9 +4,7 @@ GET /health/ -> 200 with {"status":"ok"} GET /lookup/// -> 200 with the ContentType row -The lookup view is the load-bearing one. It calls -ContentType.objects.clear_cache() before each lookup so the in-process -cache never short-circuits the query. Every request therefore fires: +The lookup view fires the load-bearing query: SELECT "django_content_type"."id", "django_content_type"."app_label", @@ -16,23 +14,103 @@ AND "django_content_type"."model" = $2) LIMIT 21 -That query is `class: APP` per pgmatch.Classify (a SELECT from a -user-schema table, not pg_catalog), so DeriveLifetime tags every -captured invocation `LifetimePerTest`. With the lifetime gate at -pickSessionFallback (pre-fix), once a per-test cohort is empty for the -SQL hash but the agent has lax-promoted the same hash into the -session pool, the matcher misses with `candidates: 0` and -`sessionFallbackCandidates: N>0`. That's the doccano-shape failure -this lane regresses against. +That query is `class: APP` per pgmatch.Classify, so DeriveLifetime +tags every captured invocation `LifetimePerTest`. With the lifetime +gate at pickSessionFallback (pre-fix), once a per-test cohort is +empty for the SQL hash but the agent has lax-promoted the same hash +into the session pool, the matcher misses with `candidates: 0` and +`sessionFallbackCandidates: N>0`. + +The view ALSO spawns a delayed side-query thread (see +_fire_delayed_side_query) on every request. This is the deterministic +mechanism that surfaces the lifetime-gate bug shape end-to-end — +without it the lane is regression coverage but not falsifying. """ +import threading +import time + from django.contrib.contenttypes.models import ContentType +from django.db import close_old_connections from django.http import JsonResponse, HttpResponseNotFound +# Fixed bind values for the side query. Deliberately one of the +# (app_label, model) pairs the exerciser also calls directly via +# /lookup/, so the response body is well-known and the recorded mock +# is stable. The exerciser fires /lookup/auth/user/ as one of its +# round members, which means at record time there's at least one +# perTest capture of this SQL+bind in some test's window — but the +# delayed-fire copies populated by every other request land between +# HTTP test windows, where the agent's lax-promotion path routes +# them into the session pool with their LifetimePerTest tag intact. +SIDE_QUERY_APP_LABEL = "auth" +SIDE_QUERY_MODEL = "user" + def health(_request): return JsonResponse({"status": "ok"}) +def _fire_delayed_side_query(): + """ + Fire `SELECT django_content_type WHERE app_label='auth' AND + model='user'` 100 ms after the parent request's response is sent, + on a fresh DB connection (daemon thread → Django gives us a new + thread-local connection). + + Why 100 ms, why fixed binds, why daemon thread: + -------------------------------------------------------------- + The lifetime-gate bug surfaces only when, at replay time, a live + DB call lands *inside an active HTTP test window* for a SQL hash + whose perTest cohort is empty for that bind, *and* the session + pool has a lax-promoted PerTest invocation matching the bind. + + Engineering this asymmetry deterministically requires that the + same code path produces a different test-window attribution at + record vs. replay. That's exactly what a fixed-delay + post-response thread does: + + AT RECORD (exerciser fires HTTP requests ~1 s apart): + the 100 ms post-response delay puts the side query's + timestamp comfortably between test N and test N+1's + windows. Captured invocation lands in the session pool + with its LifetimePerTest tag preserved (lax-promotion). + + AT REPLAY (keploy `test` compresses pacing to ~tens of ms): + the same 100 ms delay puts the side query's timestamp + *inside* a later test's window. That test's perTest + cohort holds a capture for *its own* HTTP-driven bind, + not (auth, user) — so for this hash+bind the cohort is + empty, the dispatcher routes through PerTest → + SessionFallback, and the matcher's gate is consulted. + + Pre-fix matcher: lifetime-tag gate rejects every PerTest-tagged + candidate in the session pool → `transactional: no invocation + matched` ERROR → check_for_errors fails the lane. + + Post-fix matcher: mutation-eligibility gate accepts non-mutating + shape-equal candidates regardless of lifetime → serves the + recorded response → green. + + Daemon thread because gunicorn shouldn't wait on it during + worker shutdown; close_old_connections to keep Django's + thread-local connection registry clean. + """ + time.sleep(0.1) + try: + close_old_connections() + ContentType.objects.get( + app_label=SIDE_QUERY_APP_LABEL, model=SIDE_QUERY_MODEL + ) + except Exception: + # Replay-time mock-miss raises here; the parent response is + # already on the wire so we can't surface it to the client. + # The matcher logs the miss as an ERROR which the lane's + # check_for_errors assertion picks up. + pass + finally: + close_old_connections() + + def lookup(_request, app_label, model): # Clear the in-process ContentType cache so each call is forced to # hit the DB. Without this, only the very first call per worker @@ -45,6 +123,9 @@ def lookup(_request, app_label, model): JsonResponse({"error": "not found"}).content, content_type="application/json", ) + # Kick off the delayed side query before returning. See + # _fire_delayed_side_query for the timing rationale. + threading.Thread(target=_fire_delayed_side_query, daemon=True).start() return JsonResponse( {"id": ct.id, "app_label": ct.app_label, "model": ct.model} )