diff --git a/django-permission-cohort-postgres/Dockerfile b/django-permission-cohort-postgres/Dockerfile new file mode 100644 index 0000000..ce0db1a --- /dev/null +++ b/django-permission-cohort-postgres/Dockerfile @@ -0,0 +1,18 @@ +FROM python:3.12-slim + +ENV PYTHONDONTWRITEBYTECODE=1 \ + PYTHONUNBUFFERED=1 \ + PIP_DISABLE_PIP_VERSION_CHECK=1 + +WORKDIR /app + +COPY requirements.txt . +RUN pip install --no-cache-dir -r requirements.txt + +COPY . /app + +RUN chmod +x /app/entrypoint.sh + +EXPOSE 8080 + +CMD ["/app/entrypoint.sh"] diff --git a/django-permission-cohort-postgres/README.md b/django-permission-cohort-postgres/README.md new file mode 100644 index 0000000..92016c5 --- /dev/null +++ b/django-permission-cohort-postgres/README.md @@ -0,0 +1,44 @@ +# django-permission-cohort-postgres + +Minimal Django + PostgreSQL sample that reproduces the `django_content_type` permission-lookup pattern. Each request to `/lookup///` clears Django's in-process `ContentType` cache and forces the SQL: + +```sql +SELECT "django_content_type"."id", + "django_content_type"."app_label", + "django_content_type"."model" + FROM "django_content_type" + WHERE "django_content_type"."app_label" = $1 + AND "django_content_type"."model" = $2 + LIMIT 21 +``` + +That query is what surfaced the postgres v3 session-fallback lifetime-gate bug in [keploy/enterprise#1952](https://github.com/keploy/enterprise/issues/1952). The sample exists so the keploy/integrations e2e lane (`.woodpecker/django-permission-cohort-postgres.yml`) can drive it end-to-end through record + replay. + +## Endpoints + +| Path | Effect | +|---|---| +| `GET /health/` | `{"status":"ok"}` (used as wait-for-app gate) | +| `GET /lookup///` | Looks up the matching `ContentType` row, returns `{id, app_label, model}` | + +## Run standalone + +```bash +docker compose up +curl http://localhost:8080/health/ +curl http://localhost:8080/lookup/auth/user/ +curl http://localhost:8080/lookup/auth/group/ +``` + +## What it intentionally does NOT have + +- No admin +- No DRF / JWT / token auth — the bug shape doesn't need them +- No static files, no templates, no signal handlers, no celery +- No migrations beyond Django's stock contenttypes/auth tables — those are exactly what the lookup hits + +The sample is small on purpose so the failing query is the only DB traffic that matters at replay time. + +## Why CONN_MAX_AGE=0 + +Default `CONN_MAX_AGE=0` (a fresh connection per request) makes Django redo any first-connect work on every request. That keeps the per-request DB call sequence deterministic across record/replay — important for a regression sample. diff --git a/django-permission-cohort-postgres/docker-compose.yml b/django-permission-cohort-postgres/docker-compose.yml new file mode 100644 index 0000000..e9e0d5d --- /dev/null +++ b/django-permission-cohort-postgres/docker-compose.yml @@ -0,0 +1,44 @@ +services: + api: + build: + context: . + dockerfile: Dockerfile + container_name: api + ports: + - "8080:8080" + networks: + - djnet + environment: + DB_HOST: db + DB_PORT: 5432 + DB_USER: djpcohort + DB_PASSWORD: djpcohort + DB_NAME: djpcohort + # Defence-in-depth: pin Python's per-process hash seed so any + # set/dict iteration anywhere in the Django stack is stable + # across record and replay runs. + PYTHONHASHSEED: "0" + depends_on: + db: + condition: service_healthy + restart: on-failure + + db: + image: postgres:16-alpine + container_name: djpcohort_db + networks: + - djnet + environment: + POSTGRES_USER: djpcohort + POSTGRES_PASSWORD: djpcohort + POSTGRES_DB: djpcohort + ports: + - "127.0.0.1:55437:5432" + healthcheck: + test: ["CMD-SHELL", "pg_isready -U djpcohort -d djpcohort"] + interval: 5s + timeout: 5s + retries: 12 + +networks: + djnet: diff --git a/django-permission-cohort-postgres/entrypoint.sh b/django-permission-cohort-postgres/entrypoint.sh new file mode 100755 index 0000000..d3a4f1d --- /dev/null +++ b/django-permission-cohort-postgres/entrypoint.sh @@ -0,0 +1,32 @@ +#!/usr/bin/env bash +# entrypoint.sh — run Django migrations once, then start gunicorn. +# +# The migrations populate django_content_type and the auth_* +# permission tables; without this the /lookup/ endpoint would have +# nothing to find. We migrate inline at container start (rather than +# in a separate init container) so the recorder captures both the +# migration sequence and the runtime queries on the same connection +# pool — keeps the record/replay traffic shape minimal. +set -Eeuo pipefail + +echo "[entrypoint] running migrations..." +python /app/manage.py migrate --noinput + +echo "[entrypoint] starting gunicorn on :8080..." +# --timeout 300: matcher round-trips through the keploy proxy can spike +# under CI load; gunicorn's default 30s SIGKILLs the worker mid-request +# and the test that landed on that worker times out client-side. 300s +# gives the proxy comfortable headroom. +# --workers 4 / --threads 2: enough concurrency that a single slow +# matcher response doesn't queue subsequent requests behind it. +exec gunicorn \ + --bind 0.0.0.0:8080 \ + --workers 4 \ + --threads 2 \ + --timeout 300 \ + --graceful-timeout 30 \ + --access-logfile - \ + --error-logfile - \ + --capture-output \ + --log-level info \ + myproj.wsgi:application diff --git a/django-permission-cohort-postgres/manage.py b/django-permission-cohort-postgres/manage.py new file mode 100755 index 0000000..15df926 --- /dev/null +++ b/django-permission-cohort-postgres/manage.py @@ -0,0 +1,14 @@ +#!/usr/bin/env python +import os +import sys + + +def main(): + os.environ.setdefault("DJANGO_SETTINGS_MODULE", "myproj.settings") + from django.core.management import execute_from_command_line + + execute_from_command_line(sys.argv) + + +if __name__ == "__main__": + main() diff --git a/django-permission-cohort-postgres/myproj/__init__.py b/django-permission-cohort-postgres/myproj/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/django-permission-cohort-postgres/myproj/apps.py b/django-permission-cohort-postgres/myproj/apps.py new file mode 100644 index 0000000..f4e84cd --- /dev/null +++ b/django-permission-cohort-postgres/myproj/apps.py @@ -0,0 +1,91 @@ +""" +AppConfig that spawns a background ContentType-lookup thread inside +each gunicorn worker. + +Why this exists +--------------- +The keploy/integrations django-permission-cohort-postgres lane is a +falsifying e2e lane for the postgres-v3 session-fallback lifetime-gate +fix (keploy/enterprise#1952). To produce the bug shape end-to-end, +the recording must capture some `SELECT django_content_type` +invocations whose request-timestamp falls *between* the surrounding +HTTP test windows. Those out-of-window captures are routed by the +keploy agent's lax-promotion path +(FilterPerTestAndLaxPromotedTierAware in keploy/keploy:pkg/util.go) +into the session pool — preserving their on-disk +`lifetime: perTest` tag. Replay-side, the live request that fires +during a test whose perTest cohort is empty for that SQL hash falls +through to the session-fallback path; pre-fix the lifetime gate +silently rejected the lax-promoted invocation, post-fix the +mutation-aware gate accepts it. + +The lane's HTTP-driven exerciser alone can't produce that shape: any +query fired inside an HTTP request lands inside that test's window +and ends up in its perTest cohort, never out-of-window. We need +DB activity that fires *outside* the HTTP request path. + +This thread provides that: each gunicorn worker spawns one daemon +thread that periodically clears Django's in-process ContentType +cache, runs a fresh `ContentType.objects.get(...)` (which forces the +SQL the bug surfaces on), and sleeps. The cadence (default 3s) is +slow enough not to swamp the worker pool but fast enough that the +exerciser's 6s inter-round pause captures multiple thread-fired +queries between HTTP test windows. + +The thread is gated behind the BACKGROUND_LOOKUPS env var so the +sample stays usable as a plain Django app for ad-hoc local testing. +The CI lane sets BACKGROUND_LOOKUPS=1. +""" +import logging +import os +import threading +import time + +from django.apps import AppConfig + +logger = logging.getLogger(__name__) + + +class MyProjConfig(AppConfig): + name = "myproj" + default = True + + def ready(self): + # `ready()` runs at most once per worker (Django guarantees idempotency + # within a process), so the thread spawn is safe here. Daemon=True + # ensures the thread doesn't block worker shutdown. + if os.environ.get("BACKGROUND_LOOKUPS", "0") != "1": + return + # Lookup-target cycle — different (app_label, model) pairs to widen + # the hash spectrum so the recording's session-fallback cohort isn't + # a single-shape singleton. All target the same SQL hash (different + # binds), which is the shape the lifetime-gate fix unblocks. + targets = [ + ("auth", "user"), + ("auth", "group"), + ("auth", "permission"), + ("contenttypes", "contenttype"), + ] + interval_s = float(os.environ.get("BACKGROUND_LOOKUPS_INTERVAL_S", "3")) + + def loop(): + # Lazy import — apps aren't all loaded at module-import time. + from django.contrib.contenttypes.models import ContentType + + i = 0 + while True: + try: + time.sleep(interval_s) + app_label, model = targets[i % len(targets)] + ContentType.objects.clear_cache() + ContentType.objects.get(app_label=app_label, model=model) + i += 1 + except Exception as exc: # noqa: BLE001 — keep the thread alive + logger.warning("background lookup failed: %s", exc) + + threading.Thread(target=loop, daemon=True, name="bg-content-type-lookups").start() + logger.info( + "background ContentType-lookup thread started (interval=%ss, targets=%s)", + interval_s, + len(targets), + ) diff --git a/django-permission-cohort-postgres/myproj/settings.py b/django-permission-cohort-postgres/myproj/settings.py new file mode 100644 index 0000000..a586181 --- /dev/null +++ b/django-permission-cohort-postgres/myproj/settings.py @@ -0,0 +1,90 @@ +""" +Minimal Django settings for the django-permission-cohort-postgres +e2e lane (keploy/integrations). + +The app exists only to exercise the ContentType permission-lookup +shape that surfaced the postgres-v3 lifetime-gate bug fixed in this +PR. It is intentionally as small as possible: no admin, no static +files, no DRF, no migrations beyond the contenttypes/auth defaults +Django ships out of the box. Anything that doesn't help reproduce +the recorder→lax-promotion→session-fallback path is omitted. +""" +import os +from pathlib import Path + +BASE_DIR = Path(__file__).resolve().parent.parent + +SECRET_KEY = "ci-only-not-secret" +DEBUG = False +ALLOWED_HOSTS = ["*"] + +INSTALLED_APPS = [ + # contenttypes is the load-bearing piece — every request to the + # /lookup/ view fires SELECT django_content_type WHERE app_label=$1 + # AND model=$2, the exact query the lifetime-gate bug surfaces on. + "django.contrib.contenttypes", + "django.contrib.auth", + # myproj.apps.MyProjConfig spawns a background ContentType-lookup + # thread when BACKGROUND_LOOKUPS=1 — the keploy/integrations e2e + # lane uses this to drive DB activity *outside* the HTTP request + # path, so the recording captures session-pool-bound invocations + # the lifetime-gate fix unblocks. Off by default; ad-hoc local + # use of this sample doesn't need it. + "myproj.apps.MyProjConfig", +] + +MIDDLEWARE = [] + +ROOT_URLCONF = "myproj.urls" + +TEMPLATES = [] + +WSGI_APPLICATION = "myproj.wsgi.application" + +DATABASES = { + "default": { + "ENGINE": "django.db.backends.postgresql", + "NAME": os.environ.get("DB_NAME", "djpcohort"), + "USER": os.environ.get("DB_USER", "djpcohort"), + "PASSWORD": os.environ.get("DB_PASSWORD", "djpcohort"), + "HOST": os.environ.get("DB_HOST", "db"), + "PORT": os.environ.get("DB_PORT", "5432"), + # CONN_MAX_AGE=0 (default) means a fresh connection per request, + # which makes Django redo any first-connect work each time. We + # keep that default so the per-request DB call sequence is + # deterministic across record/replay. + "OPTIONS": { + # Skip libpq's SSLRequest preamble. The compose stack runs + # cleartext postgres; without sslmode=disable, psycopg2 + # sends the SSLRequest byte sequence and waits for an 'S' + # or 'N' response. Postgres replies with 'R' (auth + # request, skipping SSL negotiation entirely), keploy's + # v3 recorder/proxy logs this as "unexpected SSL + # response", and on the *first* fresh connection at + # replay time the proxy stalls waiting for an SSL + # handshake the server isn't going to complete — every + # first request to a new endpoint then hangs until the + # client-side timeout fires. Setting sslmode=disable + # makes psycopg2 skip the preamble entirely so every + # request flows on the clean cleartext path the proxy + # already handles. + "sslmode": "disable", + }, + } +} + +DEFAULT_AUTO_FIELD = "django.db.models.AutoField" + +USE_TZ = True + +LOGGING = { + "version": 1, + "disable_existing_loggers": False, + "handlers": { + "console": {"class": "logging.StreamHandler"}, + }, + "root": {"handlers": ["console"], "level": "INFO"}, + "loggers": { + "django.db.backends": {"handlers": ["console"], "level": "INFO"}, + }, +} diff --git a/django-permission-cohort-postgres/myproj/urls.py b/django-permission-cohort-postgres/myproj/urls.py new file mode 100644 index 0000000..611fc7a --- /dev/null +++ b/django-permission-cohort-postgres/myproj/urls.py @@ -0,0 +1,8 @@ +from django.urls import path + +from . import views + +urlpatterns = [ + path("health/", views.health), + path("lookup///", views.lookup), +] diff --git a/django-permission-cohort-postgres/myproj/views.py b/django-permission-cohort-postgres/myproj/views.py new file mode 100644 index 0000000..462f59c --- /dev/null +++ b/django-permission-cohort-postgres/myproj/views.py @@ -0,0 +1,131 @@ +""" +Two endpoints, both deliberately minimal: + + GET /health/ -> 200 with {"status":"ok"} + GET /lookup/// -> 200 with the ContentType row + +The lookup view fires the load-bearing query: + + SELECT "django_content_type"."id", + "django_content_type"."app_label", + "django_content_type"."model" + FROM "django_content_type" + WHERE ("django_content_type"."app_label" = $1 + AND "django_content_type"."model" = $2) + LIMIT 21 + +That query is `class: APP` per pgmatch.Classify, so DeriveLifetime +tags every captured invocation `LifetimePerTest`. With the lifetime +gate at pickSessionFallback (pre-fix), once a per-test cohort is +empty for the SQL hash but the agent has lax-promoted the same hash +into the session pool, the matcher misses with `candidates: 0` and +`sessionFallbackCandidates: N>0`. + +The view ALSO spawns a delayed side-query thread (see +_fire_delayed_side_query) on every request. This is the deterministic +mechanism that surfaces the lifetime-gate bug shape end-to-end — +without it the lane is regression coverage but not falsifying. +""" +import threading +import time + +from django.contrib.contenttypes.models import ContentType +from django.db import close_old_connections +from django.http import JsonResponse, HttpResponseNotFound + +# Fixed bind values for the side query. Deliberately one of the +# (app_label, model) pairs the exerciser also calls directly via +# /lookup/, so the response body is well-known and the recorded mock +# is stable. The exerciser fires /lookup/auth/user/ as one of its +# round members, which means at record time there's at least one +# perTest capture of this SQL+bind in some test's window — but the +# delayed-fire copies populated by every other request land between +# HTTP test windows, where the agent's lax-promotion path routes +# them into the session pool with their LifetimePerTest tag intact. +SIDE_QUERY_APP_LABEL = "auth" +SIDE_QUERY_MODEL = "user" + + +def health(_request): + return JsonResponse({"status": "ok"}) + + +def _fire_delayed_side_query(): + """ + Fire `SELECT django_content_type WHERE app_label='auth' AND + model='user'` 100 ms after the parent request's response is sent, + on a fresh DB connection (daemon thread → Django gives us a new + thread-local connection). + + Why 100 ms, why fixed binds, why daemon thread: + -------------------------------------------------------------- + The lifetime-gate bug surfaces only when, at replay time, a live + DB call lands *inside an active HTTP test window* for a SQL hash + whose perTest cohort is empty for that bind, *and* the session + pool has a lax-promoted PerTest invocation matching the bind. + + Engineering this asymmetry deterministically requires that the + same code path produces a different test-window attribution at + record vs. replay. That's exactly what a fixed-delay + post-response thread does: + + AT RECORD (exerciser fires HTTP requests ~1 s apart): + the 100 ms post-response delay puts the side query's + timestamp comfortably between test N and test N+1's + windows. Captured invocation lands in the session pool + with its LifetimePerTest tag preserved (lax-promotion). + + AT REPLAY (keploy `test` compresses pacing to ~tens of ms): + the same 100 ms delay puts the side query's timestamp + *inside* a later test's window. That test's perTest + cohort holds a capture for *its own* HTTP-driven bind, + not (auth, user) — so for this hash+bind the cohort is + empty, the dispatcher routes through PerTest → + SessionFallback, and the matcher's gate is consulted. + + Pre-fix matcher: lifetime-tag gate rejects every PerTest-tagged + candidate in the session pool → `transactional: no invocation + matched` ERROR → check_for_errors fails the lane. + + Post-fix matcher: mutation-eligibility gate accepts non-mutating + shape-equal candidates regardless of lifetime → serves the + recorded response → green. + + Daemon thread because gunicorn shouldn't wait on it during + worker shutdown; close_old_connections to keep Django's + thread-local connection registry clean. + """ + time.sleep(0.1) + try: + close_old_connections() + ContentType.objects.get( + app_label=SIDE_QUERY_APP_LABEL, model=SIDE_QUERY_MODEL + ) + except Exception: + # Replay-time mock-miss raises here; the parent response is + # already on the wire so we can't surface it to the client. + # The matcher logs the miss as an ERROR which the lane's + # check_for_errors assertion picks up. + pass + finally: + close_old_connections() + + +def lookup(_request, app_label, model): + # Clear the in-process ContentType cache so each call is forced to + # hit the DB. Without this, only the very first call per worker + # would emit the SQL we want under the recorder. + ContentType.objects.clear_cache() + try: + ct = ContentType.objects.get(app_label=app_label, model=model) + except ContentType.DoesNotExist: + return HttpResponseNotFound( + JsonResponse({"error": "not found"}).content, + content_type="application/json", + ) + # Kick off the delayed side query before returning. See + # _fire_delayed_side_query for the timing rationale. + threading.Thread(target=_fire_delayed_side_query, daemon=True).start() + return JsonResponse( + {"id": ct.id, "app_label": ct.app_label, "model": ct.model} + ) diff --git a/django-permission-cohort-postgres/myproj/wsgi.py b/django-permission-cohort-postgres/myproj/wsgi.py new file mode 100644 index 0000000..2419403 --- /dev/null +++ b/django-permission-cohort-postgres/myproj/wsgi.py @@ -0,0 +1,7 @@ +import os + +from django.core.wsgi import get_wsgi_application + +os.environ.setdefault("DJANGO_SETTINGS_MODULE", "myproj.settings") + +application = get_wsgi_application() diff --git a/django-permission-cohort-postgres/requirements.txt b/django-permission-cohort-postgres/requirements.txt new file mode 100644 index 0000000..e1baa50 --- /dev/null +++ b/django-permission-cohort-postgres/requirements.txt @@ -0,0 +1,3 @@ +Django==4.2.16 +gunicorn==21.2.0 +psycopg2-binary==2.9.9