From 829f44fa5325cf18c7dc5aa359f280ce870db2b1 Mon Sep 17 00:00:00 2001 From: mountain Date: Sun, 17 May 2026 15:13:43 +0800 Subject: [PATCH 1/4] feat(cli): support 'openkb add ' with content-type sniffing MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit `openkb add` now accepts http(s) URLs in addition to local paths. URLs are fetched into raw/ first, then handed to the existing add_single_file pipeline — so PageIndex / markitdown / hash dedup / remove all work unchanged. Routing is decided by the HTTP Content-Type header, validated against magic bytes (so a CDN that mislabels a PDF as 'application/octet-stream' still routes correctly): PDF response → urllib chunked download → raw/.pdf → existing convert_document → PageIndex (≥20 pages) or markitdown (short) HTML response → trafilatura.fetch_url + extract main content → raw/.md → existing markitdown short-doc path Anything else → clear error, returns None Filename derivation: Content-Disposition header (RFC 5987 / quoted / unquoted forms) → URL basename → URL host fallback. Sanitization preserves identifier dots like arxiv's "2509.11420" (only strips an extension when it matches the target extension), replaces shell- unsafe chars (space / parens / etc.) with '-', collapses repeats, and caps the stem at 80 chars. Failure modes covered with stderr messages, no crashes: - HTTPError (404, 403, etc.) → "[ERROR] HTTP " - URLError (DNS, refused) → "[ERROR] Network error" - trafilatura returns None → "[ERROR] No main content extracted" - <300 chars extracted → "[WARN] only N chars — page may be JS-rendered" (still saved so user can inspect) - Unsupported content type → "[ERROR] Unsupported content type" Architecturally lives in openkb/url_ingest.py — peer to openkb/converter.py and openkb/indexer.py rather than bloating cli.py. cli.py only adds a 4-line intercept at the top of `add`. Live-tested against four URL shapes: - arxiv.org/abs/ → trafilatura → "Trading-R1-...md" (3 KB) - arxiv.org/pdf/ → urllib → "2509.11420v1.pdf" (2.1 MB) (server's Content-Disposition gives the version suffix for free — no special arxiv handling needed) - claude.com/blog/... → trafilatura → ".md" (1.5 KB) - CDN .pdf with %20 → urllib → sanitized filename (465 KB) 31 new tests cover all routing branches, magic-byte override paths, Content-Disposition parsing (3 forms), filename sanitization edge cases (arxiv ID, spaces, long names, empty), and graceful failure on HTTP/network errors. 360 total tests pass (329 prior + 31 new). --- README.md | 5 +- openkb/cli.py | 19 ++- openkb/url_ingest.py | 248 +++++++++++++++++++++++++++ pyproject.toml | 1 + tests/test_url_ingest.py | 361 +++++++++++++++++++++++++++++++++++++++ 5 files changed, 631 insertions(+), 3 deletions(-) create mode 100644 openkb/url_ingest.py create mode 100644 tests/test_url_ingest.py diff --git a/README.md b/README.md index 4dd51c47..02fd401c 100644 --- a/README.md +++ b/README.md @@ -72,7 +72,8 @@ openkb init # 3. Add documents openkb add paper.pdf -openkb add ~/papers/ # Add a whole directory +openkb add ~/papers/ # Add a whole directory +openkb add https://arxiv.org/pdf/2509.11420 # Or fetch from a URL # 4. Ask a question openkb query "What are the main findings?" @@ -148,7 +149,7 @@ A single source might touch 10-15 wiki pages. Knowledge accumulates: each docume | Command | Description | |---|---| | `openkb init` | Initialize a new knowledge base (interactive) | -| openkb add <file_or_dir> | Add documents and compile to wiki | +| openkb add <file_or_dir_or_URL> | Add documents and compile to wiki. URL ingest auto-detects PDF (saved as `.pdf` → PageIndex / markitdown) vs HTML (trafilatura main-content extract → `.md`) | | openkb remove <doc> | Remove a document and clean up its wiki pages, images, registry, and PageIndex state (use `--dry-run` to preview, `--keep-raw` / `--keep-empty-concepts` to retain artifacts) | | openkb query "question" | Ask a question over the knowledge base (use `--save` to save the answer to `wiki/explorations/`) | | `openkb chat` | Start an interactive multi-turn chat (use `--resume`, `--list`, `--delete` to manage sessions) | diff --git a/openkb/cli.py b/openkb/cli.py index 5197a930..fb749776 100644 --- a/openkb/cli.py +++ b/openkb/cli.py @@ -395,12 +395,29 @@ def init(language): @click.argument("path") @click.pass_context def add(ctx, path): - """Add a document or directory of documents at PATH to the knowledge base.""" + """Add a document or directory of documents at PATH to the knowledge base. + + PATH may be a local file, a local directory (which is walked + recursively for supported extensions), or an http(s) URL. URLs are + fetched into ``raw/`` first: PDF responses (by Content-Type and + magic-byte sniff) are saved as ``.pdf``; HTML responses are run + through trafilatura's main-content extractor and saved as ``.md``. + """ kb_dir = _find_kb_dir(ctx.obj.get("kb_dir_override")) if kb_dir is None: click.echo("No knowledge base found. Run `openkb init` first.") return + # URL ingest: download into raw/ first, then fall through to the + # normal file-path branch below. fetch_url_to_raw returns the + # local path it wrote, or None on failure. + from openkb.url_ingest import looks_like_url, fetch_url_to_raw + if looks_like_url(path): + fetched = fetch_url_to_raw(path, kb_dir) + if fetched is None: + return + path = str(fetched) + target = Path(path) if not target.exists(): click.echo(f"Path does not exist: {path}") diff --git a/openkb/url_ingest.py b/openkb/url_ingest.py new file mode 100644 index 00000000..85b430d4 --- /dev/null +++ b/openkb/url_ingest.py @@ -0,0 +1,248 @@ +"""Fetch remote URLs into the knowledge base's ``raw/`` directory. + +This module is the input-acquisition layer for ``openkb add ``: it +takes an http(s) URL, decides whether it points at a PDF or an HTML +document (using both the HTTP ``Content-Type`` header and a magic-byte +sniff so a mistyped header doesn't fool us), saves a local file under +``raw/``, and hands the path back to the caller. + +The caller (``openkb.cli.add``) then dispatches the saved path to the +normal local-file ingest pipeline (``add_single_file`` → +``convert_document`` → markitdown / PageIndex), so all the existing +short-vs-long-doc routing applies automatically based on the file +extension and page count. + +PDF responses are streamed to disk in chunks (large papers can be tens +of MB). HTML responses are passed through trafilatura's main-content +extractor — saving the raw HTML directly would feed nav/footer/cookie +chrome into the LLM and produce noisy summaries. +""" +from __future__ import annotations + +import re +import urllib.error +import urllib.request +from pathlib import Path +from urllib.parse import unquote, urlparse + +import click + +_USER_AGENT = "openkb/url-fetcher (+https://github.com/VectifyAI/OpenKB)" +_TIMEOUT_SECONDS = 30 +_CHUNK_BYTES = 64 * 1024 +_SNIFF_BYTES = 512 +_HTML_MIN_EXTRACT_CHARS = 300 +_MAX_FILENAME_STEM = 80 + + +def looks_like_url(s: str) -> bool: + """Cheap check used by ``openkb add`` to branch into URL ingest.""" + return s.startswith(("http://", "https://")) + + +def _sniff_content_type(head: bytes, declared: str) -> str: + """Return ``'pdf'``, ``'html'``, or ``'unknown'`` based on magic bytes, + falling back to the server's declared Content-Type. + + Magic bytes win when they disagree with the header (some servers + mislabel PDFs as ``application/octet-stream`` or send HTML + interstitial pages with ``application/pdf``). + """ + if head.startswith(b"%PDF-"): + return "pdf" + stripped = head.lstrip(b" \t\r\n\xef\xbb\xbf") # strip BOM + leading whitespace + if stripped[:1] == b"<": + return "html" + declared_main = declared.split(";")[0].strip().lower() + if declared_main == "application/pdf": + return "pdf" + if declared_main.startswith("text/html") or declared_main == "application/xhtml+xml": + return "html" + return "unknown" + + +def _sanitize_filename(name: str, ext: str) -> str: + """Make a filename safe for shell + filesystem use. + + - URL-decodes percent escapes. + - Strips the existing extension **only when it matches the target + ``ext``** — so ``"2509.11420"`` keeps its dot when we're saving a + ``.pdf`` (the dot is part of the arxiv identifier, not an + extension). + - Replaces whitespace / parentheses / other non-``[a-zA-Z0-9._-]`` + chars with ``-``, collapses repeated ``-``, and trims. + - Caps the stem at 80 chars to avoid filesystem limits. + - Returns ````, falling back to ``document`` if the + sanitized stem is empty. + """ + decoded = unquote(name) + if ext and decoded.lower().endswith(ext.lower()): + stem = decoded[: -len(ext)] + else: + stem = decoded + stem = re.sub(r"[^a-zA-Z0-9._-]+", "-", stem) + stem = re.sub(r"-+", "-", stem).strip("-._") + stem = stem[:_MAX_FILENAME_STEM].rstrip("-._") + return f"{stem}{ext}" if stem else f"document{ext}" + + +def _parse_content_disposition_filename(header: str | None) -> str | None: + """Extract a filename hint from a ``Content-Disposition`` header. + + Handles three forms (in priority order): + + 1. ``filename*=UTF-8''percent-encoded`` (RFC 5987) + 2. ``filename="quoted with spaces.pdf"`` + 3. ``filename=unquoted-no-spaces.pdf`` + """ + if not header: + return None + # RFC 5987 extended form first + m = re.search(r"filename\*=(?:[\w-]+'[\w-]*')?([^;]+)", header) + if m: + return unquote(m.group(1).strip()) + # Quoted form (may contain spaces / parens / commas) + m = re.search(r'filename="([^"]+)"', header) + if m: + return m.group(1) + # Unquoted form (stops at whitespace or semicolon) + m = re.search(r"filename=([^\s;]+)", header) + if m: + return m.group(1) + return None + + +def _pdf_filename(url: str, content_disposition: str | None) -> str: + """Derive a ``.pdf`` filename for a downloaded PDF. + + Priority: ``Content-Disposition: filename=`` header → URL basename → + URL host. The result is run through :func:`_sanitize_filename`. + """ + cd_name = _parse_content_disposition_filename(content_disposition) + if cd_name: + return _sanitize_filename(cd_name, ".pdf") + parsed = urlparse(url) + last = (parsed.path.rsplit("/", 1)[-1] or parsed.netloc).strip() + return _sanitize_filename(last or "document", ".pdf") + + +def _download_pdf_chunked(response, head_bytes: bytes, target: Path) -> None: + """Write the already-read ``head_bytes`` plus the remaining streamed + body to ``target``. Chunked so very large PDFs (50+ MB) don't sit in + RAM. + """ + with open(target, "wb") as fh: + if head_bytes: + fh.write(head_bytes) + while True: + chunk = response.read(_CHUNK_BYTES) + if not chunk: + break + fh.write(chunk) + + +def _extract_html(url: str, raw_dir: Path) -> Path | None: + """Fetch the URL through trafilatura, extract the main content as + Markdown, and write it to ``raw/.md``. + + Returns the saved path, or None if extraction failed entirely. A + short extraction (< 300 chars) is saved anyway but flagged on + stderr — pages that are JS-rendered or paywalled often produce + near-empty extractions. + """ + import trafilatura + + raw_html = trafilatura.fetch_url(url) + if not raw_html: + click.echo(f" [ERROR] Could not fetch URL: {url}", err=True) + return None + + markdown = trafilatura.extract( + raw_html, output_format="markdown", include_links=True, + ) + if not markdown: + click.echo( + " [ERROR] No main content extracted — page may be empty, " + "JS-rendered, or paywalled.", + err=True, + ) + return None + + if len(markdown) < _HTML_MIN_EXTRACT_CHARS: + click.echo( + f" [WARN] Only {len(markdown)} chars extracted — page may be " + f"JS-rendered or behind a login. Saving anyway; inspect the " + f"resulting wiki entry and use `openkb remove` if it's empty.", + err=True, + ) + + metadata = trafilatura.extract_metadata(raw_html) + title = (metadata.title if metadata else None) or url + filename = _sanitize_filename(title, ".md") + target = raw_dir / filename + target.write_text(markdown, encoding="utf-8") + click.echo( + f" Extracted: {title!r}\n" + f" Saved: raw/{filename} ({len(markdown) // 1024 or 1} KB clean markdown)" + ) + return target + + +def fetch_url_to_raw(url: str, kb_dir: Path) -> Path | None: + """Fetch ``url`` into ``/raw/`` and return the local path. + + Routing is decided by HTTP ``Content-Type`` validated against magic + bytes (in case the server lies): + + - PDF → urllib chunked download → ``raw/.pdf`` + - HTML → trafilatura main-content extract → ``raw/.md`` + - anything else → error, returns None + + The caller then hands the saved path to ``add_single_file``, so the + existing PageIndex / markitdown routing by file extension and page + count takes over from there. + """ + raw_dir = kb_dir / "raw" + raw_dir.mkdir(parents=True, exist_ok=True) + + click.echo(f"Downloading: {url}") + + request = urllib.request.Request( + url, headers={"User-Agent": _USER_AGENT, "Accept": "*/*"}, + ) + try: + response = urllib.request.urlopen(request, timeout=_TIMEOUT_SECONDS) + except urllib.error.HTTPError as exc: + click.echo(f" [ERROR] HTTP {exc.code} {exc.reason}", err=True) + return None + except urllib.error.URLError as exc: + click.echo(f" [ERROR] Network error: {exc.reason}", err=True) + return None + except Exception as exc: + click.echo(f" [ERROR] Fetch failed: {exc}", err=True) + return None + + with response: + declared = response.headers.get("Content-Type", "") or "" + head_bytes = response.read(_SNIFF_BYTES) + actual = _sniff_content_type(head_bytes, declared) + + if actual == "pdf": + filename = _pdf_filename( + url, response.headers.get("Content-Disposition"), + ) + target = raw_dir / filename + _download_pdf_chunked(response, head_bytes, target) + size_mb = target.stat().st_size / (1024 * 1024) + click.echo(f" Saved: raw/{filename} ({size_mb:.1f} MB PDF)") + return target + + if actual == "html": + return _extract_html(url, raw_dir) + + click.echo( + f" [ERROR] Unsupported content type {declared!r} for URL ingest. " + "Download the file manually and pass its path to `openkb add` instead.", + err=True, + ) + return None diff --git a/pyproject.toml b/pyproject.toml index 7fec212c..1d0ec85a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -28,6 +28,7 @@ keywords = ["ai", "rag", "retrieval", "knowledge-base", "llm", "pageindex", "age dependencies = [ "pageindex==0.3.0.dev1", "markitdown[all]", + "trafilatura>=2.0", "click>=8.0", "watchdog>=3.0", "litellm", diff --git a/tests/test_url_ingest.py b/tests/test_url_ingest.py new file mode 100644 index 00000000..efe0ef64 --- /dev/null +++ b/tests/test_url_ingest.py @@ -0,0 +1,361 @@ +"""Tests for `openkb.url_ingest` — the URL → raw/ input-acquisition layer.""" +from __future__ import annotations + +import io +from pathlib import Path +from unittest.mock import MagicMock, patch + +from openkb.url_ingest import ( + _parse_content_disposition_filename, + _pdf_filename, + _sanitize_filename, + _sniff_content_type, + fetch_url_to_raw, + looks_like_url, +) + + +# --------------------------------------------------------------------------- +# Pure helpers (no I/O) +# --------------------------------------------------------------------------- + + +def test_looks_like_url_accepts_http_and_https(): + assert looks_like_url("http://example.com") is True + assert looks_like_url("https://example.com/foo") is True + + +def test_looks_like_url_rejects_paths_and_filenames(): + assert looks_like_url("/tmp/foo.pdf") is False + assert looks_like_url("./relative") is False + assert looks_like_url("foo.pdf") is False + assert looks_like_url("") is False + + +# Content-type sniffing — magic bytes override declared header + + +def test_sniff_pdf_magic_wins_over_octet_stream(): + """Some CDNs mislabel PDFs as application/octet-stream — magic bytes save us.""" + assert _sniff_content_type(b"%PDF-1.4\n...", "application/octet-stream") == "pdf" + + +def test_sniff_html_magic_wins_over_pdf_header(): + """Some servers serve an HTML interstitial 'click to download' page with + Content-Type: application/pdf. Magic bytes must override.""" + assert _sniff_content_type(b"", "application/pdf") == "html" + + +def test_sniff_html_magic_handles_bom_and_whitespace(): + assert _sniff_content_type(b"\xef\xbb\xbf", "") == "html" + assert _sniff_content_type(b" \n", "") == "html" + + +def test_sniff_falls_back_to_declared_when_no_magic_match(): + assert _sniff_content_type(b"\x00\x00", "application/pdf") == "pdf" + assert _sniff_content_type(b"random", "text/html; charset=utf-8") == "html" + assert _sniff_content_type(b"random", "application/xhtml+xml") == "html" + + +def test_sniff_returns_unknown_for_unsupported_types(): + assert _sniff_content_type(b"binary", "image/jpeg") == "unknown" + assert _sniff_content_type(b"binary", "application/json") == "unknown" + assert _sniff_content_type(b"binary", "") == "unknown" + + +# Filename sanitization + + +def test_sanitize_preserves_arxiv_id_with_dot(): + """The dot in arxiv's `2509.11420` is part of the identifier, not an + extension. `_sanitize_filename` must not strip it when re-adding `.pdf`.""" + assert _sanitize_filename("2509.11420", ".pdf") == "2509.11420.pdf" + + +def test_sanitize_strips_matching_extension_then_re_adds_it(): + assert _sanitize_filename("paper.pdf", ".pdf") == "paper.pdf" + assert _sanitize_filename("paper.PDF", ".pdf") == "paper.pdf" + + +def test_sanitize_replaces_shell_unsafe_chars(): + assert _sanitize_filename("hello world (1).pdf", ".pdf") == "hello-world-1.pdf" + assert _sanitize_filename("a:b/c\\d?e*f", ".md") == "a-b-c-d-e-f.md" + + +def test_sanitize_collapses_repeated_dashes_and_trims(): + # Underscores are allowed (part of [a-zA-Z0-9._-]) so they pass through; + # only sequences of non-allowed chars become dashes, and repeated dashes + # collapse to one. Leading/trailing dashes/dots/underscores are stripped. + assert _sanitize_filename("a___b---c", ".pdf") == "a___b-c.pdf" + assert _sanitize_filename("---trim---", ".md") == "trim.md" + assert _sanitize_filename("a b c d", ".md") == "a-b-c-d.md" + + +def test_sanitize_caps_stem_at_80_chars(): + name = "a" * 200 + assert _sanitize_filename(name, ".pdf") == ("a" * 80) + ".pdf" + + +def test_sanitize_falls_back_to_document_when_empty(): + assert _sanitize_filename("", ".md") == "document.md" + assert _sanitize_filename("...", ".md") == "document.md" + assert _sanitize_filename("///", ".pdf") == "document.pdf" + + +# Content-Disposition parsing + + +def test_content_disposition_quoted_with_spaces(): + """Quoted form must capture filenames with spaces / parens / commas.""" + cd = 'attachment; filename="My Paper, v3 (final).pdf"' + assert _parse_content_disposition_filename(cd) == "My Paper, v3 (final).pdf" + + +def test_content_disposition_unquoted_simple(): + assert _parse_content_disposition_filename("attachment; filename=foo.pdf") == "foo.pdf" + + +def test_content_disposition_rfc5987_extended(): + """filename*=UTF-8'' is the modern form for non-ASCII.""" + cd = "attachment; filename*=UTF-8''My%20Paper%20%C3%A9.pdf" + assert _parse_content_disposition_filename(cd) == "My Paper é.pdf" + + +def test_content_disposition_none_or_missing_filename(): + assert _parse_content_disposition_filename(None) is None + assert _parse_content_disposition_filename("attachment") is None + assert _parse_content_disposition_filename("inline") is None + + +# _pdf_filename: header → URL basename fallback chain + + +def test_pdf_filename_prefers_content_disposition(): + name = _pdf_filename( + "https://x.com/dl?id=123", + 'attachment; filename="My Paper.pdf"', + ) + assert name == "My-Paper.pdf" + + +def test_pdf_filename_falls_back_to_url_basename(): + name = _pdf_filename( + "https://cdn.example.com/abc/69fe2a55_The-Founders-Playbook-05062026_v3%20(1).pdf", + None, + ) + assert name == "69fe2a55_The-Founders-Playbook-05062026_v3-1.pdf" + + +def test_pdf_filename_handles_arxiv_pdf_url_without_extension(): + """arxiv's PDF URL `arxiv.org/pdf/2509.11420` ends without `.pdf` — the + content-type tells us it's a PDF and `_sanitize_filename` must keep + the dot in the arxiv ID rather than treating it as an extension.""" + name = _pdf_filename("https://arxiv.org/pdf/2509.11420", None) + assert name == "2509.11420.pdf" + + +def test_pdf_filename_falls_back_to_host_when_path_empty(): + name = _pdf_filename("https://example.com/", None) + assert name == "example.com.pdf" + + +# --------------------------------------------------------------------------- +# fetch_url_to_raw — integration with urllib + trafilatura mocked +# --------------------------------------------------------------------------- + + +def _fake_response(*, body: bytes, headers: dict[str, str]): + """Build a fake urllib response with the given body + headers. + + Headers are case-insensitive in real responses; mimicking that here + so the test doesn't depend on which case `_fetch_url_to_raw` looks up. + """ + class _Headers: + def __init__(self, d): + self._d = {k.lower(): v for k, v in d.items()} + + def get(self, key, default=None): + return self._d.get(key.lower(), default) + + resp = MagicMock() + resp.headers = _Headers(headers) + # read(N) returns chunks; read() with no arg returns rest + stream = io.BytesIO(body) + resp.read = stream.read + resp.__enter__ = lambda self: resp + resp.__exit__ = lambda *a: None + return resp + + +def test_fetch_pdf_writes_chunked_to_raw_dir(tmp_path): + """End-to-end PDF path: urlopen → magic-byte sniff → chunked write → + filename comes from URL basename.""" + body = b"%PDF-1.4\n" + b"x" * 100_000 # 100 KB PDF + resp = _fake_response( + body=body, + headers={"Content-Type": "application/pdf"}, + ) + + with patch("urllib.request.urlopen", return_value=resp): + result = fetch_url_to_raw("https://arxiv.org/pdf/2509.11420", tmp_path) + + assert result is not None + assert result.name == "2509.11420.pdf" + assert result.exists() + assert result.read_bytes() == body + + +def test_fetch_pdf_with_lying_octet_stream_header(tmp_path): + """Server says octet-stream but the body starts with %PDF — magic bytes + must win and the file gets the .pdf extension.""" + body = b"%PDF-1.7\n" + b"\x00" * 1000 + resp = _fake_response( + body=body, + headers={"Content-Type": "application/octet-stream"}, + ) + + with patch("urllib.request.urlopen", return_value=resp): + result = fetch_url_to_raw("https://cdn.example.com/a/b/file", tmp_path) + + assert result is not None + assert result.suffix == ".pdf" + assert result.read_bytes() == body + + +def test_fetch_pdf_chunks_a_very_large_body(tmp_path): + """A 1 MB synthetic body still writes correctly via chunked reads.""" + body = b"%PDF-1.4\n" + b"a" * (1024 * 1024) + resp = _fake_response(body=body, headers={"Content-Type": "application/pdf"}) + + with patch("urllib.request.urlopen", return_value=resp): + result = fetch_url_to_raw("https://x.com/big.pdf", tmp_path) + + assert result is not None + assert result.stat().st_size == len(body) + + +def test_fetch_pdf_uses_content_disposition_filename(tmp_path): + body = b"%PDF-1.4\n..." + resp = _fake_response( + body=body, + headers={ + "Content-Type": "application/pdf", + "Content-Disposition": 'attachment; filename="My Paper, v3.pdf"', + }, + ) + + with patch("urllib.request.urlopen", return_value=resp): + result = fetch_url_to_raw("https://x.com/dl?id=1", tmp_path) + + # comma sanitized to dash + assert result.name == "My-Paper-v3.pdf" + + +def test_fetch_html_routes_to_trafilatura(tmp_path): + """HTML responses skip urllib's body (we already consumed the sniff + head) and hand the URL to trafilatura.fetch_url for proper anti-scrape + handling. trafilatura.extract gives clean markdown which we save as .md.""" + sniff_head = b"\n..." + resp = _fake_response( + body=sniff_head + b"nav nav nav", + headers={"Content-Type": "text/html; charset=utf-8"}, + ) + + fake_md = "# Real Article Title\n\nThis is the body content. " * 20 # ~1 KB + fake_meta = MagicMock() + fake_meta.title = "Real Article Title" + + with patch("urllib.request.urlopen", return_value=resp), \ + patch("trafilatura.fetch_url", return_value="...the real HTML..."), \ + patch("trafilatura.extract", return_value=fake_md), \ + patch("trafilatura.extract_metadata", return_value=fake_meta): + result = fetch_url_to_raw("https://blog.example.com/post", tmp_path) + + assert result is not None + assert result.name == "Real-Article-Title.md" + assert result.read_text(encoding="utf-8") == fake_md + + +def test_fetch_html_warns_on_short_extraction(tmp_path, capsys): + """JS-rendered pages produce near-empty extractions. The save still + happens (so the user can inspect what we got) but a stderr warning + surfaces the suspicion.""" + sniff_head = b"" + resp = _fake_response(body=sniff_head, headers={"Content-Type": "text/html"}) + + short_md = "# Title only" # 11 chars, well under 300 + fake_meta = MagicMock() + fake_meta.title = "Title only" + + with patch("urllib.request.urlopen", return_value=resp), \ + patch("trafilatura.fetch_url", return_value="shell"), \ + patch("trafilatura.extract", return_value=short_md), \ + patch("trafilatura.extract_metadata", return_value=fake_meta): + result = fetch_url_to_raw("https://spa.example.com/page", tmp_path) + + assert result is not None + assert result.read_text() == short_md + err = capsys.readouterr().err + assert "[WARN]" in err + assert f"{len(short_md)} chars extracted" in err + + +def test_fetch_html_aborts_when_trafilatura_extracts_nothing(tmp_path): + """trafilatura.extract returning None means the page is essentially + empty (JS-only, paywall HTML, etc.). We error out rather than save + an empty .md.""" + sniff_head = b"" + resp = _fake_response(body=sniff_head, headers={"Content-Type": "text/html"}) + + with patch("urllib.request.urlopen", return_value=resp), \ + patch("trafilatura.fetch_url", return_value="empty"), \ + patch("trafilatura.extract", return_value=None): + result = fetch_url_to_raw("https://js-only.example.com", tmp_path) + + assert result is None + + +def test_fetch_unsupported_content_type_rejected(tmp_path, capsys): + """JSON / image / etc. — refuse with a clear message rather than + saving binary garbage as `.html` or `.pdf`.""" + resp = _fake_response( + body=b'{"foo": "bar"}', + headers={"Content-Type": "application/json"}, + ) + + with patch("urllib.request.urlopen", return_value=resp): + result = fetch_url_to_raw("https://api.example.com/data.json", tmp_path) + + assert result is None + err = capsys.readouterr().err + assert "Unsupported content type" in err + + +def test_fetch_http_404_returns_none(tmp_path, capsys): + """Server errors don't crash — graceful failure with stderr message.""" + import urllib.error + err_resp = urllib.error.HTTPError( + "https://x.com/missing", 404, "Not Found", {}, None, + ) + + with patch("urllib.request.urlopen", side_effect=err_resp): + result = fetch_url_to_raw("https://x.com/missing", tmp_path) + + assert result is None + err = capsys.readouterr().err + assert "HTTP 404" in err + + +def test_fetch_network_error_returns_none(tmp_path, capsys): + """DNS failure / connection refused — graceful with clear message.""" + import urllib.error + + with patch( + "urllib.request.urlopen", + side_effect=urllib.error.URLError("nodename nor servname provided"), + ): + result = fetch_url_to_raw("https://no-such-host.invalid/foo", tmp_path) + + assert result is None + err = capsys.readouterr().err + assert "Network error" in err From 47e5ec222a53796d489d594c7e73dc91fbf32c12 Mon Sep 17 00:00:00 2001 From: mountain Date: Sun, 17 May 2026 16:10:39 +0800 Subject: [PATCH 2/4] fix(url-ingest): address PR #55 self-review findings - _unique_path() prevents silent overwrites in raw/ when two URLs sanitize to the same filename (applied at both PDF and HTML write sites) - PDF filename now derives from response.geturl() (the post-redirect URL), so DOI/shortlink resolvers produce sensible names - add_single_file() returns bool; URL branch unlinks the fetched raw/* file when the downstream add is skipped (dedup), preventing orphan files that re-resurrect on every retry --- openkb/cli.py | 35 +++++--- openkb/url_ingest.py | 40 ++++++++- tests/test_url_ingest.py | 182 +++++++++++++++++++++++++++++++++++++++ 3 files changed, 243 insertions(+), 14 deletions(-) diff --git a/openkb/cli.py b/openkb/cli.py index fb749776..20f644b9 100644 --- a/openkb/cli.py +++ b/openkb/cli.py @@ -130,7 +130,7 @@ def _find_kb_dir(override: Path | None = None) -> Path | None: return None -def add_single_file(file_path: Path, kb_dir: Path) -> None: +def add_single_file(file_path: Path, kb_dir: Path) -> bool: """Convert, index, and compile a single document into the knowledge base. Steps: @@ -138,6 +138,13 @@ def add_single_file(file_path: Path, kb_dir: Path) -> None: 2. Convert the document (hash-check; skip if already known). 3. If long doc: run PageIndex then compile_long_doc. 4. Else: compile_short_doc. + + Returns: + True if the file was newly indexed and compiled. False when the + file was skipped as a duplicate (hash already registered) or + when any pipeline stage failed — callers that need to clean up + the source file (URL-ingest path, for example) can act on a + ``False`` return. """ from openkb.agent.compiler import compile_long_doc, compile_short_doc from openkb.state import HashRegistry @@ -156,11 +163,11 @@ def add_single_file(file_path: Path, kb_dir: Path) -> None: except Exception as exc: click.echo(f" [ERROR] Conversion failed: {exc}") logger.debug("Conversion traceback:", exc_info=True) - return + return False if result.skipped: click.echo(f" [SKIP] Already in knowledge base: {file_path.name}") - return + return False doc_name = file_path.stem index_result = None # populated only on the long-doc branch @@ -174,7 +181,7 @@ def add_single_file(file_path: Path, kb_dir: Path) -> None: except Exception as exc: click.echo(f" [ERROR] Indexing failed: {exc}") logger.debug("Indexing traceback:", exc_info=True) - return + return False summary_path = kb_dir / "wiki" / "summaries" / f"{doc_name}.md" click.echo(f" Compiling long doc (doc_id={index_result.doc_id})...") @@ -192,7 +199,7 @@ def add_single_file(file_path: Path, kb_dir: Path) -> None: else: click.echo(f" [ERROR] Compilation failed: {exc}") logger.debug("Compilation traceback:", exc_info=True) - return + return False else: click.echo(f" Compiling short doc...") for attempt in range(2): @@ -206,7 +213,7 @@ def add_single_file(file_path: Path, kb_dir: Path) -> None: else: click.echo(f" [ERROR] Compilation failed: {exc}") logger.debug("Compilation traceback:", exc_info=True) - return + return False # Register hash only after successful compilation if result.file_hash: @@ -225,6 +232,7 @@ def add_single_file(file_path: Path, kb_dir: Path) -> None: append_log(kb_dir / "wiki", "ingest", file_path.name) click.echo(f" [OK] {file_path.name} added to knowledge base.") + return True # --------------------------------------------------------------------------- @@ -408,15 +416,22 @@ def add(ctx, path): click.echo("No knowledge base found. Run `openkb init` first.") return - # URL ingest: download into raw/ first, then fall through to the - # normal file-path branch below. fetch_url_to_raw returns the - # local path it wrote, or None on failure. + # URL ingest: download into raw/ first, then call add_single_file + # explicitly so we can clean up the just-downloaded file if it + # turns out to be a duplicate (registry already has its hash). + # Without this, re-adding the same URL leaves an orphan in raw/ + # that the registry can't reach via openkb remove. from openkb.url_ingest import looks_like_url, fetch_url_to_raw if looks_like_url(path): fetched = fetch_url_to_raw(path, kb_dir) if fetched is None: return - path = str(fetched) + added = add_single_file(fetched, kb_dir) + if not added: + # Duplicate (or failed mid-pipeline) — drop the just-fetched + # file so raw/ doesn't accumulate orphans across retries. + fetched.unlink(missing_ok=True) + return target = Path(path) if not target.exists(): diff --git a/openkb/url_ingest.py b/openkb/url_ingest.py index 85b430d4..6cd444ec 100644 --- a/openkb/url_ingest.py +++ b/openkb/url_ingest.py @@ -126,6 +126,28 @@ def _pdf_filename(url: str, content_disposition: str | None) -> str: return _sanitize_filename(last or "document", ".pdf") +def _unique_path(target: Path) -> Path: + """Return ``target`` if it doesn't exist yet, otherwise append ``_2``, + ``_3``, … to the stem until an unused name is found. + + Prevents silent overwrites in ``raw/`` when two different URLs + sanitize to the same filename (e.g. two blog posts both titled + "Introduction" → both ``Introduction.md``). + """ + if not target.exists(): + return target + stem = target.stem + suffix = target.suffix + parent = target.parent + for i in range(2, 10_000): + candidate = parent / f"{stem}_{i}{suffix}" + if not candidate.exists(): + return candidate + raise RuntimeError( + f"Could not find a free filename for {target} after 10k attempts" + ) + + def _download_pdf_chunked(response, head_bytes: bytes, target: Path) -> None: """Write the already-read ``head_bytes`` plus the remaining streamed body to ``target``. Chunked so very large PDFs (50+ MB) don't sit in @@ -179,7 +201,10 @@ def _extract_html(url: str, raw_dir: Path) -> Path | None: metadata = trafilatura.extract_metadata(raw_html) title = (metadata.title if metadata else None) or url filename = _sanitize_filename(title, ".md") - target = raw_dir / filename + # Pick a non-colliding name — two blog posts titled "Introduction" + # would otherwise overwrite each other in raw/ and leave the hash + # registry pointing at stale bytes. + target = _unique_path(raw_dir / filename) target.write_text(markdown, encoding="utf-8") click.echo( f" Extracted: {title!r}\n" @@ -228,13 +253,20 @@ def fetch_url_to_raw(url: str, kb_dir: Path) -> Path | None: actual = _sniff_content_type(head_bytes, declared) if actual == "pdf": + # Derive the filename from the *post-redirect* URL — urllib + # follows redirects by default, so the user-typed URL may + # not be the URL that actually served the bytes (DOI / short + # link resolvers, mirror redirects, etc.). Falls back to the + # original input when the response doesn't expose a final + # URL. + final_url = response.geturl() or url filename = _pdf_filename( - url, response.headers.get("Content-Disposition"), + final_url, response.headers.get("Content-Disposition"), ) - target = raw_dir / filename + target = _unique_path(raw_dir / filename) _download_pdf_chunked(response, head_bytes, target) size_mb = target.stat().st_size / (1024 * 1024) - click.echo(f" Saved: raw/{filename} ({size_mb:.1f} MB PDF)") + click.echo(f" Saved: raw/{target.name} ({size_mb:.1f} MB PDF)") return target if actual == "html": diff --git a/tests/test_url_ingest.py b/tests/test_url_ingest.py index efe0ef64..4ea7d0de 100644 --- a/tests/test_url_ingest.py +++ b/tests/test_url_ingest.py @@ -10,6 +10,7 @@ _pdf_filename, _sanitize_filename, _sniff_content_type, + _unique_path, fetch_url_to_raw, looks_like_url, ) @@ -182,6 +183,9 @@ def get(self, key, default=None): # read(N) returns chunks; read() with no arg returns rest stream = io.BytesIO(body) resp.read = stream.read + # urllib's HTTPResponse exposes geturl(); default to empty string so + # callers using `response.geturl() or url` fall through to the input URL. + resp.geturl = lambda: "" resp.__enter__ = lambda self: resp resp.__exit__ = lambda *a: None return resp @@ -359,3 +363,181 @@ def test_fetch_network_error_returns_none(tmp_path, capsys): assert result is None err = capsys.readouterr().err assert "Network error" in err + + +# --------------------------------------------------------------------------- +# Self-review fixes from PR #55 review pass +# --------------------------------------------------------------------------- + + +def test_unique_path_returns_target_when_free(tmp_path): + p = tmp_path / "foo.pdf" + assert _unique_path(p) == p + + +def test_unique_path_finds_next_free_slot(tmp_path): + """_2 / _3 / … must be appended to the stem (not the suffix) and + must keep probing until a free name is found.""" + (tmp_path / "foo.pdf").write_bytes(b"a") + (tmp_path / "foo_2.pdf").write_bytes(b"b") + + result = _unique_path(tmp_path / "foo.pdf") + assert result == tmp_path / "foo_3.pdf" + + +def test_unique_path_handles_no_suffix(tmp_path): + """Files without an extension still get a usable suffix.""" + (tmp_path / "README").write_text("x") + result = _unique_path(tmp_path / "README") + assert result.name == "README_2" + + +def test_fetch_pdf_picks_unique_name_when_target_exists(tmp_path): + """Two URLs that sanitize to the same filename must NOT silently + overwrite — the second one gets `_2` appended.""" + raw_dir = tmp_path / "raw" + raw_dir.mkdir() + (raw_dir / "paper.pdf").write_bytes(b"%PDF-existing\nfirst") + + body = b"%PDF-1.7\nsecond URL content" + resp = _fake_response(body=body, headers={"Content-Type": "application/pdf"}) + # Make response.geturl mimic real urllib (returns the input URL when + # there's no redirect). The fake response builder doesn't set this. + resp.geturl = lambda: "https://mirror.example.com/paper.pdf" + + with patch("urllib.request.urlopen", return_value=resp): + result = fetch_url_to_raw("https://mirror.example.com/paper.pdf", tmp_path) + + # First file is untouched, second went to paper_2.pdf + assert (raw_dir / "paper.pdf").read_bytes() == b"%PDF-existing\nfirst" + assert result == raw_dir / "paper_2.pdf" + assert result.read_bytes() == body + + +def test_fetch_html_picks_unique_name_when_target_exists(tmp_path): + """Two blog posts both titled 'Introduction' must NOT collide.""" + raw_dir = tmp_path / "raw" + raw_dir.mkdir() + (raw_dir / "Introduction.md").write_text("first blog post body") + + resp = _fake_response(body=b"", headers={"Content-Type": "text/html"}) + + second_md = "# Introduction\n\nA completely different second blog post body. " * 10 + fake_meta = MagicMock() + fake_meta.title = "Introduction" + + with patch("urllib.request.urlopen", return_value=resp), \ + patch("trafilatura.fetch_url", return_value="..."), \ + patch("trafilatura.extract", return_value=second_md), \ + patch("trafilatura.extract_metadata", return_value=fake_meta): + result = fetch_url_to_raw("https://blog2.example.com/post", tmp_path) + + assert (raw_dir / "Introduction.md").read_text() == "first blog post body" + assert result == raw_dir / "Introduction_2.md" + assert result.read_text() == second_md + + +def test_fetch_pdf_uses_post_redirect_url_for_filename(tmp_path): + """When urllib follows a redirect (DOI → publisher CDN, short URLs, + etc.), the filename must be derived from the final URL — not the + user's original input — when the response has no Content-Disposition + to override either.""" + body = b"%PDF-1.7\n..." + resp = _fake_response( + body=body, + headers={"Content-Type": "application/pdf"}, # NO Content-Disposition + ) + # urllib's HTTPResponse.geturl() returns the post-redirect URL + resp.geturl = lambda: "https://publisher.example.com/articles/2024/great-paper.pdf" + + with patch("urllib.request.urlopen", return_value=resp): + result = fetch_url_to_raw("https://doi.org/10.1234/abc", tmp_path) + + # Filename comes from the redirected URL's basename, not "abc" + assert result is not None + assert result.name == "great-paper.pdf" + + +def test_add_single_file_returns_true_on_success(tmp_path): + """The new bool return contract: True when the file was actually + indexed. Used by the URL-ingest cleanup path to decide whether the + just-downloaded file in raw/ should be unlinked.""" + from openkb.cli import add_single_file + from openkb.converter import ConvertResult + + # Build a minimal KB scaffold + (tmp_path / ".openkb").mkdir() + (tmp_path / ".openkb" / "config.yaml").write_text("model: gpt-4o-mini\n") + (tmp_path / ".openkb" / "hashes.json").write_text("{}") + (tmp_path / "raw").mkdir() + (tmp_path / "wiki" / "summaries").mkdir(parents=True) + (tmp_path / "wiki" / "sources").mkdir(parents=True) + (tmp_path / "wiki" / "concepts").mkdir(parents=True) + (tmp_path / "wiki" / "log.md").write_text("") + + doc = tmp_path / "raw" / "x.md" + doc.write_text("# Hello") + source_path = tmp_path / "wiki" / "sources" / "x.md" + source_path.write_text("# Hello converted") + + mock_result = ConvertResult( + raw_path=doc, source_path=source_path, + is_long_doc=False, file_hash="cafe" * 16, + ) + + with patch("openkb.cli.convert_document", return_value=mock_result), \ + patch("openkb.cli.asyncio.run"): + added = add_single_file(doc, tmp_path) + + assert added is True + + +def test_add_single_file_returns_false_on_skip(tmp_path): + from openkb.cli import add_single_file + from openkb.converter import ConvertResult + + (tmp_path / ".openkb").mkdir() + (tmp_path / ".openkb" / "config.yaml").write_text("model: gpt-4o-mini\n") + (tmp_path / ".openkb" / "hashes.json").write_text("{}") + (tmp_path / "raw").mkdir() + doc = tmp_path / "raw" / "x.md" + doc.write_text("# Hello") + + skipped = ConvertResult(skipped=True) + with patch("openkb.cli.convert_document", return_value=skipped): + added = add_single_file(doc, tmp_path) + + assert added is False + + +def test_url_ingest_cleans_up_orphan_on_dedup_skip(tmp_path, monkeypatch): + """End-to-end: when the URL-fetched file is already in the registry, + add_single_file returns False and the CLI must unlink it from raw/ + so the user doesn't accumulate untracked duplicates.""" + from click.testing import CliRunner + from openkb.cli import cli + from openkb.converter import ConvertResult + + # Minimal KB + (tmp_path / ".openkb").mkdir() + (tmp_path / ".openkb" / "config.yaml").write_text("model: gpt-4o-mini\n") + (tmp_path / ".openkb" / "hashes.json").write_text("{}") + (tmp_path / "raw").mkdir() + + # Fake the URL fetch — write directly to where url_ingest would + fetched_path = tmp_path / "raw" / "paper.pdf" + fetched_path.write_bytes(b"%PDF-fake") + + runner = CliRunner() + # fetch_url_to_raw is lazy-imported inside `add`, so patch it at the + # source module — that's where the `from ... import` resolves. + with patch("openkb.cli._find_kb_dir", return_value=tmp_path), \ + patch("openkb.url_ingest.fetch_url_to_raw", return_value=fetched_path), \ + patch("openkb.cli.convert_document", + return_value=ConvertResult(skipped=True)): + result = runner.invoke(cli, ["add", "https://example.com/paper.pdf"]) + + assert result.exit_code == 0, result.output + assert "[SKIP]" in result.output + # Orphan cleanup: the URL-fetched file must be gone from raw/. + assert not fetched_path.exists() From d0bb76271a31b0d75b48625dd689e6688703174c Mon Sep 17 00:00:00 2001 From: mountain Date: Sun, 17 May 2026 16:31:42 +0800 Subject: [PATCH 3/4] fix(url-ingest): preserve raw file on pipeline failure, fix HTML echo MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Round-2 review on commit 47e5ec2 surfaced two issues: 1. add_single_file returned bool, so the URL branch couldn't tell "dedup skip" from "mid-pipeline failure". A transient LLM error during compile_long_doc would delete the just-downloaded PDF and leave a dangling PageIndex entry (hash registration only happens on full success, so `openkb remove` couldn't recover it). Now the function returns Literal["added","skipped","failed"]; the URL branch only unlinks on "skipped" — failures keep the raw file so the user can retry without re-downloading. 2. _extract_html echoed the pre-collision filename instead of target.name, so on a title collision the user was told the file was saved to a path occupied by the previous document. PDF branch already used target.name correctly; HTML branch now matches. --- openkb/cli.py | 36 ++++++++------ openkb/url_ingest.py | 2 +- tests/test_url_ingest.py | 103 ++++++++++++++++++++++++++++++++++----- 3 files changed, 112 insertions(+), 29 deletions(-) diff --git a/openkb/cli.py b/openkb/cli.py index 20f644b9..32c111df 100644 --- a/openkb/cli.py +++ b/openkb/cli.py @@ -14,6 +14,7 @@ import sys import time from pathlib import Path +from typing import Literal import os @@ -130,7 +131,7 @@ def _find_kb_dir(override: Path | None = None) -> Path | None: return None -def add_single_file(file_path: Path, kb_dir: Path) -> bool: +def add_single_file(file_path: Path, kb_dir: Path) -> Literal["added", "skipped", "failed"]: """Convert, index, and compile a single document into the knowledge base. Steps: @@ -140,11 +141,12 @@ def add_single_file(file_path: Path, kb_dir: Path) -> bool: 4. Else: compile_short_doc. Returns: - True if the file was newly indexed and compiled. False when the - file was skipped as a duplicate (hash already registered) or - when any pipeline stage failed — callers that need to clean up - the source file (URL-ingest path, for example) can act on a - ``False`` return. + ``"added"`` on full success, ``"skipped"`` when the file's hash + is already in the registry (dedup), or ``"failed"`` when any + pipeline stage raised. URL-ingest distinguishes these so it can + unlink the just-downloaded raw file on dedup (it would otherwise + be an orphan) while preserving it on failure so the user can + retry without re-downloading. """ from openkb.agent.compiler import compile_long_doc, compile_short_doc from openkb.state import HashRegistry @@ -163,11 +165,11 @@ def add_single_file(file_path: Path, kb_dir: Path) -> bool: except Exception as exc: click.echo(f" [ERROR] Conversion failed: {exc}") logger.debug("Conversion traceback:", exc_info=True) - return False + return "failed" if result.skipped: click.echo(f" [SKIP] Already in knowledge base: {file_path.name}") - return False + return "skipped" doc_name = file_path.stem index_result = None # populated only on the long-doc branch @@ -181,7 +183,7 @@ def add_single_file(file_path: Path, kb_dir: Path) -> bool: except Exception as exc: click.echo(f" [ERROR] Indexing failed: {exc}") logger.debug("Indexing traceback:", exc_info=True) - return False + return "failed" summary_path = kb_dir / "wiki" / "summaries" / f"{doc_name}.md" click.echo(f" Compiling long doc (doc_id={index_result.doc_id})...") @@ -199,7 +201,7 @@ def add_single_file(file_path: Path, kb_dir: Path) -> bool: else: click.echo(f" [ERROR] Compilation failed: {exc}") logger.debug("Compilation traceback:", exc_info=True) - return False + return "failed" else: click.echo(f" Compiling short doc...") for attempt in range(2): @@ -213,7 +215,7 @@ def add_single_file(file_path: Path, kb_dir: Path) -> bool: else: click.echo(f" [ERROR] Compilation failed: {exc}") logger.debug("Compilation traceback:", exc_info=True) - return False + return "failed" # Register hash only after successful compilation if result.file_hash: @@ -232,7 +234,7 @@ def add_single_file(file_path: Path, kb_dir: Path) -> bool: append_log(kb_dir / "wiki", "ingest", file_path.name) click.echo(f" [OK] {file_path.name} added to knowledge base.") - return True + return "added" # --------------------------------------------------------------------------- @@ -426,10 +428,12 @@ def add(ctx, path): fetched = fetch_url_to_raw(path, kb_dir) if fetched is None: return - added = add_single_file(fetched, kb_dir) - if not added: - # Duplicate (or failed mid-pipeline) — drop the just-fetched - # file so raw/ doesn't accumulate orphans across retries. + outcome = add_single_file(fetched, kb_dir) + # Only clean up on dedup-skip. On "failed" we keep the file so + # the user can retry (e.g. transient LLM error during compile) + # without re-downloading — and so they don't lose data when + # indexing has already succeeded but compilation didn't. + if outcome == "skipped": fetched.unlink(missing_ok=True) return diff --git a/openkb/url_ingest.py b/openkb/url_ingest.py index 6cd444ec..a1b17df2 100644 --- a/openkb/url_ingest.py +++ b/openkb/url_ingest.py @@ -208,7 +208,7 @@ def _extract_html(url: str, raw_dir: Path) -> Path | None: target.write_text(markdown, encoding="utf-8") click.echo( f" Extracted: {title!r}\n" - f" Saved: raw/{filename} ({len(markdown) // 1024 or 1} KB clean markdown)" + f" Saved: raw/{target.name} ({len(markdown) // 1024 or 1} KB clean markdown)" ) return target diff --git a/tests/test_url_ingest.py b/tests/test_url_ingest.py index 4ea7d0de..b522d4b5 100644 --- a/tests/test_url_ingest.py +++ b/tests/test_url_ingest.py @@ -414,8 +414,10 @@ def test_fetch_pdf_picks_unique_name_when_target_exists(tmp_path): assert result.read_bytes() == body -def test_fetch_html_picks_unique_name_when_target_exists(tmp_path): - """Two blog posts both titled 'Introduction' must NOT collide.""" +def test_fetch_html_picks_unique_name_when_target_exists(tmp_path, capsys): + """Two blog posts both titled 'Introduction' must NOT collide. The + user-facing 'Saved: ...' echo must also reflect the renamed path — + otherwise the message lies about where the file actually went.""" raw_dir = tmp_path / "raw" raw_dir.mkdir() (raw_dir / "Introduction.md").write_text("first blog post body") @@ -435,6 +437,8 @@ def test_fetch_html_picks_unique_name_when_target_exists(tmp_path): assert (raw_dir / "Introduction.md").read_text() == "first blog post body" assert result == raw_dir / "Introduction_2.md" assert result.read_text() == second_md + out = capsys.readouterr().out + assert "Saved: raw/Introduction_2.md" in out def test_fetch_pdf_uses_post_redirect_url_for_filename(tmp_path): @@ -458,10 +462,10 @@ def test_fetch_pdf_uses_post_redirect_url_for_filename(tmp_path): assert result.name == "great-paper.pdf" -def test_add_single_file_returns_true_on_success(tmp_path): - """The new bool return contract: True when the file was actually - indexed. Used by the URL-ingest cleanup path to decide whether the - just-downloaded file in raw/ should be unlinked.""" +def test_add_single_file_returns_added_on_success(tmp_path): + """Tri-state return contract: ``"added"`` when the file was newly + indexed. URL-ingest uses this to decide whether to keep / unlink + the just-downloaded file.""" from openkb.cli import add_single_file from openkb.converter import ConvertResult @@ -487,12 +491,12 @@ def test_add_single_file_returns_true_on_success(tmp_path): with patch("openkb.cli.convert_document", return_value=mock_result), \ patch("openkb.cli.asyncio.run"): - added = add_single_file(doc, tmp_path) + outcome = add_single_file(doc, tmp_path) - assert added is True + assert outcome == "added" -def test_add_single_file_returns_false_on_skip(tmp_path): +def test_add_single_file_returns_skipped_on_dedup(tmp_path): from openkb.cli import add_single_file from openkb.converter import ConvertResult @@ -505,14 +509,48 @@ def test_add_single_file_returns_false_on_skip(tmp_path): skipped = ConvertResult(skipped=True) with patch("openkb.cli.convert_document", return_value=skipped): - added = add_single_file(doc, tmp_path) + outcome = add_single_file(doc, tmp_path) - assert added is False + assert outcome == "skipped" + + +def test_add_single_file_returns_failed_on_pipeline_error(tmp_path): + """A pipeline failure (e.g. transient LLM error during compilation) + must be distinguishable from dedup-skip, so URL-ingest can preserve + the raw file for retry instead of deleting it.""" + from openkb.cli import add_single_file + from openkb.converter import ConvertResult + + (tmp_path / ".openkb").mkdir() + (tmp_path / ".openkb" / "config.yaml").write_text("model: gpt-4o-mini\n") + (tmp_path / ".openkb" / "hashes.json").write_text("{}") + (tmp_path / "raw").mkdir() + (tmp_path / "wiki" / "summaries").mkdir(parents=True) + (tmp_path / "wiki" / "sources").mkdir(parents=True) + (tmp_path / "wiki" / "log.md").write_text("") + + doc = tmp_path / "raw" / "x.md" + doc.write_text("# Hello") + source_path = tmp_path / "wiki" / "sources" / "x.md" + source_path.write_text("# Hello") + + mock_result = ConvertResult( + raw_path=doc, source_path=source_path, + is_long_doc=False, file_hash="cafe" * 16, + ) + + # Make both compile attempts raise to drive the failure path. + with patch("openkb.cli.convert_document", return_value=mock_result), \ + patch("openkb.cli.asyncio.run", side_effect=RuntimeError("LLM 503")), \ + patch("openkb.cli.time.sleep"): + outcome = add_single_file(doc, tmp_path) + + assert outcome == "failed" def test_url_ingest_cleans_up_orphan_on_dedup_skip(tmp_path, monkeypatch): """End-to-end: when the URL-fetched file is already in the registry, - add_single_file returns False and the CLI must unlink it from raw/ + add_single_file returns "skipped" and the CLI unlinks it from raw/ so the user doesn't accumulate untracked duplicates.""" from click.testing import CliRunner from openkb.cli import cli @@ -541,3 +579,44 @@ def test_url_ingest_cleans_up_orphan_on_dedup_skip(tmp_path, monkeypatch): assert "[SKIP]" in result.output # Orphan cleanup: the URL-fetched file must be gone from raw/. assert not fetched_path.exists() + + +def test_url_ingest_keeps_raw_file_on_pipeline_failure(tmp_path): + """The point of the tri-state return: a pipeline failure (e.g. LLM + timeout during compilation) must NOT delete the downloaded file — + the user can retry without re-downloading, and we don't lose data + when indexing has already succeeded but compilation hasn't.""" + from click.testing import CliRunner + from openkb.cli import cli + from openkb.converter import ConvertResult + + (tmp_path / ".openkb").mkdir() + (tmp_path / ".openkb" / "config.yaml").write_text("model: gpt-4o-mini\n") + (tmp_path / ".openkb" / "hashes.json").write_text("{}") + (tmp_path / "raw").mkdir() + (tmp_path / "wiki" / "summaries").mkdir(parents=True) + (tmp_path / "wiki" / "sources").mkdir(parents=True) + (tmp_path / "wiki" / "log.md").write_text("") + + fetched_path = tmp_path / "raw" / "paper.pdf" + fetched_path.write_bytes(b"%PDF-fake") + source_path = tmp_path / "wiki" / "sources" / "paper.md" + source_path.write_text("# fake") + + mock_result = ConvertResult( + raw_path=fetched_path, source_path=source_path, + is_long_doc=False, file_hash="cafe" * 16, + ) + + runner = CliRunner() + with patch("openkb.cli._find_kb_dir", return_value=tmp_path), \ + patch("openkb.url_ingest.fetch_url_to_raw", return_value=fetched_path), \ + patch("openkb.cli.convert_document", return_value=mock_result), \ + patch("openkb.cli.asyncio.run", side_effect=RuntimeError("LLM 503")), \ + patch("openkb.cli.time.sleep"): + result = runner.invoke(cli, ["add", "https://example.com/paper.pdf"]) + + assert result.exit_code == 0, result.output + assert "[ERROR] Compilation failed" in result.output + # The raw file must be preserved so the user can retry. + assert fetched_path.exists() From 46adaab2aa5db241e9fa5fe2e37536c48d4cf8a8 Mon Sep 17 00:00:00 2001 From: mountain Date: Mon, 18 May 2026 10:29:06 +0800 Subject: [PATCH 4/4] chore(tests): drop unused 'from pathlib import Path' import Flagged by github-code-quality bot on PR #55. The import was left over from an earlier draft; nothing in the file references Path. --- tests/test_url_ingest.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/test_url_ingest.py b/tests/test_url_ingest.py index b522d4b5..1b8548ee 100644 --- a/tests/test_url_ingest.py +++ b/tests/test_url_ingest.py @@ -2,7 +2,6 @@ from __future__ import annotations import io -from pathlib import Path from unittest.mock import MagicMock, patch from openkb.url_ingest import (