From 829f44fa5325cf18c7dc5aa359f280ce870db2b1 Mon Sep 17 00:00:00 2001
From: mountain <kose2livs@gmail.com>
Date: Sun, 17 May 2026 15:13:43 +0800
Subject: [PATCH 1/4] feat(cli): support 'openkb add <URL>' with content-type
 sniffing
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

`openkb add` now accepts http(s) URLs in addition to local paths.
URLs are fetched into raw/ first, then handed to the existing
add_single_file pipeline — so PageIndex / markitdown / hash dedup /
remove all work unchanged.

Routing is decided by the HTTP Content-Type header, validated
against magic bytes (so a CDN that mislabels a PDF as
'application/octet-stream' still routes correctly):

  PDF response   → urllib chunked download → raw/<sanitized>.pdf
                   → existing convert_document → PageIndex (≥20 pages)
                   or markitdown (short)

  HTML response  → trafilatura.fetch_url + extract main content
                   → raw/<title-slug>.md
                   → existing markitdown short-doc path

  Anything else  → clear error, returns None

Filename derivation: Content-Disposition header (RFC 5987 / quoted /
unquoted forms) → URL basename → URL host fallback. Sanitization
preserves identifier dots like arxiv's "2509.11420" (only strips an
extension when it matches the target extension), replaces shell-
unsafe chars (space / parens / etc.) with '-', collapses repeats,
and caps the stem at 80 chars.

Failure modes covered with stderr messages, no crashes:
  - HTTPError (404, 403, etc.) → "[ERROR] HTTP <code>"
  - URLError (DNS, refused)    → "[ERROR] Network error"
  - trafilatura returns None   → "[ERROR] No main content extracted"
  - <300 chars extracted       → "[WARN] only N chars — page may be JS-rendered"
                                  (still saved so user can inspect)
  - Unsupported content type   → "[ERROR] Unsupported content type"

Architecturally lives in openkb/url_ingest.py — peer to
openkb/converter.py and openkb/indexer.py rather than bloating
cli.py. cli.py only adds a 4-line intercept at the top of `add`.

Live-tested against four URL shapes:
  - arxiv.org/abs/<id>   → trafilatura → "Trading-R1-...md" (3 KB)
  - arxiv.org/pdf/<id>   → urllib      → "2509.11420v1.pdf" (2.1 MB)
                             (server's Content-Disposition gives the
                              version suffix for free — no special
                              arxiv handling needed)
  - claude.com/blog/...  → trafilatura → ".md" (1.5 KB)
  - CDN .pdf with %20    → urllib      → sanitized filename (465 KB)

31 new tests cover all routing branches, magic-byte override paths,
Content-Disposition parsing (3 forms), filename sanitization edge
cases (arxiv ID, spaces, long names, empty), and graceful failure
on HTTP/network errors. 360 total tests pass (329 prior + 31 new).
---
 README.md                |   5 +-
 openkb/cli.py            |  19 ++-
 openkb/url_ingest.py     | 248 +++++++++++++++++++++++++++
 pyproject.toml           |   1 +
 tests/test_url_ingest.py | 361 +++++++++++++++++++++++++++++++++++++++
 5 files changed, 631 insertions(+), 3 deletions(-)
 create mode 100644 openkb/url_ingest.py
 create mode 100644 tests/test_url_ingest.py
diff --git a/README.md b/README.md
index 4dd51c47..02fd401c 100644
--- a/README.md
+++ b/README.md
@@ -72,7 +72,8 @@ openkb init
 
 # 3. Add documents
 openkb add paper.pdf
-openkb add ~/papers/  # Add a whole directory
+openkb add ~/papers/                            # Add a whole directory
+openkb add https://arxiv.org/pdf/2509.11420     # Or fetch from a URL
 
 # 4. Ask a question
 openkb query "What are the main findings?"
@@ -148,7 +149,7 @@ A single source might touch 10-15 wiki pages. Knowledge accumulates: each docume
 | Command | Description |
 |---|---|
 | `openkb init` | Initialize a new knowledge base (interactive) |
-| <code>openkb&nbsp;add&nbsp;&lt;file_or_dir&gt;</code> | Add documents and compile to wiki |
+| <code>openkb&nbsp;add&nbsp;&lt;file_or_dir_or_URL&gt;</code> | Add documents and compile to wiki. URL ingest auto-detects PDF (saved as `.pdf` → PageIndex / markitdown) vs HTML (trafilatura main-content extract → `.md`) |
 | <code>openkb&nbsp;remove&nbsp;&lt;doc&gt;</code> | Remove a document and clean up its wiki pages, images, registry, and PageIndex state (use `--dry-run` to preview, `--keep-raw` / `--keep-empty-concepts` to retain artifacts) |
 | <code>openkb&nbsp;query&nbsp;"question"</code> | Ask a question over the knowledge base (use `--save` to save the answer to `wiki/explorations/`) |
 | `openkb chat` | Start an interactive multi-turn chat (use `--resume`, `--list`, `--delete` to manage sessions) |
diff --git a/openkb/cli.py b/openkb/cli.py
index 5197a930..fb749776 100644
--- a/openkb/cli.py
+++ b/openkb/cli.py
@@ -395,12 +395,29 @@ def init(language):
 @click.argument("path")
 @click.pass_context
 def add(ctx, path):
-    """Add a document or directory of documents at PATH to the knowledge base."""
+    """Add a document or directory of documents at PATH to the knowledge base.
+
+    PATH may be a local file, a local directory (which is walked
+    recursively for supported extensions), or an http(s) URL. URLs are
+    fetched into ``raw/`` first: PDF responses (by Content-Type and
+    magic-byte sniff) are saved as ``.pdf``; HTML responses are run
+    through trafilatura's main-content extractor and saved as ``.md``.
+    """
     kb_dir = _find_kb_dir(ctx.obj.get("kb_dir_override"))
     if kb_dir is None:
         click.echo("No knowledge base found. Run `openkb init` first.")
         return
 
+    # URL ingest: download into raw/ first, then fall through to the
+    # normal file-path branch below. fetch_url_to_raw returns the
+    # local path it wrote, or None on failure.
+    from openkb.url_ingest import looks_like_url, fetch_url_to_raw
+    if looks_like_url(path):
+        fetched = fetch_url_to_raw(path, kb_dir)
+        if fetched is None:
+            return
+        path = str(fetched)
+
     target = Path(path)
     if not target.exists():
         click.echo(f"Path does not exist: {path}")
diff --git a/openkb/url_ingest.py b/openkb/url_ingest.py
new file mode 100644
index 00000000..85b430d4
--- /dev/null
+++ b/openkb/url_ingest.py
@@ -0,0 +1,248 @@
+"""Fetch remote URLs into the knowledge base's ``raw/`` directory.
+
+This module is the input-acquisition layer for ``openkb add <URL>``: it
+takes an http(s) URL, decides whether it points at a PDF or an HTML
+document (using both the HTTP ``Content-Type`` header and a magic-byte
+sniff so a mistyped header doesn't fool us), saves a local file under
+``raw/``, and hands the path back to the caller.
+
+The caller (``openkb.cli.add``) then dispatches the saved path to the
+normal local-file ingest pipeline (``add_single_file`` →
+``convert_document`` → markitdown / PageIndex), so all the existing
+short-vs-long-doc routing applies automatically based on the file
+extension and page count.
+
+PDF responses are streamed to disk in chunks (large papers can be tens
+of MB). HTML responses are passed through trafilatura's main-content
+extractor — saving the raw HTML directly would feed nav/footer/cookie
+chrome into the LLM and produce noisy summaries.
+"""
+from __future__ import annotations
+
+import re
+import urllib.error
+import urllib.request
+from pathlib import Path
+from urllib.parse import unquote, urlparse
+
+import click
+
+_USER_AGENT = "openkb/url-fetcher (+https://github.com/VectifyAI/OpenKB)"
+_TIMEOUT_SECONDS = 30
+_CHUNK_BYTES = 64 * 1024
+_SNIFF_BYTES = 512
+_HTML_MIN_EXTRACT_CHARS = 300
+_MAX_FILENAME_STEM = 80
+
+
+def looks_like_url(s: str) -> bool:
+    """Cheap check used by ``openkb add`` to branch into URL ingest."""
+    return s.startswith(("http://", "https://"))
+
+
+def _sniff_content_type(head: bytes, declared: str) -> str:
+    """Return ``'pdf'``, ``'html'``, or ``'unknown'`` based on magic bytes,
+    falling back to the server's declared Content-Type.
+
+    Magic bytes win when they disagree with the header (some servers
+    mislabel PDFs as ``application/octet-stream`` or send HTML
+    interstitial pages with ``application/pdf``).
+    """
+    if head.startswith(b"%PDF-"):
+        return "pdf"
+    stripped = head.lstrip(b" \t\r\n\xef\xbb\xbf")  # strip BOM + leading whitespace
+    if stripped[:1] == b"<":
+        return "html"
+    declared_main = declared.split(";")[0].strip().lower()
+    if declared_main == "application/pdf":
+        return "pdf"
+    if declared_main.startswith("text/html") or declared_main == "application/xhtml+xml":
+        return "html"
+    return "unknown"
+
+
+def _sanitize_filename(name: str, ext: str) -> str:
+    """Make a filename safe for shell + filesystem use.
+
+    - URL-decodes percent escapes.
+    - Strips the existing extension **only when it matches the target
+      ``ext``** — so ``"2509.11420"`` keeps its dot when we're saving a
+      ``.pdf`` (the dot is part of the arxiv identifier, not an
+      extension).
+    - Replaces whitespace / parentheses / other non-``[a-zA-Z0-9._-]``
+      chars with ``-``, collapses repeated ``-``, and trims.
+    - Caps the stem at 80 chars to avoid filesystem limits.
+    - Returns ``<stem><ext>``, falling back to ``document<ext>`` if the
+      sanitized stem is empty.
+    """
+    decoded = unquote(name)
+    if ext and decoded.lower().endswith(ext.lower()):
+        stem = decoded[: -len(ext)]
+    else:
+        stem = decoded
+    stem = re.sub(r"[^a-zA-Z0-9._-]+", "-", stem)
+    stem = re.sub(r"-+", "-", stem).strip("-._")
+    stem = stem[:_MAX_FILENAME_STEM].rstrip("-._")
+    return f"{stem}{ext}" if stem else f"document{ext}"
+
+
+def _parse_content_disposition_filename(header: str | None) -> str | None:
+    """Extract a filename hint from a ``Content-Disposition`` header.
+
+    Handles three forms (in priority order):
+
+    1. ``filename*=UTF-8''percent-encoded`` (RFC 5987)
+    2. ``filename="quoted with spaces.pdf"``
+    3. ``filename=unquoted-no-spaces.pdf``
+    """
+    if not header:
+        return None
+    # RFC 5987 extended form first
+    m = re.search(r"filename\*=(?:[\w-]+'[\w-]*')?([^;]+)", header)
+    if m:
+        return unquote(m.group(1).strip())
+    # Quoted form (may contain spaces / parens / commas)
+    m = re.search(r'filename="([^"]+)"', header)
+    if m:
+        return m.group(1)
+    # Unquoted form (stops at whitespace or semicolon)
+    m = re.search(r"filename=([^\s;]+)", header)
+    if m:
+        return m.group(1)
+    return None
+
+
+def _pdf_filename(url: str, content_disposition: str | None) -> str:
+    """Derive a ``.pdf`` filename for a downloaded PDF.
+
+    Priority: ``Content-Disposition: filename=`` header → URL basename →
+    URL host. The result is run through :func:`_sanitize_filename`.
+    """
+    cd_name = _parse_content_disposition_filename(content_disposition)
+    if cd_name:
+        return _sanitize_filename(cd_name, ".pdf")
+    parsed = urlparse(url)
+    last = (parsed.path.rsplit("/", 1)[-1] or parsed.netloc).strip()
+    return _sanitize_filename(last or "document", ".pdf")
+
+
+def _download_pdf_chunked(response, head_bytes: bytes, target: Path) -> None:
+    """Write the already-read ``head_bytes`` plus the remaining streamed
+    body to ``target``. Chunked so very large PDFs (50+ MB) don't sit in
+    RAM.
+    """
+    with open(target, "wb") as fh:
+        if head_bytes:
+            fh.write(head_bytes)
+        while True:
+            chunk = response.read(_CHUNK_BYTES)
+            if not chunk:
+                break
+            fh.write(chunk)
+
+
+def _extract_html(url: str, raw_dir: Path) -> Path | None:
+    """Fetch the URL through trafilatura, extract the main content as
+    Markdown, and write it to ``raw/<title-slug>.md``.
+
+    Returns the saved path, or None if extraction failed entirely. A
+    short extraction (< 300 chars) is saved anyway but flagged on
+    stderr — pages that are JS-rendered or paywalled often produce
+    near-empty extractions.
+    """
+    import trafilatura
+
+    raw_html = trafilatura.fetch_url(url)
+    if not raw_html:
+        click.echo(f"  [ERROR] Could not fetch URL: {url}", err=True)
+        return None
+
+    markdown = trafilatura.extract(
+        raw_html, output_format="markdown", include_links=True,
+    )
+    if not markdown:
+        click.echo(
+            "  [ERROR] No main content extracted — page may be empty, "
+            "JS-rendered, or paywalled.",
+            err=True,
+        )
+        return None
+
+    if len(markdown) < _HTML_MIN_EXTRACT_CHARS:
+        click.echo(
+            f"  [WARN] Only {len(markdown)} chars extracted — page may be "
+            f"JS-rendered or behind a login. Saving anyway; inspect the "
+            f"resulting wiki entry and use `openkb remove` if it's empty.",
+            err=True,
+        )
+
+    metadata = trafilatura.extract_metadata(raw_html)
+    title = (metadata.title if metadata else None) or url
+    filename = _sanitize_filename(title, ".md")
+    target = raw_dir / filename
+    target.write_text(markdown, encoding="utf-8")
+    click.echo(
+        f"  Extracted: {title!r}\n"
+        f"  Saved: raw/{filename} ({len(markdown) // 1024 or 1} KB clean markdown)"
+    )
+    return target
+
+
+def fetch_url_to_raw(url: str, kb_dir: Path) -> Path | None:
+    """Fetch ``url`` into ``<kb>/raw/`` and return the local path.
+
+    Routing is decided by HTTP ``Content-Type`` validated against magic
+    bytes (in case the server lies):
+
+    - PDF  → urllib chunked download → ``raw/<sanitized>.pdf``
+    - HTML → trafilatura main-content extract → ``raw/<title-slug>.md``
+    - anything else → error, returns None
+
+    The caller then hands the saved path to ``add_single_file``, so the
+    existing PageIndex / markitdown routing by file extension and page
+    count takes over from there.
+    """
+    raw_dir = kb_dir / "raw"
+    raw_dir.mkdir(parents=True, exist_ok=True)
+
+    click.echo(f"Downloading: {url}")
+
+    request = urllib.request.Request(
+        url, headers={"User-Agent": _USER_AGENT, "Accept": "*/*"},
+    )
+    try:
+        response = urllib.request.urlopen(request, timeout=_TIMEOUT_SECONDS)
+    except urllib.error.HTTPError as exc:
+        click.echo(f"  [ERROR] HTTP {exc.code} {exc.reason}", err=True)
+        return None
+    except urllib.error.URLError as exc:
+        click.echo(f"  [ERROR] Network error: {exc.reason}", err=True)
+        return None
+    except Exception as exc:
+        click.echo(f"  [ERROR] Fetch failed: {exc}", err=True)
+        return None
+
+    with response:
+        declared = response.headers.get("Content-Type", "") or ""
+        head_bytes = response.read(_SNIFF_BYTES)
+        actual = _sniff_content_type(head_bytes, declared)
+
+        if actual == "pdf":
+            filename = _pdf_filename(
+                url, response.headers.get("Content-Disposition"),
+            )
+            target = raw_dir / filename
+            _download_pdf_chunked(response, head_bytes, target)
+            size_mb = target.stat().st_size / (1024 * 1024)
+            click.echo(f"  Saved: raw/{filename} ({size_mb:.1f} MB PDF)")
+            return target
+
+    if actual == "html":
+        return _extract_html(url, raw_dir)
+
+    click.echo(
+        f"  [ERROR] Unsupported content type {declared!r} for URL ingest. "
+        "Download the file manually and pass its path to `openkb add` instead.",
+        err=True,
+    )
+    return None
diff --git a/pyproject.toml b/pyproject.toml
index 7fec212c..1d0ec85a 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -28,6 +28,7 @@ keywords = ["ai", "rag", "retrieval", "knowledge-base", "llm", "pageindex", "age
 dependencies = [
     "pageindex==0.3.0.dev1",
     "markitdown[all]",
+    "trafilatura>=2.0",
     "click>=8.0",
     "watchdog>=3.0",
     "litellm",
diff --git a/tests/test_url_ingest.py b/tests/test_url_ingest.py
new file mode 100644
index 00000000..efe0ef64
--- /dev/null
+++ b/tests/test_url_ingest.py
@@ -0,0 +1,361 @@
+"""Tests for `openkb.url_ingest` — the URL → raw/ input-acquisition layer."""
+from __future__ import annotations
+
+import io
+from pathlib import Path
+from unittest.mock import MagicMock, patch
+
+from openkb.url_ingest import (
+    _parse_content_disposition_filename,
+    _pdf_filename,
+    _sanitize_filename,
+    _sniff_content_type,
+    fetch_url_to_raw,
+    looks_like_url,
+)
+
+
+# ---------------------------------------------------------------------------
+# Pure helpers (no I/O)
+# ---------------------------------------------------------------------------
+
+
+def test_looks_like_url_accepts_http_and_https():
+    assert looks_like_url("http://example.com") is True
+    assert looks_like_url("https://example.com/foo") is True
+
+
+def test_looks_like_url_rejects_paths_and_filenames():
+    assert looks_like_url("/tmp/foo.pdf") is False
+    assert looks_like_url("./relative") is False
+    assert looks_like_url("foo.pdf") is False
+    assert looks_like_url("") is False
+
+
+# Content-type sniffing — magic bytes override declared header
+
+
+def test_sniff_pdf_magic_wins_over_octet_stream():
+    """Some CDNs mislabel PDFs as application/octet-stream — magic bytes save us."""
+    assert _sniff_content_type(b"%PDF-1.4\n...", "application/octet-stream") == "pdf"
+
+
+def test_sniff_html_magic_wins_over_pdf_header():
+    """Some servers serve an HTML interstitial 'click to download' page with
+    Content-Type: application/pdf. Magic bytes must override."""
+    assert _sniff_content_type(b"<!doctype html>", "application/pdf") == "html"
+
+
+def test_sniff_html_magic_handles_bom_and_whitespace():
+    assert _sniff_content_type(b"\xef\xbb\xbf<html>", "") == "html"
+    assert _sniff_content_type(b"  \n<html>", "") == "html"
+
+
+def test_sniff_falls_back_to_declared_when_no_magic_match():
+    assert _sniff_content_type(b"\x00\x00", "application/pdf") == "pdf"
+    assert _sniff_content_type(b"random", "text/html; charset=utf-8") == "html"
+    assert _sniff_content_type(b"random", "application/xhtml+xml") == "html"
+
+
+def test_sniff_returns_unknown_for_unsupported_types():
+    assert _sniff_content_type(b"binary", "image/jpeg") == "unknown"
+    assert _sniff_content_type(b"binary", "application/json") == "unknown"
+    assert _sniff_content_type(b"binary", "") == "unknown"
+
+
+# Filename sanitization
+
+
+def test_sanitize_preserves_arxiv_id_with_dot():
+    """The dot in arxiv's `2509.11420` is part of the identifier, not an
+    extension. `_sanitize_filename` must not strip it when re-adding `.pdf`."""
+    assert _sanitize_filename("2509.11420", ".pdf") == "2509.11420.pdf"
+
+
+def test_sanitize_strips_matching_extension_then_re_adds_it():
+    assert _sanitize_filename("paper.pdf", ".pdf") == "paper.pdf"
+    assert _sanitize_filename("paper.PDF", ".pdf") == "paper.pdf"
+
+
+def test_sanitize_replaces_shell_unsafe_chars():
+    assert _sanitize_filename("hello world (1).pdf", ".pdf") == "hello-world-1.pdf"
+    assert _sanitize_filename("a:b/c\\d?e*f", ".md") == "a-b-c-d-e-f.md"
+
+
+def test_sanitize_collapses_repeated_dashes_and_trims():
+    # Underscores are allowed (part of [a-zA-Z0-9._-]) so they pass through;
+    # only sequences of non-allowed chars become dashes, and repeated dashes
+    # collapse to one. Leading/trailing dashes/dots/underscores are stripped.
+    assert _sanitize_filename("a___b---c", ".pdf") == "a___b-c.pdf"
+    assert _sanitize_filename("---trim---", ".md") == "trim.md"
+    assert _sanitize_filename("a b c d", ".md") == "a-b-c-d.md"
+
+
+def test_sanitize_caps_stem_at_80_chars():
+    name = "a" * 200
+    assert _sanitize_filename(name, ".pdf") == ("a" * 80) + ".pdf"
+
+
+def test_sanitize_falls_back_to_document_when_empty():
+    assert _sanitize_filename("", ".md") == "document.md"
+    assert _sanitize_filename("...", ".md") == "document.md"
+    assert _sanitize_filename("///", ".pdf") == "document.pdf"
+
+
+# Content-Disposition parsing
+
+
+def test_content_disposition_quoted_with_spaces():
+    """Quoted form must capture filenames with spaces / parens / commas."""
+    cd = 'attachment; filename="My Paper, v3 (final).pdf"'
+    assert _parse_content_disposition_filename(cd) == "My Paper, v3 (final).pdf"
+
+
+def test_content_disposition_unquoted_simple():
+    assert _parse_content_disposition_filename("attachment; filename=foo.pdf") == "foo.pdf"
+
+
+def test_content_disposition_rfc5987_extended():
+    """filename*=UTF-8''<percent-encoded> is the modern form for non-ASCII."""
+    cd = "attachment; filename*=UTF-8''My%20Paper%20%C3%A9.pdf"
+    assert _parse_content_disposition_filename(cd) == "My Paper é.pdf"
+
+
+def test_content_disposition_none_or_missing_filename():
+    assert _parse_content_disposition_filename(None) is None
+    assert _parse_content_disposition_filename("attachment") is None
+    assert _parse_content_disposition_filename("inline") is None
+
+
+# _pdf_filename: header → URL basename fallback chain
+
+
+def test_pdf_filename_prefers_content_disposition():
+    name = _pdf_filename(
+        "https://x.com/dl?id=123",
+        'attachment; filename="My Paper.pdf"',
+    )
+    assert name == "My-Paper.pdf"
+
+
+def test_pdf_filename_falls_back_to_url_basename():
+    name = _pdf_filename(
+        "https://cdn.example.com/abc/69fe2a55_The-Founders-Playbook-05062026_v3%20(1).pdf",
+        None,
+    )
+    assert name == "69fe2a55_The-Founders-Playbook-05062026_v3-1.pdf"
+
+
+def test_pdf_filename_handles_arxiv_pdf_url_without_extension():
+    """arxiv's PDF URL `arxiv.org/pdf/2509.11420` ends without `.pdf` — the
+    content-type tells us it's a PDF and `_sanitize_filename` must keep
+    the dot in the arxiv ID rather than treating it as an extension."""
+    name = _pdf_filename("https://arxiv.org/pdf/2509.11420", None)
+    assert name == "2509.11420.pdf"
+
+
+def test_pdf_filename_falls_back_to_host_when_path_empty():
+    name = _pdf_filename("https://example.com/", None)
+    assert name == "example.com.pdf"
+
+
+# ---------------------------------------------------------------------------
+# fetch_url_to_raw — integration with urllib + trafilatura mocked
+# ---------------------------------------------------------------------------
+
+
+def _fake_response(*, body: bytes, headers: dict[str, str]):
+    """Build a fake urllib response with the given body + headers.
+
+    Headers are case-insensitive in real responses; mimicking that here
+    so the test doesn't depend on which case `_fetch_url_to_raw` looks up.
+    """
+    class _Headers:
+        def __init__(self, d):
+            self._d = {k.lower(): v for k, v in d.items()}
+
+        def get(self, key, default=None):
+            return self._d.get(key.lower(), default)
+
+    resp = MagicMock()
+    resp.headers = _Headers(headers)
+    # read(N) returns chunks; read() with no arg returns rest
+    stream = io.BytesIO(body)
+    resp.read = stream.read
+    resp.__enter__ = lambda self: resp
+    resp.__exit__ = lambda *a: None
+    return resp
+
+
+def test_fetch_pdf_writes_chunked_to_raw_dir(tmp_path):
+    """End-to-end PDF path: urlopen → magic-byte sniff → chunked write →
+    filename comes from URL basename."""
+    body = b"%PDF-1.4\n" + b"x" * 100_000  # 100 KB PDF
+    resp = _fake_response(
+        body=body,
+        headers={"Content-Type": "application/pdf"},
+    )
+
+    with patch("urllib.request.urlopen", return_value=resp):
+        result = fetch_url_to_raw("https://arxiv.org/pdf/2509.11420", tmp_path)
+
+    assert result is not None
+    assert result.name == "2509.11420.pdf"
+    assert result.exists()
+    assert result.read_bytes() == body
+
+
+def test_fetch_pdf_with_lying_octet_stream_header(tmp_path):
+    """Server says octet-stream but the body starts with %PDF — magic bytes
+    must win and the file gets the .pdf extension."""
+    body = b"%PDF-1.7\n" + b"\x00" * 1000
+    resp = _fake_response(
+        body=body,
+        headers={"Content-Type": "application/octet-stream"},
+    )
+
+    with patch("urllib.request.urlopen", return_value=resp):
+        result = fetch_url_to_raw("https://cdn.example.com/a/b/file", tmp_path)
+
+    assert result is not None
+    assert result.suffix == ".pdf"
+    assert result.read_bytes() == body
+
+
+def test_fetch_pdf_chunks_a_very_large_body(tmp_path):
+    """A 1 MB synthetic body still writes correctly via chunked reads."""
+    body = b"%PDF-1.4\n" + b"a" * (1024 * 1024)
+    resp = _fake_response(body=body, headers={"Content-Type": "application/pdf"})
+
+    with patch("urllib.request.urlopen", return_value=resp):
+        result = fetch_url_to_raw("https://x.com/big.pdf", tmp_path)
+
+    assert result is not None
+    assert result.stat().st_size == len(body)
+
+
+def test_fetch_pdf_uses_content_disposition_filename(tmp_path):
+    body = b"%PDF-1.4\n..."
+    resp = _fake_response(
+        body=body,
+        headers={
+            "Content-Type": "application/pdf",
+            "Content-Disposition": 'attachment; filename="My Paper, v3.pdf"',
+        },
+    )
+
+    with patch("urllib.request.urlopen", return_value=resp):
+        result = fetch_url_to_raw("https://x.com/dl?id=1", tmp_path)
+
+    # comma sanitized to dash
+    assert result.name == "My-Paper-v3.pdf"
+
+
+def test_fetch_html_routes_to_trafilatura(tmp_path):
+    """HTML responses skip urllib's body (we already consumed the sniff
+    head) and hand the URL to trafilatura.fetch_url for proper anti-scrape
+    handling. trafilatura.extract gives clean markdown which we save as .md."""
+    sniff_head = b"<!doctype html>\n<html><head>..."
+    resp = _fake_response(
+        body=sniff_head + b"<body>nav nav nav</body>",
+        headers={"Content-Type": "text/html; charset=utf-8"},
+    )
+
+    fake_md = "# Real Article Title\n\nThis is the body content. " * 20  # ~1 KB
+    fake_meta = MagicMock()
+    fake_meta.title = "Real Article Title"
+
+    with patch("urllib.request.urlopen", return_value=resp), \
+         patch("trafilatura.fetch_url", return_value="<html>...the real HTML...</html>"), \
+         patch("trafilatura.extract", return_value=fake_md), \
+         patch("trafilatura.extract_metadata", return_value=fake_meta):
+        result = fetch_url_to_raw("https://blog.example.com/post", tmp_path)
+
+    assert result is not None
+    assert result.name == "Real-Article-Title.md"
+    assert result.read_text(encoding="utf-8") == fake_md
+
+
+def test_fetch_html_warns_on_short_extraction(tmp_path, capsys):
+    """JS-rendered pages produce near-empty extractions. The save still
+    happens (so the user can inspect what we got) but a stderr warning
+    surfaces the suspicion."""
+    sniff_head = b"<html>"
+    resp = _fake_response(body=sniff_head, headers={"Content-Type": "text/html"})
+
+    short_md = "# Title only"  # 11 chars, well under 300
+    fake_meta = MagicMock()
+    fake_meta.title = "Title only"
+
+    with patch("urllib.request.urlopen", return_value=resp), \
+         patch("trafilatura.fetch_url", return_value="<html>shell</html>"), \
+         patch("trafilatura.extract", return_value=short_md), \
+         patch("trafilatura.extract_metadata", return_value=fake_meta):
+        result = fetch_url_to_raw("https://spa.example.com/page", tmp_path)
+
+    assert result is not None
+    assert result.read_text() == short_md
+    err = capsys.readouterr().err
+    assert "[WARN]" in err
+    assert f"{len(short_md)} chars extracted" in err
+
+
+def test_fetch_html_aborts_when_trafilatura_extracts_nothing(tmp_path):
+    """trafilatura.extract returning None means the page is essentially
+    empty (JS-only, paywall HTML, etc.). We error out rather than save
+    an empty .md."""
+    sniff_head = b"<html>"
+    resp = _fake_response(body=sniff_head, headers={"Content-Type": "text/html"})
+
+    with patch("urllib.request.urlopen", return_value=resp), \
+         patch("trafilatura.fetch_url", return_value="<html>empty</html>"), \
+         patch("trafilatura.extract", return_value=None):
+        result = fetch_url_to_raw("https://js-only.example.com", tmp_path)
+
+    assert result is None
+
+
+def test_fetch_unsupported_content_type_rejected(tmp_path, capsys):
+    """JSON / image / etc. — refuse with a clear message rather than
+    saving binary garbage as `.html` or `.pdf`."""
+    resp = _fake_response(
+        body=b'{"foo": "bar"}',
+        headers={"Content-Type": "application/json"},
+    )
+
+    with patch("urllib.request.urlopen", return_value=resp):
+        result = fetch_url_to_raw("https://api.example.com/data.json", tmp_path)
+
+    assert result is None
+    err = capsys.readouterr().err
+    assert "Unsupported content type" in err
+
+
+def test_fetch_http_404_returns_none(tmp_path, capsys):
+    """Server errors don't crash — graceful failure with stderr message."""
+    import urllib.error
+    err_resp = urllib.error.HTTPError(
+        "https://x.com/missing", 404, "Not Found", {}, None,
+    )
+
+    with patch("urllib.request.urlopen", side_effect=err_resp):
+        result = fetch_url_to_raw("https://x.com/missing", tmp_path)
+
+    assert result is None
+    err = capsys.readouterr().err
+    assert "HTTP 404" in err
+
+
+def test_fetch_network_error_returns_none(tmp_path, capsys):
+    """DNS failure / connection refused — graceful with clear message."""
+    import urllib.error
+
+    with patch(
+        "urllib.request.urlopen",
+        side_effect=urllib.error.URLError("nodename nor servname provided"),
+    ):
+        result = fetch_url_to_raw("https://no-such-host.invalid/foo", tmp_path)
+
+    assert result is None
+    err = capsys.readouterr().err
+    assert "Network error" in err

From 47e5ec222a53796d489d594c7e73dc91fbf32c12 Mon Sep 17 00:00:00 2001
From: mountain <kose2livs@gmail.com>
Date: Sun, 17 May 2026 16:10:39 +0800
Subject: [PATCH 2/4] fix(url-ingest): address PR #55 self-review findings

- _unique_path() prevents silent overwrites in raw/ when two URLs
  sanitize to the same filename (applied at both PDF and HTML write
  sites)
- PDF filename now derives from response.geturl() (the post-redirect
  URL), so DOI/shortlink resolvers produce sensible names
- add_single_file() returns bool; URL branch unlinks the fetched
  raw/* file when the downstream add is skipped (dedup), preventing
  orphan files that re-resurrect on every retry
---
 openkb/cli.py            |  35 +++++---
 openkb/url_ingest.py     |  40 ++++++++-
 tests/test_url_ingest.py | 182 +++++++++++++++++++++++++++++++++++++++
 3 files changed, 243 insertions(+), 14 deletions(-)

diff --git a/openkb/cli.py b/openkb/cli.py
index fb749776..20f644b9 100644
--- a/openkb/cli.py
+++ b/openkb/cli.py
@@ -130,7 +130,7 @@ def _find_kb_dir(override: Path | None = None) -> Path | None:
     return None
 
 
-def add_single_file(file_path: Path, kb_dir: Path) -> None:
+def add_single_file(file_path: Path, kb_dir: Path) -> bool:
     """Convert, index, and compile a single document into the knowledge base.
 
     Steps:
@@ -138,6 +138,13 @@ def add_single_file(file_path: Path, kb_dir: Path) -> None:
     2. Convert the document (hash-check; skip if already known).
     3. If long doc: run PageIndex then compile_long_doc.
     4. Else: compile_short_doc.
+
+    Returns:
+        True if the file was newly indexed and compiled. False when the
+        file was skipped as a duplicate (hash already registered) or
+        when any pipeline stage failed — callers that need to clean up
+        the source file (URL-ingest path, for example) can act on a
+        ``False`` return.
     """
     from openkb.agent.compiler import compile_long_doc, compile_short_doc
     from openkb.state import HashRegistry
@@ -156,11 +163,11 @@ def add_single_file(file_path: Path, kb_dir: Path) -> None:
     except Exception as exc:
         click.echo(f"  [ERROR] Conversion failed: {exc}")
         logger.debug("Conversion traceback:", exc_info=True)
-        return
+        return False
 
     if result.skipped:
         click.echo(f"  [SKIP] Already in knowledge base: {file_path.name}")
-        return
+        return False
 
     doc_name = file_path.stem
     index_result = None  # populated only on the long-doc branch
@@ -174,7 +181,7 @@ def add_single_file(file_path: Path, kb_dir: Path) -> None:
         except Exception as exc:
             click.echo(f"  [ERROR] Indexing failed: {exc}")
             logger.debug("Indexing traceback:", exc_info=True)
-            return
+            return False
 
         summary_path = kb_dir / "wiki" / "summaries" / f"{doc_name}.md"
         click.echo(f"  Compiling long doc (doc_id={index_result.doc_id})...")
@@ -192,7 +199,7 @@ def add_single_file(file_path: Path, kb_dir: Path) -> None:
                 else:
                     click.echo(f"  [ERROR] Compilation failed: {exc}")
                     logger.debug("Compilation traceback:", exc_info=True)
-                    return
+                    return False
     else:
         click.echo(f"  Compiling short doc...")
         for attempt in range(2):
@@ -206,7 +213,7 @@ def add_single_file(file_path: Path, kb_dir: Path) -> None:
                 else:
                     click.echo(f"  [ERROR] Compilation failed: {exc}")
                     logger.debug("Compilation traceback:", exc_info=True)
-                    return
+                    return False
 
     # Register hash only after successful compilation
     if result.file_hash:
@@ -225,6 +232,7 @@ def add_single_file(file_path: Path, kb_dir: Path) -> None:
 
     append_log(kb_dir / "wiki", "ingest", file_path.name)
     click.echo(f"  [OK] {file_path.name} added to knowledge base.")
+    return True
 
 
 # ---------------------------------------------------------------------------
@@ -408,15 +416,22 @@ def add(ctx, path):
         click.echo("No knowledge base found. Run `openkb init` first.")
         return
 
-    # URL ingest: download into raw/ first, then fall through to the
-    # normal file-path branch below. fetch_url_to_raw returns the
-    # local path it wrote, or None on failure.
+    # URL ingest: download into raw/ first, then call add_single_file
+    # explicitly so we can clean up the just-downloaded file if it
+    # turns out to be a duplicate (registry already has its hash).
+    # Without this, re-adding the same URL leaves an orphan in raw/
+    # that the registry can't reach via openkb remove.
     from openkb.url_ingest import looks_like_url, fetch_url_to_raw
     if looks_like_url(path):
         fetched = fetch_url_to_raw(path, kb_dir)
         if fetched is None:
             return
-        path = str(fetched)
+        added = add_single_file(fetched, kb_dir)
+        if not added:
+            # Duplicate (or failed mid-pipeline) — drop the just-fetched
+            # file so raw/ doesn't accumulate orphans across retries.
+            fetched.unlink(missing_ok=True)
+        return
 
     target = Path(path)
     if not target.exists():
diff --git a/openkb/url_ingest.py b/openkb/url_ingest.py
index 85b430d4..6cd444ec 100644
--- a/openkb/url_ingest.py
+++ b/openkb/url_ingest.py
@@ -126,6 +126,28 @@ def _pdf_filename(url: str, content_disposition: str | None) -> str:
     return _sanitize_filename(last or "document", ".pdf")
 
 
+def _unique_path(target: Path) -> Path:
+    """Return ``target`` if it doesn't exist yet, otherwise append ``_2``,
+    ``_3``, … to the stem until an unused name is found.
+
+    Prevents silent overwrites in ``raw/`` when two different URLs
+    sanitize to the same filename (e.g. two blog posts both titled
+    "Introduction" → both ``Introduction.md``).
+    """
+    if not target.exists():
+        return target
+    stem = target.stem
+    suffix = target.suffix
+    parent = target.parent
+    for i in range(2, 10_000):
+        candidate = parent / f"{stem}_{i}{suffix}"
+        if not candidate.exists():
+            return candidate
+    raise RuntimeError(
+        f"Could not find a free filename for {target} after 10k attempts"
+    )
+
+
 def _download_pdf_chunked(response, head_bytes: bytes, target: Path) -> None:
     """Write the already-read ``head_bytes`` plus the remaining streamed
     body to ``target``. Chunked so very large PDFs (50+ MB) don't sit in
@@ -179,7 +201,10 @@ def _extract_html(url: str, raw_dir: Path) -> Path | None:
     metadata = trafilatura.extract_metadata(raw_html)
     title = (metadata.title if metadata else None) or url
     filename = _sanitize_filename(title, ".md")
-    target = raw_dir / filename
+    # Pick a non-colliding name — two blog posts titled "Introduction"
+    # would otherwise overwrite each other in raw/ and leave the hash
+    # registry pointing at stale bytes.
+    target = _unique_path(raw_dir / filename)
     target.write_text(markdown, encoding="utf-8")
     click.echo(
         f"  Extracted: {title!r}\n"
@@ -228,13 +253,20 @@ def fetch_url_to_raw(url: str, kb_dir: Path) -> Path | None:
         actual = _sniff_content_type(head_bytes, declared)
 
         if actual == "pdf":
+            # Derive the filename from the *post-redirect* URL — urllib
+            # follows redirects by default, so the user-typed URL may
+            # not be the URL that actually served the bytes (DOI / short
+            # link resolvers, mirror redirects, etc.). Falls back to the
+            # original input when the response doesn't expose a final
+            # URL.
+            final_url = response.geturl() or url
             filename = _pdf_filename(
-                url, response.headers.get("Content-Disposition"),
+                final_url, response.headers.get("Content-Disposition"),
             )
-            target = raw_dir / filename
+            target = _unique_path(raw_dir / filename)
             _download_pdf_chunked(response, head_bytes, target)
             size_mb = target.stat().st_size / (1024 * 1024)
-            click.echo(f"  Saved: raw/{filename} ({size_mb:.1f} MB PDF)")
+            click.echo(f"  Saved: raw/{target.name} ({size_mb:.1f} MB PDF)")
             return target
 
     if actual == "html":
diff --git a/tests/test_url_ingest.py b/tests/test_url_ingest.py
index efe0ef64..4ea7d0de 100644
--- a/tests/test_url_ingest.py
+++ b/tests/test_url_ingest.py
@@ -10,6 +10,7 @@
     _pdf_filename,
     _sanitize_filename,
     _sniff_content_type,
+    _unique_path,
     fetch_url_to_raw,
     looks_like_url,
 )
@@ -182,6 +183,9 @@ def get(self, key, default=None):
     # read(N) returns chunks; read() with no arg returns rest
     stream = io.BytesIO(body)
     resp.read = stream.read
+    # urllib's HTTPResponse exposes geturl(); default to empty string so
+    # callers using `response.geturl() or url` fall through to the input URL.
+    resp.geturl = lambda: ""
     resp.__enter__ = lambda self: resp
     resp.__exit__ = lambda *a: None
     return resp
@@ -359,3 +363,181 @@ def test_fetch_network_error_returns_none(tmp_path, capsys):
     assert result is None
     err = capsys.readouterr().err
     assert "Network error" in err
+
+
+# ---------------------------------------------------------------------------
+# Self-review fixes from PR #55 review pass
+# ---------------------------------------------------------------------------
+
+
+def test_unique_path_returns_target_when_free(tmp_path):
+    p = tmp_path / "foo.pdf"
+    assert _unique_path(p) == p
+
+
+def test_unique_path_finds_next_free_slot(tmp_path):
+    """_2 / _3 / … must be appended to the stem (not the suffix) and
+    must keep probing until a free name is found."""
+    (tmp_path / "foo.pdf").write_bytes(b"a")
+    (tmp_path / "foo_2.pdf").write_bytes(b"b")
+
+    result = _unique_path(tmp_path / "foo.pdf")
+    assert result == tmp_path / "foo_3.pdf"
+
+
+def test_unique_path_handles_no_suffix(tmp_path):
+    """Files without an extension still get a usable suffix."""
+    (tmp_path / "README").write_text("x")
+    result = _unique_path(tmp_path / "README")
+    assert result.name == "README_2"
+
+
+def test_fetch_pdf_picks_unique_name_when_target_exists(tmp_path):
+    """Two URLs that sanitize to the same filename must NOT silently
+    overwrite — the second one gets `_2` appended."""
+    raw_dir = tmp_path / "raw"
+    raw_dir.mkdir()
+    (raw_dir / "paper.pdf").write_bytes(b"%PDF-existing\nfirst")
+
+    body = b"%PDF-1.7\nsecond URL content"
+    resp = _fake_response(body=body, headers={"Content-Type": "application/pdf"})
+    # Make response.geturl mimic real urllib (returns the input URL when
+    # there's no redirect). The fake response builder doesn't set this.
+    resp.geturl = lambda: "https://mirror.example.com/paper.pdf"
+
+    with patch("urllib.request.urlopen", return_value=resp):
+        result = fetch_url_to_raw("https://mirror.example.com/paper.pdf", tmp_path)
+
+    # First file is untouched, second went to paper_2.pdf
+    assert (raw_dir / "paper.pdf").read_bytes() == b"%PDF-existing\nfirst"
+    assert result == raw_dir / "paper_2.pdf"
+    assert result.read_bytes() == body
+
+
+def test_fetch_html_picks_unique_name_when_target_exists(tmp_path):
+    """Two blog posts both titled 'Introduction' must NOT collide."""
+    raw_dir = tmp_path / "raw"
+    raw_dir.mkdir()
+    (raw_dir / "Introduction.md").write_text("first blog post body")
+
+    resp = _fake_response(body=b"<html>", headers={"Content-Type": "text/html"})
+
+    second_md = "# Introduction\n\nA completely different second blog post body. " * 10
+    fake_meta = MagicMock()
+    fake_meta.title = "Introduction"
+
+    with patch("urllib.request.urlopen", return_value=resp), \
+         patch("trafilatura.fetch_url", return_value="<html>...</html>"), \
+         patch("trafilatura.extract", return_value=second_md), \
+         patch("trafilatura.extract_metadata", return_value=fake_meta):
+        result = fetch_url_to_raw("https://blog2.example.com/post", tmp_path)
+
+    assert (raw_dir / "Introduction.md").read_text() == "first blog post body"
+    assert result == raw_dir / "Introduction_2.md"
+    assert result.read_text() == second_md
+
+
+def test_fetch_pdf_uses_post_redirect_url_for_filename(tmp_path):
+    """When urllib follows a redirect (DOI → publisher CDN, short URLs,
+    etc.), the filename must be derived from the final URL — not the
+    user's original input — when the response has no Content-Disposition
+    to override either."""
+    body = b"%PDF-1.7\n..."
+    resp = _fake_response(
+        body=body,
+        headers={"Content-Type": "application/pdf"},  # NO Content-Disposition
+    )
+    # urllib's HTTPResponse.geturl() returns the post-redirect URL
+    resp.geturl = lambda: "https://publisher.example.com/articles/2024/great-paper.pdf"
+
+    with patch("urllib.request.urlopen", return_value=resp):
+        result = fetch_url_to_raw("https://doi.org/10.1234/abc", tmp_path)
+
+    # Filename comes from the redirected URL's basename, not "abc"
+    assert result is not None
+    assert result.name == "great-paper.pdf"
+
+
+def test_add_single_file_returns_true_on_success(tmp_path):
+    """The new bool return contract: True when the file was actually
+    indexed. Used by the URL-ingest cleanup path to decide whether the
+    just-downloaded file in raw/ should be unlinked."""
+    from openkb.cli import add_single_file
+    from openkb.converter import ConvertResult
+
+    # Build a minimal KB scaffold
+    (tmp_path / ".openkb").mkdir()
+    (tmp_path / ".openkb" / "config.yaml").write_text("model: gpt-4o-mini\n")
+    (tmp_path / ".openkb" / "hashes.json").write_text("{}")
+    (tmp_path / "raw").mkdir()
+    (tmp_path / "wiki" / "summaries").mkdir(parents=True)
+    (tmp_path / "wiki" / "sources").mkdir(parents=True)
+    (tmp_path / "wiki" / "concepts").mkdir(parents=True)
+    (tmp_path / "wiki" / "log.md").write_text("")
+
+    doc = tmp_path / "raw" / "x.md"
+    doc.write_text("# Hello")
+    source_path = tmp_path / "wiki" / "sources" / "x.md"
+    source_path.write_text("# Hello converted")
+
+    mock_result = ConvertResult(
+        raw_path=doc, source_path=source_path,
+        is_long_doc=False, file_hash="cafe" * 16,
+    )
+
+    with patch("openkb.cli.convert_document", return_value=mock_result), \
+         patch("openkb.cli.asyncio.run"):
+        added = add_single_file(doc, tmp_path)
+
+    assert added is True
+
+
+def test_add_single_file_returns_false_on_skip(tmp_path):
+    from openkb.cli import add_single_file
+    from openkb.converter import ConvertResult
+
+    (tmp_path / ".openkb").mkdir()
+    (tmp_path / ".openkb" / "config.yaml").write_text("model: gpt-4o-mini\n")
+    (tmp_path / ".openkb" / "hashes.json").write_text("{}")
+    (tmp_path / "raw").mkdir()
+    doc = tmp_path / "raw" / "x.md"
+    doc.write_text("# Hello")
+
+    skipped = ConvertResult(skipped=True)
+    with patch("openkb.cli.convert_document", return_value=skipped):
+        added = add_single_file(doc, tmp_path)
+
+    assert added is False
+
+
+def test_url_ingest_cleans_up_orphan_on_dedup_skip(tmp_path, monkeypatch):
+    """End-to-end: when the URL-fetched file is already in the registry,
+    add_single_file returns False and the CLI must unlink it from raw/
+    so the user doesn't accumulate untracked duplicates."""
+    from click.testing import CliRunner
+    from openkb.cli import cli
+    from openkb.converter import ConvertResult
+
+    # Minimal KB
+    (tmp_path / ".openkb").mkdir()
+    (tmp_path / ".openkb" / "config.yaml").write_text("model: gpt-4o-mini\n")
+    (tmp_path / ".openkb" / "hashes.json").write_text("{}")
+    (tmp_path / "raw").mkdir()
+
+    # Fake the URL fetch — write directly to where url_ingest would
+    fetched_path = tmp_path / "raw" / "paper.pdf"
+    fetched_path.write_bytes(b"%PDF-fake")
+
+    runner = CliRunner()
+    # fetch_url_to_raw is lazy-imported inside `add`, so patch it at the
+    # source module — that's where the `from ... import` resolves.
+    with patch("openkb.cli._find_kb_dir", return_value=tmp_path), \
+         patch("openkb.url_ingest.fetch_url_to_raw", return_value=fetched_path), \
+         patch("openkb.cli.convert_document",
+               return_value=ConvertResult(skipped=True)):
+        result = runner.invoke(cli, ["add", "https://example.com/paper.pdf"])
+
+    assert result.exit_code == 0, result.output
+    assert "[SKIP]" in result.output
+    # Orphan cleanup: the URL-fetched file must be gone from raw/.
+    assert not fetched_path.exists()

From d0bb76271a31b0d75b48625dd689e6688703174c Mon Sep 17 00:00:00 2001
From: mountain <kose2livs@gmail.com>
Date: Sun, 17 May 2026 16:31:42 +0800
Subject: [PATCH 3/4] fix(url-ingest): preserve raw file on pipeline failure,
 fix HTML echo
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Round-2 review on commit 47e5ec2 surfaced two issues:

1. add_single_file returned bool, so the URL branch couldn't tell
   "dedup skip" from "mid-pipeline failure". A transient LLM error
   during compile_long_doc would delete the just-downloaded PDF and
   leave a dangling PageIndex entry (hash registration only happens
   on full success, so `openkb remove` couldn't recover it). Now the
   function returns Literal["added","skipped","failed"]; the URL
   branch only unlinks on "skipped" — failures keep the raw file so
   the user can retry without re-downloading.

2. _extract_html echoed the pre-collision filename instead of
   target.name, so on a title collision the user was told the file
   was saved to a path occupied by the previous document. PDF branch
   already used target.name correctly; HTML branch now matches.
---
 openkb/cli.py            |  36 ++++++++------
 openkb/url_ingest.py     |   2 +-
 tests/test_url_ingest.py | 103 ++++++++++++++++++++++++++++++++++-----
 3 files changed, 112 insertions(+), 29 deletions(-)

diff --git a/openkb/cli.py b/openkb/cli.py
index 20f644b9..32c111df 100644
--- a/openkb/cli.py
+++ b/openkb/cli.py
@@ -14,6 +14,7 @@
 import sys
 import time
 from pathlib import Path
+from typing import Literal
 
 import os
 
@@ -130,7 +131,7 @@ def _find_kb_dir(override: Path | None = None) -> Path | None:
     return None
 
 
-def add_single_file(file_path: Path, kb_dir: Path) -> bool:
+def add_single_file(file_path: Path, kb_dir: Path) -> Literal["added", "skipped", "failed"]:
     """Convert, index, and compile a single document into the knowledge base.
 
     Steps:
@@ -140,11 +141,12 @@ def add_single_file(file_path: Path, kb_dir: Path) -> bool:
     4. Else: compile_short_doc.
 
     Returns:
-        True if the file was newly indexed and compiled. False when the
-        file was skipped as a duplicate (hash already registered) or
-        when any pipeline stage failed — callers that need to clean up
-        the source file (URL-ingest path, for example) can act on a
-        ``False`` return.
+        ``"added"`` on full success, ``"skipped"`` when the file's hash
+        is already in the registry (dedup), or ``"failed"`` when any
+        pipeline stage raised. URL-ingest distinguishes these so it can
+        unlink the just-downloaded raw file on dedup (it would otherwise
+        be an orphan) while preserving it on failure so the user can
+        retry without re-downloading.
     """
     from openkb.agent.compiler import compile_long_doc, compile_short_doc
     from openkb.state import HashRegistry
@@ -163,11 +165,11 @@ def add_single_file(file_path: Path, kb_dir: Path) -> bool:
     except Exception as exc:
         click.echo(f"  [ERROR] Conversion failed: {exc}")
         logger.debug("Conversion traceback:", exc_info=True)
-        return False
+        return "failed"
 
     if result.skipped:
         click.echo(f"  [SKIP] Already in knowledge base: {file_path.name}")
-        return False
+        return "skipped"
 
     doc_name = file_path.stem
     index_result = None  # populated only on the long-doc branch
@@ -181,7 +183,7 @@ def add_single_file(file_path: Path, kb_dir: Path) -> bool:
         except Exception as exc:
             click.echo(f"  [ERROR] Indexing failed: {exc}")
             logger.debug("Indexing traceback:", exc_info=True)
-            return False
+            return "failed"
 
         summary_path = kb_dir / "wiki" / "summaries" / f"{doc_name}.md"
         click.echo(f"  Compiling long doc (doc_id={index_result.doc_id})...")
@@ -199,7 +201,7 @@ def add_single_file(file_path: Path, kb_dir: Path) -> bool:
                 else:
                     click.echo(f"  [ERROR] Compilation failed: {exc}")
                     logger.debug("Compilation traceback:", exc_info=True)
-                    return False
+                    return "failed"
     else:
         click.echo(f"  Compiling short doc...")
         for attempt in range(2):
@@ -213,7 +215,7 @@ def add_single_file(file_path: Path, kb_dir: Path) -> bool:
                 else:
                     click.echo(f"  [ERROR] Compilation failed: {exc}")
                     logger.debug("Compilation traceback:", exc_info=True)
-                    return False
+                    return "failed"
 
     # Register hash only after successful compilation
     if result.file_hash:
@@ -232,7 +234,7 @@ def add_single_file(file_path: Path, kb_dir: Path) -> bool:
 
     append_log(kb_dir / "wiki", "ingest", file_path.name)
     click.echo(f"  [OK] {file_path.name} added to knowledge base.")
-    return True
+    return "added"
 
 
 # ---------------------------------------------------------------------------
@@ -426,10 +428,12 @@ def add(ctx, path):
         fetched = fetch_url_to_raw(path, kb_dir)
         if fetched is None:
             return
-        added = add_single_file(fetched, kb_dir)
-        if not added:
-            # Duplicate (or failed mid-pipeline) — drop the just-fetched
-            # file so raw/ doesn't accumulate orphans across retries.
+        outcome = add_single_file(fetched, kb_dir)
+        # Only clean up on dedup-skip. On "failed" we keep the file so
+        # the user can retry (e.g. transient LLM error during compile)
+        # without re-downloading — and so they don't lose data when
+        # indexing has already succeeded but compilation didn't.
+        if outcome == "skipped":
             fetched.unlink(missing_ok=True)
         return
 
diff --git a/openkb/url_ingest.py b/openkb/url_ingest.py
index 6cd444ec..a1b17df2 100644
--- a/openkb/url_ingest.py
+++ b/openkb/url_ingest.py
@@ -208,7 +208,7 @@ def _extract_html(url: str, raw_dir: Path) -> Path | None:
     target.write_text(markdown, encoding="utf-8")
     click.echo(
         f"  Extracted: {title!r}\n"
-        f"  Saved: raw/{filename} ({len(markdown) // 1024 or 1} KB clean markdown)"
+        f"  Saved: raw/{target.name} ({len(markdown) // 1024 or 1} KB clean markdown)"
     )
     return target
 
diff --git a/tests/test_url_ingest.py b/tests/test_url_ingest.py
index 4ea7d0de..b522d4b5 100644
--- a/tests/test_url_ingest.py
+++ b/tests/test_url_ingest.py
@@ -414,8 +414,10 @@ def test_fetch_pdf_picks_unique_name_when_target_exists(tmp_path):
     assert result.read_bytes() == body
 
 
-def test_fetch_html_picks_unique_name_when_target_exists(tmp_path):
-    """Two blog posts both titled 'Introduction' must NOT collide."""
+def test_fetch_html_picks_unique_name_when_target_exists(tmp_path, capsys):
+    """Two blog posts both titled 'Introduction' must NOT collide. The
+    user-facing 'Saved: ...' echo must also reflect the renamed path —
+    otherwise the message lies about where the file actually went."""
     raw_dir = tmp_path / "raw"
     raw_dir.mkdir()
     (raw_dir / "Introduction.md").write_text("first blog post body")
@@ -435,6 +437,8 @@ def test_fetch_html_picks_unique_name_when_target_exists(tmp_path):
     assert (raw_dir / "Introduction.md").read_text() == "first blog post body"
     assert result == raw_dir / "Introduction_2.md"
     assert result.read_text() == second_md
+    out = capsys.readouterr().out
+    assert "Saved: raw/Introduction_2.md" in out
 
 
 def test_fetch_pdf_uses_post_redirect_url_for_filename(tmp_path):
@@ -458,10 +462,10 @@ def test_fetch_pdf_uses_post_redirect_url_for_filename(tmp_path):
     assert result.name == "great-paper.pdf"
 
 
-def test_add_single_file_returns_true_on_success(tmp_path):
-    """The new bool return contract: True when the file was actually
-    indexed. Used by the URL-ingest cleanup path to decide whether the
-    just-downloaded file in raw/ should be unlinked."""
+def test_add_single_file_returns_added_on_success(tmp_path):
+    """Tri-state return contract: ``"added"`` when the file was newly
+    indexed. URL-ingest uses this to decide whether to keep / unlink
+    the just-downloaded file."""
     from openkb.cli import add_single_file
     from openkb.converter import ConvertResult
 
@@ -487,12 +491,12 @@ def test_add_single_file_returns_true_on_success(tmp_path):
 
     with patch("openkb.cli.convert_document", return_value=mock_result), \
          patch("openkb.cli.asyncio.run"):
-        added = add_single_file(doc, tmp_path)
+        outcome = add_single_file(doc, tmp_path)
 
-    assert added is True
+    assert outcome == "added"
 
 
-def test_add_single_file_returns_false_on_skip(tmp_path):
+def test_add_single_file_returns_skipped_on_dedup(tmp_path):
     from openkb.cli import add_single_file
     from openkb.converter import ConvertResult
 
@@ -505,14 +509,48 @@ def test_add_single_file_returns_false_on_skip(tmp_path):
 
     skipped = ConvertResult(skipped=True)
     with patch("openkb.cli.convert_document", return_value=skipped):
-        added = add_single_file(doc, tmp_path)
+        outcome = add_single_file(doc, tmp_path)
 
-    assert added is False
+    assert outcome == "skipped"
+
+
+def test_add_single_file_returns_failed_on_pipeline_error(tmp_path):
+    """A pipeline failure (e.g. transient LLM error during compilation)
+    must be distinguishable from dedup-skip, so URL-ingest can preserve
+    the raw file for retry instead of deleting it."""
+    from openkb.cli import add_single_file
+    from openkb.converter import ConvertResult
+
+    (tmp_path / ".openkb").mkdir()
+    (tmp_path / ".openkb" / "config.yaml").write_text("model: gpt-4o-mini\n")
+    (tmp_path / ".openkb" / "hashes.json").write_text("{}")
+    (tmp_path / "raw").mkdir()
+    (tmp_path / "wiki" / "summaries").mkdir(parents=True)
+    (tmp_path / "wiki" / "sources").mkdir(parents=True)
+    (tmp_path / "wiki" / "log.md").write_text("")
+
+    doc = tmp_path / "raw" / "x.md"
+    doc.write_text("# Hello")
+    source_path = tmp_path / "wiki" / "sources" / "x.md"
+    source_path.write_text("# Hello")
+
+    mock_result = ConvertResult(
+        raw_path=doc, source_path=source_path,
+        is_long_doc=False, file_hash="cafe" * 16,
+    )
+
+    # Make both compile attempts raise to drive the failure path.
+    with patch("openkb.cli.convert_document", return_value=mock_result), \
+         patch("openkb.cli.asyncio.run", side_effect=RuntimeError("LLM 503")), \
+         patch("openkb.cli.time.sleep"):
+        outcome = add_single_file(doc, tmp_path)
+
+    assert outcome == "failed"
 
 
 def test_url_ingest_cleans_up_orphan_on_dedup_skip(tmp_path, monkeypatch):
     """End-to-end: when the URL-fetched file is already in the registry,
-    add_single_file returns False and the CLI must unlink it from raw/
+    add_single_file returns "skipped" and the CLI unlinks it from raw/
     so the user doesn't accumulate untracked duplicates."""
     from click.testing import CliRunner
     from openkb.cli import cli
@@ -541,3 +579,44 @@ def test_url_ingest_cleans_up_orphan_on_dedup_skip(tmp_path, monkeypatch):
     assert "[SKIP]" in result.output
     # Orphan cleanup: the URL-fetched file must be gone from raw/.
     assert not fetched_path.exists()
+
+
+def test_url_ingest_keeps_raw_file_on_pipeline_failure(tmp_path):
+    """The point of the tri-state return: a pipeline failure (e.g. LLM
+    timeout during compilation) must NOT delete the downloaded file —
+    the user can retry without re-downloading, and we don't lose data
+    when indexing has already succeeded but compilation hasn't."""
+    from click.testing import CliRunner
+    from openkb.cli import cli
+    from openkb.converter import ConvertResult
+
+    (tmp_path / ".openkb").mkdir()
+    (tmp_path / ".openkb" / "config.yaml").write_text("model: gpt-4o-mini\n")
+    (tmp_path / ".openkb" / "hashes.json").write_text("{}")
+    (tmp_path / "raw").mkdir()
+    (tmp_path / "wiki" / "summaries").mkdir(parents=True)
+    (tmp_path / "wiki" / "sources").mkdir(parents=True)
+    (tmp_path / "wiki" / "log.md").write_text("")
+
+    fetched_path = tmp_path / "raw" / "paper.pdf"
+    fetched_path.write_bytes(b"%PDF-fake")
+    source_path = tmp_path / "wiki" / "sources" / "paper.md"
+    source_path.write_text("# fake")
+
+    mock_result = ConvertResult(
+        raw_path=fetched_path, source_path=source_path,
+        is_long_doc=False, file_hash="cafe" * 16,
+    )
+
+    runner = CliRunner()
+    with patch("openkb.cli._find_kb_dir", return_value=tmp_path), \
+         patch("openkb.url_ingest.fetch_url_to_raw", return_value=fetched_path), \
+         patch("openkb.cli.convert_document", return_value=mock_result), \
+         patch("openkb.cli.asyncio.run", side_effect=RuntimeError("LLM 503")), \
+         patch("openkb.cli.time.sleep"):
+        result = runner.invoke(cli, ["add", "https://example.com/paper.pdf"])
+
+    assert result.exit_code == 0, result.output
+    assert "[ERROR] Compilation failed" in result.output
+    # The raw file must be preserved so the user can retry.
+    assert fetched_path.exists()

From 46adaab2aa5db241e9fa5fe2e37536c48d4cf8a8 Mon Sep 17 00:00:00 2001
From: mountain <kose2livs@gmail.com>
Date: Mon, 18 May 2026 10:29:06 +0800
Subject: [PATCH 4/4] chore(tests): drop unused 'from pathlib import Path'
 import

Flagged by github-code-quality bot on PR #55. The import was left over
from an earlier draft; nothing in the file references Path.
---
 tests/test_url_ingest.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/tests/test_url_ingest.py b/tests/test_url_ingest.py
index b522d4b5..1b8548ee 100644
--- a/tests/test_url_ingest.py
+++ b/tests/test_url_ingest.py
@@ -2,7 +2,6 @@
 from __future__ import annotations
 
 import io
-from pathlib import Path
 from unittest.mock import MagicMock, patch
 
 from openkb.url_ingest import (