diff --git a/README.md b/README.md
index 4dd51c47..02fd401c 100644
--- a/README.md
+++ b/README.md
@@ -72,7 +72,8 @@ openkb init
# 3. Add documents
openkb add paper.pdf
-openkb add ~/papers/ # Add a whole directory
+openkb add ~/papers/ # Add a whole directory
+openkb add https://arxiv.org/pdf/2509.11420 # Or fetch from a URL
# 4. Ask a question
openkb query "What are the main findings?"
@@ -148,7 +149,7 @@ A single source might touch 10-15 wiki pages. Knowledge accumulates: each docume
| Command | Description |
|---|---|
| `openkb init` | Initialize a new knowledge base (interactive) |
-| openkb add <file_or_dir> | Add documents and compile to wiki |
+| openkb add <file_or_dir_or_URL> | Add documents and compile to wiki. URL ingest auto-detects PDF (saved as `.pdf` → PageIndex / markitdown) vs HTML (trafilatura main-content extract → `.md`) |
| openkb remove <doc> | Remove a document and clean up its wiki pages, images, registry, and PageIndex state (use `--dry-run` to preview, `--keep-raw` / `--keep-empty-concepts` to retain artifacts) |
| openkb query "question" | Ask a question over the knowledge base (use `--save` to save the answer to `wiki/explorations/`) |
| `openkb chat` | Start an interactive multi-turn chat (use `--resume`, `--list`, `--delete` to manage sessions) |
diff --git a/openkb/cli.py b/openkb/cli.py
index 5197a930..32c111df 100644
--- a/openkb/cli.py
+++ b/openkb/cli.py
@@ -14,6 +14,7 @@
import sys
import time
from pathlib import Path
+from typing import Literal
import os
@@ -130,7 +131,7 @@ def _find_kb_dir(override: Path | None = None) -> Path | None:
return None
-def add_single_file(file_path: Path, kb_dir: Path) -> None:
+def add_single_file(file_path: Path, kb_dir: Path) -> Literal["added", "skipped", "failed"]:
"""Convert, index, and compile a single document into the knowledge base.
Steps:
@@ -138,6 +139,14 @@ def add_single_file(file_path: Path, kb_dir: Path) -> None:
2. Convert the document (hash-check; skip if already known).
3. If long doc: run PageIndex then compile_long_doc.
4. Else: compile_short_doc.
+
+ Returns:
+ ``"added"`` on full success, ``"skipped"`` when the file's hash
+ is already in the registry (dedup), or ``"failed"`` when any
+ pipeline stage raised. URL-ingest distinguishes these so it can
+ unlink the just-downloaded raw file on dedup (it would otherwise
+ be an orphan) while preserving it on failure so the user can
+ retry without re-downloading.
"""
from openkb.agent.compiler import compile_long_doc, compile_short_doc
from openkb.state import HashRegistry
@@ -156,11 +165,11 @@ def add_single_file(file_path: Path, kb_dir: Path) -> None:
except Exception as exc:
click.echo(f" [ERROR] Conversion failed: {exc}")
logger.debug("Conversion traceback:", exc_info=True)
- return
+ return "failed"
if result.skipped:
click.echo(f" [SKIP] Already in knowledge base: {file_path.name}")
- return
+ return "skipped"
doc_name = file_path.stem
index_result = None # populated only on the long-doc branch
@@ -174,7 +183,7 @@ def add_single_file(file_path: Path, kb_dir: Path) -> None:
except Exception as exc:
click.echo(f" [ERROR] Indexing failed: {exc}")
logger.debug("Indexing traceback:", exc_info=True)
- return
+ return "failed"
summary_path = kb_dir / "wiki" / "summaries" / f"{doc_name}.md"
click.echo(f" Compiling long doc (doc_id={index_result.doc_id})...")
@@ -192,7 +201,7 @@ def add_single_file(file_path: Path, kb_dir: Path) -> None:
else:
click.echo(f" [ERROR] Compilation failed: {exc}")
logger.debug("Compilation traceback:", exc_info=True)
- return
+ return "failed"
else:
click.echo(f" Compiling short doc...")
for attempt in range(2):
@@ -206,7 +215,7 @@ def add_single_file(file_path: Path, kb_dir: Path) -> None:
else:
click.echo(f" [ERROR] Compilation failed: {exc}")
logger.debug("Compilation traceback:", exc_info=True)
- return
+ return "failed"
# Register hash only after successful compilation
if result.file_hash:
@@ -225,6 +234,7 @@ def add_single_file(file_path: Path, kb_dir: Path) -> None:
append_log(kb_dir / "wiki", "ingest", file_path.name)
click.echo(f" [OK] {file_path.name} added to knowledge base.")
+ return "added"
# ---------------------------------------------------------------------------
@@ -395,12 +405,38 @@ def init(language):
@click.argument("path")
@click.pass_context
def add(ctx, path):
- """Add a document or directory of documents at PATH to the knowledge base."""
+ """Add a document or directory of documents at PATH to the knowledge base.
+
+ PATH may be a local file, a local directory (which is walked
+ recursively for supported extensions), or an http(s) URL. URLs are
+ fetched into ``raw/`` first: PDF responses (by Content-Type and
+ magic-byte sniff) are saved as ``.pdf``; HTML responses are run
+ through trafilatura's main-content extractor and saved as ``.md``.
+ """
kb_dir = _find_kb_dir(ctx.obj.get("kb_dir_override"))
if kb_dir is None:
click.echo("No knowledge base found. Run `openkb init` first.")
return
+ # URL ingest: download into raw/ first, then call add_single_file
+ # explicitly so we can clean up the just-downloaded file if it
+ # turns out to be a duplicate (registry already has its hash).
+ # Without this, re-adding the same URL leaves an orphan in raw/
+ # that the registry can't reach via openkb remove.
+ from openkb.url_ingest import looks_like_url, fetch_url_to_raw
+ if looks_like_url(path):
+ fetched = fetch_url_to_raw(path, kb_dir)
+ if fetched is None:
+ return
+ outcome = add_single_file(fetched, kb_dir)
+ # Only clean up on dedup-skip. On "failed" we keep the file so
+ # the user can retry (e.g. transient LLM error during compile)
+ # without re-downloading — and so they don't lose data when
+ # indexing has already succeeded but compilation didn't.
+ if outcome == "skipped":
+ fetched.unlink(missing_ok=True)
+ return
+
target = Path(path)
if not target.exists():
click.echo(f"Path does not exist: {path}")
diff --git a/openkb/url_ingest.py b/openkb/url_ingest.py
new file mode 100644
index 00000000..a1b17df2
--- /dev/null
+++ b/openkb/url_ingest.py
@@ -0,0 +1,280 @@
+"""Fetch remote URLs into the knowledge base's ``raw/`` directory.
+
+This module is the input-acquisition layer for ``openkb add ``: it
+takes an http(s) URL, decides whether it points at a PDF or an HTML
+document (using both the HTTP ``Content-Type`` header and a magic-byte
+sniff so a mistyped header doesn't fool us), saves a local file under
+``raw/``, and hands the path back to the caller.
+
+The caller (``openkb.cli.add``) then dispatches the saved path to the
+normal local-file ingest pipeline (``add_single_file`` →
+``convert_document`` → markitdown / PageIndex), so all the existing
+short-vs-long-doc routing applies automatically based on the file
+extension and page count.
+
+PDF responses are streamed to disk in chunks (large papers can be tens
+of MB). HTML responses are passed through trafilatura's main-content
+extractor — saving the raw HTML directly would feed nav/footer/cookie
+chrome into the LLM and produce noisy summaries.
+"""
+from __future__ import annotations
+
+import re
+import urllib.error
+import urllib.request
+from pathlib import Path
+from urllib.parse import unquote, urlparse
+
+import click
+
+_USER_AGENT = "openkb/url-fetcher (+https://github.com/VectifyAI/OpenKB)"
+_TIMEOUT_SECONDS = 30
+_CHUNK_BYTES = 64 * 1024
+_SNIFF_BYTES = 512
+_HTML_MIN_EXTRACT_CHARS = 300
+_MAX_FILENAME_STEM = 80
+
+
+def looks_like_url(s: str) -> bool:
+ """Cheap check used by ``openkb add`` to branch into URL ingest."""
+ return s.startswith(("http://", "https://"))
+
+
+def _sniff_content_type(head: bytes, declared: str) -> str:
+ """Return ``'pdf'``, ``'html'``, or ``'unknown'`` based on magic bytes,
+ falling back to the server's declared Content-Type.
+
+ Magic bytes win when they disagree with the header (some servers
+ mislabel PDFs as ``application/octet-stream`` or send HTML
+ interstitial pages with ``application/pdf``).
+ """
+ if head.startswith(b"%PDF-"):
+ return "pdf"
+ stripped = head.lstrip(b" \t\r\n\xef\xbb\xbf") # strip BOM + leading whitespace
+ if stripped[:1] == b"<":
+ return "html"
+ declared_main = declared.split(";")[0].strip().lower()
+ if declared_main == "application/pdf":
+ return "pdf"
+ if declared_main.startswith("text/html") or declared_main == "application/xhtml+xml":
+ return "html"
+ return "unknown"
+
+
+def _sanitize_filename(name: str, ext: str) -> str:
+ """Make a filename safe for shell + filesystem use.
+
+ - URL-decodes percent escapes.
+ - Strips the existing extension **only when it matches the target
+ ``ext``** — so ``"2509.11420"`` keeps its dot when we're saving a
+ ``.pdf`` (the dot is part of the arxiv identifier, not an
+ extension).
+ - Replaces whitespace / parentheses / other non-``[a-zA-Z0-9._-]``
+ chars with ``-``, collapses repeated ``-``, and trims.
+ - Caps the stem at 80 chars to avoid filesystem limits.
+ - Returns ````, falling back to ``document`` if the
+ sanitized stem is empty.
+ """
+ decoded = unquote(name)
+ if ext and decoded.lower().endswith(ext.lower()):
+ stem = decoded[: -len(ext)]
+ else:
+ stem = decoded
+ stem = re.sub(r"[^a-zA-Z0-9._-]+", "-", stem)
+ stem = re.sub(r"-+", "-", stem).strip("-._")
+ stem = stem[:_MAX_FILENAME_STEM].rstrip("-._")
+ return f"{stem}{ext}" if stem else f"document{ext}"
+
+
+def _parse_content_disposition_filename(header: str | None) -> str | None:
+ """Extract a filename hint from a ``Content-Disposition`` header.
+
+ Handles three forms (in priority order):
+
+ 1. ``filename*=UTF-8''percent-encoded`` (RFC 5987)
+ 2. ``filename="quoted with spaces.pdf"``
+ 3. ``filename=unquoted-no-spaces.pdf``
+ """
+ if not header:
+ return None
+ # RFC 5987 extended form first
+ m = re.search(r"filename\*=(?:[\w-]+'[\w-]*')?([^;]+)", header)
+ if m:
+ return unquote(m.group(1).strip())
+ # Quoted form (may contain spaces / parens / commas)
+ m = re.search(r'filename="([^"]+)"', header)
+ if m:
+ return m.group(1)
+ # Unquoted form (stops at whitespace or semicolon)
+ m = re.search(r"filename=([^\s;]+)", header)
+ if m:
+ return m.group(1)
+ return None
+
+
+def _pdf_filename(url: str, content_disposition: str | None) -> str:
+ """Derive a ``.pdf`` filename for a downloaded PDF.
+
+ Priority: ``Content-Disposition: filename=`` header → URL basename →
+ URL host. The result is run through :func:`_sanitize_filename`.
+ """
+ cd_name = _parse_content_disposition_filename(content_disposition)
+ if cd_name:
+ return _sanitize_filename(cd_name, ".pdf")
+ parsed = urlparse(url)
+ last = (parsed.path.rsplit("/", 1)[-1] or parsed.netloc).strip()
+ return _sanitize_filename(last or "document", ".pdf")
+
+
+def _unique_path(target: Path) -> Path:
+ """Return ``target`` if it doesn't exist yet, otherwise append ``_2``,
+ ``_3``, … to the stem until an unused name is found.
+
+ Prevents silent overwrites in ``raw/`` when two different URLs
+ sanitize to the same filename (e.g. two blog posts both titled
+ "Introduction" → both ``Introduction.md``).
+ """
+ if not target.exists():
+ return target
+ stem = target.stem
+ suffix = target.suffix
+ parent = target.parent
+ for i in range(2, 10_000):
+ candidate = parent / f"{stem}_{i}{suffix}"
+ if not candidate.exists():
+ return candidate
+ raise RuntimeError(
+ f"Could not find a free filename for {target} after 10k attempts"
+ )
+
+
+def _download_pdf_chunked(response, head_bytes: bytes, target: Path) -> None:
+ """Write the already-read ``head_bytes`` plus the remaining streamed
+ body to ``target``. Chunked so very large PDFs (50+ MB) don't sit in
+ RAM.
+ """
+ with open(target, "wb") as fh:
+ if head_bytes:
+ fh.write(head_bytes)
+ while True:
+ chunk = response.read(_CHUNK_BYTES)
+ if not chunk:
+ break
+ fh.write(chunk)
+
+
+def _extract_html(url: str, raw_dir: Path) -> Path | None:
+ """Fetch the URL through trafilatura, extract the main content as
+ Markdown, and write it to ``raw/.md``.
+
+ Returns the saved path, or None if extraction failed entirely. A
+ short extraction (< 300 chars) is saved anyway but flagged on
+ stderr — pages that are JS-rendered or paywalled often produce
+ near-empty extractions.
+ """
+ import trafilatura
+
+ raw_html = trafilatura.fetch_url(url)
+ if not raw_html:
+ click.echo(f" [ERROR] Could not fetch URL: {url}", err=True)
+ return None
+
+ markdown = trafilatura.extract(
+ raw_html, output_format="markdown", include_links=True,
+ )
+ if not markdown:
+ click.echo(
+ " [ERROR] No main content extracted — page may be empty, "
+ "JS-rendered, or paywalled.",
+ err=True,
+ )
+ return None
+
+ if len(markdown) < _HTML_MIN_EXTRACT_CHARS:
+ click.echo(
+ f" [WARN] Only {len(markdown)} chars extracted — page may be "
+ f"JS-rendered or behind a login. Saving anyway; inspect the "
+ f"resulting wiki entry and use `openkb remove` if it's empty.",
+ err=True,
+ )
+
+ metadata = trafilatura.extract_metadata(raw_html)
+ title = (metadata.title if metadata else None) or url
+ filename = _sanitize_filename(title, ".md")
+ # Pick a non-colliding name — two blog posts titled "Introduction"
+ # would otherwise overwrite each other in raw/ and leave the hash
+ # registry pointing at stale bytes.
+ target = _unique_path(raw_dir / filename)
+ target.write_text(markdown, encoding="utf-8")
+ click.echo(
+ f" Extracted: {title!r}\n"
+ f" Saved: raw/{target.name} ({len(markdown) // 1024 or 1} KB clean markdown)"
+ )
+ return target
+
+
+def fetch_url_to_raw(url: str, kb_dir: Path) -> Path | None:
+ """Fetch ``url`` into ``/raw/`` and return the local path.
+
+ Routing is decided by HTTP ``Content-Type`` validated against magic
+ bytes (in case the server lies):
+
+ - PDF → urllib chunked download → ``raw/.pdf``
+ - HTML → trafilatura main-content extract → ``raw/.md``
+ - anything else → error, returns None
+
+ The caller then hands the saved path to ``add_single_file``, so the
+ existing PageIndex / markitdown routing by file extension and page
+ count takes over from there.
+ """
+ raw_dir = kb_dir / "raw"
+ raw_dir.mkdir(parents=True, exist_ok=True)
+
+ click.echo(f"Downloading: {url}")
+
+ request = urllib.request.Request(
+ url, headers={"User-Agent": _USER_AGENT, "Accept": "*/*"},
+ )
+ try:
+ response = urllib.request.urlopen(request, timeout=_TIMEOUT_SECONDS)
+ except urllib.error.HTTPError as exc:
+ click.echo(f" [ERROR] HTTP {exc.code} {exc.reason}", err=True)
+ return None
+ except urllib.error.URLError as exc:
+ click.echo(f" [ERROR] Network error: {exc.reason}", err=True)
+ return None
+ except Exception as exc:
+ click.echo(f" [ERROR] Fetch failed: {exc}", err=True)
+ return None
+
+ with response:
+ declared = response.headers.get("Content-Type", "") or ""
+ head_bytes = response.read(_SNIFF_BYTES)
+ actual = _sniff_content_type(head_bytes, declared)
+
+ if actual == "pdf":
+ # Derive the filename from the *post-redirect* URL — urllib
+ # follows redirects by default, so the user-typed URL may
+ # not be the URL that actually served the bytes (DOI / short
+ # link resolvers, mirror redirects, etc.). Falls back to the
+ # original input when the response doesn't expose a final
+ # URL.
+ final_url = response.geturl() or url
+ filename = _pdf_filename(
+ final_url, response.headers.get("Content-Disposition"),
+ )
+ target = _unique_path(raw_dir / filename)
+ _download_pdf_chunked(response, head_bytes, target)
+ size_mb = target.stat().st_size / (1024 * 1024)
+ click.echo(f" Saved: raw/{target.name} ({size_mb:.1f} MB PDF)")
+ return target
+
+ if actual == "html":
+ return _extract_html(url, raw_dir)
+
+ click.echo(
+ f" [ERROR] Unsupported content type {declared!r} for URL ingest. "
+ "Download the file manually and pass its path to `openkb add` instead.",
+ err=True,
+ )
+ return None
diff --git a/pyproject.toml b/pyproject.toml
index 7fec212c..1d0ec85a 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -28,6 +28,7 @@ keywords = ["ai", "rag", "retrieval", "knowledge-base", "llm", "pageindex", "age
dependencies = [
"pageindex==0.3.0.dev1",
"markitdown[all]",
+ "trafilatura>=2.0",
"click>=8.0",
"watchdog>=3.0",
"litellm",
diff --git a/tests/test_url_ingest.py b/tests/test_url_ingest.py
new file mode 100644
index 00000000..1b8548ee
--- /dev/null
+++ b/tests/test_url_ingest.py
@@ -0,0 +1,621 @@
+"""Tests for `openkb.url_ingest` — the URL → raw/ input-acquisition layer."""
+from __future__ import annotations
+
+import io
+from unittest.mock import MagicMock, patch
+
+from openkb.url_ingest import (
+ _parse_content_disposition_filename,
+ _pdf_filename,
+ _sanitize_filename,
+ _sniff_content_type,
+ _unique_path,
+ fetch_url_to_raw,
+ looks_like_url,
+)
+
+
+# ---------------------------------------------------------------------------
+# Pure helpers (no I/O)
+# ---------------------------------------------------------------------------
+
+
+def test_looks_like_url_accepts_http_and_https():
+ assert looks_like_url("http://example.com") is True
+ assert looks_like_url("https://example.com/foo") is True
+
+
+def test_looks_like_url_rejects_paths_and_filenames():
+ assert looks_like_url("/tmp/foo.pdf") is False
+ assert looks_like_url("./relative") is False
+ assert looks_like_url("foo.pdf") is False
+ assert looks_like_url("") is False
+
+
+# Content-type sniffing — magic bytes override declared header
+
+
+def test_sniff_pdf_magic_wins_over_octet_stream():
+ """Some CDNs mislabel PDFs as application/octet-stream — magic bytes save us."""
+ assert _sniff_content_type(b"%PDF-1.4\n...", "application/octet-stream") == "pdf"
+
+
+def test_sniff_html_magic_wins_over_pdf_header():
+ """Some servers serve an HTML interstitial 'click to download' page with
+ Content-Type: application/pdf. Magic bytes must override."""
+ assert _sniff_content_type(b"", "application/pdf") == "html"
+
+
+def test_sniff_html_magic_handles_bom_and_whitespace():
+ assert _sniff_content_type(b"\xef\xbb\xbf", "") == "html"
+ assert _sniff_content_type(b" \n", "") == "html"
+
+
+def test_sniff_falls_back_to_declared_when_no_magic_match():
+ assert _sniff_content_type(b"\x00\x00", "application/pdf") == "pdf"
+ assert _sniff_content_type(b"random", "text/html; charset=utf-8") == "html"
+ assert _sniff_content_type(b"random", "application/xhtml+xml") == "html"
+
+
+def test_sniff_returns_unknown_for_unsupported_types():
+ assert _sniff_content_type(b"binary", "image/jpeg") == "unknown"
+ assert _sniff_content_type(b"binary", "application/json") == "unknown"
+ assert _sniff_content_type(b"binary", "") == "unknown"
+
+
+# Filename sanitization
+
+
+def test_sanitize_preserves_arxiv_id_with_dot():
+ """The dot in arxiv's `2509.11420` is part of the identifier, not an
+ extension. `_sanitize_filename` must not strip it when re-adding `.pdf`."""
+ assert _sanitize_filename("2509.11420", ".pdf") == "2509.11420.pdf"
+
+
+def test_sanitize_strips_matching_extension_then_re_adds_it():
+ assert _sanitize_filename("paper.pdf", ".pdf") == "paper.pdf"
+ assert _sanitize_filename("paper.PDF", ".pdf") == "paper.pdf"
+
+
+def test_sanitize_replaces_shell_unsafe_chars():
+ assert _sanitize_filename("hello world (1).pdf", ".pdf") == "hello-world-1.pdf"
+ assert _sanitize_filename("a:b/c\\d?e*f", ".md") == "a-b-c-d-e-f.md"
+
+
+def test_sanitize_collapses_repeated_dashes_and_trims():
+ # Underscores are allowed (part of [a-zA-Z0-9._-]) so they pass through;
+ # only sequences of non-allowed chars become dashes, and repeated dashes
+ # collapse to one. Leading/trailing dashes/dots/underscores are stripped.
+ assert _sanitize_filename("a___b---c", ".pdf") == "a___b-c.pdf"
+ assert _sanitize_filename("---trim---", ".md") == "trim.md"
+ assert _sanitize_filename("a b c d", ".md") == "a-b-c-d.md"
+
+
+def test_sanitize_caps_stem_at_80_chars():
+ name = "a" * 200
+ assert _sanitize_filename(name, ".pdf") == ("a" * 80) + ".pdf"
+
+
+def test_sanitize_falls_back_to_document_when_empty():
+ assert _sanitize_filename("", ".md") == "document.md"
+ assert _sanitize_filename("...", ".md") == "document.md"
+ assert _sanitize_filename("///", ".pdf") == "document.pdf"
+
+
+# Content-Disposition parsing
+
+
+def test_content_disposition_quoted_with_spaces():
+ """Quoted form must capture filenames with spaces / parens / commas."""
+ cd = 'attachment; filename="My Paper, v3 (final).pdf"'
+ assert _parse_content_disposition_filename(cd) == "My Paper, v3 (final).pdf"
+
+
+def test_content_disposition_unquoted_simple():
+ assert _parse_content_disposition_filename("attachment; filename=foo.pdf") == "foo.pdf"
+
+
+def test_content_disposition_rfc5987_extended():
+ """filename*=UTF-8'' is the modern form for non-ASCII."""
+ cd = "attachment; filename*=UTF-8''My%20Paper%20%C3%A9.pdf"
+ assert _parse_content_disposition_filename(cd) == "My Paper é.pdf"
+
+
+def test_content_disposition_none_or_missing_filename():
+ assert _parse_content_disposition_filename(None) is None
+ assert _parse_content_disposition_filename("attachment") is None
+ assert _parse_content_disposition_filename("inline") is None
+
+
+# _pdf_filename: header → URL basename fallback chain
+
+
+def test_pdf_filename_prefers_content_disposition():
+ name = _pdf_filename(
+ "https://x.com/dl?id=123",
+ 'attachment; filename="My Paper.pdf"',
+ )
+ assert name == "My-Paper.pdf"
+
+
+def test_pdf_filename_falls_back_to_url_basename():
+ name = _pdf_filename(
+ "https://cdn.example.com/abc/69fe2a55_The-Founders-Playbook-05062026_v3%20(1).pdf",
+ None,
+ )
+ assert name == "69fe2a55_The-Founders-Playbook-05062026_v3-1.pdf"
+
+
+def test_pdf_filename_handles_arxiv_pdf_url_without_extension():
+ """arxiv's PDF URL `arxiv.org/pdf/2509.11420` ends without `.pdf` — the
+ content-type tells us it's a PDF and `_sanitize_filename` must keep
+ the dot in the arxiv ID rather than treating it as an extension."""
+ name = _pdf_filename("https://arxiv.org/pdf/2509.11420", None)
+ assert name == "2509.11420.pdf"
+
+
+def test_pdf_filename_falls_back_to_host_when_path_empty():
+ name = _pdf_filename("https://example.com/", None)
+ assert name == "example.com.pdf"
+
+
+# ---------------------------------------------------------------------------
+# fetch_url_to_raw — integration with urllib + trafilatura mocked
+# ---------------------------------------------------------------------------
+
+
+def _fake_response(*, body: bytes, headers: dict[str, str]):
+ """Build a fake urllib response with the given body + headers.
+
+ Headers are case-insensitive in real responses; mimicking that here
+ so the test doesn't depend on which case `_fetch_url_to_raw` looks up.
+ """
+ class _Headers:
+ def __init__(self, d):
+ self._d = {k.lower(): v for k, v in d.items()}
+
+ def get(self, key, default=None):
+ return self._d.get(key.lower(), default)
+
+ resp = MagicMock()
+ resp.headers = _Headers(headers)
+ # read(N) returns chunks; read() with no arg returns rest
+ stream = io.BytesIO(body)
+ resp.read = stream.read
+ # urllib's HTTPResponse exposes geturl(); default to empty string so
+ # callers using `response.geturl() or url` fall through to the input URL.
+ resp.geturl = lambda: ""
+ resp.__enter__ = lambda self: resp
+ resp.__exit__ = lambda *a: None
+ return resp
+
+
+def test_fetch_pdf_writes_chunked_to_raw_dir(tmp_path):
+ """End-to-end PDF path: urlopen → magic-byte sniff → chunked write →
+ filename comes from URL basename."""
+ body = b"%PDF-1.4\n" + b"x" * 100_000 # 100 KB PDF
+ resp = _fake_response(
+ body=body,
+ headers={"Content-Type": "application/pdf"},
+ )
+
+ with patch("urllib.request.urlopen", return_value=resp):
+ result = fetch_url_to_raw("https://arxiv.org/pdf/2509.11420", tmp_path)
+
+ assert result is not None
+ assert result.name == "2509.11420.pdf"
+ assert result.exists()
+ assert result.read_bytes() == body
+
+
+def test_fetch_pdf_with_lying_octet_stream_header(tmp_path):
+ """Server says octet-stream but the body starts with %PDF — magic bytes
+ must win and the file gets the .pdf extension."""
+ body = b"%PDF-1.7\n" + b"\x00" * 1000
+ resp = _fake_response(
+ body=body,
+ headers={"Content-Type": "application/octet-stream"},
+ )
+
+ with patch("urllib.request.urlopen", return_value=resp):
+ result = fetch_url_to_raw("https://cdn.example.com/a/b/file", tmp_path)
+
+ assert result is not None
+ assert result.suffix == ".pdf"
+ assert result.read_bytes() == body
+
+
+def test_fetch_pdf_chunks_a_very_large_body(tmp_path):
+ """A 1 MB synthetic body still writes correctly via chunked reads."""
+ body = b"%PDF-1.4\n" + b"a" * (1024 * 1024)
+ resp = _fake_response(body=body, headers={"Content-Type": "application/pdf"})
+
+ with patch("urllib.request.urlopen", return_value=resp):
+ result = fetch_url_to_raw("https://x.com/big.pdf", tmp_path)
+
+ assert result is not None
+ assert result.stat().st_size == len(body)
+
+
+def test_fetch_pdf_uses_content_disposition_filename(tmp_path):
+ body = b"%PDF-1.4\n..."
+ resp = _fake_response(
+ body=body,
+ headers={
+ "Content-Type": "application/pdf",
+ "Content-Disposition": 'attachment; filename="My Paper, v3.pdf"',
+ },
+ )
+
+ with patch("urllib.request.urlopen", return_value=resp):
+ result = fetch_url_to_raw("https://x.com/dl?id=1", tmp_path)
+
+ # comma sanitized to dash
+ assert result.name == "My-Paper-v3.pdf"
+
+
+def test_fetch_html_routes_to_trafilatura(tmp_path):
+ """HTML responses skip urllib's body (we already consumed the sniff
+ head) and hand the URL to trafilatura.fetch_url for proper anti-scrape
+ handling. trafilatura.extract gives clean markdown which we save as .md."""
+ sniff_head = b"\n..."
+ resp = _fake_response(
+ body=sniff_head + b"nav nav nav",
+ headers={"Content-Type": "text/html; charset=utf-8"},
+ )
+
+ fake_md = "# Real Article Title\n\nThis is the body content. " * 20 # ~1 KB
+ fake_meta = MagicMock()
+ fake_meta.title = "Real Article Title"
+
+ with patch("urllib.request.urlopen", return_value=resp), \
+ patch("trafilatura.fetch_url", return_value="...the real HTML..."), \
+ patch("trafilatura.extract", return_value=fake_md), \
+ patch("trafilatura.extract_metadata", return_value=fake_meta):
+ result = fetch_url_to_raw("https://blog.example.com/post", tmp_path)
+
+ assert result is not None
+ assert result.name == "Real-Article-Title.md"
+ assert result.read_text(encoding="utf-8") == fake_md
+
+
+def test_fetch_html_warns_on_short_extraction(tmp_path, capsys):
+ """JS-rendered pages produce near-empty extractions. The save still
+ happens (so the user can inspect what we got) but a stderr warning
+ surfaces the suspicion."""
+ sniff_head = b""
+ resp = _fake_response(body=sniff_head, headers={"Content-Type": "text/html"})
+
+ short_md = "# Title only" # 11 chars, well under 300
+ fake_meta = MagicMock()
+ fake_meta.title = "Title only"
+
+ with patch("urllib.request.urlopen", return_value=resp), \
+ patch("trafilatura.fetch_url", return_value="shell"), \
+ patch("trafilatura.extract", return_value=short_md), \
+ patch("trafilatura.extract_metadata", return_value=fake_meta):
+ result = fetch_url_to_raw("https://spa.example.com/page", tmp_path)
+
+ assert result is not None
+ assert result.read_text() == short_md
+ err = capsys.readouterr().err
+ assert "[WARN]" in err
+ assert f"{len(short_md)} chars extracted" in err
+
+
+def test_fetch_html_aborts_when_trafilatura_extracts_nothing(tmp_path):
+ """trafilatura.extract returning None means the page is essentially
+ empty (JS-only, paywall HTML, etc.). We error out rather than save
+ an empty .md."""
+ sniff_head = b""
+ resp = _fake_response(body=sniff_head, headers={"Content-Type": "text/html"})
+
+ with patch("urllib.request.urlopen", return_value=resp), \
+ patch("trafilatura.fetch_url", return_value="empty"), \
+ patch("trafilatura.extract", return_value=None):
+ result = fetch_url_to_raw("https://js-only.example.com", tmp_path)
+
+ assert result is None
+
+
+def test_fetch_unsupported_content_type_rejected(tmp_path, capsys):
+ """JSON / image / etc. — refuse with a clear message rather than
+ saving binary garbage as `.html` or `.pdf`."""
+ resp = _fake_response(
+ body=b'{"foo": "bar"}',
+ headers={"Content-Type": "application/json"},
+ )
+
+ with patch("urllib.request.urlopen", return_value=resp):
+ result = fetch_url_to_raw("https://api.example.com/data.json", tmp_path)
+
+ assert result is None
+ err = capsys.readouterr().err
+ assert "Unsupported content type" in err
+
+
+def test_fetch_http_404_returns_none(tmp_path, capsys):
+ """Server errors don't crash — graceful failure with stderr message."""
+ import urllib.error
+ err_resp = urllib.error.HTTPError(
+ "https://x.com/missing", 404, "Not Found", {}, None,
+ )
+
+ with patch("urllib.request.urlopen", side_effect=err_resp):
+ result = fetch_url_to_raw("https://x.com/missing", tmp_path)
+
+ assert result is None
+ err = capsys.readouterr().err
+ assert "HTTP 404" in err
+
+
+def test_fetch_network_error_returns_none(tmp_path, capsys):
+ """DNS failure / connection refused — graceful with clear message."""
+ import urllib.error
+
+ with patch(
+ "urllib.request.urlopen",
+ side_effect=urllib.error.URLError("nodename nor servname provided"),
+ ):
+ result = fetch_url_to_raw("https://no-such-host.invalid/foo", tmp_path)
+
+ assert result is None
+ err = capsys.readouterr().err
+ assert "Network error" in err
+
+
+# ---------------------------------------------------------------------------
+# Self-review fixes from PR #55 review pass
+# ---------------------------------------------------------------------------
+
+
+def test_unique_path_returns_target_when_free(tmp_path):
+ p = tmp_path / "foo.pdf"
+ assert _unique_path(p) == p
+
+
+def test_unique_path_finds_next_free_slot(tmp_path):
+ """_2 / _3 / … must be appended to the stem (not the suffix) and
+ must keep probing until a free name is found."""
+ (tmp_path / "foo.pdf").write_bytes(b"a")
+ (tmp_path / "foo_2.pdf").write_bytes(b"b")
+
+ result = _unique_path(tmp_path / "foo.pdf")
+ assert result == tmp_path / "foo_3.pdf"
+
+
+def test_unique_path_handles_no_suffix(tmp_path):
+ """Files without an extension still get a usable suffix."""
+ (tmp_path / "README").write_text("x")
+ result = _unique_path(tmp_path / "README")
+ assert result.name == "README_2"
+
+
+def test_fetch_pdf_picks_unique_name_when_target_exists(tmp_path):
+ """Two URLs that sanitize to the same filename must NOT silently
+ overwrite — the second one gets `_2` appended."""
+ raw_dir = tmp_path / "raw"
+ raw_dir.mkdir()
+ (raw_dir / "paper.pdf").write_bytes(b"%PDF-existing\nfirst")
+
+ body = b"%PDF-1.7\nsecond URL content"
+ resp = _fake_response(body=body, headers={"Content-Type": "application/pdf"})
+ # Make response.geturl mimic real urllib (returns the input URL when
+ # there's no redirect). The fake response builder doesn't set this.
+ resp.geturl = lambda: "https://mirror.example.com/paper.pdf"
+
+ with patch("urllib.request.urlopen", return_value=resp):
+ result = fetch_url_to_raw("https://mirror.example.com/paper.pdf", tmp_path)
+
+ # First file is untouched, second went to paper_2.pdf
+ assert (raw_dir / "paper.pdf").read_bytes() == b"%PDF-existing\nfirst"
+ assert result == raw_dir / "paper_2.pdf"
+ assert result.read_bytes() == body
+
+
+def test_fetch_html_picks_unique_name_when_target_exists(tmp_path, capsys):
+ """Two blog posts both titled 'Introduction' must NOT collide. The
+ user-facing 'Saved: ...' echo must also reflect the renamed path —
+ otherwise the message lies about where the file actually went."""
+ raw_dir = tmp_path / "raw"
+ raw_dir.mkdir()
+ (raw_dir / "Introduction.md").write_text("first blog post body")
+
+ resp = _fake_response(body=b"", headers={"Content-Type": "text/html"})
+
+ second_md = "# Introduction\n\nA completely different second blog post body. " * 10
+ fake_meta = MagicMock()
+ fake_meta.title = "Introduction"
+
+ with patch("urllib.request.urlopen", return_value=resp), \
+ patch("trafilatura.fetch_url", return_value="..."), \
+ patch("trafilatura.extract", return_value=second_md), \
+ patch("trafilatura.extract_metadata", return_value=fake_meta):
+ result = fetch_url_to_raw("https://blog2.example.com/post", tmp_path)
+
+ assert (raw_dir / "Introduction.md").read_text() == "first blog post body"
+ assert result == raw_dir / "Introduction_2.md"
+ assert result.read_text() == second_md
+ out = capsys.readouterr().out
+ assert "Saved: raw/Introduction_2.md" in out
+
+
+def test_fetch_pdf_uses_post_redirect_url_for_filename(tmp_path):
+ """When urllib follows a redirect (DOI → publisher CDN, short URLs,
+ etc.), the filename must be derived from the final URL — not the
+ user's original input — when the response has no Content-Disposition
+ to override either."""
+ body = b"%PDF-1.7\n..."
+ resp = _fake_response(
+ body=body,
+ headers={"Content-Type": "application/pdf"}, # NO Content-Disposition
+ )
+ # urllib's HTTPResponse.geturl() returns the post-redirect URL
+ resp.geturl = lambda: "https://publisher.example.com/articles/2024/great-paper.pdf"
+
+ with patch("urllib.request.urlopen", return_value=resp):
+ result = fetch_url_to_raw("https://doi.org/10.1234/abc", tmp_path)
+
+ # Filename comes from the redirected URL's basename, not "abc"
+ assert result is not None
+ assert result.name == "great-paper.pdf"
+
+
+def test_add_single_file_returns_added_on_success(tmp_path):
+ """Tri-state return contract: ``"added"`` when the file was newly
+ indexed. URL-ingest uses this to decide whether to keep / unlink
+ the just-downloaded file."""
+ from openkb.cli import add_single_file
+ from openkb.converter import ConvertResult
+
+ # Build a minimal KB scaffold
+ (tmp_path / ".openkb").mkdir()
+ (tmp_path / ".openkb" / "config.yaml").write_text("model: gpt-4o-mini\n")
+ (tmp_path / ".openkb" / "hashes.json").write_text("{}")
+ (tmp_path / "raw").mkdir()
+ (tmp_path / "wiki" / "summaries").mkdir(parents=True)
+ (tmp_path / "wiki" / "sources").mkdir(parents=True)
+ (tmp_path / "wiki" / "concepts").mkdir(parents=True)
+ (tmp_path / "wiki" / "log.md").write_text("")
+
+ doc = tmp_path / "raw" / "x.md"
+ doc.write_text("# Hello")
+ source_path = tmp_path / "wiki" / "sources" / "x.md"
+ source_path.write_text("# Hello converted")
+
+ mock_result = ConvertResult(
+ raw_path=doc, source_path=source_path,
+ is_long_doc=False, file_hash="cafe" * 16,
+ )
+
+ with patch("openkb.cli.convert_document", return_value=mock_result), \
+ patch("openkb.cli.asyncio.run"):
+ outcome = add_single_file(doc, tmp_path)
+
+ assert outcome == "added"
+
+
+def test_add_single_file_returns_skipped_on_dedup(tmp_path):
+ from openkb.cli import add_single_file
+ from openkb.converter import ConvertResult
+
+ (tmp_path / ".openkb").mkdir()
+ (tmp_path / ".openkb" / "config.yaml").write_text("model: gpt-4o-mini\n")
+ (tmp_path / ".openkb" / "hashes.json").write_text("{}")
+ (tmp_path / "raw").mkdir()
+ doc = tmp_path / "raw" / "x.md"
+ doc.write_text("# Hello")
+
+ skipped = ConvertResult(skipped=True)
+ with patch("openkb.cli.convert_document", return_value=skipped):
+ outcome = add_single_file(doc, tmp_path)
+
+ assert outcome == "skipped"
+
+
+def test_add_single_file_returns_failed_on_pipeline_error(tmp_path):
+ """A pipeline failure (e.g. transient LLM error during compilation)
+ must be distinguishable from dedup-skip, so URL-ingest can preserve
+ the raw file for retry instead of deleting it."""
+ from openkb.cli import add_single_file
+ from openkb.converter import ConvertResult
+
+ (tmp_path / ".openkb").mkdir()
+ (tmp_path / ".openkb" / "config.yaml").write_text("model: gpt-4o-mini\n")
+ (tmp_path / ".openkb" / "hashes.json").write_text("{}")
+ (tmp_path / "raw").mkdir()
+ (tmp_path / "wiki" / "summaries").mkdir(parents=True)
+ (tmp_path / "wiki" / "sources").mkdir(parents=True)
+ (tmp_path / "wiki" / "log.md").write_text("")
+
+ doc = tmp_path / "raw" / "x.md"
+ doc.write_text("# Hello")
+ source_path = tmp_path / "wiki" / "sources" / "x.md"
+ source_path.write_text("# Hello")
+
+ mock_result = ConvertResult(
+ raw_path=doc, source_path=source_path,
+ is_long_doc=False, file_hash="cafe" * 16,
+ )
+
+ # Make both compile attempts raise to drive the failure path.
+ with patch("openkb.cli.convert_document", return_value=mock_result), \
+ patch("openkb.cli.asyncio.run", side_effect=RuntimeError("LLM 503")), \
+ patch("openkb.cli.time.sleep"):
+ outcome = add_single_file(doc, tmp_path)
+
+ assert outcome == "failed"
+
+
+def test_url_ingest_cleans_up_orphan_on_dedup_skip(tmp_path, monkeypatch):
+ """End-to-end: when the URL-fetched file is already in the registry,
+ add_single_file returns "skipped" and the CLI unlinks it from raw/
+ so the user doesn't accumulate untracked duplicates."""
+ from click.testing import CliRunner
+ from openkb.cli import cli
+ from openkb.converter import ConvertResult
+
+ # Minimal KB
+ (tmp_path / ".openkb").mkdir()
+ (tmp_path / ".openkb" / "config.yaml").write_text("model: gpt-4o-mini\n")
+ (tmp_path / ".openkb" / "hashes.json").write_text("{}")
+ (tmp_path / "raw").mkdir()
+
+ # Fake the URL fetch — write directly to where url_ingest would
+ fetched_path = tmp_path / "raw" / "paper.pdf"
+ fetched_path.write_bytes(b"%PDF-fake")
+
+ runner = CliRunner()
+ # fetch_url_to_raw is lazy-imported inside `add`, so patch it at the
+ # source module — that's where the `from ... import` resolves.
+ with patch("openkb.cli._find_kb_dir", return_value=tmp_path), \
+ patch("openkb.url_ingest.fetch_url_to_raw", return_value=fetched_path), \
+ patch("openkb.cli.convert_document",
+ return_value=ConvertResult(skipped=True)):
+ result = runner.invoke(cli, ["add", "https://example.com/paper.pdf"])
+
+ assert result.exit_code == 0, result.output
+ assert "[SKIP]" in result.output
+ # Orphan cleanup: the URL-fetched file must be gone from raw/.
+ assert not fetched_path.exists()
+
+
+def test_url_ingest_keeps_raw_file_on_pipeline_failure(tmp_path):
+ """The point of the tri-state return: a pipeline failure (e.g. LLM
+ timeout during compilation) must NOT delete the downloaded file —
+ the user can retry without re-downloading, and we don't lose data
+ when indexing has already succeeded but compilation hasn't."""
+ from click.testing import CliRunner
+ from openkb.cli import cli
+ from openkb.converter import ConvertResult
+
+ (tmp_path / ".openkb").mkdir()
+ (tmp_path / ".openkb" / "config.yaml").write_text("model: gpt-4o-mini\n")
+ (tmp_path / ".openkb" / "hashes.json").write_text("{}")
+ (tmp_path / "raw").mkdir()
+ (tmp_path / "wiki" / "summaries").mkdir(parents=True)
+ (tmp_path / "wiki" / "sources").mkdir(parents=True)
+ (tmp_path / "wiki" / "log.md").write_text("")
+
+ fetched_path = tmp_path / "raw" / "paper.pdf"
+ fetched_path.write_bytes(b"%PDF-fake")
+ source_path = tmp_path / "wiki" / "sources" / "paper.md"
+ source_path.write_text("# fake")
+
+ mock_result = ConvertResult(
+ raw_path=fetched_path, source_path=source_path,
+ is_long_doc=False, file_hash="cafe" * 16,
+ )
+
+ runner = CliRunner()
+ with patch("openkb.cli._find_kb_dir", return_value=tmp_path), \
+ patch("openkb.url_ingest.fetch_url_to_raw", return_value=fetched_path), \
+ patch("openkb.cli.convert_document", return_value=mock_result), \
+ patch("openkb.cli.asyncio.run", side_effect=RuntimeError("LLM 503")), \
+ patch("openkb.cli.time.sleep"):
+ result = runner.invoke(cli, ["add", "https://example.com/paper.pdf"])
+
+ assert result.exit_code == 0, result.output
+ assert "[ERROR] Compilation failed" in result.output
+ # The raw file must be preserved so the user can retry.
+ assert fetched_path.exists()