Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 3 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,8 @@ openkb init

# 3. Add documents
openkb add paper.pdf
openkb add ~/papers/ # Add a whole directory
openkb add ~/papers/ # Add a whole directory
openkb add https://arxiv.org/pdf/2509.11420 # Or fetch from a URL

# 4. Ask a question
openkb query "What are the main findings?"
Expand Down Expand Up @@ -148,7 +149,7 @@ A single source might touch 10-15 wiki pages. Knowledge accumulates: each docume
| Command | Description |
|---|---|
| `openkb init` | Initialize a new knowledge base (interactive) |
| <code>openkb&nbsp;add&nbsp;&lt;file_or_dir&gt;</code> | Add documents and compile to wiki |
| <code>openkb&nbsp;add&nbsp;&lt;file_or_dir_or_URL&gt;</code> | Add documents and compile to wiki. URL ingest auto-detects PDF (saved as `.pdf` → PageIndex / markitdown) vs HTML (trafilatura main-content extract → `.md`) |
| <code>openkb&nbsp;remove&nbsp;&lt;doc&gt;</code> | Remove a document and clean up its wiki pages, images, registry, and PageIndex state (use `--dry-run` to preview, `--keep-raw` / `--keep-empty-concepts` to retain artifacts) |
| <code>openkb&nbsp;query&nbsp;"question"</code> | Ask a question over the knowledge base (use `--save` to save the answer to `wiki/explorations/`) |
| `openkb chat` | Start an interactive multi-turn chat (use `--resume`, `--list`, `--delete` to manage sessions) |
Expand Down
50 changes: 43 additions & 7 deletions openkb/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
import sys
import time
from pathlib import Path
from typing import Literal

import os

Expand Down Expand Up @@ -130,14 +131,22 @@ def _find_kb_dir(override: Path | None = None) -> Path | None:
return None


def add_single_file(file_path: Path, kb_dir: Path) -> None:
def add_single_file(file_path: Path, kb_dir: Path) -> Literal["added", "skipped", "failed"]:
"""Convert, index, and compile a single document into the knowledge base.

Steps:
1. Load config to get the model name.
2. Convert the document (hash-check; skip if already known).
3. If long doc: run PageIndex then compile_long_doc.
4. Else: compile_short_doc.

Returns:
``"added"`` on full success, ``"skipped"`` when the file's hash
is already in the registry (dedup), or ``"failed"`` when any
pipeline stage raised. URL-ingest distinguishes these so it can
unlink the just-downloaded raw file on dedup (it would otherwise
be an orphan) while preserving it on failure so the user can
retry without re-downloading.
"""
from openkb.agent.compiler import compile_long_doc, compile_short_doc
from openkb.state import HashRegistry
Expand All @@ -156,11 +165,11 @@ def add_single_file(file_path: Path, kb_dir: Path) -> None:
except Exception as exc:
click.echo(f" [ERROR] Conversion failed: {exc}")
logger.debug("Conversion traceback:", exc_info=True)
return
return "failed"

if result.skipped:
click.echo(f" [SKIP] Already in knowledge base: {file_path.name}")
return
return "skipped"

doc_name = file_path.stem
index_result = None # populated only on the long-doc branch
Expand All @@ -174,7 +183,7 @@ def add_single_file(file_path: Path, kb_dir: Path) -> None:
except Exception as exc:
click.echo(f" [ERROR] Indexing failed: {exc}")
logger.debug("Indexing traceback:", exc_info=True)
return
return "failed"

summary_path = kb_dir / "wiki" / "summaries" / f"{doc_name}.md"
click.echo(f" Compiling long doc (doc_id={index_result.doc_id})...")
Expand All @@ -192,7 +201,7 @@ def add_single_file(file_path: Path, kb_dir: Path) -> None:
else:
click.echo(f" [ERROR] Compilation failed: {exc}")
logger.debug("Compilation traceback:", exc_info=True)
return
return "failed"
else:
click.echo(f" Compiling short doc...")
for attempt in range(2):
Expand All @@ -206,7 +215,7 @@ def add_single_file(file_path: Path, kb_dir: Path) -> None:
else:
click.echo(f" [ERROR] Compilation failed: {exc}")
logger.debug("Compilation traceback:", exc_info=True)
return
return "failed"

# Register hash only after successful compilation
if result.file_hash:
Expand All @@ -225,6 +234,7 @@ def add_single_file(file_path: Path, kb_dir: Path) -> None:

append_log(kb_dir / "wiki", "ingest", file_path.name)
click.echo(f" [OK] {file_path.name} added to knowledge base.")
return "added"


# ---------------------------------------------------------------------------
Expand Down Expand Up @@ -395,12 +405,38 @@ def init(language):
@click.argument("path")
@click.pass_context
def add(ctx, path):
"""Add a document or directory of documents at PATH to the knowledge base."""
"""Add a document or directory of documents at PATH to the knowledge base.

PATH may be a local file, a local directory (which is walked
recursively for supported extensions), or an http(s) URL. URLs are
fetched into ``raw/`` first: PDF responses (by Content-Type and
magic-byte sniff) are saved as ``.pdf``; HTML responses are run
through trafilatura's main-content extractor and saved as ``.md``.
"""
kb_dir = _find_kb_dir(ctx.obj.get("kb_dir_override"))
if kb_dir is None:
click.echo("No knowledge base found. Run `openkb init` first.")
return

# URL ingest: download into raw/ first, then call add_single_file
# explicitly so we can clean up the just-downloaded file if it
# turns out to be a duplicate (registry already has its hash).
# Without this, re-adding the same URL leaves an orphan in raw/
# that the registry can't reach via openkb remove.
from openkb.url_ingest import looks_like_url, fetch_url_to_raw
if looks_like_url(path):
fetched = fetch_url_to_raw(path, kb_dir)
if fetched is None:
return
outcome = add_single_file(fetched, kb_dir)
# Only clean up on dedup-skip. On "failed" we keep the file so
# the user can retry (e.g. transient LLM error during compile)
# without re-downloading — and so they don't lose data when
# indexing has already succeeded but compilation didn't.
if outcome == "skipped":
fetched.unlink(missing_ok=True)
return

target = Path(path)
if not target.exists():
click.echo(f"Path does not exist: {path}")
Expand Down
Loading