diff --git a/README.md b/README.md index cc8ad4e8..bdac0770 100644 --- a/README.md +++ b/README.md @@ -155,6 +155,26 @@ To run this project in a Docker container, you'll need to pass your API keys as ``` +## 📡 Publish to understand-quickly (opt-in) + +Add `--publish` to land the generated tutorial in [`looptech-ai/understand-quickly`](https://github.com/looptech-ai/understand-quickly), a public registry of code-knowledge graphs that ships an MCP server. The flag emits a small `generic@1` JSON projection of the tutorial (abstractions/chapters as nodes, relationships as edges) at `//tutorial.json` with `metadata.{tool, tool_version, generated_at}` plus `commit` when a local git repo is available (i.e. for `--dir`; not always populated for remote `--repo` crawls). If `UNDERSTAND_QUICKLY_TOKEN` is set, it also fires a `repository_dispatch` so the registry resyncs the entry. + +```bash +python main.py --repo https://github.com/example/demo --publish +``` + +Without the token, only the local file is written. The drop-in CI step is the [`looptech-ai/uq-publish-action`](https://github.com/looptech-ai/uq-publish-action): + +```yaml +- uses: looptech-ai/uq-publish-action@v0.1.0 + with: + graph-path: 'output//tutorial.json' + format: 'generic@1' + token: ${{ secrets.UNDERSTAND_QUICKLY_TOKEN }} +``` + +Submitting via `--publish` is governed by the [Understand-Quickly Data License 1.0](https://github.com/looptech-ai/understand-quickly/blob/main/DATA-LICENSE.md). It is opt-in. + ## 💡 Development Tutorial - I built using [**Agentic Coding**](https://zacharyhuang.substack.com/p/agentic-coding-the-most-fun-way-to), the fastest development paradigm, where humans simply [design](docs/design.md) and agents [code](flow.py). diff --git a/main.py b/main.py index dbba523f..397e548d 100644 --- a/main.py +++ b/main.py @@ -56,6 +56,15 @@ def main(): parser.add_argument("--no-cache", action="store_true", help="Disable LLM response caching (default: caching enabled)") # Add max_abstraction_num parameter to control the number of abstractions parser.add_argument("--max-abstractions", type=int, default=10, help="Maximum number of abstractions to identify (default: 10)") + # Opt-in publish to the understand-quickly registry of code-knowledge graphs. + # https://github.com/looptech-ai/understand-quickly + parser.add_argument( + "--publish", + action="store_true", + help="Emit a generic@1 knowledge-graph projection of the tutorial and (if " + "UNDERSTAND_QUICKLY_TOKEN is set) dispatch it to the understand-quickly " + "registry. Opt-in; default behavior is unchanged.", + ) args = parser.parse_args() @@ -88,6 +97,9 @@ def main(): # Add max_abstraction_num parameter "max_abstraction_num": args.max_abstractions, + # Opt-in publish to understand-quickly (looptech-ai/understand-quickly). + "publish_to_uq": args.publish, + # Outputs will be populated by the nodes "files": [], "abstractions": [], diff --git a/nodes.py b/nodes.py index 0e3fa587..8bb85562 100644 --- a/nodes.py +++ b/nodes.py @@ -878,3 +878,31 @@ def exec(self, prep_res): def post(self, shared, prep_res, exec_res): shared["final_output_dir"] = exec_res # Store the output path print(f"\nTutorial generation complete! Files are in: {exec_res}") + + # Opt-in: emit a generic@1 knowledge-graph projection and (if a token is + # set) publish to the understand-quickly registry. Failures here never + # affect tutorial generation — the markdown output is already written. + if shared.get("publish_to_uq"): + try: + from pathlib import Path + from utils.uq_publish import build_generic_graph, publish + + source_dir = Path(shared["local_dir"]).resolve() if shared.get("local_dir") else None + graph = build_generic_graph( + project_name=shared["project_name"], + abstractions=shared.get("abstractions", []), + chapter_order=shared.get("chapter_order", []), + relationships=shared.get("relationships", {}), + repo_url=shared.get("repo_url"), + source_dir=source_dir, + files_data=shared.get("files"), + ) + graph_path = Path(exec_res) / "tutorial.json" + publish( + graph, + graph_path, + repo_url=shared.get("repo_url"), + source_dir=source_dir, + ) + except Exception as exc: + print(f"[uq-publish] warning: {exc}") diff --git a/tests/test_uq_publish.py b/tests/test_uq_publish.py new file mode 100644 index 00000000..cf7830a5 --- /dev/null +++ b/tests/test_uq_publish.py @@ -0,0 +1,101 @@ +"""Tests for utils/uq_publish.py — opt-in understand-quickly publish.""" +from __future__ import annotations + +import json +import os +import sys +import unittest +from pathlib import Path +from unittest import mock + +# Add project root to path so `from utils.uq_publish import ...` works +sys.path.insert(0, str(Path(__file__).resolve().parent.parent)) + +from utils.uq_publish import build_generic_graph, publish, TOKEN_ENV # noqa: E402 + + +# Mirrors the actual upstream shape: IdentifyAbstractions emits `files` as a +# list of integer indices into shared["files"] (the (path, content) tuples). +SAMPLE_ABSTRACTIONS = [ + {"name": "Flow", "description": "Pipeline orchestrator", "files": [0]}, + {"name": "Node", "description": "Unit of work", "files": [1]}, +] +SAMPLE_FILES_DATA = [ + ("flow.py", "class Flow:\n pass\n"), + ("nodes.py", "class Node:\n pass\n"), +] +SAMPLE_RELATIONSHIPS = { + "summary": "PocketFlow runs nodes in a flow.", + "details": [{"from": 0, "to": 1, "label": "contains"}], +} + + +class BuildGenericGraphTests(unittest.TestCase): + def test_emits_generic_at_1_with_metadata(self) -> None: + graph = build_generic_graph( + project_name="demo", + abstractions=SAMPLE_ABSTRACTIONS, + chapter_order=[0, 1], + relationships=SAMPLE_RELATIONSHIPS, + repo_url="https://github.com/example/demo", + source_dir=None, + files_data=SAMPLE_FILES_DATA, + ) + self.assertEqual(graph["schema"], "generic@1") + md = graph["metadata"] + self.assertEqual(md["tool"], "pocketflow-tutorial-codebase-knowledge") + self.assertEqual(md["project_name"], "demo") + self.assertTrue(md["generated_at"].endswith("Z")) + # Two abstractions -> two nodes; one relationship + one chapter-order edge. + self.assertEqual(len(graph["nodes"]), 2) + self.assertEqual(len(graph["edges"]), 2) + kinds = sorted(e["kind"] for e in graph["edges"]) + self.assertEqual(kinds, ["next_chapter", "relationship"]) + # File indices resolved to repo-relative paths via files_data. + self.assertEqual(graph["nodes"][0]["files"], ["flow.py"]) + self.assertEqual(graph["nodes"][1]["files"], ["nodes.py"]) + # chapter_index precomputed from chapter_order. + self.assertEqual(graph["nodes"][0]["chapter_index"], 0) + self.assertEqual(graph["nodes"][1]["chapter_index"], 1) + + def test_no_files_data_exposes_indices_under_renamed_field(self) -> None: + graph = build_generic_graph( + project_name="demo", + abstractions=SAMPLE_ABSTRACTIONS, + chapter_order=[0, 1], + relationships=SAMPLE_RELATIONSHIPS, + repo_url=None, + source_dir=None, + ) + # Without files_data, expose integer indices as `file_indices` (not + # `files`) so downstream consumers know they're not paths. + self.assertNotIn("files", graph["nodes"][0]) + self.assertEqual(graph["nodes"][0]["file_indices"], [0]) + + +class PublishTests(unittest.TestCase): + def test_no_token_writes_file_and_skips_dispatch(self) -> None: + graph = build_generic_graph( + project_name="demo", + abstractions=SAMPLE_ABSTRACTIONS, + chapter_order=[0, 1], + relationships=SAMPLE_RELATIONSHIPS, + repo_url=None, + source_dir=None, + ) + env = {k: v for k, v in os.environ.items() if k != TOKEN_ENV} + with mock.patch.dict(os.environ, env, clear=True): + import tempfile + with tempfile.TemporaryDirectory() as tmp: + out = Path(tmp) / "tutorial.json" + result = publish(graph, out, source_dir=Path(tmp)) + self.assertFalse(result["dispatched"]) + self.assertTrue(out.exists()) + data = json.loads(out.read_text()) + self.assertEqual(data["schema"], "generic@1") + self.assertEqual(data["metadata"]["tool"], + "pocketflow-tutorial-codebase-knowledge") + + +if __name__ == "__main__": + unittest.main() diff --git a/utils/uq_publish.py b/utils/uq_publish.py new file mode 100644 index 00000000..2a69511f --- /dev/null +++ b/utils/uq_publish.py @@ -0,0 +1,247 @@ +"""Opt-in understand-quickly registry publish for PocketFlow-Tutorial-Codebase-Knowledge. + +Emits a small `generic@1` knowledge-graph projection of the generated tutorial +(nodes = abstractions/chapters, edges = relationships + chapter ordering) and, +if a token is set, fires a `repository_dispatch` at the registry. + +Stdlib-only — no new dependencies. + +Spec: https://github.com/looptech-ai/understand-quickly/blob/main/docs/spec/code-graph-protocol.md +""" +from __future__ import annotations + +import datetime as _dt +import json +import os +import subprocess # nosec B404 — fixed argv, no shell +import sys +import urllib.error +import urllib.request +from pathlib import Path +from typing import Any + +TOOL_NAME = "pocketflow-tutorial-codebase-knowledge" +TOOL_VERSION = "0.1.0" +REGISTRY_REPO = "looptech-ai/understand-quickly" +TOKEN_ENV = "UNDERSTAND_QUICKLY_TOKEN" +DISPATCH_EVENT_TYPE = "uq-publish" + + +def _git(args: list[str], cwd: Path) -> str | None: + try: + r = subprocess.run( # nosec B603 + ["git", *args], cwd=str(cwd), capture_output=True, text=True, + check=False, timeout=5, + ) + except (FileNotFoundError, subprocess.SubprocessError): + return None + return r.stdout.strip() if r.returncode == 0 else None + + +def _git_head(repo_dir: Path) -> str | None: + sha = _git(["rev-parse", "HEAD"], repo_dir) + return sha if sha and len(sha) == 40 else None + + +def _detect_repo_slug(repo_dir: Path, repo_url: str | None = None) -> str | None: + """Best-effort `owner/repo` slug — honours `repo_url` first (PocketFlow + typically tutorialises a remote repo, not the cwd).""" + candidates: list[str] = [] + if repo_url: + candidates.append(repo_url) + origin = _git(["remote", "get-url", "origin"], repo_dir) + if origin: + candidates.append(origin) + for url in candidates: + for prefix in ("https://github.com/", "git@github.com:"): + if url.startswith(prefix): + slug = url[len(prefix):].removesuffix(".git") + if slug and "/" in slug: + return slug + return None + + +def build_generic_graph( + *, + project_name: str, + abstractions: list[dict], + chapter_order: list[int], + relationships: dict, + repo_url: str | None, + source_dir: Path | None, + files_data: list[tuple[str, str]] | None = None, +) -> dict: + """Project the tutorial onto a `generic@1` node/edge graph. + + Each abstraction becomes a node (kind=abstraction). The edges capture the + `relationships.details` produced by AnalyzeRelationships and chapter ordering. + + `abstr["files"]` is a list of integer indices into `files_data` (the + `(path, content)` tuples produced upstream). When `files_data` is supplied + we resolve indices to repo-relative paths in the exported `files` field; + otherwise we expose them as `file_indices` so consumers know what to expect. + """ + # O(1) lookup for chapter_index to avoid O(n^2) scans on large tutorials. + chapter_index_by_abstr = {abstr_idx: pos for pos, abstr_idx in enumerate(chapter_order)} + + nodes: list[dict] = [] + for i, abstr in enumerate(abstractions): + raw_files = list(abstr.get("files", [])) + node: dict[str, Any] = { + "id": f"A{i}", + "label": abstr.get("name", f"abstraction {i}"), + "kind": "abstraction", + "description": abstr.get("description", ""), + "chapter_index": chapter_index_by_abstr.get(i), + } + if files_data is not None: + resolved: list[str] = [] + for idx in raw_files: + if isinstance(idx, int) and 0 <= idx < len(files_data): + resolved.append(files_data[idx][0]) + elif isinstance(idx, str): + resolved.append(idx) + node["files"] = resolved + else: + # Indices verbatim — name the field for what they actually are. + node["file_indices"] = raw_files + nodes.append(node) + edges: list[dict] = [] + for rel in (relationships or {}).get("details", []): + edges.append({ + "source": f"A{rel['from']}", + "target": f"A{rel['to']}", + "label": rel.get("label", ""), + "kind": "relationship", + }) + # Chapter-order edges (A_i -> A_{i+1}) for prerequisite-style traversal. + for prev, curr in zip(chapter_order, chapter_order[1:]): + edges.append({ + "source": f"A{prev}", + "target": f"A{curr}", + "kind": "next_chapter", + }) + + commit = _git_head(source_dir) if source_dir else None + metadata: dict[str, Any] = { + "tool": TOOL_NAME, + "tool_version": TOOL_VERSION, + "generated_at": _dt.datetime.now(_dt.timezone.utc).strftime( + "%Y-%m-%dT%H:%M:%SZ" + ), + "project_name": project_name, + "summary": (relationships or {}).get("summary", ""), + } + if commit: + metadata["commit"] = commit + if repo_url: + metadata["repo_url"] = repo_url + return { + "schema": "generic@1", + "metadata": metadata, + "nodes": nodes, + "edges": edges, + } + + +def write_graph(graph: dict, output_path: Path) -> Path: + output_path.parent.mkdir(parents=True, exist_ok=True) + output_path.write_text(json.dumps(graph, indent=2), encoding="utf-8") + return output_path + + +def dispatch(repo_slug: str, *, token: str, schema: str, graph_path: str, + commit: str | None = None, timeout: float = 10.0) -> int: + payload = { + "event_type": DISPATCH_EVENT_TYPE, + "client_payload": { + "repo": repo_slug, "schema": schema, "graph_path": graph_path, + "tool": TOOL_NAME, "tool_version": TOOL_VERSION, + **({"commit": commit} if commit else {}), + }, + } + req = urllib.request.Request( # nosec B310 — fixed https URL + f"https://api.github.com/repos/{REGISTRY_REPO}/dispatches", + data=json.dumps(payload).encode("utf-8"), + headers={ + "Accept": "application/vnd.github+json", + "Authorization": f"Bearer {token}", + "Content-Type": "application/json", + "User-Agent": f"{TOOL_NAME}/{TOOL_VERSION}", + "X-GitHub-Api-Version": "2022-11-28", + }, + method="POST", + ) + with urllib.request.urlopen(req, timeout=timeout) as resp: # nosec B310 + return resp.status + + +def publish( + graph: dict, + output_path: Path, + *, + repo_url: str | None = None, + source_dir: Path | None = None, + token_env: str = TOKEN_ENV, + log: Any = None, +) -> dict[str, Any]: + """Write the graph and (if token set) dispatch. Never raises on network errors.""" + log = log or sys.stderr + write_graph(graph, output_path) + metadata = graph.get("metadata", {}) + + token = os.environ.get(token_env, "").strip() + if not token: + print( + f"[uq-publish] wrote {output_path}; ${token_env} unset — " + f"skipping registry dispatch (see " + f"https://github.com/looptech-ai/uq-publish-action for CI use).", + file=log, + ) + return {"dispatched": False, "metadata": metadata} + + repo_slug = _detect_repo_slug(source_dir or Path.cwd(), repo_url) + if not repo_slug: + print("[uq-publish] could not detect github repo slug — skipping dispatch.", + file=log) + return {"dispatched": False, "metadata": metadata} + + # Registry fetches via raw.githubusercontent.com, so it needs a repo-relative + # POSIX path. If the graph isn't inside the repo (or no repo dir is known), + # fall back to the basename rather than dispatching an absolute filesystem path. + output_path = Path(output_path).resolve() + rel_graph_path: str | None = None + if source_dir is not None: + try: + rel_graph_path = output_path.relative_to(Path(source_dir).resolve()).as_posix() + except ValueError: + rel_graph_path = None + if rel_graph_path is None: + print( + f"[uq-publish] {output_path} is not inside source_dir; " + f"skipping dispatch (registry can only fetch paths inside the repo).", + file=log, + ) + return {"dispatched": False, "metadata": metadata, + "error": "graph_path outside source_dir"} + + try: + status = dispatch( + repo_slug, token=token, schema=graph.get("schema", "generic@1"), + graph_path=rel_graph_path, commit=metadata.get("commit"), + ) + except urllib.error.HTTPError as exc: + if exc.code == 404: + print(f"[uq-publish] {repo_slug} not in registry — register once with: " + "npx @understand-quickly/cli add", file=log) + return {"dispatched": False, "metadata": metadata, "registered": False} + print(f"[uq-publish] dispatch failed ({exc.code}); local file written.", + file=log) + return {"dispatched": False, "metadata": metadata, "error": str(exc)} + except (urllib.error.URLError, OSError) as exc: + print(f"[uq-publish] dispatch failed ({exc}); local file written.", file=log) + return {"dispatched": False, "metadata": metadata, "error": str(exc)} + + print(f"[uq-publish] dispatched to {REGISTRY_REPO} (HTTP {status}) for " + f"{repo_slug}.", file=log) + return {"dispatched": True, "metadata": metadata, "status": status}