From 5c4ad357d22790a881a5e29b5848881b4111c5c0 Mon Sep 17 00:00:00 2001 From: Grivn Date: Wed, 27 May 2026 14:29:42 +0000 Subject: [PATCH] feat(import): add mnemon import command with versioned draft schema Add a versioned memory draft schema and import command so historical chat exports can be converted into Mnemon insights with normal deduplication, graph construction, embedding, lifecycle scoring, and pruning behavior. The import path disables real-time temporal edge generation, repairs affected source timelines after all backdated insights land, refreshes effective importance after explicit edges are inserted, and documents the JSON schema plus LLM extraction prompt. Validation includes importdraft unit tests and command integration coverage for temporal repair and EI refresh. --- cmd/import.go | 427 +++++++++++++++++++++++++++++ cmd/import_test.go | 247 +++++++++++++++++ docs/IMPORT.md | 240 ++++++++++++++++ docs/USAGE.md | 5 + internal/graph/engine.go | 35 ++- internal/importdraft/draft.go | 189 +++++++++++++ internal/importdraft/draft_test.go | 115 ++++++++ internal/store/edge.go | 8 + internal/store/node.go | 13 + 9 files changed, 1277 insertions(+), 2 deletions(-) create mode 100644 cmd/import.go create mode 100644 cmd/import_test.go create mode 100644 docs/IMPORT.md create mode 100644 internal/importdraft/draft.go create mode 100644 internal/importdraft/draft_test.go diff --git a/cmd/import.go b/cmd/import.go new file mode 100644 index 0000000..ce89202 --- /dev/null +++ b/cmd/import.go @@ -0,0 +1,427 @@ +package cmd + +import ( + "encoding/json" + "fmt" + "os" + "time" + + "github.com/google/uuid" + "github.com/mnemon-dev/mnemon/internal/embed" + "github.com/mnemon-dev/mnemon/internal/graph" + "github.com/mnemon-dev/mnemon/internal/importdraft" + "github.com/mnemon-dev/mnemon/internal/model" + "github.com/mnemon-dev/mnemon/internal/search" + "github.com/mnemon-dev/mnemon/internal/store" + "github.com/spf13/cobra" +) + +var ( + importNoDiff bool + importDryRun bool +) + +var importCmd = &cobra.Command{ + Use: "import ", + Short: "Import a memory draft file", + Long: `Import insights from a memory draft JSON file (schema_version: "1"). + +Each insight passes through Mnemon's normal write path: deduplication, +graph edge construction, embeddings, and lifecycle scoring are all applied +automatically. + +The draft format and a reference LLM prompt for generating it from chat +exports are documented in docs/IMPORT.md.`, + Args: cobra.ExactArgs(1), + RunE: func(cmd *cobra.Command, args []string) error { + draft, err := importdraft.Load(args[0]) + if err != nil { + return err + } + if err := draft.Validate(); err != nil { + return fmt.Errorf("invalid draft: %w", err) + } + + if importDryRun { + fmt.Printf("Dry run: %d insights, %d explicit edges — validation passed.\n", + len(draft.Insights), len(draft.Edges)) + return nil + } + + db, err := openDB() + if err != nil { + return fmt.Errorf("open database: %w", err) + } + defer db.Close() + + ec := embed.NewClientWithModel(resolveEmbedModel()) + + // Build embed cache once for all diff and graph operations. + var embedCache graph.EmbedCache + if ec.Available() { + dbEmbeds, err := db.GetAllEmbeddings() + if err == nil { + embedCache = make(graph.EmbedCache, len(dbEmbeds)) + for _, e := range dbEmbeds { + if v := embed.DeserializeVector(e.Embedding); v != nil { + embedCache[e.ID] = v + } + } + } + } + + // imported maps draft index → assigned insight ID (for explicit edge resolution). + imported := make(map[int]string, len(draft.Insights)) + importedIDs := make(map[string]bool, len(draft.Insights)) + importedSources := make(map[string]bool) + refreshIDs := make(map[string]bool) + results := make([]importResult, 0, len(draft.Insights)) + + for idx, di := range draft.Insights { + cat := model.Category(di.Category) + if cat == "" { + cat = model.CategoryGeneral + } + imp := di.Importance + if imp == 0 { + imp = 3 + } + tags := di.Tags + if tags == nil { + tags = []string{} + } + entities := di.Entities + if entities == nil { + entities = []string{} + } + + var createdAt time.Time + if di.CreatedAt != "" { + if t, err := time.Parse(time.RFC3339, di.CreatedAt); err == nil { + createdAt = t.UTC() + } + } + if createdAt.IsZero() { + createdAt = time.Now().UTC() + } + + insight := &model.Insight{ + ID: uuid.New().String(), + Content: di.Content, + Category: cat, + Importance: imp, + Tags: tags, + Entities: entities, + Source: draft.ResolvedSource(idx), + CreatedAt: createdAt, + UpdatedAt: createdAt, + } + + // Compute embedding before acquiring the DB lock. + var embeddingBlob []byte + var embeddingVec []float64 + if ec.Available() { + if vec, err := ec.Embed(insight.Content); err == nil { + embeddingVec = vec + embeddingBlob = embed.SerializeVector(vec) + } + } + + var action string + var replacedID string + + if importNoDiff { + action = "added" + } else { + allInsights, err := db.GetAllActiveInsights() + if err != nil { + results = append(results, importResult{Index: idx, ID: insight.ID, Content: insight.Content, Error: err.Error()}) + continue + } + opts := search.DiffOptions{Limit: 5, NewEmbedding: embeddingVec} + if embedCache != nil { + opts.ExistingEmbed = make([]search.EmbeddedItem, 0, len(embedCache)) + for id, v := range embedCache { + opts.ExistingEmbed = append(opts.ExistingEmbed, search.EmbeddedItem{ID: id, Embedding: v}) + } + } + result := search.Diff(allInsights, insight.Content, opts) + switch result.Suggestion { + case search.DiffDuplicate: + action = "skipped" + if len(result.Matches) > 0 { + replacedID = result.Matches[0].ID + } + case search.DiffConflict, search.DiffUpdate: + action = "updated" + if len(result.Matches) > 0 { + replacedID = result.Matches[0].ID + } + default: + action = "added" + } + } + + if action == "skipped" { + db.LogOp("import-skip", insight.ID, fmt.Sprintf("duplicate of %s", replacedID)) + if replacedID != "" { + imported[idx] = replacedID + } else { + imported[idx] = insight.ID + } + results = append(results, importResult{Index: idx, ID: imported[idx], Content: insight.Content, Action: action}) + continue + } + + var writeErr error + err = db.InTransaction(func() error { + if action == "updated" && replacedID != "" { + if err := db.SoftDeleteInsight(replacedID); err != nil { + fmt.Fprintf(os.Stderr, "warning: soft-delete %s: %v\n", replacedID, err) + } else { + db.LogOp("import-replace", replacedID, fmt.Sprintf("replaced by %s", insight.ID)) + delete(embedCache, replacedID) + } + } + if err := db.InsertInsight(insight); err != nil { + return fmt.Errorf("insert insight: %w", err) + } + if embeddingBlob != nil { + if err := db.UpdateEmbedding(insight.ID, embeddingBlob); err != nil { + return fmt.Errorf("update embedding: %w", err) + } + if embedCache != nil { + embedCache[insight.ID] = embeddingVec + } + } + engine := graph.NewEngineWithOptions(db, embedCache, graph.EngineOptions{ + EntityMode: graph.EntityModeMerge, + TemporalMode: graph.TemporalDisabled, + }) + engine.OnInsightCreated(insight) + if len(insight.Entities) > 0 { + _ = db.UpdateEntities(insight.ID, insight.Entities) + } + if _, err := db.RefreshEffectiveImportance(insight.ID); err != nil { + fmt.Fprintf(os.Stderr, "warning: refresh EI for %s: %v\n", insight.ID, err) + } + db.LogOp("import", insight.ID, insight.Content) + return nil + }) + if err != nil { + writeErr = err + embedCache = nil + } + + if writeErr != nil { + results = append(results, importResult{Index: idx, ID: insight.ID, Content: insight.Content, Error: writeErr.Error()}) + continue + } + + imported[idx] = insight.ID + importedIDs[insight.ID] = true + importedSources[insight.Source] = true + refreshIDs[insight.ID] = true + results = append(results, importResult{Index: idx, ID: insight.ID, Content: insight.Content, Action: action}) + } + + edgesInserted := 0 + temporalEdgesRepaired := 0 + pruned := 0 + if err := db.InTransaction(func() error { + // Insert explicit edges for successfully imported insights. + for _, de := range draft.Edges { + srcID, srcOK := imported[de.SourceIndex] + tgtID, tgtOK := imported[de.TargetIndex] + if !srcOK || !tgtOK { + continue + } + w := de.Weight + if w == 0 { + w = 0.5 + } + meta := map[string]string{} + if de.Reason != "" { + meta["reason"] = de.Reason + } + edge := &model.Edge{ + SourceID: srcID, + TargetID: tgtID, + EdgeType: model.EdgeType(de.EdgeType), + Weight: w, + Metadata: meta, + CreatedAt: time.Now().UTC(), + } + if err := db.InsertEdge(edge); err != nil { + fmt.Fprintf(os.Stderr, "warning: insert explicit edge %d→%d: %v\n", de.SourceIndex, de.TargetIndex, err) + continue + } + edgesInserted++ + refreshIDs[srcID] = true + refreshIDs[tgtID] = true + } + + repaired, touched, err := repairImportedTemporalEdges(db, importedSources, importedIDs) + if err != nil { + return err + } + temporalEdgesRepaired = repaired + for id := range touched { + refreshIDs[id] = true + } + + for id := range refreshIDs { + if _, err := db.RefreshEffectiveImportance(id); err != nil { + fmt.Fprintf(os.Stderr, "warning: refresh EI for %s: %v\n", id, err) + } + } + + var pruneErr error + pruned, pruneErr = db.AutoPrune(store.MaxInsights, nil) + return pruneErr + }); err != nil { + return fmt.Errorf("finalize import graph: %w", err) + } + + _ = temporalEdgesRepaired // computed internally; not surfaced in default output + summary := map[string]interface{}{ + "imported": countAction(results, "added"), + "updated": countAction(results, "updated"), + "skipped": countAction(results, "skipped"), + "errors": countErrors(results), + "edges_inserted": edgesInserted, + "auto_pruned": pruned, + "results": results, + } + enc := json.NewEncoder(os.Stdout) + enc.SetIndent("", " ") + return enc.Encode(summary) + }, +} + +func repairImportedTemporalEdges(db *store.DB, sources map[string]bool, importedIDs map[string]bool) (int, map[string]bool, error) { + touched := make(map[string]bool) + if len(importedIDs) == 0 { + return 0, touched, nil + } + + inserted := 0 + for source := range sources { + timeline, err := db.GetActiveInsightsBySourceOrdered(source) + if err != nil { + return inserted, touched, fmt.Errorf("load temporal timeline for source %q: %w", source, err) + } + if len(timeline) == 0 { + continue + } + + for idx, insight := range timeline { + if !importedIDs[insight.ID] { + continue + } + touched[insight.ID] = true + + prevExisting := nearestNonImportedBefore(timeline, importedIDs, idx) + nextExisting := nearestNonImportedAfter(timeline, importedIDs, idx) + if prevExisting != nil && nextExisting != nil { + if err := db.DeleteEdge(prevExisting.ID, nextExisting.ID, model.EdgeTemporal); err != nil { + return inserted, touched, fmt.Errorf("delete temporal edge %s→%s: %w", prevExisting.ID, nextExisting.ID, err) + } + if err := db.DeleteEdge(nextExisting.ID, prevExisting.ID, model.EdgeTemporal); err != nil { + return inserted, touched, fmt.Errorf("delete temporal edge %s→%s: %w", nextExisting.ID, prevExisting.ID, err) + } + touched[prevExisting.ID] = true + touched[nextExisting.ID] = true + } + } + + now := time.Now().UTC() + for idx := 0; idx < len(timeline)-1; idx++ { + prev := timeline[idx] + next := timeline[idx+1] + if !importedIDs[prev.ID] && !importedIDs[next.ID] { + continue + } + + if err := db.InsertEdge(&model.Edge{ + SourceID: prev.ID, + TargetID: next.ID, + EdgeType: model.EdgeTemporal, + Weight: 1.0, + Metadata: map[string]string{"sub_type": "backbone", "direction": "precedes"}, + CreatedAt: now, + }); err != nil { + return inserted, touched, fmt.Errorf("insert temporal edge %s→%s: %w", prev.ID, next.ID, err) + } + inserted++ + + if err := db.InsertEdge(&model.Edge{ + SourceID: next.ID, + TargetID: prev.ID, + EdgeType: model.EdgeTemporal, + Weight: 1.0, + Metadata: map[string]string{"sub_type": "backbone", "direction": "succeeds"}, + CreatedAt: now, + }); err != nil { + return inserted, touched, fmt.Errorf("insert temporal edge %s→%s: %w", next.ID, prev.ID, err) + } + inserted++ + touched[prev.ID] = true + touched[next.ID] = true + } + } + + return inserted, touched, nil +} + +func nearestNonImportedBefore(timeline []*model.Insight, importedIDs map[string]bool, idx int) *model.Insight { + for i := idx - 1; i >= 0; i-- { + if !importedIDs[timeline[i].ID] { + return timeline[i] + } + } + return nil +} + +func nearestNonImportedAfter(timeline []*model.Insight, importedIDs map[string]bool, idx int) *model.Insight { + for i := idx + 1; i < len(timeline); i++ { + if !importedIDs[timeline[i].ID] { + return timeline[i] + } + } + return nil +} + +func countAction(results []importResult, action string) int { + n := 0 + for _, r := range results { + if r.Action == action { + n++ + } + } + return n +} + +func countErrors(results []importResult) int { + n := 0 + for _, r := range results { + if r.Error != "" { + n++ + } + } + return n +} + +type importResult struct { + Index int `json:"index"` + ID string `json:"id"` + Content string `json:"content"` + Action string `json:"action"` + Error string `json:"error,omitempty"` +} + +func init() { + importCmd.Flags().BoolVar(&importNoDiff, "no-diff", false, "skip deduplication; insert all insights as new") + importCmd.Flags().BoolVar(&importDryRun, "dry-run", false, "validate the draft file without writing to the database") + rootCmd.AddCommand(importCmd) +} diff --git a/cmd/import_test.go b/cmd/import_test.go new file mode 100644 index 0000000..21a978b --- /dev/null +++ b/cmd/import_test.go @@ -0,0 +1,247 @@ +package cmd + +import ( + "io" + "os" + "path/filepath" + "testing" + "time" + + "github.com/mnemon-dev/mnemon/internal/model" + "github.com/mnemon-dev/mnemon/internal/store" +) + +func TestImportRepairsBackdatedTemporalBackbone(t *testing.T) { + t.Setenv("MNEMON_EMBED_ENDPOINT", "http://127.0.0.1:1") + + oldDataDir, oldStoreName, oldReadOnly := dataDir, storeName, readOnly + oldImportNoDiff, oldImportDryRun := importNoDiff, importDryRun + t.Cleanup(func() { + dataDir, storeName, readOnly = oldDataDir, oldStoreName, oldReadOnly + importNoDiff, importDryRun = oldImportNoDiff, oldImportDryRun + }) + + dataDir = t.TempDir() + storeName = "" + readOnly = false + importNoDiff = true + importDryRun = false + + db, err := store.Open(store.StoreDir(dataDir, store.DefaultStoreName)) + if err != nil { + t.Fatalf("open store: %v", err) + } + insertTestInsight(t, db, "old-2023", "older context", "chat", "2023-01-01T00:00:00Z") + insertTestInsight(t, db, "old-2025", "newer context", "chat", "2025-01-01T00:00:00Z") + now := time.Now().UTC() + if err := db.InsertEdge(&model.Edge{ + SourceID: "old-2023", + TargetID: "old-2025", + EdgeType: model.EdgeTemporal, + Weight: 1.0, + Metadata: map[string]string{"sub_type": "backbone", "direction": "precedes"}, + CreatedAt: now, + }); err != nil { + t.Fatalf("insert old forward edge: %v", err) + } + if err := db.InsertEdge(&model.Edge{ + SourceID: "old-2025", + TargetID: "old-2023", + EdgeType: model.EdgeTemporal, + Weight: 1.0, + Metadata: map[string]string{"sub_type": "backbone", "direction": "succeeds"}, + CreatedAt: now, + }); err != nil { + t.Fatalf("insert old reverse edge: %v", err) + } + if err := db.Close(); err != nil { + t.Fatalf("close seed db: %v", err) + } + + draftPath := filepath.Join(t.TempDir(), "memory_draft.json") + draft := `{ + "schema_version": "1", + "source": "chat", + "insights": [ + { + "content": "imported middle context", + "category": "context", + "importance": 3, + "created_at": "2024-01-01T00:00:00Z" + } + ] +}` + if err := os.WriteFile(draftPath, []byte(draft), 0o644); err != nil { + t.Fatalf("write draft: %v", err) + } + + output := captureStdout(t, func() { + if err := importCmd.RunE(importCmd, []string{draftPath}); err != nil { + t.Fatalf("import RunE: %v", err) + } + }) + if output == "" { + t.Fatal("expected import summary output") + } + + db, err = store.Open(store.StoreDir(dataDir, store.DefaultStoreName)) + if err != nil { + t.Fatalf("reopen store: %v", err) + } + defer db.Close() + + imported, err := db.QueryInsights(store.QueryFilter{Keyword: "imported middle", Limit: 1}) + if err != nil { + t.Fatalf("query imported insight: %v", err) + } + if len(imported) != 1 { + t.Fatalf("imported insight count = %d, want 1", len(imported)) + } + importedID := imported[0].ID + + if hasTemporalEdge(t, db, "old-2023", "old-2025") { + t.Fatal("old 2023->2025 temporal edge should have been removed") + } + if hasTemporalEdge(t, db, "old-2025", "old-2023") { + t.Fatal("old 2025->2023 temporal edge should have been removed") + } + if !hasTemporalEdge(t, db, "old-2023", importedID) { + t.Fatal("missing repaired 2023->imported temporal edge") + } + if !hasTemporalEdge(t, db, importedID, "old-2023") { + t.Fatal("missing repaired imported->2023 temporal edge") + } + if !hasTemporalEdge(t, db, importedID, "old-2025") { + t.Fatal("missing repaired imported->2025 temporal edge") + } + if !hasTemporalEdge(t, db, "old-2025", importedID) { + t.Fatal("missing repaired 2025->imported temporal edge") + } +} + +func TestImportRefreshesEffectiveImportanceAfterExplicitEdges(t *testing.T) { + t.Setenv("MNEMON_EMBED_ENDPOINT", "http://127.0.0.1:1") + + oldDataDir, oldStoreName, oldReadOnly := dataDir, storeName, readOnly + oldImportNoDiff, oldImportDryRun := importNoDiff, importDryRun + t.Cleanup(func() { + dataDir, storeName, readOnly = oldDataDir, oldStoreName, oldReadOnly + importNoDiff, importDryRun = oldImportNoDiff, oldImportDryRun + }) + + dataDir = t.TempDir() + storeName = "" + readOnly = false + importNoDiff = true + importDryRun = false + + draftPath := filepath.Join(t.TempDir(), "memory_draft.json") + draft := `{ + "schema_version": "1", + "insights": [ + { + "content": "alpha lowercase memory", + "category": "context", + "importance": 3, + "source": "source-a" + }, + { + "content": "beta lowercase memory", + "category": "context", + "importance": 3, + "source": "source-b" + } + ], + "edges": [ + { + "source_index": 0, + "target_index": 1, + "edge_type": "semantic", + "weight": 0.8, + "reason": "test explicit edge" + } + ] +}` + if err := os.WriteFile(draftPath, []byte(draft), 0o644); err != nil { + t.Fatalf("write draft: %v", err) + } + + captureStdout(t, func() { + if err := importCmd.RunE(importCmd, []string{draftPath}); err != nil { + t.Fatalf("import RunE: %v", err) + } + }) + + db, err := store.Open(store.StoreDir(dataDir, store.DefaultStoreName)) + if err != nil { + t.Fatalf("open store: %v", err) + } + defer db.Close() + + var lowEI float64 + if err := db.Conn().QueryRow( + `SELECT MIN(effective_importance) FROM insights WHERE deleted_at IS NULL`, + ).Scan(&lowEI); err != nil { + t.Fatalf("query effective importance: %v", err) + } + if lowEI <= 0.5 { + t.Fatalf("effective_importance = %f, want refreshed value above no-edge baseline", lowEI) + } +} + +func insertTestInsight(t *testing.T, db *store.DB, id, content, source, createdAt string) { + t.Helper() + ts, err := time.Parse(time.RFC3339, createdAt) + if err != nil { + t.Fatalf("parse time: %v", err) + } + if err := db.InsertInsight(&model.Insight{ + ID: id, + Content: content, + Category: model.CategoryContext, + Importance: 3, + Tags: []string{}, + Entities: []string{}, + Source: source, + CreatedAt: ts, + UpdatedAt: ts, + }); err != nil { + t.Fatalf("insert insight %s: %v", id, err) + } +} + +func hasTemporalEdge(t *testing.T, db *store.DB, sourceID, targetID string) bool { + t.Helper() + edges, err := db.GetEdgesBySourceAndType(sourceID, model.EdgeTemporal) + if err != nil { + t.Fatalf("get temporal edges for %s: %v", sourceID, err) + } + for _, edge := range edges { + if edge.TargetID == targetID { + return true + } + } + return false +} + +func captureStdout(t *testing.T, fn func()) string { + t.Helper() + oldStdout := os.Stdout + r, w, err := os.Pipe() + if err != nil { + t.Fatalf("pipe stdout: %v", err) + } + os.Stdout = w + defer func() { os.Stdout = oldStdout }() + + fn() + + if err := w.Close(); err != nil { + t.Fatalf("close stdout writer: %v", err) + } + data, err := io.ReadAll(r) + if err != nil { + t.Fatalf("read stdout: %v", err) + } + return string(data) +} diff --git a/docs/IMPORT.md b/docs/IMPORT.md new file mode 100644 index 0000000..29c3349 --- /dev/null +++ b/docs/IMPORT.md @@ -0,0 +1,240 @@ +# Mnemon — 记忆导入指南 + +本文档说明如何将历史聊天记录或外部上下文批量导入 Mnemon 的记忆图谱。 + +--- + +## 工作流概览 + +``` +聊天导出 / Markdown → LLM(使用下方提示词) → memory_draft.json → mnemon import +``` + +1. 将原始聊天记录或笔记导出为 Markdown 或纯文本。 +2. 将文本连同下方的**参考提示词**一起发送给 LLM,生成符合本文档格式的 `memory_draft.json`。 +3. 运行 `mnemon import memory_draft.json`,Mnemon 自动完成去重、图谱边构建、向量嵌入和生命周期评分。 + +--- + +## 导入文件格式(schema_version: "1") + +```json +{ + "schema_version": "1", + "source": "chat-export", + "insights": [ + { + "content": "选择了 Qdrant 而非 Milvus 作为向量搜索引擎,主要原因是其过滤查询性能更好。", + "category": "decision", + "importance": 5, + "tags": ["architecture", "search", "vector-db"], + "entities": ["Qdrant", "Milvus"], + "source": "agent", + "created_at": "2024-03-15T09:30:00Z" + }, + { + "content": "用户偏好简洁的 API 响应,不希望看到多余的解释文本。", + "category": "preference", + "importance": 4, + "tags": ["ux", "api"] + } + ], + "edges": [ + { + "source_index": 0, + "target_index": 1, + "edge_type": "causal", + "weight": 0.7, + "reason": "向量引擎选型决策影响了 API 响应设计偏好" + } + ] +} +``` + +--- + +## 字段说明 + +### 顶层字段 + +| 字段 | 类型 | 必填 | 说明 | +|---|---|---|---| +| `schema_version` | string | **是** | 必须为 `"1"` | +| `source` | string | 否 | 整批导入的来源标签(如 `"chat-export"`、`"manual"`)。单条 insight 可通过自身的 `source` 字段覆盖此值 | +| `insights` | array | **是** | 记忆节点列表,至少包含一项 | +| `edges` | array | 否 | 显式关系列表;Mnemon 的图引擎也会自动创建边,此处为补充或强制指定 | + +### insights 条目字段 + +| 字段 | 类型 | 必填 | 约束 | 说明 | +|---|---|---|---|---| +| `content` | string | **是** | 最多 8000 字符 | 记忆的文本内容 | +| `category` | string | 否 | 见下表,默认 `general` | 知识类型 | +| `importance` | integer | 否 | 1–5,默认 3 | 重要程度;影响保留优先级和自动剪枝 | +| `tags` | array | 否 | 最多 20 项,每项最多 100 字符 | 自由标签,用于检索和过滤 | +| `entities` | array | 否 | 最多 50 项,每项最多 200 字符 | 记忆中涉及的命名实体(人名、项目名、工具名等);与 Mnemon 自动提取结果合并 | +| `source` | string | 否 | — | 覆盖顶层 `source`,该条记忆的具体来源 | +| `created_at` | string | 否 | RFC 3339 格式 | 原始创建时间;省略时使用导入时间 | + +#### category 可选值 + +| 值 | 适用场景 | +|---|---| +| `preference` | 用户偏好、习惯、风格要求 | +| `decision` | 已确定的技术或产品决策 | +| `fact` | 客观事实、数据、规格参数 | +| `insight` | 推断、分析结论、经验总结 | +| `context` | 背景信息、项目概况、约束条件 | +| `general` | 其他不适合上述分类的记忆 | + +#### importance 赋值建议 + +| 值 | 含义 | +|---|---| +| 5 | 核心决策或强烈偏好,必须长期保留 | +| 4 | 重要上下文,通常需要保留 | +| 3 | 一般记忆(默认值) | +| 2 | 次要细节,可能在自动剪枝时被移除 | +| 1 | 临时或低价值信息 | + +### edges 条目字段 + +| 字段 | 类型 | 必填 | 约束 | 说明 | +|---|---|---|---|---| +| `source_index` | integer | **是** | insights 数组的零基索引 | 边的起点 | +| `target_index` | integer | **是** | 不能等于 `source_index` | 边的终点 | +| `edge_type` | string | **是** | 见下表 | 关系类型 | +| `weight` | float | 否 | 0.0–1.0,默认 0.5 | 关系强度 | +| `reason` | string | 否 | — | 说明该关系存在的原因(存入边元数据) | + +#### edge_type 可选值 + +| 值 | 含义 | +|---|---| +| `temporal` | 时间顺序关系(事件 A 发生在事件 B 之前) | +| `causal` | 因果关系(A 导致或影响 B) | +| `semantic` | 语义相似关系(A 与 B 讨论同一主题) | +| `entity` | 实体共现关系(A 与 B 涉及同一命名实体) | + +--- + +## 使用命令 + +```bash +# 基本导入 +mnemon import memory_draft.json + +# 验证格式但不写入数据库 +mnemon import --dry-run memory_draft.json + +# 跳过去重检测,将所有条目作为新记忆插入 +mnemon import --no-diff memory_draft.json + +# 指定具体 store +mnemon import --store project-alpha memory_draft.json +``` + +### 输出示例 + +```json +{ + "imported": 8, + "updated": 1, + "skipped": 2, + "errors": 0, + "edges_inserted": 3, + "auto_pruned": 0, + "results": [ + {"index": 0, "id": "a1b2c3d4...", "content": "选择了 Qdrant...", "action": "added"}, + {"index": 1, "id": "e5f6a7b8...", "content": "用户偏好简洁的...", "action": "skipped"} + ] +} +``` + +| 字段 | 说明 | +|---|---| +| `imported` | 新增的记忆数量 | +| `updated` | 替换了已有冲突记忆的数量 | +| `skipped` | 检测为重复而跳过的数量 | +| `errors` | 写入失败的数量;导入允许部分成功,脚本调用方应检查此字段是否为 0 | +| `edges_inserted` | 成功插入的显式边数量 | +| `auto_pruned` | 超出容量限制后自动删除的记忆数量 | + +--- + +## 参考提示词(用于生成 memory_draft.json) + +将以下提示词和你的聊天记录一起发送给 LLM: + +``` +你是一个记忆提取助手。请从下方的聊天记录或文档中提取有价值的知识片段, +生成一个符合 Mnemon memory draft 格式(schema_version: "1")的 JSON 文件。 + +## 提取规则 + +1. 每条 insight 必须是独立、完整的知识单元,不依赖上下文即可理解。 +2. 去除闲聊、重复表述和无实质内容的对话。 +3. 如果同一主题多次出现,合并为一条最完整的表述,不要重复。 +4. 按以下优先级分配 importance: + - 5:关键架构决策、明确的用户核心偏好 + - 4:重要上下文、反复出现的模式 + - 3:一般事实和背景信息 + - 2:细节或一次性提及的内容 + - 1:临时状态或极低价值信息 +5. entities 填写记忆中出现的具体名词:人名、项目名、工具/库名、组织名。 +6. tags 使用小写英文,用连字符分隔词语(如 "vector-db"、"api-design")。 +7. 如果能从上下文推断出原始时间,在 created_at 中填写 RFC 3339 格式的时间。 +8. edges 只填写显而易见的强关联关系(因果、同一决策链),不要过度连接。 + +## 输出要求 + +- 只输出 JSON,不要有任何解释文字。 +- 严格遵守以下 schema: + +{ + "schema_version": "1", + "source": "chat-export", + "insights": [ + { + "content": "...", + "category": "preference|decision|fact|insight|context|general", + "importance": 1-5, + "tags": ["tag1", "tag2"], + "entities": ["Entity1", "Entity2"], + "created_at": "2024-01-15T09:30:00Z" + } + ], + "edges": [ + { + "source_index": 0, + "target_index": 1, + "edge_type": "causal|semantic|temporal|entity", + "weight": 0.0-1.0, + "reason": "..." + } + ] +} + +## 待处理内容 + +[在此粘贴你的聊天记录或文档] +``` + +--- + +## 常见问题 + +**Q: `created_at` 必须填吗?** +不必须。省略时 Mnemon 使用导入时间。如果原始聊天记录有时间戳,建议填写以保留历史顺序。 + +**Q: 导入后如何验证结果?** +运行 `mnemon log` 查看最新记忆,或用 `mnemon search <关键词>` 确认可检索。 + +**Q: `edges` 数组必须填吗?** +不必须。Mnemon 的图引擎会根据时间、语义相似度和实体共现自动创建边。显式 `edges` 用于指定自动检测难以发现的强关联关系。 + +**Q: 如何分批导入大型聊天记录?** +将聊天记录按时间段或主题切分为多个文件,依次执行 `mnemon import`。重复内容会被去重机制自动跳过。 + +**Q: 可以导入非英文内容吗?** +可以,`content`、`tags`、`entities` 均支持任意 Unicode 文本。 diff --git a/docs/USAGE.md b/docs/USAGE.md index 69e620e..306092d 100644 --- a/docs/USAGE.md +++ b/docs/USAGE.md @@ -79,6 +79,11 @@ mnemon recall "auth" --basic # Search — token-scored keyword search mnemon search "authentication" --limit 10 +# Import — bulk-import a memory draft file (see docs/IMPORT.md for schema and LLM prompt) +mnemon import memory_draft.json +mnemon import --dry-run memory_draft.json # validate without writing +mnemon import --no-diff memory_draft.json # skip deduplication + # Forget — soft-delete an insight mnemon forget ``` diff --git a/internal/graph/engine.go b/internal/graph/engine.go index 4075814..a773fa9 100644 --- a/internal/graph/engine.go +++ b/internal/graph/engine.go @@ -18,6 +18,24 @@ type Engine struct { db *store.DB embedCache EmbedCache entityMode EntityMode + options EngineOptions +} + +// TemporalMode controls whether the engine creates temporal edges. +type TemporalMode string + +const ( + // TemporalEnabled creates temporal edges using the normal real-time write path. + TemporalEnabled TemporalMode = "enabled" + // TemporalDisabled skips temporal edges. Import paths can repair historical + // temporal edges after all backdated insights are written. + TemporalDisabled TemporalMode = "disabled" +) + +// EngineOptions configures automatic edge generation. +type EngineOptions struct { + EntityMode EntityMode + TemporalMode TemporalMode } // NewEngine creates a new graph edge engine. @@ -28,7 +46,18 @@ func NewEngine(db *store.DB, embedCache EmbedCache) *Engine { // NewEngineWithEntityMode creates a graph engine with configurable entity handling. func NewEngineWithEntityMode(db *store.DB, embedCache EmbedCache, entityMode EntityMode) *Engine { - return &Engine{db: db, embedCache: embedCache, entityMode: entityMode} + return NewEngineWithOptions(db, embedCache, EngineOptions{EntityMode: entityMode}) +} + +// NewEngineWithOptions creates a graph engine with explicit generation options. +func NewEngineWithOptions(db *store.DB, embedCache EmbedCache, options EngineOptions) *Engine { + if options.EntityMode == "" { + options.EntityMode = EntityModeMerge + } + if options.TemporalMode == "" { + options.TemporalMode = TemporalEnabled + } + return &Engine{db: db, embedCache: embedCache, entityMode: options.EntityMode, options: options} } // OnInsightCreated runs all edge generators for a newly created insight. @@ -40,7 +69,9 @@ func (e *Engine) OnInsightCreated(insight *model.Insight) EdgeStats { insight.Entities = ResolveEntities(insight.Content, insight.Entities, e.entityMode) // 2. Temporal backbone + proximity edges - stats.Temporal = CreateTemporalEdge(e.db, insight) + if e.options.TemporalMode != TemporalDisabled { + stats.Temporal = CreateTemporalEdge(e.db, insight) + } // 3. Entity co-occurrence edges stats.Entity = CreateEntityEdges(e.db, insight) diff --git a/internal/importdraft/draft.go b/internal/importdraft/draft.go new file mode 100644 index 0000000..849defb --- /dev/null +++ b/internal/importdraft/draft.go @@ -0,0 +1,189 @@ +// Package importdraft defines the versioned public import schema for Mnemon +// memory draft files. The schema intentionally decouples from internal DB +// structures so both can evolve independently. +package importdraft + +import ( + "encoding/json" + "fmt" + "os" + "time" + + "github.com/mnemon-dev/mnemon/internal/model" +) + +// CurrentSchemaVersion is the only schema version this build can import. +const CurrentSchemaVersion = "1" + +// MemoryDraft is the top-level structure of a memory draft file. +type MemoryDraft struct { + // SchemaVersion must equal "1". Checked before any other field. + SchemaVersion string `json:"schema_version"` + + // Source labels where this draft came from (e.g. "chat-export", "manual"). + // Stored on every imported insight unless overridden per-insight. + Source string `json:"source,omitempty"` + + // Insights is the list of memory nodes to import. + Insights []DraftInsight `json:"insights"` + + // Edges is an optional list of explicit relationships between insights. + // Mnemon also auto-creates edges via its graph engine; these supplement + // or override auto-detected relationships. + Edges []DraftEdge `json:"edges,omitempty"` +} + +// DraftInsight represents one memory node in the import file. +type DraftInsight struct { + // Content is the text of the memory. Required; max 8000 characters. + Content string `json:"content"` + + // Category classifies the type of knowledge. + // One of: preference, decision, fact, insight, context, general. + // Defaults to "general" when omitted. + Category string `json:"category,omitempty"` + + // Importance is a 1–5 signal of how significant this memory is. + // Defaults to 3 when omitted or 0. + Importance int `json:"importance,omitempty"` + + // Tags are free-form labels (max 20, each max 100 chars). + Tags []string `json:"tags,omitempty"` + + // Entities are named subjects in the memory (people, projects, tools). + // Mnemon merges these with its own auto-extraction. + Entities []string `json:"entities,omitempty"` + + // Source overrides the top-level source field for this specific insight. + Source string `json:"source,omitempty"` + + // CreatedAt sets the original creation timestamp (RFC 3339). + // Defaults to import time when omitted. + CreatedAt string `json:"created_at,omitempty"` +} + +// DraftEdge declares an explicit relationship between two insights. +// Both source_index and target_index are zero-based indices into the insights array. +type DraftEdge struct { + // SourceIndex is the zero-based index into the insights array. + SourceIndex int `json:"source_index"` + + // TargetIndex is the zero-based index into the insights array. + TargetIndex int `json:"target_index"` + + // EdgeType is the kind of relationship. + // One of: temporal, semantic, causal, entity. + EdgeType string `json:"edge_type"` + + // Weight is the edge strength in [0.0, 1.0]. Defaults to 0.5. + Weight float64 `json:"weight,omitempty"` + + // Reason is an optional human/LLM explanation of why the edge exists. + // Stored as edge metadata. + Reason string `json:"reason,omitempty"` +} + +// Load reads and JSON-decodes a memory draft file. +func Load(path string) (*MemoryDraft, error) { + data, err := os.ReadFile(path) + if err != nil { + return nil, fmt.Errorf("read file: %w", err) + } + var draft MemoryDraft + if err := json.Unmarshal(data, &draft); err != nil { + return nil, fmt.Errorf("parse JSON: %w", err) + } + return &draft, nil +} + +// Validate checks that the draft is well-formed before any DB writes. +func (d *MemoryDraft) Validate() error { + if d.SchemaVersion != CurrentSchemaVersion { + return fmt.Errorf("unsupported schema_version %q (expected %q)", d.SchemaVersion, CurrentSchemaVersion) + } + if len(d.Insights) == 0 { + return fmt.Errorf("insights array is empty; nothing to import") + } + + for i, ins := range d.Insights { + if ins.Content == "" { + return fmt.Errorf("insights[%d]: content is required", i) + } + if len(ins.Content) > 8000 { + return fmt.Errorf("insights[%d]: content too long (%d chars, max 8000)", i, len(ins.Content)) + } + + cat := ins.Category + if cat == "" { + cat = string(model.CategoryGeneral) + } + if !model.ValidCategories[model.Category(cat)] { + return fmt.Errorf("insights[%d]: invalid category %q", i, cat) + } + + imp := ins.Importance + if imp == 0 { + imp = 3 + } + if imp < 1 || imp > 5 { + return fmt.Errorf("insights[%d]: importance must be 1–5, got %d", i, ins.Importance) + } + + if len(ins.Tags) > 20 { + return fmt.Errorf("insights[%d]: too many tags (%d, max 20)", i, len(ins.Tags)) + } + for j, t := range ins.Tags { + if len(t) > 100 { + return fmt.Errorf("insights[%d]: tag[%d] too long (%d chars, max 100)", i, j, len(t)) + } + } + + if len(ins.Entities) > 50 { + return fmt.Errorf("insights[%d]: too many entities (%d, max 50)", i, len(ins.Entities)) + } + for j, e := range ins.Entities { + if len(e) > 200 { + return fmt.Errorf("insights[%d]: entity[%d] too long (%d chars, max 200)", i, j, len(e)) + } + } + + if ins.CreatedAt != "" { + if _, err := time.Parse(time.RFC3339, ins.CreatedAt); err != nil { + return fmt.Errorf("insights[%d]: created_at %q is not RFC 3339 (e.g. 2024-01-15T09:30:00Z)", i, ins.CreatedAt) + } + } + } + + n := len(d.Insights) + for i, edge := range d.Edges { + if edge.SourceIndex < 0 || edge.SourceIndex >= n { + return fmt.Errorf("edges[%d]: source_index %d out of range [0,%d)", i, edge.SourceIndex, n) + } + if edge.TargetIndex < 0 || edge.TargetIndex >= n { + return fmt.Errorf("edges[%d]: target_index %d out of range [0,%d)", i, edge.TargetIndex, n) + } + if edge.SourceIndex == edge.TargetIndex { + return fmt.Errorf("edges[%d]: source_index and target_index must differ", i) + } + if !model.ValidEdgeTypes[model.EdgeType(edge.EdgeType)] { + return fmt.Errorf("edges[%d]: invalid edge_type %q (valid: temporal, semantic, causal, entity)", i, edge.EdgeType) + } + if edge.Weight < 0 || edge.Weight > 1.0 { + return fmt.Errorf("edges[%d]: weight %g out of range [0.0, 1.0]", i, edge.Weight) + } + } + + return nil +} + +// ResolvedSource returns the source to use for a given insight index, falling +// back to the top-level source, then to "import". +func (d *MemoryDraft) ResolvedSource(idx int) string { + if d.Insights[idx].Source != "" { + return d.Insights[idx].Source + } + if d.Source != "" { + return d.Source + } + return "import" +} diff --git a/internal/importdraft/draft_test.go b/internal/importdraft/draft_test.go new file mode 100644 index 0000000..fde9252 --- /dev/null +++ b/internal/importdraft/draft_test.go @@ -0,0 +1,115 @@ +package importdraft + +import ( + "strings" + "testing" +) + +func TestValidateAcceptsMinimalDraft(t *testing.T) { + draft := &MemoryDraft{ + SchemaVersion: CurrentSchemaVersion, + Insights: []DraftInsight{ + {Content: "User prefers concise answers."}, + }, + } + + if err := draft.Validate(); err != nil { + t.Fatalf("Validate() error = %v", err) + } +} + +func TestValidateRejectsInvalidInsightFields(t *testing.T) { + cases := []struct { + name string + draft *MemoryDraft + want string + }{ + { + name: "missing content", + draft: &MemoryDraft{ + SchemaVersion: CurrentSchemaVersion, + Insights: []DraftInsight{{Category: "fact"}}, + }, + want: "content is required", + }, + { + name: "invalid category", + draft: &MemoryDraft{ + SchemaVersion: CurrentSchemaVersion, + Insights: []DraftInsight{{Content: "x", Category: "note"}}, + }, + want: "invalid category", + }, + { + name: "invalid importance", + draft: &MemoryDraft{ + SchemaVersion: CurrentSchemaVersion, + Insights: []DraftInsight{{Content: "x", Importance: 6}}, + }, + want: "importance", + }, + { + name: "invalid created_at", + draft: &MemoryDraft{ + SchemaVersion: CurrentSchemaVersion, + Insights: []DraftInsight{{Content: "x", CreatedAt: "2024-01-01"}}, + }, + want: "RFC 3339", + }, + } + + for _, tc := range cases { + t.Run(tc.name, func(t *testing.T) { + err := tc.draft.Validate() + if err == nil { + t.Fatal("Validate() error = nil") + } + if !strings.Contains(err.Error(), tc.want) { + t.Fatalf("Validate() error = %v, want substring %q", err, tc.want) + } + }) + } +} + +func TestValidateRejectsInvalidEdges(t *testing.T) { + draft := &MemoryDraft{ + SchemaVersion: CurrentSchemaVersion, + Insights: []DraftInsight{ + {Content: "first"}, + {Content: "second"}, + }, + Edges: []DraftEdge{ + {SourceIndex: 0, TargetIndex: 2, EdgeType: "semantic"}, + }, + } + + err := draft.Validate() + if err == nil { + t.Fatal("Validate() error = nil") + } + if !strings.Contains(err.Error(), "target_index") { + t.Fatalf("Validate() error = %v, want target_index", err) + } +} + +func TestResolvedSourceFallsBackFromInsightToDraftToImport(t *testing.T) { + draft := &MemoryDraft{ + Source: "chat-export", + Insights: []DraftInsight{ + {Content: "one", Source: "manual"}, + {Content: "two"}, + }, + } + + if got := draft.ResolvedSource(0); got != "manual" { + t.Fatalf("ResolvedSource(0) = %q, want manual", got) + } + if got := draft.ResolvedSource(1); got != "chat-export" { + t.Fatalf("ResolvedSource(1) = %q, want chat-export", got) + } + + draft.Source = "" + if got := draft.ResolvedSource(1); got != "import" { + t.Fatalf("ResolvedSource(1) = %q, want import", got) + } +} diff --git a/internal/store/edge.go b/internal/store/edge.go index 8050dc2..b1627e4 100644 --- a/internal/store/edge.go +++ b/internal/store/edge.go @@ -98,6 +98,14 @@ func (db *DB) GetAllEdges() ([]*model.Edge, error) { return scanEdges(rows) } +// DeleteEdge removes one typed edge between two nodes. +func (db *DB) DeleteEdge(sourceID string, targetID string, edgeType model.EdgeType) error { + _, err := db.execer().Exec( + `DELETE FROM edges WHERE source_id = ? AND target_id = ? AND edge_type = ?`, + sourceID, targetID, string(edgeType)) + return err +} + // DeleteEdgesByNode removes all edges referencing a node. func (db *DB) DeleteEdgesByNode(nodeID string) error { _, err := db.execer().Exec( diff --git a/internal/store/node.go b/internal/store/node.go index 5447430..02c8d41 100644 --- a/internal/store/node.go +++ b/internal/store/node.go @@ -508,6 +508,19 @@ func (db *DB) GetRecentInsightsBySource(source string, excludeID string, limit i return scanInsights(rows) } +// GetActiveInsightsBySourceOrdered returns active insights for a source in chronological order. +func (db *DB) GetActiveInsightsBySourceOrdered(source string) ([]*model.Insight, error) { + rows, err := db.execer().Query( + `SELECT id, content, category, importance, tags, entities, source, access_count, created_at, updated_at, deleted_at + FROM insights WHERE source = ? AND deleted_at IS NULL + ORDER BY created_at ASC, rowid ASC`, source) + if err != nil { + return nil, err + } + defer rows.Close() + return scanInsights(rows) +} + // GetAllActiveInsights returns all non-deleted insights. func (db *DB) GetAllActiveInsights() ([]*model.Insight, error) { rows, err := db.execer().Query(