diff --git a/cmd/import.go b/cmd/import.go new file mode 100644 index 0000000..ce89202 --- /dev/null +++ b/cmd/import.go @@ -0,0 +1,427 @@ +package cmd + +import ( + "encoding/json" + "fmt" + "os" + "time" + + "github.com/google/uuid" + "github.com/mnemon-dev/mnemon/internal/embed" + "github.com/mnemon-dev/mnemon/internal/graph" + "github.com/mnemon-dev/mnemon/internal/importdraft" + "github.com/mnemon-dev/mnemon/internal/model" + "github.com/mnemon-dev/mnemon/internal/search" + "github.com/mnemon-dev/mnemon/internal/store" + "github.com/spf13/cobra" +) + +var ( + importNoDiff bool + importDryRun bool +) + +var importCmd = &cobra.Command{ + Use: "import ", + Short: "Import a memory draft file", + Long: `Import insights from a memory draft JSON file (schema_version: "1"). + +Each insight passes through Mnemon's normal write path: deduplication, +graph edge construction, embeddings, and lifecycle scoring are all applied +automatically. + +The draft format and a reference LLM prompt for generating it from chat +exports are documented in docs/IMPORT.md.`, + Args: cobra.ExactArgs(1), + RunE: func(cmd *cobra.Command, args []string) error { + draft, err := importdraft.Load(args[0]) + if err != nil { + return err + } + if err := draft.Validate(); err != nil { + return fmt.Errorf("invalid draft: %w", err) + } + + if importDryRun { + fmt.Printf("Dry run: %d insights, %d explicit edges — validation passed.\n", + len(draft.Insights), len(draft.Edges)) + return nil + } + + db, err := openDB() + if err != nil { + return fmt.Errorf("open database: %w", err) + } + defer db.Close() + + ec := embed.NewClientWithModel(resolveEmbedModel()) + + // Build embed cache once for all diff and graph operations. + var embedCache graph.EmbedCache + if ec.Available() { + dbEmbeds, err := db.GetAllEmbeddings() + if err == nil { + embedCache = make(graph.EmbedCache, len(dbEmbeds)) + for _, e := range dbEmbeds { + if v := embed.DeserializeVector(e.Embedding); v != nil { + embedCache[e.ID] = v + } + } + } + } + + // imported maps draft index → assigned insight ID (for explicit edge resolution). + imported := make(map[int]string, len(draft.Insights)) + importedIDs := make(map[string]bool, len(draft.Insights)) + importedSources := make(map[string]bool) + refreshIDs := make(map[string]bool) + results := make([]importResult, 0, len(draft.Insights)) + + for idx, di := range draft.Insights { + cat := model.Category(di.Category) + if cat == "" { + cat = model.CategoryGeneral + } + imp := di.Importance + if imp == 0 { + imp = 3 + } + tags := di.Tags + if tags == nil { + tags = []string{} + } + entities := di.Entities + if entities == nil { + entities = []string{} + } + + var createdAt time.Time + if di.CreatedAt != "" { + if t, err := time.Parse(time.RFC3339, di.CreatedAt); err == nil { + createdAt = t.UTC() + } + } + if createdAt.IsZero() { + createdAt = time.Now().UTC() + } + + insight := &model.Insight{ + ID: uuid.New().String(), + Content: di.Content, + Category: cat, + Importance: imp, + Tags: tags, + Entities: entities, + Source: draft.ResolvedSource(idx), + CreatedAt: createdAt, + UpdatedAt: createdAt, + } + + // Compute embedding before acquiring the DB lock. + var embeddingBlob []byte + var embeddingVec []float64 + if ec.Available() { + if vec, err := ec.Embed(insight.Content); err == nil { + embeddingVec = vec + embeddingBlob = embed.SerializeVector(vec) + } + } + + var action string + var replacedID string + + if importNoDiff { + action = "added" + } else { + allInsights, err := db.GetAllActiveInsights() + if err != nil { + results = append(results, importResult{Index: idx, ID: insight.ID, Content: insight.Content, Error: err.Error()}) + continue + } + opts := search.DiffOptions{Limit: 5, NewEmbedding: embeddingVec} + if embedCache != nil { + opts.ExistingEmbed = make([]search.EmbeddedItem, 0, len(embedCache)) + for id, v := range embedCache { + opts.ExistingEmbed = append(opts.ExistingEmbed, search.EmbeddedItem{ID: id, Embedding: v}) + } + } + result := search.Diff(allInsights, insight.Content, opts) + switch result.Suggestion { + case search.DiffDuplicate: + action = "skipped" + if len(result.Matches) > 0 { + replacedID = result.Matches[0].ID + } + case search.DiffConflict, search.DiffUpdate: + action = "updated" + if len(result.Matches) > 0 { + replacedID = result.Matches[0].ID + } + default: + action = "added" + } + } + + if action == "skipped" { + db.LogOp("import-skip", insight.ID, fmt.Sprintf("duplicate of %s", replacedID)) + if replacedID != "" { + imported[idx] = replacedID + } else { + imported[idx] = insight.ID + } + results = append(results, importResult{Index: idx, ID: imported[idx], Content: insight.Content, Action: action}) + continue + } + + var writeErr error + err = db.InTransaction(func() error { + if action == "updated" && replacedID != "" { + if err := db.SoftDeleteInsight(replacedID); err != nil { + fmt.Fprintf(os.Stderr, "warning: soft-delete %s: %v\n", replacedID, err) + } else { + db.LogOp("import-replace", replacedID, fmt.Sprintf("replaced by %s", insight.ID)) + delete(embedCache, replacedID) + } + } + if err := db.InsertInsight(insight); err != nil { + return fmt.Errorf("insert insight: %w", err) + } + if embeddingBlob != nil { + if err := db.UpdateEmbedding(insight.ID, embeddingBlob); err != nil { + return fmt.Errorf("update embedding: %w", err) + } + if embedCache != nil { + embedCache[insight.ID] = embeddingVec + } + } + engine := graph.NewEngineWithOptions(db, embedCache, graph.EngineOptions{ + EntityMode: graph.EntityModeMerge, + TemporalMode: graph.TemporalDisabled, + }) + engine.OnInsightCreated(insight) + if len(insight.Entities) > 0 { + _ = db.UpdateEntities(insight.ID, insight.Entities) + } + if _, err := db.RefreshEffectiveImportance(insight.ID); err != nil { + fmt.Fprintf(os.Stderr, "warning: refresh EI for %s: %v\n", insight.ID, err) + } + db.LogOp("import", insight.ID, insight.Content) + return nil + }) + if err != nil { + writeErr = err + embedCache = nil + } + + if writeErr != nil { + results = append(results, importResult{Index: idx, ID: insight.ID, Content: insight.Content, Error: writeErr.Error()}) + continue + } + + imported[idx] = insight.ID + importedIDs[insight.ID] = true + importedSources[insight.Source] = true + refreshIDs[insight.ID] = true + results = append(results, importResult{Index: idx, ID: insight.ID, Content: insight.Content, Action: action}) + } + + edgesInserted := 0 + temporalEdgesRepaired := 0 + pruned := 0 + if err := db.InTransaction(func() error { + // Insert explicit edges for successfully imported insights. + for _, de := range draft.Edges { + srcID, srcOK := imported[de.SourceIndex] + tgtID, tgtOK := imported[de.TargetIndex] + if !srcOK || !tgtOK { + continue + } + w := de.Weight + if w == 0 { + w = 0.5 + } + meta := map[string]string{} + if de.Reason != "" { + meta["reason"] = de.Reason + } + edge := &model.Edge{ + SourceID: srcID, + TargetID: tgtID, + EdgeType: model.EdgeType(de.EdgeType), + Weight: w, + Metadata: meta, + CreatedAt: time.Now().UTC(), + } + if err := db.InsertEdge(edge); err != nil { + fmt.Fprintf(os.Stderr, "warning: insert explicit edge %d→%d: %v\n", de.SourceIndex, de.TargetIndex, err) + continue + } + edgesInserted++ + refreshIDs[srcID] = true + refreshIDs[tgtID] = true + } + + repaired, touched, err := repairImportedTemporalEdges(db, importedSources, importedIDs) + if err != nil { + return err + } + temporalEdgesRepaired = repaired + for id := range touched { + refreshIDs[id] = true + } + + for id := range refreshIDs { + if _, err := db.RefreshEffectiveImportance(id); err != nil { + fmt.Fprintf(os.Stderr, "warning: refresh EI for %s: %v\n", id, err) + } + } + + var pruneErr error + pruned, pruneErr = db.AutoPrune(store.MaxInsights, nil) + return pruneErr + }); err != nil { + return fmt.Errorf("finalize import graph: %w", err) + } + + _ = temporalEdgesRepaired // computed internally; not surfaced in default output + summary := map[string]interface{}{ + "imported": countAction(results, "added"), + "updated": countAction(results, "updated"), + "skipped": countAction(results, "skipped"), + "errors": countErrors(results), + "edges_inserted": edgesInserted, + "auto_pruned": pruned, + "results": results, + } + enc := json.NewEncoder(os.Stdout) + enc.SetIndent("", " ") + return enc.Encode(summary) + }, +} + +func repairImportedTemporalEdges(db *store.DB, sources map[string]bool, importedIDs map[string]bool) (int, map[string]bool, error) { + touched := make(map[string]bool) + if len(importedIDs) == 0 { + return 0, touched, nil + } + + inserted := 0 + for source := range sources { + timeline, err := db.GetActiveInsightsBySourceOrdered(source) + if err != nil { + return inserted, touched, fmt.Errorf("load temporal timeline for source %q: %w", source, err) + } + if len(timeline) == 0 { + continue + } + + for idx, insight := range timeline { + if !importedIDs[insight.ID] { + continue + } + touched[insight.ID] = true + + prevExisting := nearestNonImportedBefore(timeline, importedIDs, idx) + nextExisting := nearestNonImportedAfter(timeline, importedIDs, idx) + if prevExisting != nil && nextExisting != nil { + if err := db.DeleteEdge(prevExisting.ID, nextExisting.ID, model.EdgeTemporal); err != nil { + return inserted, touched, fmt.Errorf("delete temporal edge %s→%s: %w", prevExisting.ID, nextExisting.ID, err) + } + if err := db.DeleteEdge(nextExisting.ID, prevExisting.ID, model.EdgeTemporal); err != nil { + return inserted, touched, fmt.Errorf("delete temporal edge %s→%s: %w", nextExisting.ID, prevExisting.ID, err) + } + touched[prevExisting.ID] = true + touched[nextExisting.ID] = true + } + } + + now := time.Now().UTC() + for idx := 0; idx < len(timeline)-1; idx++ { + prev := timeline[idx] + next := timeline[idx+1] + if !importedIDs[prev.ID] && !importedIDs[next.ID] { + continue + } + + if err := db.InsertEdge(&model.Edge{ + SourceID: prev.ID, + TargetID: next.ID, + EdgeType: model.EdgeTemporal, + Weight: 1.0, + Metadata: map[string]string{"sub_type": "backbone", "direction": "precedes"}, + CreatedAt: now, + }); err != nil { + return inserted, touched, fmt.Errorf("insert temporal edge %s→%s: %w", prev.ID, next.ID, err) + } + inserted++ + + if err := db.InsertEdge(&model.Edge{ + SourceID: next.ID, + TargetID: prev.ID, + EdgeType: model.EdgeTemporal, + Weight: 1.0, + Metadata: map[string]string{"sub_type": "backbone", "direction": "succeeds"}, + CreatedAt: now, + }); err != nil { + return inserted, touched, fmt.Errorf("insert temporal edge %s→%s: %w", next.ID, prev.ID, err) + } + inserted++ + touched[prev.ID] = true + touched[next.ID] = true + } + } + + return inserted, touched, nil +} + +func nearestNonImportedBefore(timeline []*model.Insight, importedIDs map[string]bool, idx int) *model.Insight { + for i := idx - 1; i >= 0; i-- { + if !importedIDs[timeline[i].ID] { + return timeline[i] + } + } + return nil +} + +func nearestNonImportedAfter(timeline []*model.Insight, importedIDs map[string]bool, idx int) *model.Insight { + for i := idx + 1; i < len(timeline); i++ { + if !importedIDs[timeline[i].ID] { + return timeline[i] + } + } + return nil +} + +func countAction(results []importResult, action string) int { + n := 0 + for _, r := range results { + if r.Action == action { + n++ + } + } + return n +} + +func countErrors(results []importResult) int { + n := 0 + for _, r := range results { + if r.Error != "" { + n++ + } + } + return n +} + +type importResult struct { + Index int `json:"index"` + ID string `json:"id"` + Content string `json:"content"` + Action string `json:"action"` + Error string `json:"error,omitempty"` +} + +func init() { + importCmd.Flags().BoolVar(&importNoDiff, "no-diff", false, "skip deduplication; insert all insights as new") + importCmd.Flags().BoolVar(&importDryRun, "dry-run", false, "validate the draft file without writing to the database") + rootCmd.AddCommand(importCmd) +} diff --git a/cmd/import_test.go b/cmd/import_test.go new file mode 100644 index 0000000..21a978b --- /dev/null +++ b/cmd/import_test.go @@ -0,0 +1,247 @@ +package cmd + +import ( + "io" + "os" + "path/filepath" + "testing" + "time" + + "github.com/mnemon-dev/mnemon/internal/model" + "github.com/mnemon-dev/mnemon/internal/store" +) + +func TestImportRepairsBackdatedTemporalBackbone(t *testing.T) { + t.Setenv("MNEMON_EMBED_ENDPOINT", "http://127.0.0.1:1") + + oldDataDir, oldStoreName, oldReadOnly := dataDir, storeName, readOnly + oldImportNoDiff, oldImportDryRun := importNoDiff, importDryRun + t.Cleanup(func() { + dataDir, storeName, readOnly = oldDataDir, oldStoreName, oldReadOnly + importNoDiff, importDryRun = oldImportNoDiff, oldImportDryRun + }) + + dataDir = t.TempDir() + storeName = "" + readOnly = false + importNoDiff = true + importDryRun = false + + db, err := store.Open(store.StoreDir(dataDir, store.DefaultStoreName)) + if err != nil { + t.Fatalf("open store: %v", err) + } + insertTestInsight(t, db, "old-2023", "older context", "chat", "2023-01-01T00:00:00Z") + insertTestInsight(t, db, "old-2025", "newer context", "chat", "2025-01-01T00:00:00Z") + now := time.Now().UTC() + if err := db.InsertEdge(&model.Edge{ + SourceID: "old-2023", + TargetID: "old-2025", + EdgeType: model.EdgeTemporal, + Weight: 1.0, + Metadata: map[string]string{"sub_type": "backbone", "direction": "precedes"}, + CreatedAt: now, + }); err != nil { + t.Fatalf("insert old forward edge: %v", err) + } + if err := db.InsertEdge(&model.Edge{ + SourceID: "old-2025", + TargetID: "old-2023", + EdgeType: model.EdgeTemporal, + Weight: 1.0, + Metadata: map[string]string{"sub_type": "backbone", "direction": "succeeds"}, + CreatedAt: now, + }); err != nil { + t.Fatalf("insert old reverse edge: %v", err) + } + if err := db.Close(); err != nil { + t.Fatalf("close seed db: %v", err) + } + + draftPath := filepath.Join(t.TempDir(), "memory_draft.json") + draft := `{ + "schema_version": "1", + "source": "chat", + "insights": [ + { + "content": "imported middle context", + "category": "context", + "importance": 3, + "created_at": "2024-01-01T00:00:00Z" + } + ] +}` + if err := os.WriteFile(draftPath, []byte(draft), 0o644); err != nil { + t.Fatalf("write draft: %v", err) + } + + output := captureStdout(t, func() { + if err := importCmd.RunE(importCmd, []string{draftPath}); err != nil { + t.Fatalf("import RunE: %v", err) + } + }) + if output == "" { + t.Fatal("expected import summary output") + } + + db, err = store.Open(store.StoreDir(dataDir, store.DefaultStoreName)) + if err != nil { + t.Fatalf("reopen store: %v", err) + } + defer db.Close() + + imported, err := db.QueryInsights(store.QueryFilter{Keyword: "imported middle", Limit: 1}) + if err != nil { + t.Fatalf("query imported insight: %v", err) + } + if len(imported) != 1 { + t.Fatalf("imported insight count = %d, want 1", len(imported)) + } + importedID := imported[0].ID + + if hasTemporalEdge(t, db, "old-2023", "old-2025") { + t.Fatal("old 2023->2025 temporal edge should have been removed") + } + if hasTemporalEdge(t, db, "old-2025", "old-2023") { + t.Fatal("old 2025->2023 temporal edge should have been removed") + } + if !hasTemporalEdge(t, db, "old-2023", importedID) { + t.Fatal("missing repaired 2023->imported temporal edge") + } + if !hasTemporalEdge(t, db, importedID, "old-2023") { + t.Fatal("missing repaired imported->2023 temporal edge") + } + if !hasTemporalEdge(t, db, importedID, "old-2025") { + t.Fatal("missing repaired imported->2025 temporal edge") + } + if !hasTemporalEdge(t, db, "old-2025", importedID) { + t.Fatal("missing repaired 2025->imported temporal edge") + } +} + +func TestImportRefreshesEffectiveImportanceAfterExplicitEdges(t *testing.T) { + t.Setenv("MNEMON_EMBED_ENDPOINT", "http://127.0.0.1:1") + + oldDataDir, oldStoreName, oldReadOnly := dataDir, storeName, readOnly + oldImportNoDiff, oldImportDryRun := importNoDiff, importDryRun + t.Cleanup(func() { + dataDir, storeName, readOnly = oldDataDir, oldStoreName, oldReadOnly + importNoDiff, importDryRun = oldImportNoDiff, oldImportDryRun + }) + + dataDir = t.TempDir() + storeName = "" + readOnly = false + importNoDiff = true + importDryRun = false + + draftPath := filepath.Join(t.TempDir(), "memory_draft.json") + draft := `{ + "schema_version": "1", + "insights": [ + { + "content": "alpha lowercase memory", + "category": "context", + "importance": 3, + "source": "source-a" + }, + { + "content": "beta lowercase memory", + "category": "context", + "importance": 3, + "source": "source-b" + } + ], + "edges": [ + { + "source_index": 0, + "target_index": 1, + "edge_type": "semantic", + "weight": 0.8, + "reason": "test explicit edge" + } + ] +}` + if err := os.WriteFile(draftPath, []byte(draft), 0o644); err != nil { + t.Fatalf("write draft: %v", err) + } + + captureStdout(t, func() { + if err := importCmd.RunE(importCmd, []string{draftPath}); err != nil { + t.Fatalf("import RunE: %v", err) + } + }) + + db, err := store.Open(store.StoreDir(dataDir, store.DefaultStoreName)) + if err != nil { + t.Fatalf("open store: %v", err) + } + defer db.Close() + + var lowEI float64 + if err := db.Conn().QueryRow( + `SELECT MIN(effective_importance) FROM insights WHERE deleted_at IS NULL`, + ).Scan(&lowEI); err != nil { + t.Fatalf("query effective importance: %v", err) + } + if lowEI <= 0.5 { + t.Fatalf("effective_importance = %f, want refreshed value above no-edge baseline", lowEI) + } +} + +func insertTestInsight(t *testing.T, db *store.DB, id, content, source, createdAt string) { + t.Helper() + ts, err := time.Parse(time.RFC3339, createdAt) + if err != nil { + t.Fatalf("parse time: %v", err) + } + if err := db.InsertInsight(&model.Insight{ + ID: id, + Content: content, + Category: model.CategoryContext, + Importance: 3, + Tags: []string{}, + Entities: []string{}, + Source: source, + CreatedAt: ts, + UpdatedAt: ts, + }); err != nil { + t.Fatalf("insert insight %s: %v", id, err) + } +} + +func hasTemporalEdge(t *testing.T, db *store.DB, sourceID, targetID string) bool { + t.Helper() + edges, err := db.GetEdgesBySourceAndType(sourceID, model.EdgeTemporal) + if err != nil { + t.Fatalf("get temporal edges for %s: %v", sourceID, err) + } + for _, edge := range edges { + if edge.TargetID == targetID { + return true + } + } + return false +} + +func captureStdout(t *testing.T, fn func()) string { + t.Helper() + oldStdout := os.Stdout + r, w, err := os.Pipe() + if err != nil { + t.Fatalf("pipe stdout: %v", err) + } + os.Stdout = w + defer func() { os.Stdout = oldStdout }() + + fn() + + if err := w.Close(); err != nil { + t.Fatalf("close stdout writer: %v", err) + } + data, err := io.ReadAll(r) + if err != nil { + t.Fatalf("read stdout: %v", err) + } + return string(data) +} diff --git a/docs/IMPORT.md b/docs/IMPORT.md new file mode 100644 index 0000000..29c3349 --- /dev/null +++ b/docs/IMPORT.md @@ -0,0 +1,240 @@ +# Mnemon — 记忆导入指南 + +本文档说明如何将历史聊天记录或外部上下文批量导入 Mnemon 的记忆图谱。 + +--- + +## 工作流概览 + +``` +聊天导出 / Markdown → LLM(使用下方提示词) → memory_draft.json → mnemon import +``` + +1. 将原始聊天记录或笔记导出为 Markdown 或纯文本。 +2. 将文本连同下方的**参考提示词**一起发送给 LLM,生成符合本文档格式的 `memory_draft.json`。 +3. 运行 `mnemon import memory_draft.json`,Mnemon 自动完成去重、图谱边构建、向量嵌入和生命周期评分。 + +--- + +## 导入文件格式(schema_version: "1") + +```json +{ + "schema_version": "1", + "source": "chat-export", + "insights": [ + { + "content": "选择了 Qdrant 而非 Milvus 作为向量搜索引擎,主要原因是其过滤查询性能更好。", + "category": "decision", + "importance": 5, + "tags": ["architecture", "search", "vector-db"], + "entities": ["Qdrant", "Milvus"], + "source": "agent", + "created_at": "2024-03-15T09:30:00Z" + }, + { + "content": "用户偏好简洁的 API 响应,不希望看到多余的解释文本。", + "category": "preference", + "importance": 4, + "tags": ["ux", "api"] + } + ], + "edges": [ + { + "source_index": 0, + "target_index": 1, + "edge_type": "causal", + "weight": 0.7, + "reason": "向量引擎选型决策影响了 API 响应设计偏好" + } + ] +} +``` + +--- + +## 字段说明 + +### 顶层字段 + +| 字段 | 类型 | 必填 | 说明 | +|---|---|---|---| +| `schema_version` | string | **是** | 必须为 `"1"` | +| `source` | string | 否 | 整批导入的来源标签(如 `"chat-export"`、`"manual"`)。单条 insight 可通过自身的 `source` 字段覆盖此值 | +| `insights` | array | **是** | 记忆节点列表,至少包含一项 | +| `edges` | array | 否 | 显式关系列表;Mnemon 的图引擎也会自动创建边,此处为补充或强制指定 | + +### insights 条目字段 + +| 字段 | 类型 | 必填 | 约束 | 说明 | +|---|---|---|---|---| +| `content` | string | **是** | 最多 8000 字符 | 记忆的文本内容 | +| `category` | string | 否 | 见下表,默认 `general` | 知识类型 | +| `importance` | integer | 否 | 1–5,默认 3 | 重要程度;影响保留优先级和自动剪枝 | +| `tags` | array | 否 | 最多 20 项,每项最多 100 字符 | 自由标签,用于检索和过滤 | +| `entities` | array | 否 | 最多 50 项,每项最多 200 字符 | 记忆中涉及的命名实体(人名、项目名、工具名等);与 Mnemon 自动提取结果合并 | +| `source` | string | 否 | — | 覆盖顶层 `source`,该条记忆的具体来源 | +| `created_at` | string | 否 | RFC 3339 格式 | 原始创建时间;省略时使用导入时间 | + +#### category 可选值 + +| 值 | 适用场景 | +|---|---| +| `preference` | 用户偏好、习惯、风格要求 | +| `decision` | 已确定的技术或产品决策 | +| `fact` | 客观事实、数据、规格参数 | +| `insight` | 推断、分析结论、经验总结 | +| `context` | 背景信息、项目概况、约束条件 | +| `general` | 其他不适合上述分类的记忆 | + +#### importance 赋值建议 + +| 值 | 含义 | +|---|---| +| 5 | 核心决策或强烈偏好,必须长期保留 | +| 4 | 重要上下文,通常需要保留 | +| 3 | 一般记忆(默认值) | +| 2 | 次要细节,可能在自动剪枝时被移除 | +| 1 | 临时或低价值信息 | + +### edges 条目字段 + +| 字段 | 类型 | 必填 | 约束 | 说明 | +|---|---|---|---|---| +| `source_index` | integer | **是** | insights 数组的零基索引 | 边的起点 | +| `target_index` | integer | **是** | 不能等于 `source_index` | 边的终点 | +| `edge_type` | string | **是** | 见下表 | 关系类型 | +| `weight` | float | 否 | 0.0–1.0,默认 0.5 | 关系强度 | +| `reason` | string | 否 | — | 说明该关系存在的原因(存入边元数据) | + +#### edge_type 可选值 + +| 值 | 含义 | +|---|---| +| `temporal` | 时间顺序关系(事件 A 发生在事件 B 之前) | +| `causal` | 因果关系(A 导致或影响 B) | +| `semantic` | 语义相似关系(A 与 B 讨论同一主题) | +| `entity` | 实体共现关系(A 与 B 涉及同一命名实体) | + +--- + +## 使用命令 + +```bash +# 基本导入 +mnemon import memory_draft.json + +# 验证格式但不写入数据库 +mnemon import --dry-run memory_draft.json + +# 跳过去重检测,将所有条目作为新记忆插入 +mnemon import --no-diff memory_draft.json + +# 指定具体 store +mnemon import --store project-alpha memory_draft.json +``` + +### 输出示例 + +```json +{ + "imported": 8, + "updated": 1, + "skipped": 2, + "errors": 0, + "edges_inserted": 3, + "auto_pruned": 0, + "results": [ + {"index": 0, "id": "a1b2c3d4...", "content": "选择了 Qdrant...", "action": "added"}, + {"index": 1, "id": "e5f6a7b8...", "content": "用户偏好简洁的...", "action": "skipped"} + ] +} +``` + +| 字段 | 说明 | +|---|---| +| `imported` | 新增的记忆数量 | +| `updated` | 替换了已有冲突记忆的数量 | +| `skipped` | 检测为重复而跳过的数量 | +| `errors` | 写入失败的数量;导入允许部分成功,脚本调用方应检查此字段是否为 0 | +| `edges_inserted` | 成功插入的显式边数量 | +| `auto_pruned` | 超出容量限制后自动删除的记忆数量 | + +--- + +## 参考提示词(用于生成 memory_draft.json) + +将以下提示词和你的聊天记录一起发送给 LLM: + +``` +你是一个记忆提取助手。请从下方的聊天记录或文档中提取有价值的知识片段, +生成一个符合 Mnemon memory draft 格式(schema_version: "1")的 JSON 文件。 + +## 提取规则 + +1. 每条 insight 必须是独立、完整的知识单元,不依赖上下文即可理解。 +2. 去除闲聊、重复表述和无实质内容的对话。 +3. 如果同一主题多次出现,合并为一条最完整的表述,不要重复。 +4. 按以下优先级分配 importance: + - 5:关键架构决策、明确的用户核心偏好 + - 4:重要上下文、反复出现的模式 + - 3:一般事实和背景信息 + - 2:细节或一次性提及的内容 + - 1:临时状态或极低价值信息 +5. entities 填写记忆中出现的具体名词:人名、项目名、工具/库名、组织名。 +6. tags 使用小写英文,用连字符分隔词语(如 "vector-db"、"api-design")。 +7. 如果能从上下文推断出原始时间,在 created_at 中填写 RFC 3339 格式的时间。 +8. edges 只填写显而易见的强关联关系(因果、同一决策链),不要过度连接。 + +## 输出要求 + +- 只输出 JSON,不要有任何解释文字。 +- 严格遵守以下 schema: + +{ + "schema_version": "1", + "source": "chat-export", + "insights": [ + { + "content": "...", + "category": "preference|decision|fact|insight|context|general", + "importance": 1-5, + "tags": ["tag1", "tag2"], + "entities": ["Entity1", "Entity2"], + "created_at": "2024-01-15T09:30:00Z" + } + ], + "edges": [ + { + "source_index": 0, + "target_index": 1, + "edge_type": "causal|semantic|temporal|entity", + "weight": 0.0-1.0, + "reason": "..." + } + ] +} + +## 待处理内容 + +[在此粘贴你的聊天记录或文档] +``` + +--- + +## 常见问题 + +**Q: `created_at` 必须填吗?** +不必须。省略时 Mnemon 使用导入时间。如果原始聊天记录有时间戳,建议填写以保留历史顺序。 + +**Q: 导入后如何验证结果?** +运行 `mnemon log` 查看最新记忆,或用 `mnemon search <关键词>` 确认可检索。 + +**Q: `edges` 数组必须填吗?** +不必须。Mnemon 的图引擎会根据时间、语义相似度和实体共现自动创建边。显式 `edges` 用于指定自动检测难以发现的强关联关系。 + +**Q: 如何分批导入大型聊天记录?** +将聊天记录按时间段或主题切分为多个文件,依次执行 `mnemon import`。重复内容会被去重机制自动跳过。 + +**Q: 可以导入非英文内容吗?** +可以,`content`、`tags`、`entities` 均支持任意 Unicode 文本。 diff --git a/docs/USAGE.md b/docs/USAGE.md index 69e620e..306092d 100644 --- a/docs/USAGE.md +++ b/docs/USAGE.md @@ -79,6 +79,11 @@ mnemon recall "auth" --basic # Search — token-scored keyword search mnemon search "authentication" --limit 10 +# Import — bulk-import a memory draft file (see docs/IMPORT.md for schema and LLM prompt) +mnemon import memory_draft.json +mnemon import --dry-run memory_draft.json # validate without writing +mnemon import --no-diff memory_draft.json # skip deduplication + # Forget — soft-delete an insight mnemon forget ``` diff --git a/internal/graph/engine.go b/internal/graph/engine.go index 4075814..a773fa9 100644 --- a/internal/graph/engine.go +++ b/internal/graph/engine.go @@ -18,6 +18,24 @@ type Engine struct { db *store.DB embedCache EmbedCache entityMode EntityMode + options EngineOptions +} + +// TemporalMode controls whether the engine creates temporal edges. +type TemporalMode string + +const ( + // TemporalEnabled creates temporal edges using the normal real-time write path. + TemporalEnabled TemporalMode = "enabled" + // TemporalDisabled skips temporal edges. Import paths can repair historical + // temporal edges after all backdated insights are written. + TemporalDisabled TemporalMode = "disabled" +) + +// EngineOptions configures automatic edge generation. +type EngineOptions struct { + EntityMode EntityMode + TemporalMode TemporalMode } // NewEngine creates a new graph edge engine. @@ -28,7 +46,18 @@ func NewEngine(db *store.DB, embedCache EmbedCache) *Engine { // NewEngineWithEntityMode creates a graph engine with configurable entity handling. func NewEngineWithEntityMode(db *store.DB, embedCache EmbedCache, entityMode EntityMode) *Engine { - return &Engine{db: db, embedCache: embedCache, entityMode: entityMode} + return NewEngineWithOptions(db, embedCache, EngineOptions{EntityMode: entityMode}) +} + +// NewEngineWithOptions creates a graph engine with explicit generation options. +func NewEngineWithOptions(db *store.DB, embedCache EmbedCache, options EngineOptions) *Engine { + if options.EntityMode == "" { + options.EntityMode = EntityModeMerge + } + if options.TemporalMode == "" { + options.TemporalMode = TemporalEnabled + } + return &Engine{db: db, embedCache: embedCache, entityMode: options.EntityMode, options: options} } // OnInsightCreated runs all edge generators for a newly created insight. @@ -40,7 +69,9 @@ func (e *Engine) OnInsightCreated(insight *model.Insight) EdgeStats { insight.Entities = ResolveEntities(insight.Content, insight.Entities, e.entityMode) // 2. Temporal backbone + proximity edges - stats.Temporal = CreateTemporalEdge(e.db, insight) + if e.options.TemporalMode != TemporalDisabled { + stats.Temporal = CreateTemporalEdge(e.db, insight) + } // 3. Entity co-occurrence edges stats.Entity = CreateEntityEdges(e.db, insight) diff --git a/internal/importdraft/draft.go b/internal/importdraft/draft.go new file mode 100644 index 0000000..849defb --- /dev/null +++ b/internal/importdraft/draft.go @@ -0,0 +1,189 @@ +// Package importdraft defines the versioned public import schema for Mnemon +// memory draft files. The schema intentionally decouples from internal DB +// structures so both can evolve independently. +package importdraft + +import ( + "encoding/json" + "fmt" + "os" + "time" + + "github.com/mnemon-dev/mnemon/internal/model" +) + +// CurrentSchemaVersion is the only schema version this build can import. +const CurrentSchemaVersion = "1" + +// MemoryDraft is the top-level structure of a memory draft file. +type MemoryDraft struct { + // SchemaVersion must equal "1". Checked before any other field. + SchemaVersion string `json:"schema_version"` + + // Source labels where this draft came from (e.g. "chat-export", "manual"). + // Stored on every imported insight unless overridden per-insight. + Source string `json:"source,omitempty"` + + // Insights is the list of memory nodes to import. + Insights []DraftInsight `json:"insights"` + + // Edges is an optional list of explicit relationships between insights. + // Mnemon also auto-creates edges via its graph engine; these supplement + // or override auto-detected relationships. + Edges []DraftEdge `json:"edges,omitempty"` +} + +// DraftInsight represents one memory node in the import file. +type DraftInsight struct { + // Content is the text of the memory. Required; max 8000 characters. + Content string `json:"content"` + + // Category classifies the type of knowledge. + // One of: preference, decision, fact, insight, context, general. + // Defaults to "general" when omitted. + Category string `json:"category,omitempty"` + + // Importance is a 1–5 signal of how significant this memory is. + // Defaults to 3 when omitted or 0. + Importance int `json:"importance,omitempty"` + + // Tags are free-form labels (max 20, each max 100 chars). + Tags []string `json:"tags,omitempty"` + + // Entities are named subjects in the memory (people, projects, tools). + // Mnemon merges these with its own auto-extraction. + Entities []string `json:"entities,omitempty"` + + // Source overrides the top-level source field for this specific insight. + Source string `json:"source,omitempty"` + + // CreatedAt sets the original creation timestamp (RFC 3339). + // Defaults to import time when omitted. + CreatedAt string `json:"created_at,omitempty"` +} + +// DraftEdge declares an explicit relationship between two insights. +// Both source_index and target_index are zero-based indices into the insights array. +type DraftEdge struct { + // SourceIndex is the zero-based index into the insights array. + SourceIndex int `json:"source_index"` + + // TargetIndex is the zero-based index into the insights array. + TargetIndex int `json:"target_index"` + + // EdgeType is the kind of relationship. + // One of: temporal, semantic, causal, entity. + EdgeType string `json:"edge_type"` + + // Weight is the edge strength in [0.0, 1.0]. Defaults to 0.5. + Weight float64 `json:"weight,omitempty"` + + // Reason is an optional human/LLM explanation of why the edge exists. + // Stored as edge metadata. + Reason string `json:"reason,omitempty"` +} + +// Load reads and JSON-decodes a memory draft file. +func Load(path string) (*MemoryDraft, error) { + data, err := os.ReadFile(path) + if err != nil { + return nil, fmt.Errorf("read file: %w", err) + } + var draft MemoryDraft + if err := json.Unmarshal(data, &draft); err != nil { + return nil, fmt.Errorf("parse JSON: %w", err) + } + return &draft, nil +} + +// Validate checks that the draft is well-formed before any DB writes. +func (d *MemoryDraft) Validate() error { + if d.SchemaVersion != CurrentSchemaVersion { + return fmt.Errorf("unsupported schema_version %q (expected %q)", d.SchemaVersion, CurrentSchemaVersion) + } + if len(d.Insights) == 0 { + return fmt.Errorf("insights array is empty; nothing to import") + } + + for i, ins := range d.Insights { + if ins.Content == "" { + return fmt.Errorf("insights[%d]: content is required", i) + } + if len(ins.Content) > 8000 { + return fmt.Errorf("insights[%d]: content too long (%d chars, max 8000)", i, len(ins.Content)) + } + + cat := ins.Category + if cat == "" { + cat = string(model.CategoryGeneral) + } + if !model.ValidCategories[model.Category(cat)] { + return fmt.Errorf("insights[%d]: invalid category %q", i, cat) + } + + imp := ins.Importance + if imp == 0 { + imp = 3 + } + if imp < 1 || imp > 5 { + return fmt.Errorf("insights[%d]: importance must be 1–5, got %d", i, ins.Importance) + } + + if len(ins.Tags) > 20 { + return fmt.Errorf("insights[%d]: too many tags (%d, max 20)", i, len(ins.Tags)) + } + for j, t := range ins.Tags { + if len(t) > 100 { + return fmt.Errorf("insights[%d]: tag[%d] too long (%d chars, max 100)", i, j, len(t)) + } + } + + if len(ins.Entities) > 50 { + return fmt.Errorf("insights[%d]: too many entities (%d, max 50)", i, len(ins.Entities)) + } + for j, e := range ins.Entities { + if len(e) > 200 { + return fmt.Errorf("insights[%d]: entity[%d] too long (%d chars, max 200)", i, j, len(e)) + } + } + + if ins.CreatedAt != "" { + if _, err := time.Parse(time.RFC3339, ins.CreatedAt); err != nil { + return fmt.Errorf("insights[%d]: created_at %q is not RFC 3339 (e.g. 2024-01-15T09:30:00Z)", i, ins.CreatedAt) + } + } + } + + n := len(d.Insights) + for i, edge := range d.Edges { + if edge.SourceIndex < 0 || edge.SourceIndex >= n { + return fmt.Errorf("edges[%d]: source_index %d out of range [0,%d)", i, edge.SourceIndex, n) + } + if edge.TargetIndex < 0 || edge.TargetIndex >= n { + return fmt.Errorf("edges[%d]: target_index %d out of range [0,%d)", i, edge.TargetIndex, n) + } + if edge.SourceIndex == edge.TargetIndex { + return fmt.Errorf("edges[%d]: source_index and target_index must differ", i) + } + if !model.ValidEdgeTypes[model.EdgeType(edge.EdgeType)] { + return fmt.Errorf("edges[%d]: invalid edge_type %q (valid: temporal, semantic, causal, entity)", i, edge.EdgeType) + } + if edge.Weight < 0 || edge.Weight > 1.0 { + return fmt.Errorf("edges[%d]: weight %g out of range [0.0, 1.0]", i, edge.Weight) + } + } + + return nil +} + +// ResolvedSource returns the source to use for a given insight index, falling +// back to the top-level source, then to "import". +func (d *MemoryDraft) ResolvedSource(idx int) string { + if d.Insights[idx].Source != "" { + return d.Insights[idx].Source + } + if d.Source != "" { + return d.Source + } + return "import" +} diff --git a/internal/importdraft/draft_test.go b/internal/importdraft/draft_test.go new file mode 100644 index 0000000..fde9252 --- /dev/null +++ b/internal/importdraft/draft_test.go @@ -0,0 +1,115 @@ +package importdraft + +import ( + "strings" + "testing" +) + +func TestValidateAcceptsMinimalDraft(t *testing.T) { + draft := &MemoryDraft{ + SchemaVersion: CurrentSchemaVersion, + Insights: []DraftInsight{ + {Content: "User prefers concise answers."}, + }, + } + + if err := draft.Validate(); err != nil { + t.Fatalf("Validate() error = %v", err) + } +} + +func TestValidateRejectsInvalidInsightFields(t *testing.T) { + cases := []struct { + name string + draft *MemoryDraft + want string + }{ + { + name: "missing content", + draft: &MemoryDraft{ + SchemaVersion: CurrentSchemaVersion, + Insights: []DraftInsight{{Category: "fact"}}, + }, + want: "content is required", + }, + { + name: "invalid category", + draft: &MemoryDraft{ + SchemaVersion: CurrentSchemaVersion, + Insights: []DraftInsight{{Content: "x", Category: "note"}}, + }, + want: "invalid category", + }, + { + name: "invalid importance", + draft: &MemoryDraft{ + SchemaVersion: CurrentSchemaVersion, + Insights: []DraftInsight{{Content: "x", Importance: 6}}, + }, + want: "importance", + }, + { + name: "invalid created_at", + draft: &MemoryDraft{ + SchemaVersion: CurrentSchemaVersion, + Insights: []DraftInsight{{Content: "x", CreatedAt: "2024-01-01"}}, + }, + want: "RFC 3339", + }, + } + + for _, tc := range cases { + t.Run(tc.name, func(t *testing.T) { + err := tc.draft.Validate() + if err == nil { + t.Fatal("Validate() error = nil") + } + if !strings.Contains(err.Error(), tc.want) { + t.Fatalf("Validate() error = %v, want substring %q", err, tc.want) + } + }) + } +} + +func TestValidateRejectsInvalidEdges(t *testing.T) { + draft := &MemoryDraft{ + SchemaVersion: CurrentSchemaVersion, + Insights: []DraftInsight{ + {Content: "first"}, + {Content: "second"}, + }, + Edges: []DraftEdge{ + {SourceIndex: 0, TargetIndex: 2, EdgeType: "semantic"}, + }, + } + + err := draft.Validate() + if err == nil { + t.Fatal("Validate() error = nil") + } + if !strings.Contains(err.Error(), "target_index") { + t.Fatalf("Validate() error = %v, want target_index", err) + } +} + +func TestResolvedSourceFallsBackFromInsightToDraftToImport(t *testing.T) { + draft := &MemoryDraft{ + Source: "chat-export", + Insights: []DraftInsight{ + {Content: "one", Source: "manual"}, + {Content: "two"}, + }, + } + + if got := draft.ResolvedSource(0); got != "manual" { + t.Fatalf("ResolvedSource(0) = %q, want manual", got) + } + if got := draft.ResolvedSource(1); got != "chat-export" { + t.Fatalf("ResolvedSource(1) = %q, want chat-export", got) + } + + draft.Source = "" + if got := draft.ResolvedSource(1); got != "import" { + t.Fatalf("ResolvedSource(1) = %q, want import", got) + } +} diff --git a/internal/store/edge.go b/internal/store/edge.go index 8050dc2..b1627e4 100644 --- a/internal/store/edge.go +++ b/internal/store/edge.go @@ -98,6 +98,14 @@ func (db *DB) GetAllEdges() ([]*model.Edge, error) { return scanEdges(rows) } +// DeleteEdge removes one typed edge between two nodes. +func (db *DB) DeleteEdge(sourceID string, targetID string, edgeType model.EdgeType) error { + _, err := db.execer().Exec( + `DELETE FROM edges WHERE source_id = ? AND target_id = ? AND edge_type = ?`, + sourceID, targetID, string(edgeType)) + return err +} + // DeleteEdgesByNode removes all edges referencing a node. func (db *DB) DeleteEdgesByNode(nodeID string) error { _, err := db.execer().Exec( diff --git a/internal/store/node.go b/internal/store/node.go index 5447430..02c8d41 100644 --- a/internal/store/node.go +++ b/internal/store/node.go @@ -508,6 +508,19 @@ func (db *DB) GetRecentInsightsBySource(source string, excludeID string, limit i return scanInsights(rows) } +// GetActiveInsightsBySourceOrdered returns active insights for a source in chronological order. +func (db *DB) GetActiveInsightsBySourceOrdered(source string) ([]*model.Insight, error) { + rows, err := db.execer().Query( + `SELECT id, content, category, importance, tags, entities, source, access_count, created_at, updated_at, deleted_at + FROM insights WHERE source = ? AND deleted_at IS NULL + ORDER BY created_at ASC, rowid ASC`, source) + if err != nil { + return nil, err + } + defer rows.Close() + return scanInsights(rows) +} + // GetAllActiveInsights returns all non-deleted insights. func (db *DB) GetAllActiveInsights() ([]*model.Insight, error) { rows, err := db.execer().Query(