fix: use provider-specific token estimation for manual chunks

teedonk · teedonk · commit 2c8cbb41cdbf · 2026-04-06T22:57:46.000+01:00
Match document processing heuristic: Ollama uses length/3, OpenAI uses estimateTokenCount
diff --git a/apps/sim/lib/knowledge/chunks/service.ts b/apps/sim/lib/knowledge/chunks/service.ts
@@ -185,7 +185,9 @@ export async function createChunk(
     kbOllamaBaseUrl
   )
 
-  const tokenCount = estimateTokenCount(chunkData.content, 'openai')
+  const tokenCount = isOllama
+    ? { count: Math.ceil(chunkData.content.length / 3) }
+    : estimateTokenCount(chunkData.content, 'openai')
   const chunkId = generateId()
   const now = new Date()
 
@@ -549,7 +551,9 @@ export async function updateChunk(
           kbOllamaBaseUrl
         )
         newEmbedding = embeddings[0]
-        newTokenCount = estimateTokenCount(content, 'openai').count
+        newTokenCount = isOllama
+          ? Math.ceil(content.length / 3)
+          : estimateTokenCount(content, 'openai').count
       }
 
       const newHash = createHash('sha256').update(content).digest('hex')

Original file line number	Diff line number	Diff line change
`@@ -185,7 +185,9 @@ export async function createChunk(`
`185`	`185`	`kbOllamaBaseUrl`
`186`	`186`	`)`
`187`	`187`
`188`		`- const tokenCount = estimateTokenCount(chunkData.content, 'openai')`
	`188`	`+ const tokenCount = isOllama`
	`189`	`+ ? { count: Math.ceil(chunkData.content.length / 3) }`
	`190`	`+ : estimateTokenCount(chunkData.content, 'openai')`
`189`	`191`	`const chunkId = generateId()`
`190`	`192`	`const now = new Date()`
`191`	`193`
`@@ -549,7 +551,9 @@ export async function updateChunk(`
`549`	`551`	`kbOllamaBaseUrl`
`550`	`552`	`)`
`551`	`553`	`newEmbedding = embeddings[0]`
`552`		`- newTokenCount = estimateTokenCount(content, 'openai').count`
	`554`	`+ newTokenCount = isOllama`
	`555`	`+ ? Math.ceil(content.length / 3)`
	`556`	`+ : estimateTokenCount(content, 'openai').count`
`553`	`557`	`}`
`554`	`558`
`555`	`559`	`const newHash = createHash('sha256').update(content).digest('hex')`