From e07df7a1ae198c7b45478a25434ced1c1ff7513c Mon Sep 17 00:00:00 2001 From: Justin Paul Date: Fri, 22 May 2026 15:11:23 -0400 Subject: [PATCH] =?UTF-8?q?fix(chunker):=20MAX=5FCHARS=206000=20=E2=86=92?= =?UTF-8?q?=204000=20for=20table-dense=20content=20(#6)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- rag/chunk.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/rag/chunk.py b/rag/chunk.py index 81ef39c..c937c1f 100644 --- a/rag/chunk.py +++ b/rag/chunk.py @@ -32,8 +32,12 @@ CHARS_PER_TOKEN = 4 TARGET_TOKENS = 500 TARGET_CHARS = TARGET_TOKENS * CHARS_PER_TOKEN # Hard cap: nomic-embed-text's context is 2048 tokens. Anything larger -# 400s the entire embed batch. 6000 chars ≈ 1500 tokens leaves headroom. -MAX_CHARS = 6000 +# 400s the entire embed batch. 6000 chars works for prose but markdown +# tables with lots of `|` separators tokenize ~1.4× denser; a 5839-char +# table chunk from the HVM qualification matrix tokenized past 2048 and +# crashed the rebuild. 4000 chars stays under 2048 tokens even for +# dense table content while leaving headroom for the query side. +MAX_CHARS = 4000 def _hard_split(text: str) -> list[str]: