fix(chunker): MAX_CHARS 6000 → 4000 for table-dense content #6

Merged
justin merged 1 commits from fix/chunk-cap-table-density into main 2026-05-22 15:11:23 -04:00
+6 -2
View File
@@ -32,8 +32,12 @@ CHARS_PER_TOKEN = 4
TARGET_TOKENS = 500 TARGET_TOKENS = 500
TARGET_CHARS = TARGET_TOKENS * CHARS_PER_TOKEN TARGET_CHARS = TARGET_TOKENS * CHARS_PER_TOKEN
# Hard cap: nomic-embed-text's context is 2048 tokens. Anything larger # Hard cap: nomic-embed-text's context is 2048 tokens. Anything larger
# 400s the entire embed batch. 6000 chars ≈ 1500 tokens leaves headroom. # 400s the entire embed batch. 6000 chars works for prose but markdown
MAX_CHARS = 6000 # tables with lots of `|` separators tokenize ~1.4× denser; a 5839-char
# table chunk from the HVM qualification matrix tokenized past 2048 and
# crashed the rebuild. 4000 chars stays under 2048 tokens even for
# dense table content while leaving headroom for the query side.
MAX_CHARS = 4000
def _hard_split(text: str) -> list[str]: def _hard_split(text: str) -> list[str]: