Add semantic-index service, deployment assets, and tests
This commit is contained in:
@@ -0,0 +1,25 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import List
|
||||
|
||||
|
||||
def chunk_text(text: str, max_chars: int = 3500, overlap: int = 300) -> List[str]:
|
||||
cleaned = "\n".join(line.rstrip() for line in text.strip().splitlines()).strip()
|
||||
if not cleaned:
|
||||
return []
|
||||
if len(cleaned) <= max_chars:
|
||||
return [cleaned]
|
||||
|
||||
chunks: List[str] = []
|
||||
start = 0
|
||||
while start < len(cleaned):
|
||||
end = min(start + max_chars, len(cleaned))
|
||||
if end < len(cleaned):
|
||||
boundary = max(cleaned.rfind("\n\n", start, end), cleaned.rfind(". ", start, end))
|
||||
if boundary > start + int(max_chars * 0.5):
|
||||
end = boundary + 1
|
||||
chunks.append(cleaned[start:end].strip())
|
||||
if end >= len(cleaned):
|
||||
break
|
||||
start = max(0, end - overlap)
|
||||
return [chunk for chunk in chunks if chunk]
|
||||
Reference in New Issue
Block a user