from __future__ import annotations from typing import List def chunk_text(text: str, max_chars: int = 3500, overlap: int = 300) -> List[str]: cleaned = "\n".join(line.rstrip() for line in text.strip().splitlines()).strip() if not cleaned: return [] if len(cleaned) <= max_chars: return [cleaned] chunks: List[str] = [] start = 0 while start < len(cleaned): end = min(start + max_chars, len(cleaned)) if end < len(cleaned): boundary = max(cleaned.rfind("\n\n", start, end), cleaned.rfind(". ", start, end)) if boundary > start + int(max_chars * 0.5): end = boundary + 1 chunks.append(cleaned[start:end].strip()) if end >= len(cleaned): break start = max(0, end - overlap) return [chunk for chunk in chunks if chunk]