Add semantic-index service, deployment assets, and tests

2026-05-04 09:50:03 -04:00
parent faad70872b
commit b305544f63
42 changed files with 5059 additions and 0 deletions
@@ -0,0 +1,61 @@
+from __future__ import annotations
+
+import re
+from typing import Any, Dict, List, Optional, Protocol
+
+from .models import SearchQuery, SearchResult
+
+
+class QueryEmbedder(Protocol):
+    def embed_query(self, text: str) -> List[float]:
+        ...
+
+
+class SearchStore(Protocol):
+    def search(self, vector: List[float], query: SearchQuery, limit: int) -> List[SearchResult]:
+        ...
+
+    def get_document(self, document_id: str) -> Optional[Dict[str, Any]]:
+        ...
+
+
+class HybridSearchService:
+    def __init__(self, embedder: QueryEmbedder, store: SearchStore) -> None:
+        self.embedder = embedder
+        self.store = store
+
+    def search(self, query: SearchQuery) -> List[SearchResult]:
+        vector = self.embedder.embed_query(query.text)
+        candidates = self.store.search(vector, query, limit=query.limit)
+        rescored = [
+            SearchResult(
+                id=result.id,
+                score=result.score + keyword_boost(query.text, result),
+                text=result.text,
+                payload=result.payload,
+            )
+            for result in candidates
+        ]
+        return sorted(rescored, key=lambda result: result.score, reverse=True)[: query.limit]
+
+    def get_document(self, document_id: str) -> Optional[Dict[str, Any]]:
+        return self.store.get_document(document_id)
+
+
+def keyword_boost(query_text: str, result: SearchResult) -> float:
+    haystack = " ".join([result.text, " ".join(str(value) for value in result.payload.values() if value is not None)]).lower()
+    boost = 0.0
+    for phrase in re.findall(r'"([^"]+)"', query_text):
+        if phrase.lower() in haystack:
+            boost += 0.35
+    for email in re.findall(r"[\w.+-]+@[\w.-]+\.[A-Za-z]{2,}", query_text):
+        if email.lower() in haystack:
+            boost += 0.3
+    for token in re.findall(r"\b(?:#?\d{2,}|[A-Z]{2,}[-_]\d{2,}|[A-Z0-9]{4,}-[A-Z0-9-]{2,})\b", query_text):
+        normalized = token.lower().lstrip("#")
+        if token.lower() in haystack or normalized in haystack:
+            boost += 0.25
+    for word in re.findall(r"\b[A-Za-z][\w.-]{2,}\b", query_text):
+        if word.lower() in haystack:
+            boost += 0.03
+    return boost