Add semantic-index service, deployment assets, and tests
This commit is contained in:
@@ -0,0 +1,61 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import re
|
||||
from typing import Any, Dict, List, Optional, Protocol
|
||||
|
||||
from .models import SearchQuery, SearchResult
|
||||
|
||||
|
||||
class QueryEmbedder(Protocol):
|
||||
def embed_query(self, text: str) -> List[float]:
|
||||
...
|
||||
|
||||
|
||||
class SearchStore(Protocol):
|
||||
def search(self, vector: List[float], query: SearchQuery, limit: int) -> List[SearchResult]:
|
||||
...
|
||||
|
||||
def get_document(self, document_id: str) -> Optional[Dict[str, Any]]:
|
||||
...
|
||||
|
||||
|
||||
class HybridSearchService:
|
||||
def __init__(self, embedder: QueryEmbedder, store: SearchStore) -> None:
|
||||
self.embedder = embedder
|
||||
self.store = store
|
||||
|
||||
def search(self, query: SearchQuery) -> List[SearchResult]:
|
||||
vector = self.embedder.embed_query(query.text)
|
||||
candidates = self.store.search(vector, query, limit=query.limit)
|
||||
rescored = [
|
||||
SearchResult(
|
||||
id=result.id,
|
||||
score=result.score + keyword_boost(query.text, result),
|
||||
text=result.text,
|
||||
payload=result.payload,
|
||||
)
|
||||
for result in candidates
|
||||
]
|
||||
return sorted(rescored, key=lambda result: result.score, reverse=True)[: query.limit]
|
||||
|
||||
def get_document(self, document_id: str) -> Optional[Dict[str, Any]]:
|
||||
return self.store.get_document(document_id)
|
||||
|
||||
|
||||
def keyword_boost(query_text: str, result: SearchResult) -> float:
|
||||
haystack = " ".join([result.text, " ".join(str(value) for value in result.payload.values() if value is not None)]).lower()
|
||||
boost = 0.0
|
||||
for phrase in re.findall(r'"([^"]+)"', query_text):
|
||||
if phrase.lower() in haystack:
|
||||
boost += 0.35
|
||||
for email in re.findall(r"[\w.+-]+@[\w.-]+\.[A-Za-z]{2,}", query_text):
|
||||
if email.lower() in haystack:
|
||||
boost += 0.3
|
||||
for token in re.findall(r"\b(?:#?\d{2,}|[A-Z]{2,}[-_]\d{2,}|[A-Z0-9]{4,}-[A-Z0-9-]{2,})\b", query_text):
|
||||
normalized = token.lower().lstrip("#")
|
||||
if token.lower() in haystack or normalized in haystack:
|
||||
boost += 0.25
|
||||
for word in re.findall(r"\b[A-Za-z][\w.-]{2,}\b", query_text):
|
||||
if word.lower() in haystack:
|
||||
boost += 0.03
|
||||
return boost
|
||||
Reference in New Issue
Block a user