Add semantic-index service, deployment assets, and tests

This commit is contained in:
Jason Thistlethwaite
2026-05-04 09:50:03 -04:00
parent faad70872b
commit b305544f63
42 changed files with 5059 additions and 0 deletions
+225
View File
@@ -0,0 +1,225 @@
from __future__ import annotations
import json
from datetime import datetime, timedelta, timezone
from pathlib import Path
from typing import Any, Dict, Iterable, List, Optional, Protocol, Sequence
from .ingest import deduplicate_documents
from .models import IndexDocument
from .redmine import RedmineMapper
class RedmineRefreshSource(Protocol):
project_identifier: str | None
def recent_helpdesk_issues(self, limit: int) -> Iterable[Dict[str, Any]]:
...
class RefreshEmbedder(Protocol):
def embed_documents(self, docs: Sequence[IndexDocument]) -> List[List[float]]:
...
class RefreshStore(Protocol):
def list_documents(
self,
limit: int = 10,
source: Optional[str] = None,
project_identifier: Optional[str] = None,
doc_type: Optional[str] = None,
issue_id: Optional[int] = None,
) -> List[Dict[str, Any]]:
...
def upsert(self, docs: Sequence[IndexDocument], vectors: Sequence[Sequence[float]]) -> None:
...
def delete_documents(self, document_ids: Sequence[str]) -> None:
...
class FileRefreshState:
def __init__(self, path: Path) -> None:
self.path = path
def load(self) -> Dict[str, Any]:
if not self.path.exists():
return {}
return json.loads(self.path.read_text(encoding="utf-8"))
def mark_success(self, project_identifier: str, timestamp: Optional[str] = None) -> None:
payload = self.load()
payload.setdefault("projects", {})
payload["projects"][project_identifier] = {
"last_successful_refresh_at": timestamp or datetime.now(timezone.utc).isoformat()
}
self.path.parent.mkdir(parents=True, exist_ok=True)
self.path.write_text(json.dumps(payload, indent=2, sort_keys=True) + "\n", encoding="utf-8")
class RedmineRefreshService:
def __init__(
self,
source: RedmineRefreshSource,
embedder: RefreshEmbedder,
store: RefreshStore,
mapper: Optional[RedmineMapper] = None,
state: Optional[FileRefreshState] = None,
) -> None:
self.source = source
self.embedder = embedder
self.store = store
self.mapper = mapper or RedmineMapper(redmine_url="")
self.state = state
def refresh_redmine_project_limits(
self,
project_limits: Dict[str, int],
dry_run: bool = False,
force_rebuild: bool = False,
overlap_minutes: int = 15,
) -> Dict[str, Any]:
previous_source_project = getattr(self.source, "project_identifier", None)
previous_mapper_project = getattr(self.mapper, "project_identifier", None)
project_results: List[Dict[str, Any]] = []
totals = {
"issues": 0,
"scanned_issues": 0,
"detail_fetched_issues": 0,
"skipped_issues": 0,
"documents": 0,
"unchanged_documents": 0,
"changed_documents": 0,
"new_documents": 0,
"stale_documents": 0,
"force_rebuilt_documents": 0,
"would_embed_documents": 0,
"embedded_documents": 0,
}
try:
for project, limit in project_limits.items():
if hasattr(self.source, "project_identifier"):
self.source.project_identifier = project
if hasattr(self.mapper, "project_identifier"):
self.mapper.project_identifier = project
project_result = self._refresh_project(project, limit, dry_run, force_rebuild, overlap_minutes)
project_results.append(project_result)
for key in totals:
totals[key] += int(project_result.get(key, 0))
if not dry_run and self.state is not None:
self.state.mark_success(project)
finally:
if hasattr(self.source, "project_identifier"):
self.source.project_identifier = previous_source_project
if hasattr(self.mapper, "project_identifier"):
self.mapper.project_identifier = previous_mapper_project
return {
"source": "redmine",
"projects": len(project_limits),
"dry_run": dry_run,
"force_rebuild": force_rebuild,
"overlap_minutes": overlap_minutes,
**totals,
"project_results": project_results,
}
def _refresh_project(self, project: str, limit: int, dry_run: bool, force_rebuild: bool, overlap_minutes: int) -> Dict[str, Any]:
summaries = list(self._recent_issue_summaries(limit))
result: Dict[str, Any] = {
"project_identifier": project,
"issues": len(summaries),
"scanned_issues": len(summaries),
"detail_fetched_issues": 0,
"skipped_issues": 0,
"documents": 0,
"unchanged_documents": 0,
"changed_documents": 0,
"new_documents": 0,
"stale_documents": 0,
"force_rebuilt_documents": 0,
"would_embed_documents": 0,
"embedded_documents": 0,
}
cutoff = self._cutoff_for_project(project, overlap_minutes)
docs_to_embed: List[IndexDocument] = []
stale_ids: List[str] = []
for summary in summaries:
if cutoff is not None and not force_rebuild and not self._issue_is_in_refresh_window(summary, cutoff):
result["skipped_issues"] += 1
continue
issue = self._issue_detail(summary)
result["detail_fetched_issues"] += 1
candidates = deduplicate_documents(self.mapper.issue_to_documents(issue))
result["documents"] += len(candidates)
existing = self.store.list_documents(
limit=5000,
source="redmine",
project_identifier=project,
issue_id=int(issue["id"]),
)
existing_by_id = {document["id"]: document for document in existing}
candidate_by_id = {document.id: document for document in candidates}
for stale_id in sorted(set(existing_by_id) - set(candidate_by_id)):
stale_ids.append(stale_id)
result["stale_documents"] += 1
for document in candidates:
existing_document = existing_by_id.get(document.id)
if existing_document is None:
result["new_documents"] += 1
docs_to_embed.append(document)
continue
existing_hash = (existing_document.get("payload") or {}).get("source_hash")
document_hash = document.payload.get("source_hash")
if force_rebuild:
result["force_rebuilt_documents"] += 1
docs_to_embed.append(document)
elif existing_hash != document_hash:
result["changed_documents"] += 1
docs_to_embed.append(document)
else:
result["unchanged_documents"] += 1
result["would_embed_documents"] = len(docs_to_embed)
if dry_run:
return result
if stale_ids:
self.store.delete_documents(stale_ids)
if docs_to_embed:
vectors = self.embedder.embed_documents(docs_to_embed)
self.store.upsert(docs_to_embed, vectors)
result["embedded_documents"] = len(docs_to_embed)
return result
def _recent_issue_summaries(self, limit: int) -> Iterable[Dict[str, Any]]:
if hasattr(self.source, "recent_issue_summaries"):
return self.source.recent_issue_summaries(limit) # type: ignore[attr-defined]
return self.source.recent_helpdesk_issues(limit)
def _issue_detail(self, summary: Dict[str, Any]) -> Dict[str, Any]:
if hasattr(self.source, "issue_detail"):
return self.source.issue_detail(int(summary["id"])) # type: ignore[attr-defined]
return summary
def _cutoff_for_project(self, project: str, overlap_minutes: int) -> Optional[datetime]:
if self.state is None:
return None
timestamp = ((self.state.load().get("projects") or {}).get(project) or {}).get("last_successful_refresh_at")
if not timestamp:
return None
parsed = parse_redmine_datetime(timestamp)
return parsed - timedelta(minutes=overlap_minutes)
def _issue_is_in_refresh_window(self, issue: Dict[str, Any], cutoff: datetime) -> bool:
updated_on = issue.get("updated_on")
if not updated_on:
return True
return parse_redmine_datetime(str(updated_on)) >= cutoff
def parse_redmine_datetime(raw: str) -> datetime:
normalized = raw.replace("Z", "+00:00")
parsed = datetime.fromisoformat(normalized)
if parsed.tzinfo is None:
return parsed.replace(tzinfo=timezone.utc)
return parsed.astimezone(timezone.utc)