diff --git a/deploy/semantic-index/install.sh b/deploy/semantic-index/install.sh new file mode 100755 index 0000000..9bccaaf --- /dev/null +++ b/deploy/semantic-index/install.sh @@ -0,0 +1,183 @@ +#!/usr/bin/env bash +set -euo pipefail + +usage() { + cat >&2 <<'EOF' +Usage: + deploy/semantic-index/install.sh [--dry-run] [--apply] [--start] [--no-system] [--skip-deps] + +Modes: + --dry-run Print commands that would run. This is the default. + --apply Install files, venv, dependencies, env template, and systemd units. + --start With --apply, reload systemd and start only semantic-index.service. + --no-system Skip sudo/systemd operations. Useful for tests and local validation. + --skip-deps Skip venv creation and dependency install. + +The installer never runs backfill, never enables the refresh timer, and never +passes --force-rebuild. +EOF +} + +mode=dry-run +start_service=0 +system_ops=1 +skip_deps=0 + +while [[ $# -gt 0 ]]; do + case "$1" in + --dry-run) + mode=dry-run + shift + ;; + --apply) + mode=apply + shift + ;; + --start) + start_service=1 + shift + ;; + --no-system) + system_ops=0 + shift + ;; + --skip-deps) + skip_deps=1 + shift + ;; + -h|--help) + usage + exit 0 + ;; + *) + usage + exit 2 + ;; + esac +done + +if [[ "$start_service" -eq 1 && "$mode" != "apply" ]]; then + echo "--start requires --apply" >&2 + exit 2 +fi + +repo_root=$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd) +install_dir=${SEMANTIC_INDEX_INSTALL_DIR:-/opt/semantic-index} +env_file=${SEMANTIC_INDEX_ENV_FILE:-/etc/semantic-index.env} +state_dir=${SEMANTIC_INDEX_STATE_DIR:-/var/lib/semantic-index} +log_dir=${SEMANTIC_INDEX_LOG_DIR:-/var/log/semantic-index} +systemd_dir=${SEMANTIC_INDEX_SYSTEMD_DIR:-/etc/systemd/system} +python_bin=${PYTHON:-python3} + +run() { + if [[ "$mode" == "dry-run" ]]; then + printf 'would run:' + printf ' %q' "$@" + printf '\n' + else + "$@" + fi +} + +run_sudo() { + if [[ "$system_ops" -eq 0 ]]; then + run "$@" + else + run sudo "$@" + fi +} + +install_env_template() { + if [[ "$mode" == "dry-run" ]]; then + echo "would copy env template only if missing: $env_file" + return + fi + if [[ -e "$env_file" ]]; then + echo "keeping existing $env_file" + return + fi + if [[ "$system_ops" -eq 0 ]]; then + mkdir -p "$(dirname "$env_file")" + cp "$repo_root/deploy/semantic-index/semantic-index.env.example" "$env_file" + else + sudo install -m 0640 "$repo_root/deploy/semantic-index/semantic-index.env.example" "$env_file" + fi +} + +print_next_steps_warning() { + cat <`. + +Search response shape is shared by HTTP, MCP, and the Python client: + +```json +{ + "query": "candidate follow up", + "filters": {"project_identifier": "hiring", "limit": 5}, + "results": [ + { + "id": "redmine:issue:123:chunk:0", + "score": 0.72, + "snippet": "Candidate follow up...", + "payload": {}, + "citation": { + "source": "redmine", + "doc_type": "issue", + "issue_id": 123, + "project_identifier": "hiring", + "url": "http://redmine/issues/123" + } + } + ] +} +``` + +HTTP examples: + +```sh +curl -sS -H "Authorization: Bearer $SEMANTIC_INDEX_API_KEY" \ + -H "Content-Type: application/json" \ + -d '{"query":"candidate follow up","project_identifier":"hiring","limit":5}' \ + http://127.0.0.1:8787/search + +curl -sS -H "Authorization: Bearer $SEMANTIC_INDEX_API_KEY" \ + http://127.0.0.1:8787/projects +``` + +## Python Client + +Use the client in-process when running from this repo/environment: + +```python +from semantic_index.client import SemanticIndexClient + +client = SemanticIndexClient.local() +results = client.search("callum@safetagtracking.com", project_identifier="customer-service", limit=5) +document = client.get_document(results["results"][0]["id"]) +``` + +Use HTTP mode from another local program: + +```python +from semantic_index.client import SemanticIndexClient + +client = SemanticIndexClient(base_url="http://127.0.0.1:8787", api_key="...") +results = client.search("candidate follow up", project_identifier="hiring", limit=5) +``` + +## Backfill + +Refresh the configured Redmine sample from the command line: + +```sh +python3 -m semantic_index --backfill-redmine-sample --limit 50 +``` + +When `REDMINE_PROJECT_IDENTIFIER` is set, the rebuild deletes and replaces only +indexed Redmine documents for that project. Without a project identifier, it +rebuilds the Redmine source sample for the collection. + +Refresh a balanced multi-project sample: + +```sh +python3 -m semantic_index --backfill-redmine-projects \ + --projects customer-service,hiring,todo-jason,sales-inbox,business-development,dock-scheduling,prep-standardization \ + --per-project-limit 100 +``` + +Use project-specific limits when Customer Service should stay larger than the +internal project sample: + +```sh +python3 -m semantic_index --backfill-redmine-projects \ + --project-limits customer-service=500,hiring=200,todo-jason=200,sales-inbox=100,business-development=100,dock-scheduling=100,prep-standardization=100 +``` + +Multi-project backfill rebuilds each project scope independently. Non-Helpdesk +projects are indexed as ordinary Redmine issues and journals; they are not +expected to have Helpdesk contact metadata. + +## Rolling Refresh + +Use rolling refresh for routine updates after an initial backfill: + +```sh +python3 -m semantic_index --refresh-redmine-projects \ + --project-limits customer-service=500,hiring=200,todo-jason=200,sales-inbox=100,business-development=100,dock-scheduling=100,prep-standardization=100 \ + --dry-run +``` + +Dry-run reports what would change without calling OpenAI or writing to Qdrant. +Remove `--dry-run` to apply the refresh. + +The refresh maps each recent Redmine issue to stable document IDs, reads the +existing Qdrant payloads for that issue, and compares `source_hash` values. +Only new or changed documents are embedded and upserted. Unchanged documents +are left alone, and stale documents for refreshed issues are deleted without +embedding. Use `--force-rebuild` only when you explicitly want to re-embed +matching documents. + +The default local state file is `.cache/semantic_index/refresh_state.json`. +After a successful refresh, later runs skip issues older than the previous +success timestamp minus `--overlap-minutes` unless `--force-rebuild` is used. +Override it with: + +```sh +python3 -m semantic_index --refresh-redmine-projects \ + --project-limits customer-service=500 \ + --state-path /tmp/semantic-refresh-state.json +``` + +The HTTP endpoint exposes the same behavior: + +```sh +curl -sS -X POST http://127.0.0.1:8787/sources/redmine/refresh \ + -H 'Content-Type: application/json' \ + -d '{"project_limits":{"customer-service":500},"dry_run":true}' +``` + +For production-style operation, use the wrapper script. It defaults to dry-run +and writes timestamped logs under `.cache/semantic_index/logs`: + +```sh +semantic_index/refresh.sh +semantic_index/refresh.sh --apply +``` + +For a quick smoke check of the wrapper path: + +```sh +SEMANTIC_INDEX_PROJECT_LIMITS='customer-service=5' semantic_index/refresh.sh +``` + +Override project limits, state path, or log location through environment +variables: + +```sh +SEMANTIC_INDEX_PROJECT_LIMITS='customer-service=500,hiring=200' \ +SEMANTIC_INDEX_LOG_DIR=/var/log/semantic-index \ +SEMANTIC_INDEX_STATE_PATH=/var/lib/semantic-index/refresh_state.json \ +semantic_index/refresh.sh --apply +``` + +Do not schedule `--force-rebuild`. Force rebuilds should stay manual because +they intentionally re-embed unchanged documents. + +## MCP Stdio + +```sh +python3 -m semantic_index --mcp-stdio +``` + +Tools: + +- `semantic_search` +- `semantic_get_document` +- `semantic_list_projects` +- `semantic_backfill_redmine_sample` +- `semantic_refresh_redmine` + +For agent workflows, list projects first when the user has not named a project, +search broadly or with `project_identifier` when known, then call +`semantic_get_document` for any promising result. Treat returned citations and +Redmine URLs as the authoritative references. Backfill tools are operational and +should not be part of normal search behavior. + +## Inspection CLI + +Use the inspect commands before larger backfills to see what is already indexed +or preview what Redmine would produce without writing to Qdrant. + +```sh +python3 -m semantic_index inspect count --source redmine --project customer-service +python3 -m semantic_index inspect list --limit 20 --source redmine --project customer-service +python3 -m semantic_index inspect search "order status" --limit 5 --project customer-service +python3 -m semantic_index inspect search "customer@example.com" --limit 5 --project customer-service +python3 -m semantic_index inspect show redmine:issue:39778:chunk:0 +python3 -m semantic_index inspect preview-redmine --limit 10 --project customer-service +python3 -m semantic_index inspect audit --source redmine --project customer-service --limit 500 +python3 -m semantic_index inspect compare-redmine --project customer-service --limit 20 +python3 -m semantic_index inspect smoke-search --project customer-service +``` + +`count`, `list`, `show`, and `preview-redmine` do not call OpenAI. +`search` embeds the query text. List/search output shows snippets by default; +pass `--full-text` when you need the full indexed text. +`audit` summarizes indexed document coverage without calling OpenAI. +`compare-redmine` previews live Redmine chunks and compares them to indexed +Qdrant documents without writing to Qdrant. `smoke-search` runs known search +checks and calls OpenAI for query embeddings. Pass `--json` to `audit`, +`compare-redmine`, or `smoke-search` for machine-readable output. +For mixed project samples, run `audit` without `--project` to see project-level +counts and Helpdesk-contact coverage separately from ordinary internal issues. + +For Helpdesk tickets, Redmine issue ingestion expects +`/issues/:id.json?include=journals,helpdesk` to return `helpdesk_ticket` +metadata with an expanded contact. See +`docs/redmine_issue_api_helpdesk_include.md` for the Redmine API patch notes. + +## Qdrant + +For local Docker-hosted Qdrant: + +```sh +docker run -p 6333:6333 -p 6334:6334 -v qdrant_storage:/qdrant/storage qdrant/qdrant +``` + +Create snapshots with Qdrant's snapshot API or mounted storage tooling before +destructive maintenance. The default collection name is +`redmine_semantic_sample`. diff --git a/semantic_index/__init__.py b/semantic_index/__init__.py new file mode 100644 index 0000000..ab0f471 --- /dev/null +++ b/semantic_index/__init__.py @@ -0,0 +1,12 @@ +"""Local semantic index service for Redmine and future source adapters.""" + +__all__ = [ + "config", + "embeddings", + "ingest", + "mcp", + "models", + "qdrant_store", + "redmine", + "search", +] diff --git a/semantic_index/__main__.py b/semantic_index/__main__.py new file mode 100644 index 0000000..8e9d7a8 --- /dev/null +++ b/semantic_index/__main__.py @@ -0,0 +1,206 @@ +from __future__ import annotations + +import argparse +from pathlib import Path +from typing import Callable, Dict, List, Optional + +from .app import build_services +from .config import Settings, load_settings +from .inspect import ( + print_audit, + print_compare_redmine, + print_count, + print_list, + print_preview_redmine, + print_search, + print_show, + print_smoke_search, +) +from .mcp import SemanticMCP, serve_stdio +from .refresh import FileRefreshState +from .redmine import RedmineApiSource + + +def build_preview_services(settings: Settings) -> Dict[str, object]: + return { + "settings": settings, + "redmine_source": RedmineApiSource( + redmine_url=settings.redmine_url, + api_key=settings.redmine_api_key or "", + project_identifier=settings.redmine_project_identifier, + ), + } + + +def parse_projects(raw: str) -> List[str]: + return [project.strip() for project in raw.split(",") if project.strip()] + + +def parse_project_limits(raw: str) -> Dict[str, int]: + project_limits: Dict[str, int] = {} + for item in raw.split(","): + if not item.strip(): + continue + project, limit = item.split("=", 1) + project_limits[project.strip()] = int(limit.strip()) + return project_limits + + +def main( + argv: Optional[List[str]] = None, + service_builder: Callable[[], Dict[str, object]] = build_services, + preview_service_builder: Optional[Callable[[Settings], Dict[str, object]]] = None, + settings_loader: Callable[[], Settings] = load_settings, +) -> None: + parser = argparse.ArgumentParser(description="Semantic index helper", allow_abbrev=False) + parser.add_argument("--mcp-stdio", action="store_true", help="Run the MCP-compatible stdio tool server") + parser.add_argument("--backfill-redmine-sample", action="store_true", help="Backfill the configured Redmine sample") + parser.add_argument("--backfill-redmine-projects", action="store_true", help="Backfill multiple Redmine projects") + parser.add_argument("--refresh-redmine-projects", action="store_true", help="Refresh recent Redmine issues without re-embedding unchanged documents") + parser.add_argument("--projects", help="Comma-separated Redmine project identifiers for multi-project backfill") + parser.add_argument("--project-limits", help="Comma-separated project=limit pairs for multi-project backfill") + parser.add_argument("--per-project-limit", type=int, default=500) + parser.add_argument("--limit", type=int, default=500) + parser.add_argument("--dry-run", action="store_true", help="Report planned refresh work without embeddings or writes") + parser.add_argument("--force-rebuild", action="store_true", help="Embed and upsert refresh candidates even when source hashes match") + parser.add_argument("--overlap-minutes", type=int, default=15, help="Refresh overlap window for rolling update state") + parser.add_argument("--state-path", help="Override rolling refresh state file path") + subparsers = parser.add_subparsers(dest="command") + + inspect_parser = subparsers.add_parser("inspect", help="Inspect indexed documents and preview Redmine chunks") + inspect_subparsers = inspect_parser.add_subparsers(dest="inspect_command", required=True) + + def add_filters(command_parser: argparse.ArgumentParser) -> None: + command_parser.add_argument("--source", default="redmine") + command_parser.add_argument("--project", dest="project_identifier") + command_parser.add_argument("--doc-type") + + count_parser = inspect_subparsers.add_parser("count", help="Count indexed documents") + add_filters(count_parser) + + list_parser = inspect_subparsers.add_parser("list", help="List indexed documents") + add_filters(list_parser) + list_parser.add_argument("--limit", type=int, default=20) + list_parser.add_argument("--full-text", action="store_true") + + search_parser = inspect_subparsers.add_parser("search", help="Search indexed documents") + search_parser.add_argument("query") + add_filters(search_parser) + search_parser.add_argument("--limit", type=int, default=10) + search_parser.add_argument("--full-text", action="store_true") + + show_parser = inspect_subparsers.add_parser("show", help="Show one indexed document") + show_parser.add_argument("document_id") + + preview_parser = inspect_subparsers.add_parser("preview-redmine", help="Preview Redmine chunks without writing to Qdrant") + preview_parser.add_argument("--limit", type=int, default=10) + preview_parser.add_argument("--project", dest="project_identifier") + preview_parser.add_argument("--full-text", action="store_true") + + audit_parser = inspect_subparsers.add_parser("audit", help="Audit indexed documents for trust-check coverage") + add_filters(audit_parser) + audit_parser.add_argument("--limit", type=int, default=500) + audit_parser.add_argument("--json", action="store_true") + + compare_parser = inspect_subparsers.add_parser("compare-redmine", help="Compare live Redmine preview chunks with indexed documents") + compare_parser.add_argument("--limit", type=int, default=20) + compare_parser.add_argument("--project", dest="project_identifier") + compare_parser.add_argument("--json", action="store_true") + + smoke_parser = inspect_subparsers.add_parser("smoke-search", help="Run repeatable search checks against indexed documents") + smoke_parser.add_argument("--project", dest="project_identifier") + smoke_parser.add_argument("--email", default="callum@safetagtracking.com") + smoke_parser.add_argument("--issue-id", type=int, default=39779) + smoke_parser.add_argument("--order-token") + smoke_parser.add_argument("--natural-query", default="customer needs goods returned") + smoke_parser.add_argument("--json", action="store_true") + + args = parser.parse_args(argv) + + if not args.command and not args.backfill_redmine_sample and not args.backfill_redmine_projects and not args.refresh_redmine_projects and not args.mcp_stdio: + parser.print_help() + return + + if args.command == "inspect" and args.inspect_command == "preview-redmine": + if preview_service_builder is not None: + services = preview_service_builder(settings_loader()) + elif service_builder is build_services: + services = build_preview_services(settings_loader()) + else: + services = service_builder() + project = args.project_identifier or services["settings"].redmine_project_identifier + print_preview_redmine(services["redmine_source"], services["settings"].redmine_url, project, args.limit, args.full_text) + return + + services = service_builder() + if args.state_path and "refresh" in services and hasattr(services["refresh"], "state"): + services["refresh"].state = FileRefreshState(Path(args.state_path)) + if args.backfill_redmine_sample: + print(services["backfill"].backfill_redmine_sample(limit=args.limit)) + return + if args.backfill_redmine_projects: + if args.project_limits: + print(services["backfill"].backfill_redmine_project_limits(parse_project_limits(args.project_limits))) + return + projects = parse_projects(args.projects or "") + if not projects: + parser.error("--projects or --project-limits is required with --backfill-redmine-projects") + print(services["backfill"].backfill_redmine_projects(projects, per_project_limit=args.per_project_limit)) + return + if args.refresh_redmine_projects: + if args.project_limits: + project_limits = parse_project_limits(args.project_limits) + else: + projects = parse_projects(args.projects or "") + if not projects: + parser.error("--projects or --project-limits is required with --refresh-redmine-projects") + project_limits = {project: args.per_project_limit for project in projects} + print( + services["refresh"].refresh_redmine_project_limits( + project_limits, + dry_run=args.dry_run, + force_rebuild=args.force_rebuild, + overlap_minutes=args.overlap_minutes, + ) + ) + return + if args.mcp_stdio: + serve_stdio(SemanticMCP(search_service=services["search"], backfill_service=services["backfill"], store=services["store"], refresh_service=services.get("refresh"))) + return + if args.command == "inspect": + if args.inspect_command == "count": + print_count(services["store"], args.source, args.project_identifier, args.doc_type) + return + if args.inspect_command == "list": + print_list(services["store"], args.limit, args.source, args.project_identifier, args.doc_type, args.full_text) + return + if args.inspect_command == "search": + print_search(services["search"], args.query, args.limit, args.source, args.project_identifier, args.doc_type, args.full_text) + return + if args.inspect_command == "show": + print_show(services["search"], args.document_id) + return + if args.inspect_command == "audit": + print_audit(services["store"], args.limit, args.source, args.project_identifier, args.doc_type, args.json) + return + if args.inspect_command == "compare-redmine": + project = args.project_identifier or services["settings"].redmine_project_identifier + print_compare_redmine(services["store"], services["redmine_source"], services["settings"].redmine_url, project, args.limit, args.json) + return + if args.inspect_command == "smoke-search": + project = args.project_identifier or services["settings"].redmine_project_identifier + print_smoke_search( + services["search"], + project, + args.email, + args.issue_id, + args.order_token, + args.natural_query, + args.json, + ) + return + parser.print_help() + + +if __name__ == "__main__": + main() diff --git a/semantic_index/app.py b/semantic_index/app.py new file mode 100644 index 0000000..d37a5dd --- /dev/null +++ b/semantic_index/app.py @@ -0,0 +1,153 @@ +from __future__ import annotations + +from typing import Any, Callable, Dict, Optional + +from .config import Settings, load_settings +from .embeddings import OpenAIEmbedder, OpenAIEmbeddingClient +from .ingest import BackfillService +from .models import SearchQuery, search_response +from .qdrant_store import QdrantStore +from .refresh import FileRefreshState, RedmineRefreshService +from .redmine import RedmineApiSource, RedmineMapper +from .search import HybridSearchService + + +def build_services(settings: Optional[Settings] = None) -> Dict[str, Any]: + settings = settings or load_settings() + embedding_client = OpenAIEmbeddingClient(api_key=settings.openai_api_key) + embedder = OpenAIEmbedder(client=embedding_client) + store = QdrantStore( + url=settings.qdrant_url, + api_key=settings.qdrant_api_key, + collection=settings.qdrant_collection, + ) + redmine_source = RedmineApiSource( + redmine_url=settings.redmine_url, + api_key=settings.redmine_api_key or "", + project_identifier=settings.redmine_project_identifier, + ) + search_service = HybridSearchService(embedder=embedder, store=store) + backfill_service = BackfillService( + source=redmine_source, + embedder=embedder, + store=store, + mapper=RedmineMapper(redmine_url=settings.redmine_url, project_identifier=settings.redmine_project_identifier), + ) + refresh_service = RedmineRefreshService( + source=redmine_source, + embedder=embedder, + store=store, + mapper=RedmineMapper(redmine_url=settings.redmine_url, project_identifier=settings.redmine_project_identifier), + state=FileRefreshState(settings.refresh_state_path), + ) + return { + "settings": settings, + "search": search_service, + "backfill": backfill_service, + "refresh": refresh_service, + "store": store, + "redmine_source": redmine_source, + } + + +def create_app(settings: Optional[Settings] = None, service_builder: Optional[Callable[[], Dict[str, Any]]] = None): + try: + from fastapi import FastAPI, Header, HTTPException + except ImportError as exc: + raise RuntimeError("Install fastapi and uvicorn to run the HTTP service") from exc + + services: Optional[Dict[str, Any]] = None + app = FastAPI(title="Redmine Semantic Index", version="0.1.0") + + def get_services() -> Dict[str, Any]: + nonlocal services + if services is None: + if service_builder is not None: + services = service_builder() + else: + services = build_services(settings) + return services + + def authorize(authorization: Optional[str]) -> None: + api_key = get_services()["settings"].service_api_key + if not api_key: + return + expected = f"Bearer {api_key}" + if authorization != expected: + raise HTTPException(status_code=401, detail="unauthorized") + + @app.get("/health") + def health() -> Dict[str, str]: + return {"status": "ok"} + + @app.post("/sources/redmine/backfill-sample") + def backfill(payload: Dict[str, Any] | None = None, authorization: Optional[str] = Header(default=None)) -> Dict[str, Any]: + authorize(authorization) + active_services = get_services() + limit = int((payload or {}).get("limit", active_services["settings"].sample_limit)) + return active_services["backfill"].backfill_redmine_sample(limit=limit) + + @app.post("/sources/redmine/refresh") + def refresh(payload: Dict[str, Any] | None = None, authorization: Optional[str] = Header(default=None)) -> Dict[str, Any]: + authorize(authorization) + payload = payload or {} + project_limits = payload.get("project_limits") + if not project_limits: + project = payload.get("project_identifier") or get_services()["settings"].redmine_project_identifier + if not project: + raise HTTPException(status_code=400, detail="project_limits or project_identifier is required") + project_limits = {project: int(payload.get("limit", get_services()["settings"].sample_limit))} + return get_services()["refresh"].refresh_redmine_project_limits( + {str(project): int(limit) for project, limit in project_limits.items()}, + dry_run=bool(payload.get("dry_run", False)), + force_rebuild=bool(payload.get("force_rebuild", False)), + overlap_minutes=int(payload.get("overlap_minutes", 15)), + ) + + @app.post("/search") + def search(payload: Dict[str, Any], authorization: Optional[str] = Header(default=None)) -> Dict[str, Any]: + authorize(authorization) + query = SearchQuery( + text=payload.get("query") or payload.get("text") or "", + source=payload.get("source"), + project_id=payload.get("project_id"), + project_identifier=payload.get("project_identifier"), + doc_type=payload.get("doc_type"), + issue_id=payload.get("issue_id"), + contact_id=payload.get("contact_id"), + contact_email=payload.get("contact_email"), + date_from=payload.get("date_from"), + date_to=payload.get("date_to"), + limit=int(payload.get("limit", 10)), + include_snippets=bool(payload.get("include_snippets", True)), + ) + results = get_services()["search"].search(query) + return search_response(query, results) + + @app.get("/projects") + def projects(authorization: Optional[str] = Header(default=None)) -> Dict[str, Any]: + authorize(authorization) + return {"projects": get_services()["store"].list_projects(source="redmine")} + + @app.get("/documents/{document_id}") + def document(document_id: str, authorization: Optional[str] = Header(default=None)) -> Dict[str, Any]: + authorize(authorization) + found = get_services()["search"].get_document(document_id) + if found is None: + raise HTTPException(status_code=404, detail="not_found") + return found + + return app + + +class LazyASGIApp: + def __init__(self) -> None: + self._app = None + + async def __call__(self, scope, receive, send): + if self._app is None: + self._app = create_app() + await self._app(scope, receive, send) + + +app = LazyASGIApp() diff --git a/semantic_index/chunking.py b/semantic_index/chunking.py new file mode 100644 index 0000000..1e28cdf --- /dev/null +++ b/semantic_index/chunking.py @@ -0,0 +1,25 @@ +from __future__ import annotations + +from typing import List + + +def chunk_text(text: str, max_chars: int = 3500, overlap: int = 300) -> List[str]: + cleaned = "\n".join(line.rstrip() for line in text.strip().splitlines()).strip() + if not cleaned: + return [] + if len(cleaned) <= max_chars: + return [cleaned] + + chunks: List[str] = [] + start = 0 + while start < len(cleaned): + end = min(start + max_chars, len(cleaned)) + if end < len(cleaned): + boundary = max(cleaned.rfind("\n\n", start, end), cleaned.rfind(". ", start, end)) + if boundary > start + int(max_chars * 0.5): + end = boundary + 1 + chunks.append(cleaned[start:end].strip()) + if end >= len(cleaned): + break + start = max(0, end - overlap) + return [chunk for chunk in chunks if chunk] diff --git a/semantic_index/client.py b/semantic_index/client.py new file mode 100644 index 0000000..ecdeecc --- /dev/null +++ b/semantic_index/client.py @@ -0,0 +1,72 @@ +from __future__ import annotations + +import json +import urllib.request +from typing import Any, Dict, Optional + +from .app import build_services +from .models import SearchQuery, search_response + + +class SemanticIndexClient: + def __init__( + self, + base_url: Optional[str] = None, + api_key: Optional[str] = None, + search_service: Optional[Any] = None, + ) -> None: + self.base_url = base_url.rstrip("/") if base_url else None + self.api_key = api_key + self.search_service = search_service + + @classmethod + def local(cls) -> "SemanticIndexClient": + return cls(search_service=build_services()["search"]) + + def search(self, query: str, **filters: Any) -> Dict[str, Any]: + if self.base_url: + return self._post_json("/search", {"query": query, **filters}) + search_service = self.search_service or build_services()["search"] + search_query = SearchQuery( + text=query, + source=filters.get("source"), + project_id=filters.get("project_id"), + project_identifier=filters.get("project_identifier"), + doc_type=filters.get("doc_type"), + issue_id=filters.get("issue_id"), + contact_id=filters.get("contact_id"), + contact_email=filters.get("contact_email"), + date_from=filters.get("date_from"), + date_to=filters.get("date_to"), + limit=int(filters.get("limit", 10)), + include_snippets=bool(filters.get("include_snippets", True)), + ) + return search_response(search_query, search_service.search(search_query)) + + def get_document(self, document_id: str) -> Dict[str, Any]: + if self.base_url: + return self._get_json(f"/documents/{document_id}") + search_service = self.search_service or build_services()["search"] + return search_service.get_document(document_id) or {"error": "not_found", "id": document_id} + + def _post_json(self, path: str, payload: Dict[str, Any]) -> Dict[str, Any]: + data = json.dumps(payload).encode("utf-8") + request = urllib.request.Request( + f"{self.base_url}{path}", + data=data, + headers=self._headers(), + method="POST", + ) + with urllib.request.urlopen(request, timeout=60) as response: + return json.loads(response.read().decode("utf-8")) + + def _get_json(self, path: str) -> Dict[str, Any]: + request = urllib.request.Request(f"{self.base_url}{path}", headers=self._headers()) + with urllib.request.urlopen(request, timeout=60) as response: + return json.loads(response.read().decode("utf-8")) + + def _headers(self) -> Dict[str, str]: + headers = {"Content-Type": "application/json"} + if self.api_key: + headers["Authorization"] = f"Bearer {self.api_key}" + return headers diff --git a/semantic_index/config.py b/semantic_index/config.py new file mode 100644 index 0000000..112d01e --- /dev/null +++ b/semantic_index/config.py @@ -0,0 +1,64 @@ +from __future__ import annotations + +import os +from dataclasses import dataclass +from pathlib import Path +from typing import Dict, Optional + + +@dataclass(frozen=True) +class Settings: + openai_api_key: Optional[str] + qdrant_url: str + qdrant_api_key: Optional[str] + qdrant_collection: str + redmine_url: str + redmine_api_key: Optional[str] + redmine_project_identifier: Optional[str] + sample_limit: int + bind_host: str + bind_port: int + service_api_key: Optional[str] + refresh_state_path: Path + + +def load_dotenv(path: str | Path = ".env") -> Dict[str, str]: + values: Dict[str, str] = {} + dotenv = Path(path) + if not dotenv.exists(): + return values + for raw_line in dotenv.read_text(encoding="utf-8").splitlines(): + line = raw_line.strip() + if not line or line.startswith("#") or "=" not in line: + continue + key, value = line.split("=", 1) + values[key.strip()] = value.strip().strip('"').strip("'") + return values + + +def resolve_dotenv_path(dotenv_path: str | Path = ".env") -> Path: + primary = Path(dotenv_path) + if primary.exists(): + return primary + package_env = primary.parent / "semantic_index" / ".env" + if package_env.exists(): + return package_env + return primary + + +def load_settings(dotenv_path: str | Path = ".env") -> Settings: + env = {**load_dotenv(resolve_dotenv_path(dotenv_path)), **os.environ} + return Settings( + openai_api_key=env.get("OPENAI_API_KEY"), + qdrant_url=env.get("QDRANT_URL", "http://localhost:6333"), + qdrant_api_key=env.get("QDRANT_API_KEY"), + qdrant_collection=env.get("QDRANT_COLLECTION", "redmine_semantic_sample"), + redmine_url=env.get("REDMINE_URL", "http://localhost"), + redmine_api_key=env.get("REDMINE_API_KEY"), + redmine_project_identifier=env.get("REDMINE_PROJECT_IDENTIFIER"), + sample_limit=int(env.get("REDMINE_SAMPLE_LIMIT", "500")), + bind_host=env.get("SEMANTIC_INDEX_HOST", "127.0.0.1"), + bind_port=int(env.get("SEMANTIC_INDEX_PORT", "8787")), + service_api_key=env.get("SEMANTIC_INDEX_API_KEY"), + refresh_state_path=Path(env.get("SEMANTIC_INDEX_REFRESH_STATE_PATH", ".cache/semantic_index/refresh_state.json")), + ) diff --git a/semantic_index/embeddings.py b/semantic_index/embeddings.py new file mode 100644 index 0000000..92f0b7d --- /dev/null +++ b/semantic_index/embeddings.py @@ -0,0 +1,64 @@ +from __future__ import annotations + +from typing import Iterable, List, Optional, Protocol, Sequence + +from .models import IndexDocument + + +class EmbeddingClient(Protocol): + def create_embeddings(self, model: str, inputs: Sequence[str], dimensions: Optional[int] = None) -> List[List[float]]: + ... + + +class OpenAIEmbeddingClient: + def __init__(self, api_key: Optional[str] = None) -> None: + try: + from openai import OpenAI + except ImportError as exc: + raise RuntimeError("Install openai to use live embeddings") from exc + self.client = OpenAI(api_key=api_key) + + def create_embeddings(self, model: str, inputs: Sequence[str], dimensions: Optional[int] = None) -> List[List[float]]: + kwargs = {"model": model, "input": list(inputs)} + if dimensions is not None: + kwargs["dimensions"] = dimensions + response = self.client.embeddings.create(**kwargs) + return [item.embedding for item in response.data] + + +class OpenAIEmbedder: + def __init__( + self, + client: EmbeddingClient, + model: str = "text-embedding-3-small", + dimensions: int = 1536, + batch_size: int = 100, + max_chars: int = 12000, + ) -> None: + self.client = client + self.model = model + self.dimensions = dimensions + self.batch_size = batch_size + self.max_chars = max_chars + + def embed_documents(self, documents: Sequence[IndexDocument]) -> List[List[float]]: + return self.embed_texts([document.text for document in documents]) + + def embed_query(self, text: str) -> List[float]: + return self.embed_texts([text])[0] + + def embed_texts(self, texts: Iterable[str]) -> List[List[float]]: + values = list(texts) + self._validate(values) + vectors: List[List[float]] = [] + for start in range(0, len(values), self.batch_size): + batch = values[start : start + self.batch_size] + vectors.extend(self.client.create_embeddings(self.model, batch, dimensions=self.dimensions)) + return vectors + + def _validate(self, texts: Sequence[str]) -> None: + for text in texts: + if not text.strip(): + raise ValueError("embedding text cannot be empty") + if len(text) > self.max_chars: + raise ValueError(f"embedding text exceeds {self.max_chars} characters") diff --git a/semantic_index/ingest.py b/semantic_index/ingest.py new file mode 100644 index 0000000..76600f2 --- /dev/null +++ b/semantic_index/ingest.py @@ -0,0 +1,100 @@ +from __future__ import annotations + +from typing import Any, Dict, Iterable, List, Protocol, Sequence + +from .models import IndexDocument +from .redmine import RedmineMapper + + +class RedmineSource(Protocol): + project_identifier: str | None + + def recent_helpdesk_issues(self, limit: int) -> Iterable[Dict[str, Any]]: + ... + + +class DocumentEmbedder(Protocol): + def embed_documents(self, docs: Sequence[IndexDocument]) -> List[List[float]]: + ... + + +class RebuildStore(Protocol): + def rebuild_source( + self, + source: str, + docs: Sequence[IndexDocument], + vectors: Sequence[Sequence[float]], + project_identifier: str | None = None, + ) -> None: + ... + + +class BackfillService: + def __init__(self, source: RedmineSource, embedder: DocumentEmbedder, store: RebuildStore, mapper: RedmineMapper | None = None) -> None: + self.source = source + self.embedder = embedder + self.store = store + self.mapper = mapper or RedmineMapper(redmine_url="") + + def backfill_redmine_sample(self, limit: int = 500) -> Dict[str, int | str]: + issues = list(self.source.recent_helpdesk_issues(limit)) + documents: List[IndexDocument] = [] + for issue in issues: + documents.extend(self.mapper.issue_to_documents(issue)) + documents = deduplicate_documents(documents) + vectors = self.embedder.embed_documents(documents) if documents else [] + self.store.rebuild_source("redmine", documents, vectors, project_identifier=self._project_identifier()) + return {"source": "redmine", "issues": len(issues), "documents": len(documents)} + + def backfill_redmine_projects(self, projects: Sequence[str], per_project_limit: int = 500) -> Dict[str, object]: + return self.backfill_redmine_project_limits({project: per_project_limit for project in projects}) + + def backfill_redmine_project_limits(self, project_limits: Dict[str, int]) -> Dict[str, object]: + previous_source_project = getattr(self.source, "project_identifier", None) + previous_mapper_project = getattr(self.mapper, "project_identifier", None) + project_results: List[Dict[str, int | str]] = [] + total_issues = 0 + total_documents = 0 + try: + for project, project_limit in project_limits.items(): + if hasattr(self.source, "project_identifier"): + self.source.project_identifier = project + if hasattr(self.mapper, "project_identifier"): + self.mapper.project_identifier = project + issues = list(self.source.recent_helpdesk_issues(project_limit)) + documents: List[IndexDocument] = [] + for issue in issues: + documents.extend(self.mapper.issue_to_documents(issue)) + documents = deduplicate_documents(documents) + vectors = self.embedder.embed_documents(documents) if documents else [] + self.store.rebuild_source("redmine", documents, vectors, project_identifier=project) + project_results.append( + {"project_identifier": project, "issues": len(issues), "documents": len(documents)} + ) + total_issues += len(issues) + total_documents += len(documents) + finally: + if hasattr(self.source, "project_identifier"): + self.source.project_identifier = previous_source_project + if hasattr(self.mapper, "project_identifier"): + self.mapper.project_identifier = previous_mapper_project + return { + "source": "redmine", + "projects": len(project_limits), + "issues": total_issues, + "documents": total_documents, + "project_results": project_results, + } + + def _project_identifier(self) -> str | None: + mapper_project = getattr(self.mapper, "project_identifier", None) + if mapper_project: + return mapper_project + return getattr(self.source, "project_identifier", None) + + +def deduplicate_documents(documents: Sequence[IndexDocument]) -> List[IndexDocument]: + unique: Dict[str, IndexDocument] = {} + for document in documents: + unique[document.id] = document + return list(unique.values()) diff --git a/semantic_index/inspect.py b/semantic_index/inspect.py new file mode 100644 index 0000000..a034cf8 --- /dev/null +++ b/semantic_index/inspect.py @@ -0,0 +1,292 @@ +from __future__ import annotations + +import json +from collections import Counter +from typing import Any, Dict, Iterable, List, Optional + +from .models import SearchQuery, SearchResult +from .redmine import RedmineMapper + + +def print_count(store: Any, source: Optional[str], project: Optional[str], doc_type: Optional[str]) -> None: + count = store.count_documents(source=source, project_identifier=project, doc_type=doc_type) + print(count) + + +def print_list(store: Any, limit: int, source: Optional[str], project: Optional[str], doc_type: Optional[str], full_text: bool) -> None: + documents = store.list_documents(limit=limit, source=source, project_identifier=project, doc_type=doc_type) + for document in documents: + print_document(document, full_text=full_text) + + +def print_search(search_service: Any, query_text: str, limit: int, source: Optional[str], project: Optional[str], doc_type: Optional[str], full_text: bool) -> None: + query = SearchQuery( + text=query_text, + source=source, + project_identifier=project, + doc_type=doc_type, + limit=limit, + include_snippets=not full_text, + ) + for result in search_service.search(query): + print_result(result, full_text=full_text) + + +def print_show(search_service: Any, document_id: str) -> None: + document = search_service.get_document(document_id) + if document is None: + print(f"not found: {document_id}") + return + print_document(document, full_text=True) + + +def print_preview_redmine(source: Any, redmine_url: str, project: Optional[str], limit: int, full_text: bool) -> None: + previous_project = getattr(source, "project_identifier", None) + if project and hasattr(source, "project_identifier"): + source.project_identifier = project + try: + mapper = RedmineMapper(redmine_url=redmine_url, project_identifier=project) + documents = [] + for issue in source.recent_helpdesk_issues(limit): + documents.extend(mapper.issue_to_documents(issue)) + finally: + if hasattr(source, "project_identifier"): + source.project_identifier = previous_project + for document in documents: + print_document({"id": document.id, "text": document.text, "payload": document.payload}, full_text=full_text) + + +def print_audit(store: Any, limit: int, source: Optional[str], project: Optional[str], doc_type: Optional[str], as_json: bool) -> None: + documents = store.list_documents(limit=limit, source=source, project_identifier=project, doc_type=doc_type) + report = audit_documents(documents) + if as_json: + print(json.dumps(report, sort_keys=True)) + return + print(f"documents={report['total_documents']}") + for name, count in sorted(report["doc_type_counts"].items()): + print(f"doc_type {name}={count}") + for name, count in sorted(report["project_counts"].items()): + print(f"project {name}={count}") + print(f"contact_metadata {report['contact_metadata_count']}/{report['total_documents']}") + print(f"helpdesk_contact_metadata {report['helpdesk_contact_metadata_count']}/{report['helpdesk_documents']}") + print(f"attachments={report['attachment_documents']}") + for document_id in report["missing_helpdesk_contact_metadata"]: + print(f"missing_contact {document_id}") + for document_id in report["unexpected_attachment_documents"]: + print(f"unexpected_attachment {document_id}") + + +def print_compare_redmine(store: Any, source: Any, redmine_url: str, project: Optional[str], limit: int, as_json: bool) -> None: + preview_documents = preview_redmine_documents(source, redmine_url, project, limit) + indexed_documents = store.list_documents(limit=max(5000, limit * 100), source="redmine", project_identifier=project) + report = compare_documents(preview_documents, indexed_documents) + if as_json: + print(json.dumps(report, sort_keys=True)) + return + print(f"preview_documents={report['preview_documents']}") + print(f"indexed_documents={report['indexed_documents']}") + for document_id in report["missing"]: + print(f"missing {document_id}") + for document_id in report["stale"]: + print(f"stale {document_id}") + for mismatch in report["contact_mismatches"]: + print(f"contact_mismatch {mismatch['id']}") + + +def print_smoke_search( + search_service: Any, + project: Optional[str], + email: str, + issue_id: Optional[int], + order_token: Optional[str], + natural_query: str, + as_json: bool, +) -> None: + checks = smoke_search(search_service, project, email, issue_id, order_token, natural_query) + report = {"project_identifier": project, "checks": checks} + if as_json: + print(json.dumps(report, sort_keys=True)) + return + for check in checks: + status = "PASS" if check["passed"] else "FAIL" + print(f"{status} {check['kind']} {check['query']}") + for result in check["results"]: + payload = result["payload"] + print( + f" {result['id']} score={result['score']:.4f} " + f"doc_type={payload.get('doc_type')} issue={payload.get('issue_id')} " + f"contact={contact_display(payload)} url={result['citation'].get('url')}" + ) + + +def audit_documents(documents: List[Dict[str, Any]]) -> Dict[str, Any]: + doc_type_counts = Counter(str((document.get("payload") or {}).get("doc_type") or "unknown") for document in documents) + project_counts = Counter(str((document.get("payload") or {}).get("project_identifier") or "unknown") for document in documents) + missing_contact = [] + missing_helpdesk_contact = [] + contact_metadata_count = 0 + helpdesk_documents = 0 + helpdesk_contact_metadata_count = 0 + unexpected_attachments = [] + for document in documents: + payload = document.get("payload") or {} + doc_type = str(payload.get("doc_type") or "") + has_contact = bool(payload.get("contact_id") and payload.get("contact_email")) + has_helpdesk_ticket = bool(payload.get("has_helpdesk_ticket")) + if has_contact: + contact_metadata_count += 1 + elif doc_type in {"issue", "journal", "message", "contact"} and has_helpdesk_ticket: + missing_contact.append(str(document.get("id"))) + if has_helpdesk_ticket: + helpdesk_documents += 1 + if has_contact: + helpdesk_contact_metadata_count += 1 + elif doc_type in {"issue", "journal", "message", "contact"}: + missing_helpdesk_contact.append(str(document.get("id"))) + if doc_type == "attachment": + unexpected_attachments.append(str(document.get("id"))) + return { + "total_documents": len(documents), + "doc_type_counts": dict(doc_type_counts), + "project_counts": dict(project_counts), + "contact_metadata_count": contact_metadata_count, + "helpdesk_documents": helpdesk_documents, + "helpdesk_contact_metadata_count": helpdesk_contact_metadata_count, + "missing_contact_metadata": missing_contact, + "missing_helpdesk_contact_metadata": missing_helpdesk_contact, + "attachment_documents": len(unexpected_attachments), + "unexpected_attachment_documents": unexpected_attachments, + } + + +def preview_redmine_documents(source: Any, redmine_url: str, project: Optional[str], limit: int) -> List[Dict[str, Any]]: + previous_project = getattr(source, "project_identifier", None) + if project and hasattr(source, "project_identifier"): + source.project_identifier = project + try: + mapper = RedmineMapper(redmine_url=redmine_url, project_identifier=project) + documents = [] + for issue in source.recent_helpdesk_issues(limit): + documents.extend(mapper.issue_to_documents(issue)) + return [{"id": document.id, "text": document.text, "payload": document.payload} for document in documents] + finally: + if hasattr(source, "project_identifier"): + source.project_identifier = previous_project + + +def compare_documents(preview_documents: List[Dict[str, Any]], indexed_documents: List[Dict[str, Any]]) -> Dict[str, Any]: + indexed_by_id = {str(document.get("id")): document for document in indexed_documents} + missing = [] + stale = [] + contact_mismatches = [] + for preview in preview_documents: + document_id = str(preview.get("id")) + indexed = indexed_by_id.get(document_id) + if indexed is None: + missing.append(document_id) + continue + preview_payload = preview.get("payload") or {} + indexed_payload = indexed.get("payload") or {} + if preview_payload.get("source_hash") != indexed_payload.get("source_hash"): + stale.append(document_id) + contact_fields = ("contact_id", "contact_name", "contact_email", "contact_company") + if any(preview_payload.get(field) != indexed_payload.get(field) for field in contact_fields): + contact_mismatches.append({"id": document_id}) + return { + "preview_documents": len(preview_documents), + "indexed_documents": len(indexed_documents), + "missing": missing, + "stale": stale, + "contact_mismatches": contact_mismatches, + } + + +def smoke_search( + search_service: Any, + project: Optional[str], + email: str, + issue_id: Optional[int], + order_token: Optional[str], + natural_query: str, +) -> List[Dict[str, Any]]: + checks = [run_smoke_query(search_service, "email", email, project, expected_email=email)] + if issue_id is not None: + checks.append(run_smoke_query(search_service, "issue", str(issue_id), project, expected_issue_id=issue_id)) + if order_token: + checks.append(run_smoke_query(search_service, "order", order_token, project)) + if natural_query: + checks.append(run_smoke_query(search_service, "natural", natural_query, project)) + return checks + + +def run_smoke_query( + search_service: Any, + kind: str, + text: str, + project: Optional[str], + expected_email: Optional[str] = None, + expected_issue_id: Optional[int] = None, +) -> Dict[str, Any]: + query = SearchQuery(text=text, source="redmine", project_identifier=project, issue_id=expected_issue_id, limit=5) + results = search_service.search(query) + result_dicts = [result.to_dict(include_snippet=True) for result in results] + passed = bool(result_dicts) + if expected_email: + passed = passed and any((result["payload"] or {}).get("contact_email") == expected_email for result in result_dicts) + if expected_issue_id is not None: + passed = passed and any((result["payload"] or {}).get("issue_id") == expected_issue_id for result in result_dicts) + return {"kind": kind, "query": text, "passed": passed, "results": result_dicts} + + +def print_result(result: SearchResult, full_text: bool) -> None: + print(f"{result.id} score={result.score:.4f}") + print_metadata(result.payload) + print(f"url={result.citation.get('url')}") + print(result.text if full_text else snippet(result.text)) + print() + + +def print_document(document: Dict[str, Any], full_text: bool) -> None: + payload = document.get("payload") or {} + print(document.get("id")) + print_metadata(payload) + url = payload.get("redmine_url") + if url: + print(f"url={url}") + print(document.get("text", "") if full_text else snippet(document.get("text", ""))) + print() + + +def print_metadata(payload: Dict[str, Any]) -> None: + contact = contact_display(payload) + fields = [ + ("source", payload.get("source")), + ("doc_type", payload.get("doc_type")), + ("issue", payload.get("issue_id")), + ("project", payload.get("project_identifier")), + ("contact", contact), + ("created", payload.get("created_on")), + ("updated", payload.get("updated_on")), + ] + print(" ".join(f"{name}={value}" for name, value in fields if value is not None)) + + +def contact_display(payload: Dict[str, Any]) -> Optional[str]: + contact_id = payload.get("contact_id") + pieces = [] + if contact_id is not None: + pieces.append(f"#{contact_id}") + if payload.get("contact_name"): + pieces.append(str(payload["contact_name"])) + if payload.get("contact_email"): + pieces.append(str(payload["contact_email"])) + if payload.get("contact_company"): + pieces.append(str(payload["contact_company"])) + return " | ".join(pieces) if pieces else None + + +def snippet(text: str, max_chars: int = 240) -> str: + compact = " ".join(text.split()) + if len(compact) <= max_chars: + return compact + return compact[: max_chars - 3].rstrip() + "..." diff --git a/semantic_index/mcp.py b/semantic_index/mcp.py new file mode 100644 index 0000000..cfb3631 --- /dev/null +++ b/semantic_index/mcp.py @@ -0,0 +1,80 @@ +from __future__ import annotations + +import json +import sys +from typing import Any, Dict, Optional + +from .models import SearchQuery, search_response + + +class SemanticMCP: + def __init__(self, search_service: Any, backfill_service: Optional[Any], store: Optional[Any] = None, refresh_service: Optional[Any] = None) -> None: + self.search_service = search_service + self.backfill_service = backfill_service + self.store = store + self.refresh_service = refresh_service + + def tools(self) -> Dict[str, Dict[str, str]]: + return { + "semantic_search": {"description": "Search the semantic index and return cited snippets."}, + "semantic_get_document": {"description": "Fetch one indexed document by stable id."}, + "semantic_list_projects": {"description": "List indexed project identifiers and document counts."}, + "semantic_backfill_redmine_sample": {"description": "Rebuild the Redmine sample collection."}, + "semantic_refresh_redmine": {"description": "Refresh recent Redmine issues without re-embedding unchanged documents."}, + } + + def call_tool(self, name: str, arguments: Dict[str, Any]) -> Dict[str, Any]: + if name == "semantic_search": + query = SearchQuery( + text=arguments.get("query") or arguments.get("text") or "", + source=arguments.get("source"), + project_id=arguments.get("project_id"), + project_identifier=arguments.get("project_identifier"), + doc_type=arguments.get("doc_type"), + issue_id=arguments.get("issue_id"), + contact_id=arguments.get("contact_id"), + contact_email=arguments.get("contact_email"), + date_from=arguments.get("date_from"), + date_to=arguments.get("date_to"), + limit=int(arguments.get("limit", 10)), + include_snippets=bool(arguments.get("include_snippets", True)), + ) + results = self.search_service.search(query) + return search_response(query, results) + if name == "semantic_get_document": + return self.search_service.get_document(arguments["id"]) or {"error": "not_found", "id": arguments["id"]} + if name == "semantic_list_projects": + if self.store is None: + return {"error": "project_listing_unavailable"} + return {"projects": self.store.list_projects(source=arguments.get("source", "redmine"))} + if name == "semantic_backfill_redmine_sample": + if self.backfill_service is None: + return {"error": "backfill_unavailable"} + return self.backfill_service.backfill_redmine_sample(limit=int(arguments.get("limit", 500))) + if name == "semantic_refresh_redmine": + if self.refresh_service is None: + return {"error": "refresh_unavailable"} + project_limits = arguments.get("project_limits") + if not project_limits: + project = arguments.get("project_identifier") + if not project: + return {"error": "project_required"} + project_limits = {project: int(arguments.get("limit", 500))} + return self.refresh_service.refresh_redmine_project_limits( + {str(project): int(limit) for project, limit in project_limits.items()}, + dry_run=bool(arguments.get("dry_run", False)), + force_rebuild=bool(arguments.get("force_rebuild", False)), + overlap_minutes=int(arguments.get("overlap_minutes", 15)), + ) + raise ValueError(f"unknown tool: {name}") + + +def serve_stdio(mcp: SemanticMCP) -> None: + for line in sys.stdin: + request = json.loads(line) + try: + result = mcp.call_tool(request["name"], request.get("arguments") or {}) + response = {"id": request.get("id"), "result": result} + except Exception as exc: + response = {"id": request.get("id"), "error": str(exc)} + print(json.dumps(response), flush=True) diff --git a/semantic_index/models.py b/semantic_index/models.py new file mode 100644 index 0000000..07ce58d --- /dev/null +++ b/semantic_index/models.py @@ -0,0 +1,100 @@ +from __future__ import annotations + +from dataclasses import dataclass, field +from typing import Any, Dict, Optional + + +Payload = Dict[str, Any] + + +@dataclass(frozen=True) +class IndexDocument: + id: str + text: str + payload: Payload = field(default_factory=dict) + + def __post_init__(self) -> None: + if not self.id.strip(): + raise ValueError("document id is required") + if not self.text.strip(): + raise ValueError("document text is required") + + +@dataclass(frozen=True) +class SearchQuery: + text: str + source: Optional[str] = None + project_id: Optional[int] = None + project_identifier: Optional[str] = None + doc_type: Optional[str] = None + issue_id: Optional[int] = None + contact_id: Optional[int] = None + contact_email: Optional[str] = None + date_from: Optional[str] = None + date_to: Optional[str] = None + limit: int = 10 + include_snippets: bool = True + + def __post_init__(self) -> None: + if not self.text.strip(): + raise ValueError("search text is required") + if self.limit < 1 or self.limit > 100: + raise ValueError("limit must be between 1 and 100") + + +@dataclass(frozen=True) +class SearchResult: + id: str + score: float + text: str + payload: Payload + + @property + def snippet(self) -> str: + return self.text[:500] + + @property + def citation(self) -> Payload: + return { + "id": self.id, + "source": self.payload.get("source"), + "doc_type": self.payload.get("doc_type"), + "issue_id": self.payload.get("issue_id"), + "project_identifier": self.payload.get("project_identifier"), + "contact_id": self.payload.get("contact_id"), + "contact_name": self.payload.get("contact_name"), + "contact_email": self.payload.get("contact_email"), + "url": self.payload.get("redmine_url"), + "record_id": self.payload.get("source_record_id"), + } + + def to_dict(self, include_snippet: bool = True) -> Payload: + data: Payload = { + "id": self.id, + "score": self.score, + "payload": self.payload, + "citation": self.citation, + } + if include_snippet: + data["snippet"] = self.snippet + return data + + +def search_response(query: SearchQuery, results: list[SearchResult]) -> Payload: + filters = { + "source": query.source, + "project_id": query.project_id, + "project_identifier": query.project_identifier, + "doc_type": query.doc_type, + "issue_id": query.issue_id, + "contact_id": query.contact_id, + "contact_email": query.contact_email, + "date_from": query.date_from, + "date_to": query.date_to, + "limit": query.limit, + } + return { + "query": query.text, + "filters": {key: value for key, value in filters.items() if value is not None}, + "results": [result.to_dict(include_snippet=query.include_snippets) for result in results], + } diff --git a/semantic_index/qdrant_store.py b/semantic_index/qdrant_store.py new file mode 100644 index 0000000..5460875 --- /dev/null +++ b/semantic_index/qdrant_store.py @@ -0,0 +1,219 @@ +from __future__ import annotations + +import uuid +from typing import Any, Dict, List, Optional, Sequence +from collections import Counter + +from .models import IndexDocument, SearchQuery, SearchResult + + +def point_id_for_document(document_id: str) -> str: + return str(uuid.uuid5(uuid.NAMESPACE_URL, document_id)) + + +def build_filter(query: SearchQuery) -> Dict[str, List[Dict[str, Any]]]: + must: List[Dict[str, Any]] = [] + equality_fields = { + "source": query.source, + "project_id": query.project_id, + "project_identifier": query.project_identifier, + "doc_type": query.doc_type, + "issue_id": query.issue_id, + "contact_id": query.contact_id, + "contact_email": query.contact_email, + } + for key, value in equality_fields.items(): + if value is not None: + must.append({"key": key, "match": {"value": value}}) + if query.date_from or query.date_to: + range_filter: Dict[str, str] = {} + if query.date_from: + range_filter["gte"] = query.date_from + if query.date_to: + range_filter["lte"] = query.date_to + must.append({"key": "created_on", "range": range_filter}) + return {"must": must} + + +class QdrantStore: + def __init__(self, url: str, api_key: Optional[str], collection: str, vector_size: int = 1536, upsert_batch_size: int = 64) -> None: + try: + from qdrant_client import QdrantClient + from qdrant_client.http import models as qmodels + except ImportError as exc: + raise RuntimeError("Install qdrant-client to use live Qdrant storage") from exc + self.client = QdrantClient(url=url, api_key=api_key) + self.collection = collection + self.vector_size = vector_size + self.upsert_batch_size = upsert_batch_size + self.qmodels = qmodels + + def ensure_collection(self) -> None: + collections = self.client.get_collections().collections + if any(collection.name == self.collection for collection in collections): + return + self.client.create_collection( + collection_name=self.collection, + vectors_config=self.qmodels.VectorParams(size=self.vector_size, distance=self.qmodels.Distance.COSINE), + ) + + def upsert(self, documents: Sequence[IndexDocument], vectors: Sequence[Sequence[float]]) -> None: + if len(documents) != len(vectors): + raise ValueError("documents and vectors length mismatch") + self.ensure_collection() + points = [ + self.qmodels.PointStruct( + id=point_id_for_document(document.id), + vector=list(vector), + payload={**document.payload, "document_id": document.id, "text": document.text}, + ) + for document, vector in zip(documents, vectors) + ] + for start in range(0, len(points), self.upsert_batch_size): + batch = points[start : start + self.upsert_batch_size] + if batch: + self.client.upsert(collection_name=self.collection, points=batch) + + def delete_by_source(self, source: str, project_identifier: Optional[str] = None) -> None: + self.ensure_collection() + query = SearchQuery(text="*", source=source, project_identifier=project_identifier) + self.client.delete( + collection_name=self.collection, + points_selector=self.qmodels.FilterSelector( + filter=self._to_qdrant_filter(build_filter(query)) + ), + ) + + def delete_documents(self, document_ids: Sequence[str]) -> None: + self.ensure_collection() + if not document_ids: + return + self.client.delete( + collection_name=self.collection, + points_selector=self.qmodels.PointIdsList( + points=[point_id_for_document(document_id) for document_id in document_ids] + ), + ) + + def rebuild_source( + self, + source: str, + documents: Sequence[IndexDocument], + vectors: Sequence[Sequence[float]], + project_identifier: Optional[str] = None, + ) -> None: + self.delete_by_source(source, project_identifier=project_identifier) + self.upsert(documents, vectors) + + def search(self, vector: Sequence[float], query: SearchQuery, limit: int) -> List[SearchResult]: + self.ensure_collection() + qfilter = self._to_qdrant_filter(build_filter(query)) + if hasattr(self.client, "query_points"): + response = self.client.query_points( + collection_name=self.collection, + query=list(vector), + query_filter=qfilter, + limit=limit, + with_payload=True, + ) + results = response.points + else: + results = self.client.search( + collection_name=self.collection, + query_vector=list(vector), + query_filter=qfilter, + limit=limit, + with_payload=True, + ) + return [self._point_to_result(point) for point in results] + + def get_document(self, document_id: str) -> Optional[Dict[str, Any]]: + self.ensure_collection() + points = self.client.retrieve(collection_name=self.collection, ids=[point_id_for_document(document_id)], with_payload=True) + if not points: + return None + payload = dict(points[0].payload or {}) + text = payload.pop("text", "") + payload.pop("document_id", None) + return {"id": document_id, "text": text, "payload": payload} + + def count_documents( + self, + source: Optional[str] = None, + project_identifier: Optional[str] = None, + doc_type: Optional[str] = None, + ) -> int: + self.ensure_collection() + query = SearchQuery(text="*", source=source, project_identifier=project_identifier, doc_type=doc_type) + result = self.client.count( + collection_name=self.collection, + count_filter=self._to_qdrant_filter(build_filter(query)), + exact=True, + ) + return int(result.count) + + def list_documents( + self, + limit: int = 10, + source: Optional[str] = None, + project_identifier: Optional[str] = None, + doc_type: Optional[str] = None, + issue_id: Optional[int] = None, + ) -> List[Dict[str, Any]]: + self.ensure_collection() + query = SearchQuery(text="*", source=source, project_identifier=project_identifier, doc_type=doc_type, issue_id=issue_id) + qfilter = self._to_qdrant_filter(build_filter(query)) + records = [] + offset = None + while len(records) < limit: + batch_limit = limit - len(records) + batch, offset = self.client.scroll( + collection_name=self.collection, + scroll_filter=qfilter, + limit=batch_limit, + with_payload=True, + with_vectors=False, + offset=offset, + ) + records.extend(batch[:batch_limit]) + if not offset or not batch: + break + return [self._record_to_document(record) for record in records] + + def list_projects(self, source: Optional[str] = None, limit: int = 5000) -> List[Dict[str, Any]]: + documents = self.list_documents(limit=limit, source=source) + counts = Counter( + str((document.get("payload") or {}).get("project_identifier")) + for document in documents + if (document.get("payload") or {}).get("project_identifier") + ) + return [ + {"project_identifier": project, "document_count": count} + for project, count in sorted(counts.items()) + ] + + def _to_qdrant_filter(self, raw_filter: Dict[str, List[Dict[str, Any]]]) -> Any: + conditions = [] + for condition in raw_filter.get("must", []): + if "match" in condition: + conditions.append( + self.qmodels.FieldCondition( + key=condition["key"], + match=self.qmodels.MatchValue(value=condition["match"]["value"]), + ) + ) + elif "range" in condition: + conditions.append(self.qmodels.FieldCondition(key=condition["key"], range=self.qmodels.DatetimeRange(**condition["range"]))) + return self.qmodels.Filter(must=conditions) if conditions else None + + def _point_to_result(self, point: Any) -> SearchResult: + payload = dict(point.payload or {}) + text = payload.pop("text", "") + document_id = payload.pop("document_id", str(point.id)) + return SearchResult(id=document_id, score=float(point.score), text=text, payload=payload) + + def _record_to_document(self, record: Any) -> Dict[str, Any]: + payload = dict(record.payload or {}) + text = payload.pop("text", "") + document_id = payload.pop("document_id", str(record.id)) + return {"id": document_id, "text": text, "payload": payload} diff --git a/semantic_index/redmine.py b/semantic_index/redmine.py new file mode 100644 index 0000000..c9aef68 --- /dev/null +++ b/semantic_index/redmine.py @@ -0,0 +1,243 @@ +from __future__ import annotations + +import hashlib +import json +import urllib.parse +import urllib.request +from typing import Any, Dict, Iterable, List, Optional + +from .chunking import chunk_text +from .models import IndexDocument, Payload + + +Issue = Dict[str, Any] + + +class RedmineMapper: + def __init__(self, redmine_url: str, chunk_chars: int = 3500, project_identifier: Optional[str] = None) -> None: + self.redmine_url = redmine_url.rstrip("/") + self.chunk_chars = chunk_chars + self.project_identifier = project_identifier + + def issue_to_documents(self, issue: Issue) -> List[IndexDocument]: + docs: List[IndexDocument] = [] + docs.extend(self._issue_documents(issue)) + docs.extend(self._journal_documents(issue)) + docs.extend(self._message_documents(issue)) + docs.extend(self._contact_documents(issue)) + return docs + + def _issue_documents(self, issue: Issue) -> List[IndexDocument]: + issue_id = int(issue["id"]) + subject = issue.get("subject") or "" + description = issue.get("description") or "" + contact = self._issue_contact(issue) + contact_text = self._contact_text(contact) + text = f"Issue #{issue_id}: {subject}\n\n{description}\n\n{contact_text}".strip() + return self._documents_for_record( + base_id=f"redmine:issue:{issue_id}", + text=text, + issue=issue, + doc_type="issue", + source_record_id=f"issue:{issue_id}", + record=issue, + ) + + def _journal_documents(self, issue: Issue) -> List[IndexDocument]: + docs: List[IndexDocument] = [] + issue_id = int(issue["id"]) + for journal in issue.get("journals") or []: + notes = journal.get("notes") or "" + if not notes.strip(): + continue + docs.extend( + self._documents_for_record( + base_id=f"redmine:issue:{issue_id}:journal:{journal['id']}", + text=notes, + issue=issue, + doc_type="journal", + source_record_id=f"journal:{journal['id']}", + record=journal, + extra={ + "journal_id": journal.get("id"), + "visibility": "private" if journal.get("private_notes") else "public", + "created_on": journal.get("created_on") or issue.get("updated_on"), + }, + ) + ) + return docs + + def _message_documents(self, issue: Issue) -> List[IndexDocument]: + docs: List[IndexDocument] = [] + issue_id = int(issue["id"]) + for message in issue.get("messages") or issue.get("journal_messages") or []: + body = message.get("body") or message.get("content") or message.get("message") or "" + if not body.strip(): + continue + docs.extend( + self._documents_for_record( + base_id=f"redmine:issue:{issue_id}:message:{message['id']}", + text=body, + issue=issue, + doc_type="message", + source_record_id=f"message:{message['id']}", + record=message, + extra={ + "message_id": message.get("id"), + "direction": message.get("direction"), + "created_on": message.get("created_on") or issue.get("updated_on"), + }, + ) + ) + return docs + + def _contact_documents(self, issue: Issue) -> List[IndexDocument]: + contact = self._issue_contact(issue) + contact_id = contact.get("id") + if not contact_id: + return [] + text = self._contact_text(contact) + if not text.strip(): + return [] + return self._documents_for_record( + base_id=f"redmine:contact:{contact_id}:issue:{issue['id']}", + text=text, + issue=issue, + doc_type="contact", + source_record_id=f"contact:{contact_id}", + record=contact, + ) + + def _documents_for_record( + self, + base_id: str, + text: str, + issue: Issue, + doc_type: str, + source_record_id: str, + record: Dict[str, Any], + extra: Optional[Payload] = None, + ) -> List[IndexDocument]: + chunks = chunk_text(text, max_chars=self.chunk_chars) + payload = self._base_payload(issue, doc_type, source_record_id, record) + if extra: + payload.update({key: value for key, value in extra.items() if value is not None}) + return [ + IndexDocument(id=f"{base_id}:chunk:{index}", text=chunk, payload={**payload, "chunk_index": index}) + for index, chunk in enumerate(chunks) + ] + + def _base_payload(self, issue: Issue, doc_type: str, source_record_id: str, record: Dict[str, Any]) -> Payload: + project = issue.get("project") or {} + helpdesk_ticket = issue.get("helpdesk_ticket") or {} + contact = self._issue_contact(issue) + issue_id = int(issue["id"]) + redmine_url = issue.get("url") or f"{self.redmine_url}/issues/{issue_id}" + created_on = record.get("created_on") or issue.get("created_on") + updated_on = record.get("updated_on") or issue.get("updated_on") + return { + "source": "redmine", + "doc_type": doc_type, + "issue_id": issue_id, + "project_id": project.get("id"), + "project_identifier": project.get("identifier") or self.project_identifier, + "project_name": project.get("name"), + "has_helpdesk_ticket": bool(helpdesk_ticket.get("id")), + "helpdesk_ticket_id": helpdesk_ticket.get("id"), + "contact_id": contact.get("id"), + "contact_email": contact.get("email"), + "contact_name": contact.get("name"), + "contact_company": contact.get("company"), + "created_on": created_on, + "updated_on": updated_on, + "visibility": "public", + "redmine_url": redmine_url, + "source_record_id": source_record_id, + "source_hash": stable_hash(record), + } + + def _issue_contact(self, issue: Issue) -> Payload: + contact = issue.get("contact") or issue.get("customer") or {} + helpdesk_ticket = issue.get("helpdesk_ticket") or {} + helpdesk_contact = helpdesk_ticket.get("contact") or {} + merged = {**helpdesk_contact, **contact} + if not merged.get("id"): + merged["id"] = helpdesk_ticket.get("contact_id") + if not merged.get("email"): + merged["email"] = helpdesk_ticket.get("contact_email") or helpdesk_ticket.get("from_address") + if not merged.get("name"): + merged["name"] = helpdesk_ticket.get("contact_name") + if not merged.get("company"): + merged["company"] = helpdesk_ticket.get("contact_company") + return {key: value for key, value in merged.items() if value not in (None, "")} + + def _contact_text(self, contact: Payload) -> str: + text_parts = [ + contact.get("name"), + contact.get("email"), + contact.get("phone"), + contact.get("company"), + ] + return "\n".join(str(part) for part in text_parts if part) + + +class RedmineApiSource: + def __init__(self, redmine_url: str, api_key: str, project_identifier: Optional[str] = None) -> None: + self.redmine_url = redmine_url.rstrip("/") + self.api_key = api_key + self.project_identifier = project_identifier + + def recent_helpdesk_issues(self, limit: int) -> Iterable[Issue]: + for issue in self.recent_issue_summaries(limit): + yield self.issue_detail(int(issue["id"]), fallback=issue) + + def recent_issue_summaries(self, limit: int) -> Iterable[Issue]: + yielded = 0 + offset = 0 + seen_issue_ids = set() + page_size = 100 + while yielded < limit: + current_limit = min(page_size, limit - yielded) + params = { + "limit": str(current_limit), + "offset": str(offset), + "sort": "updated_on:desc,id:desc", + "include": "journals", + "status_id": "*", + } + if self.project_identifier: + params["project_id"] = self.project_identifier + params["subproject_id"] = "!*" + path = f"{self.redmine_url}/issues.json?{urllib.parse.urlencode(params)}" + payload = self._get_json(path) + issues = payload.get("issues", []) + if not issues: + break + for issue in issues: + issue_id = issue["id"] + if issue_id in seen_issue_ids: + continue + seen_issue_ids.add(issue_id) + issue.setdefault("url", f"{self.redmine_url}/issues/{issue_id}") + yield issue + yielded += 1 + if yielded >= limit: + break + offset += len(issues) + + def issue_detail(self, issue_id: int, fallback: Optional[Issue] = None) -> Issue: + detail_params = urllib.parse.urlencode({"include": "journals,helpdesk"}) + detail = self._get_json(f"{self.redmine_url}/issues/{issue_id}.json?{detail_params}") + merged = {**(fallback or {}), **detail.get("issue", {})} + merged.setdefault("url", f"{self.redmine_url}/issues/{issue_id}") + return merged + + def _get_json(self, url: str) -> Dict[str, Any]: + request = urllib.request.Request(url, headers={"X-Redmine-API-Key": self.api_key, "Accept": "application/json"}) + with urllib.request.urlopen(request, timeout=30) as response: + return json.loads(response.read().decode("utf-8")) + + +def stable_hash(record: Dict[str, Any]) -> str: + canonical = json.dumps(record, sort_keys=True, separators=(",", ":"), default=str) + return hashlib.sha256(canonical.encode("utf-8")).hexdigest() diff --git a/semantic_index/refresh.py b/semantic_index/refresh.py new file mode 100644 index 0000000..032a8d2 --- /dev/null +++ b/semantic_index/refresh.py @@ -0,0 +1,225 @@ +from __future__ import annotations + +import json +from datetime import datetime, timedelta, timezone +from pathlib import Path +from typing import Any, Dict, Iterable, List, Optional, Protocol, Sequence + +from .ingest import deduplicate_documents +from .models import IndexDocument +from .redmine import RedmineMapper + + +class RedmineRefreshSource(Protocol): + project_identifier: str | None + + def recent_helpdesk_issues(self, limit: int) -> Iterable[Dict[str, Any]]: + ... + + +class RefreshEmbedder(Protocol): + def embed_documents(self, docs: Sequence[IndexDocument]) -> List[List[float]]: + ... + + +class RefreshStore(Protocol): + def list_documents( + self, + limit: int = 10, + source: Optional[str] = None, + project_identifier: Optional[str] = None, + doc_type: Optional[str] = None, + issue_id: Optional[int] = None, + ) -> List[Dict[str, Any]]: + ... + + def upsert(self, docs: Sequence[IndexDocument], vectors: Sequence[Sequence[float]]) -> None: + ... + + def delete_documents(self, document_ids: Sequence[str]) -> None: + ... + + +class FileRefreshState: + def __init__(self, path: Path) -> None: + self.path = path + + def load(self) -> Dict[str, Any]: + if not self.path.exists(): + return {} + return json.loads(self.path.read_text(encoding="utf-8")) + + def mark_success(self, project_identifier: str, timestamp: Optional[str] = None) -> None: + payload = self.load() + payload.setdefault("projects", {}) + payload["projects"][project_identifier] = { + "last_successful_refresh_at": timestamp or datetime.now(timezone.utc).isoformat() + } + self.path.parent.mkdir(parents=True, exist_ok=True) + self.path.write_text(json.dumps(payload, indent=2, sort_keys=True) + "\n", encoding="utf-8") + + +class RedmineRefreshService: + def __init__( + self, + source: RedmineRefreshSource, + embedder: RefreshEmbedder, + store: RefreshStore, + mapper: Optional[RedmineMapper] = None, + state: Optional[FileRefreshState] = None, + ) -> None: + self.source = source + self.embedder = embedder + self.store = store + self.mapper = mapper or RedmineMapper(redmine_url="") + self.state = state + + def refresh_redmine_project_limits( + self, + project_limits: Dict[str, int], + dry_run: bool = False, + force_rebuild: bool = False, + overlap_minutes: int = 15, + ) -> Dict[str, Any]: + previous_source_project = getattr(self.source, "project_identifier", None) + previous_mapper_project = getattr(self.mapper, "project_identifier", None) + project_results: List[Dict[str, Any]] = [] + totals = { + "issues": 0, + "scanned_issues": 0, + "detail_fetched_issues": 0, + "skipped_issues": 0, + "documents": 0, + "unchanged_documents": 0, + "changed_documents": 0, + "new_documents": 0, + "stale_documents": 0, + "force_rebuilt_documents": 0, + "would_embed_documents": 0, + "embedded_documents": 0, + } + try: + for project, limit in project_limits.items(): + if hasattr(self.source, "project_identifier"): + self.source.project_identifier = project + if hasattr(self.mapper, "project_identifier"): + self.mapper.project_identifier = project + project_result = self._refresh_project(project, limit, dry_run, force_rebuild, overlap_minutes) + project_results.append(project_result) + for key in totals: + totals[key] += int(project_result.get(key, 0)) + if not dry_run and self.state is not None: + self.state.mark_success(project) + finally: + if hasattr(self.source, "project_identifier"): + self.source.project_identifier = previous_source_project + if hasattr(self.mapper, "project_identifier"): + self.mapper.project_identifier = previous_mapper_project + return { + "source": "redmine", + "projects": len(project_limits), + "dry_run": dry_run, + "force_rebuild": force_rebuild, + "overlap_minutes": overlap_minutes, + **totals, + "project_results": project_results, + } + + def _refresh_project(self, project: str, limit: int, dry_run: bool, force_rebuild: bool, overlap_minutes: int) -> Dict[str, Any]: + summaries = list(self._recent_issue_summaries(limit)) + result: Dict[str, Any] = { + "project_identifier": project, + "issues": len(summaries), + "scanned_issues": len(summaries), + "detail_fetched_issues": 0, + "skipped_issues": 0, + "documents": 0, + "unchanged_documents": 0, + "changed_documents": 0, + "new_documents": 0, + "stale_documents": 0, + "force_rebuilt_documents": 0, + "would_embed_documents": 0, + "embedded_documents": 0, + } + cutoff = self._cutoff_for_project(project, overlap_minutes) + docs_to_embed: List[IndexDocument] = [] + stale_ids: List[str] = [] + for summary in summaries: + if cutoff is not None and not force_rebuild and not self._issue_is_in_refresh_window(summary, cutoff): + result["skipped_issues"] += 1 + continue + issue = self._issue_detail(summary) + result["detail_fetched_issues"] += 1 + candidates = deduplicate_documents(self.mapper.issue_to_documents(issue)) + result["documents"] += len(candidates) + existing = self.store.list_documents( + limit=5000, + source="redmine", + project_identifier=project, + issue_id=int(issue["id"]), + ) + existing_by_id = {document["id"]: document for document in existing} + candidate_by_id = {document.id: document for document in candidates} + for stale_id in sorted(set(existing_by_id) - set(candidate_by_id)): + stale_ids.append(stale_id) + result["stale_documents"] += 1 + for document in candidates: + existing_document = existing_by_id.get(document.id) + if existing_document is None: + result["new_documents"] += 1 + docs_to_embed.append(document) + continue + existing_hash = (existing_document.get("payload") or {}).get("source_hash") + document_hash = document.payload.get("source_hash") + if force_rebuild: + result["force_rebuilt_documents"] += 1 + docs_to_embed.append(document) + elif existing_hash != document_hash: + result["changed_documents"] += 1 + docs_to_embed.append(document) + else: + result["unchanged_documents"] += 1 + result["would_embed_documents"] = len(docs_to_embed) + if dry_run: + return result + if stale_ids: + self.store.delete_documents(stale_ids) + if docs_to_embed: + vectors = self.embedder.embed_documents(docs_to_embed) + self.store.upsert(docs_to_embed, vectors) + result["embedded_documents"] = len(docs_to_embed) + return result + + def _recent_issue_summaries(self, limit: int) -> Iterable[Dict[str, Any]]: + if hasattr(self.source, "recent_issue_summaries"): + return self.source.recent_issue_summaries(limit) # type: ignore[attr-defined] + return self.source.recent_helpdesk_issues(limit) + + def _issue_detail(self, summary: Dict[str, Any]) -> Dict[str, Any]: + if hasattr(self.source, "issue_detail"): + return self.source.issue_detail(int(summary["id"])) # type: ignore[attr-defined] + return summary + + def _cutoff_for_project(self, project: str, overlap_minutes: int) -> Optional[datetime]: + if self.state is None: + return None + timestamp = ((self.state.load().get("projects") or {}).get(project) or {}).get("last_successful_refresh_at") + if not timestamp: + return None + parsed = parse_redmine_datetime(timestamp) + return parsed - timedelta(minutes=overlap_minutes) + + def _issue_is_in_refresh_window(self, issue: Dict[str, Any], cutoff: datetime) -> bool: + updated_on = issue.get("updated_on") + if not updated_on: + return True + return parse_redmine_datetime(str(updated_on)) >= cutoff + + +def parse_redmine_datetime(raw: str) -> datetime: + normalized = raw.replace("Z", "+00:00") + parsed = datetime.fromisoformat(normalized) + if parsed.tzinfo is None: + return parsed.replace(tzinfo=timezone.utc) + return parsed.astimezone(timezone.utc) diff --git a/semantic_index/refresh.sh b/semantic_index/refresh.sh new file mode 100755 index 0000000..207d2fd --- /dev/null +++ b/semantic_index/refresh.sh @@ -0,0 +1,107 @@ +#!/usr/bin/env bash +set -euo pipefail + +usage() { + cat >&2 <<'EOF' +Usage: + semantic_index/refresh.sh [--apply] [--dry-run] + +Examples: + semantic_index/refresh.sh + semantic_index/refresh.sh --apply + +Environment: + SEMANTIC_INDEX_PROJECT_LIMITS comma-separated project=limit pairs + SEMANTIC_INDEX_LOG_DIR default: .cache/semantic_index/logs + SEMANTIC_INDEX_STATE_PATH default: .cache/semantic_index/refresh_state.json + SEMANTIC_INDEX_OVERLAP_MINUTES default: 15 + PYTHON default: /.venv/bin/python + +This wrapper never passes --force-rebuild. Run force rebuilds manually. +EOF +} + +script_dir=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +install_root=$(cd "$script_dir/.." && pwd) + +load_env_defaults() { + local file=$1 + local key value + [[ -r "$file" ]] || return 0 + while IFS= read -r line || [[ -n "$line" ]]; do + line=${line#"${line%%[![:space:]]*}"} + line=${line%"${line##*[![:space:]]}"} + [[ -z "$line" || "$line" == \#* || "$line" != *=* ]] && continue + key=${line%%=*} + value=${line#*=} + key=${key%"${key##*[![:space:]]}"} + value=${value#"${value%%[![:space:]]*}"} + value=${value%"${value##*[![:space:]]}"} + value=${value%\"} + value=${value#\"} + value=${value%\'} + value=${value#\'} + if [[ -z "${!key+x}" ]]; then + export "$key=$value" + fi + done < "$file" +} + +load_env_defaults /etc/semantic-index.env + +mode=dry-run +while [[ $# -gt 0 ]]; do + case "$1" in + --apply) + mode=apply + shift + ;; + --dry-run) + mode=dry-run + shift + ;; + -h|--help) + usage + exit 0 + ;; + *) + usage + exit 2 + ;; + esac +done + +project_limits=${SEMANTIC_INDEX_PROJECT_LIMITS:-customer-service=500,hiring=200,todo-jason=200,sales-inbox=100,business-development=100,dock-scheduling=100,prep-standardization=100} +log_dir=${SEMANTIC_INDEX_LOG_DIR:-.cache/semantic_index/logs} +state_path=${SEMANTIC_INDEX_STATE_PATH:-.cache/semantic_index/refresh_state.json} +overlap_minutes=${SEMANTIC_INDEX_OVERLAP_MINUTES:-15} +python_bin=${PYTHON:-$install_root/.venv/bin/python} + +mkdir -p "$log_dir" "$(dirname "$state_path")" +timestamp=$(date -u +"%Y%m%dT%H%M%SZ") +log_file="$log_dir/redmine-refresh-$timestamp.log" + +args=( + -m semantic_index + --refresh-redmine-projects + --project-limits "$project_limits" + --state-path "$state_path" + --overlap-minutes "$overlap_minutes" +) + +if [[ "$mode" == "dry-run" ]]; then + args+=(--dry-run) +fi + +{ + printf 'started_at=%s\n' "$(date -u +"%Y-%m-%dT%H:%M:%SZ")" + printf 'mode=%s\n' "$mode" + printf 'project_limits=%s\n' "$project_limits" + printf 'state_path=%s\n' "$state_path" + printf 'overlap_minutes=%s\n' "$overlap_minutes" + cd "$install_root" + "$python_bin" "${args[@]}" + printf '\nfinished_at=%s\n' "$(date -u +"%Y-%m-%dT%H:%M:%SZ")" +} 2>&1 | tee "$log_file" + +printf 'log_file=%s\n' "$log_file" diff --git a/semantic_index/search.py b/semantic_index/search.py new file mode 100644 index 0000000..03278ed --- /dev/null +++ b/semantic_index/search.py @@ -0,0 +1,61 @@ +from __future__ import annotations + +import re +from typing import Any, Dict, List, Optional, Protocol + +from .models import SearchQuery, SearchResult + + +class QueryEmbedder(Protocol): + def embed_query(self, text: str) -> List[float]: + ... + + +class SearchStore(Protocol): + def search(self, vector: List[float], query: SearchQuery, limit: int) -> List[SearchResult]: + ... + + def get_document(self, document_id: str) -> Optional[Dict[str, Any]]: + ... + + +class HybridSearchService: + def __init__(self, embedder: QueryEmbedder, store: SearchStore) -> None: + self.embedder = embedder + self.store = store + + def search(self, query: SearchQuery) -> List[SearchResult]: + vector = self.embedder.embed_query(query.text) + candidates = self.store.search(vector, query, limit=query.limit) + rescored = [ + SearchResult( + id=result.id, + score=result.score + keyword_boost(query.text, result), + text=result.text, + payload=result.payload, + ) + for result in candidates + ] + return sorted(rescored, key=lambda result: result.score, reverse=True)[: query.limit] + + def get_document(self, document_id: str) -> Optional[Dict[str, Any]]: + return self.store.get_document(document_id) + + +def keyword_boost(query_text: str, result: SearchResult) -> float: + haystack = " ".join([result.text, " ".join(str(value) for value in result.payload.values() if value is not None)]).lower() + boost = 0.0 + for phrase in re.findall(r'"([^"]+)"', query_text): + if phrase.lower() in haystack: + boost += 0.35 + for email in re.findall(r"[\w.+-]+@[\w.-]+\.[A-Za-z]{2,}", query_text): + if email.lower() in haystack: + boost += 0.3 + for token in re.findall(r"\b(?:#?\d{2,}|[A-Z]{2,}[-_]\d{2,}|[A-Z0-9]{4,}-[A-Z0-9-]{2,})\b", query_text): + normalized = token.lower().lstrip("#") + if token.lower() in haystack or normalized in haystack: + boost += 0.25 + for word in re.findall(r"\b[A-Za-z][\w.-]{2,}\b", query_text): + if word.lower() in haystack: + boost += 0.03 + return boost diff --git a/semantic_index/search.sh b/semantic_index/search.sh new file mode 100755 index 0000000..6c30602 --- /dev/null +++ b/semantic_index/search.sh @@ -0,0 +1,71 @@ +#!/usr/bin/env bash +set -euo pipefail + +usage() { + cat >&2 <<'EOF' +Usage: + semantic_index/search.sh "query text" [project_identifier] [limit] + +Examples: + semantic_index/search.sh "goods return" customer-service 3 + semantic_index/search.sh "candidate follow up" hiring 5 | jq '.results[] | {id, score, citation}' + +Environment: + SEMANTIC_INDEX_URL default: http://127.0.0.1:8787 + SEMANTIC_INDEX_API_KEY optional; falls back to semantic_index/.env or .env +EOF +} + +if [[ $# -lt 1 ]]; then + usage + exit 2 +fi + +query=$1 +project=${2:-} +limit=${3:-10} +base_url=${SEMANTIC_INDEX_URL:-http://127.0.0.1:8787} +script_dir=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +install_root=$(cd "$script_dir/.." && pwd) + +read_env_value() { + local key=$1 + local file + for file in /etc/semantic-index.env "$install_root/semantic_index/.env" "$install_root/.env" semantic_index/.env .env; do + if [[ -f "$file" ]]; then + awk -F= -v key="$key" ' + $1 == key { + value = substr($0, index($0, "=") + 1) + gsub(/^[ \t"'\''"]+|[ \t"'\''"]+$/, "", value) + print value + exit + } + ' "$file" + return + fi + done +} + +json_escape() { + sed \ + -e 's/\\/\\\\/g' \ + -e 's/"/\\"/g' \ + -e ':a;N;$!ba;s/\n/\\n/g' +} + +escaped_query=$(printf '%s' "$query" | json_escape) +payload="{\"query\":\"$escaped_query\",\"limit\":$limit" +if [[ -n "$project" ]]; then + escaped_project=$(printf '%s' "$project" | json_escape) + payload="$payload,\"project_identifier\":\"$escaped_project\"" +fi +payload="$payload}" + +api_key=${SEMANTIC_INDEX_API_KEY:-$(read_env_value SEMANTIC_INDEX_API_KEY)} + +args=(-sS -H "Content-Type: application/json" -d "$payload") +if [[ -n "${api_key:-}" ]]; then + args+=(-H "Authorization: Bearer $api_key") +fi + +curl "${args[@]}" "$base_url/search" diff --git a/semantic_index/search.sh.md b/semantic_index/search.sh.md new file mode 100644 index 0000000..0c3366a --- /dev/null +++ b/semantic_index/search.sh.md @@ -0,0 +1,31 @@ +• Added semantic_index/search.sh. It outputs raw JSON from the running HTTP service. + + Usage: + + semantic_index/search.sh "goods return" customer-service 3 + semantic_index/search.sh "candidate follow up" hiring 5 | jq '.results[] | {id, score, citation}' + + It defaults to http://127.0.0.1:8787, reads SEMANTIC_INDEX_API_KEY from the environment or semantic_i + ndex/.env, and accepts: + + semantic_index/search.sh "query text" [project_identifier] [limit] + + Verified: + + - Script works against the local service. + - Semantic tests still pass: 49 tests, OK. + - Python compile check passed. + + +› Okay yeah, that's nice and fast + + +• Good. That script is the right shape for quick manual probing: fast because it avoids Python startup, + plain JSON so it pipes cleanly, and thin enough that it won’t hide what the API is returning. + + A couple useful examples: + + semantic_index/search.sh "goods return" customer-service 3 | jq '.results[].citation' + semantic_index/search.sh "candidate follow up" hiring 5 | jq -r '.results[] | "\(.score) + \(.citation.url) \(.snippet)"' + semantic_index/search.sh "callum@safetagtracking.com" customer-service 3 | jq '.results[0]' diff --git a/tests/semantic_index/test_app.py b/tests/semantic_index/test_app.py new file mode 100644 index 0000000..8e15ed7 --- /dev/null +++ b/tests/semantic_index/test_app.py @@ -0,0 +1,115 @@ +import unittest +from pathlib import Path + +from semantic_index.app import create_app +from semantic_index.config import Settings +from semantic_index.models import SearchResult + + +class FakeSearchService: + def search(self, query): + return [ + SearchResult( + id="redmine:issue:1:chunk:0", + score=0.8, + text="Snippet text", + payload={ + "source": "redmine", + "project_identifier": "customer-service", + "doc_type": "issue", + "issue_id": 1, + "redmine_url": "http://redmine/issues/1", + "source_record_id": "issue:1", + }, + ) + ] + + def get_document(self, document_id): + return {"id": document_id, "text": "Full text", "payload": {}} + + +class FakeStore: + def list_projects(self, source=None, limit=1000): + return [{"project_identifier": "customer-service", "document_count": 10}] + + +class FakeRefreshService: + def __init__(self): + self.calls = [] + + def refresh_redmine_project_limits(self, project_limits, dry_run=False, force_rebuild=False, overlap_minutes=15): + self.calls.append((project_limits, dry_run, force_rebuild, overlap_minutes)) + return {"source": "redmine", "projects": len(project_limits), "dry_run": dry_run} + + +def fake_services(): + refresh = FakeRefreshService() + return { + "settings": Settings( + openai_api_key="", + qdrant_url="http://qdrant", + qdrant_api_key=None, + qdrant_collection="semantic", + redmine_url="http://redmine", + redmine_api_key="", + redmine_project_identifier=None, + sample_limit=50, + bind_host="127.0.0.1", + bind_port=8787, + service_api_key=None, + refresh_state_path=Path(".cache/semantic_index/refresh_state.json"), + ), + "search": FakeSearchService(), + "store": FakeStore(), + "refresh": refresh, + } + + +class SemanticIndexAppTest(unittest.TestCase): + def test_health_does_not_build_live_services(self): + def broken_builder(): + raise AssertionError("health should not build live clients") + + app = create_app(service_builder=broken_builder) + routes = {route.path: route.endpoint for route in app.routes} + + self.assertEqual({"status": "ok"}, routes["/health"]()) + + def test_search_endpoint_returns_normalized_agent_response(self): + app = create_app(service_builder=fake_services) + routes = {route.path: route.endpoint for route in app.routes} + + response = routes["/search"]({"query": "printer", "project_identifier": "customer-service", "limit": 3}) + + self.assertEqual("printer", response["query"]) + self.assertEqual("customer-service", response["filters"]["project_identifier"]) + self.assertEqual("customer-service", response["results"][0]["citation"]["project_identifier"]) + + def test_projects_endpoint_lists_indexed_projects(self): + app = create_app(service_builder=fake_services) + routes = {route.path: route.endpoint for route in app.routes} + + response = routes["/projects"]() + + self.assertEqual("customer-service", response["projects"][0]["project_identifier"]) + + def test_refresh_endpoint_passes_project_limits_and_cost_flags(self): + services = fake_services() + app = create_app(service_builder=lambda: services) + routes = {route.path: route.endpoint for route in app.routes} + + response = routes["/sources/redmine/refresh"]( + { + "project_limits": {"customer-service": 5}, + "dry_run": True, + "force_rebuild": False, + "overlap_minutes": 30, + } + ) + + self.assertTrue(response["dry_run"]) + self.assertEqual(({"customer-service": 5}, True, False, 30), services["refresh"].calls[0]) + + +if __name__ == "__main__": + unittest.main() diff --git a/tests/semantic_index/test_backfill_mcp.py b/tests/semantic_index/test_backfill_mcp.py new file mode 100644 index 0000000..9a7c3e0 --- /dev/null +++ b/tests/semantic_index/test_backfill_mcp.py @@ -0,0 +1,182 @@ +import unittest + +from semantic_index.ingest import BackfillService +from semantic_index.mcp import SemanticMCP +from semantic_index.models import SearchQuery, SearchResult +from semantic_index.redmine import RedmineMapper + + +class FakeRedmineSource: + project_identifier = None + + def recent_helpdesk_issues(self, limit): + return [ + { + "id": 1, + "subject": "First", + "description": "First body", + "project": {"identifier": self.project_identifier}, + }, + { + "id": 2, + "subject": "Second", + "description": "Second body", + "project": {"identifier": self.project_identifier}, + }, + ][:limit] + + +class DuplicateDocumentRedmineSource: + project_identifier = "customer-service" + + def recent_helpdesk_issues(self, limit): + return [ + {"id": 1, "subject": "First", "description": "First body", "project": {"identifier": "customer-service"}}, + {"id": 1, "subject": "First duplicate", "description": "Duplicate body", "project": {"identifier": "customer-service"}}, + ][:limit] + + +class FakeEmbedder: + def embed_documents(self, docs): + return [[float(i), 0.0, 0.0] for i, _ in enumerate(docs, start=1)] + + def embed_query(self, text): + return [0.1, 0.0, 0.0] + + +class FakeStore: + def __init__(self): + self.deleted = [] + self.upserts = [] + + def rebuild_source(self, source, docs, vectors, project_identifier=None): + self.deleted.append((source, project_identifier)) + self.upserts.append((docs, vectors)) + + def list_projects(self, source=None, limit=1000): + return [ + {"project_identifier": "customer-service", "document_count": 1684}, + {"project_identifier": "hiring", "document_count": 409}, + ] + + +class FakeRefreshService: + def __init__(self): + self.calls = [] + + def refresh_redmine_project_limits(self, project_limits, dry_run=False, force_rebuild=False, overlap_minutes=15): + self.calls.append((project_limits, dry_run, force_rebuild, overlap_minutes)) + return {"source": "redmine", "projects": len(project_limits), "dry_run": dry_run} + + +class FakeSearchService: + def __init__(self): + self.queries = [] + + def search(self, query): + self.queries.append(query) + return [SearchResult(id="doc1", score=0.5, text="Snippet", payload={"redmine_url": "http://redmine/issues/1"})] + + def get_document(self, document_id): + return {"id": document_id, "text": "Snippet"} + + +class BackfillAndMCPTest(unittest.TestCase): + def test_sample_backfill_rebuilds_redmine_source(self): + service = BackfillService(source=FakeRedmineSource(), embedder=FakeEmbedder(), store=FakeStore()) + + result = service.backfill_redmine_sample(limit=2) + + self.assertEqual({"source": "redmine", "issues": 2, "documents": 2}, result) + self.assertEqual([("redmine", None)], service.store.deleted) + docs, vectors = service.store.upserts[0] + self.assertEqual(["redmine:issue:1:chunk:0", "redmine:issue:2:chunk:0"], [doc.id for doc in docs]) + self.assertEqual(2, len(vectors)) + + def test_sample_backfill_rebuilds_only_the_configured_project_scope(self): + store = FakeStore() + service = BackfillService( + source=FakeRedmineSource(), + embedder=FakeEmbedder(), + store=store, + mapper=RedmineMapper(redmine_url="", project_identifier="customer-service"), + ) + + service.backfill_redmine_sample(limit=1) + + self.assertEqual([("redmine", "customer-service")], store.deleted) + + def test_multi_project_backfill_rebuilds_each_project_scope(self): + store = FakeStore() + service = BackfillService(source=FakeRedmineSource(), embedder=FakeEmbedder(), store=store) + + result = service.backfill_redmine_projects(["customer-service", "hiring"], per_project_limit=1) + + self.assertEqual( + { + "source": "redmine", + "projects": 2, + "issues": 2, + "documents": 2, + "project_results": [ + {"project_identifier": "customer-service", "issues": 1, "documents": 1}, + {"project_identifier": "hiring", "issues": 1, "documents": 1}, + ], + }, + result, + ) + self.assertEqual([("redmine", "customer-service"), ("redmine", "hiring")], store.deleted) + self.assertEqual("customer-service", store.upserts[0][0][0].payload["project_identifier"]) + self.assertEqual("hiring", store.upserts[1][0][0].payload["project_identifier"]) + + def test_multi_project_backfill_accepts_per_project_limits(self): + store = FakeStore() + service = BackfillService(source=FakeRedmineSource(), embedder=FakeEmbedder(), store=store) + + result = service.backfill_redmine_project_limits({"customer-service": 2, "hiring": 1}) + + self.assertEqual(3, result["issues"]) + self.assertEqual( + [ + {"project_identifier": "customer-service", "issues": 2, "documents": 2}, + {"project_identifier": "hiring", "issues": 1, "documents": 1}, + ], + result["project_results"], + ) + + def test_backfill_deduplicates_documents_by_stable_id_before_embedding(self): + store = FakeStore() + service = BackfillService(source=DuplicateDocumentRedmineSource(), embedder=FakeEmbedder(), store=store) + + result = service.backfill_redmine_sample(limit=2) + + self.assertEqual({"source": "redmine", "issues": 2, "documents": 1}, result) + docs, vectors = store.upserts[0] + self.assertEqual(["redmine:issue:1:chunk:0"], [doc.id for doc in docs]) + self.assertEqual(1, len(vectors)) + + def test_mcp_tools_return_json_ready_results(self): + search = FakeSearchService() + refresh = FakeRefreshService() + mcp = SemanticMCP(search_service=search, backfill_service=None, store=FakeStore(), refresh_service=refresh) + + response = mcp.call_tool("semantic_search", {"query": "printer", "source": "redmine", "project_identifier": "hiring", "limit": 3}) + document = mcp.call_tool("semantic_get_document", {"id": "doc1"}) + projects = mcp.call_tool("semantic_list_projects", {"source": "redmine"}) + refresh_response = mcp.call_tool("semantic_refresh_redmine", {"project_identifier": "customer-service", "limit": 5, "dry_run": True}) + + self.assertEqual("printer", response["query"]) + self.assertEqual("hiring", response["filters"]["project_identifier"]) + self.assertEqual("doc1", response["results"][0]["id"]) + self.assertEqual("http://redmine/issues/1", response["results"][0]["citation"]["url"]) + self.assertIsInstance(search.queries[0], SearchQuery) + self.assertEqual("redmine", search.queries[0].source) + self.assertEqual("hiring", search.queries[0].project_identifier) + self.assertEqual({"id": "doc1", "text": "Snippet"}, document) + self.assertEqual("customer-service", projects["projects"][0]["project_identifier"]) + self.assertTrue(refresh_response["dry_run"]) + self.assertEqual(({"customer-service": 5}, True, False, 15), refresh.calls[0]) + + +if __name__ == "__main__": + unittest.main() diff --git a/tests/semantic_index/test_cli.py b/tests/semantic_index/test_cli.py new file mode 100644 index 0000000..00c0843 --- /dev/null +++ b/tests/semantic_index/test_cli.py @@ -0,0 +1,37 @@ +import subprocess +import sys +from pathlib import Path +from tempfile import TemporaryDirectory +import unittest + +from semantic_index.config import load_settings + + +class SemanticIndexCliTest(unittest.TestCase): + def test_help_does_not_require_http_runtime_dependencies(self): + result = subprocess.run( + [sys.executable, "-m", "semantic_index", "--help"], + check=False, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + text=True, + ) + + self.assertEqual("", result.stderr) + self.assertEqual(0, result.returncode) + self.assertIn("--mcp-stdio", result.stdout) + + def test_settings_load_from_package_env_when_root_env_missing(self): + with TemporaryDirectory() as tmp: + env_path = Path(tmp) / "semantic_index" / ".env" + env_path.parent.mkdir() + env_path.write_text("QDRANT_URL=http://qdrant.example:6333\nREDMINE_SAMPLE_LIMIT=7\n", encoding="utf-8") + + settings = load_settings(Path(tmp) / ".env") + + self.assertEqual("http://qdrant.example:6333", settings.qdrant_url) + self.assertEqual(7, settings.sample_limit) + + +if __name__ == "__main__": + unittest.main() diff --git a/tests/semantic_index/test_client.py b/tests/semantic_index/test_client.py new file mode 100644 index 0000000..c3235cc --- /dev/null +++ b/tests/semantic_index/test_client.py @@ -0,0 +1,87 @@ +import json +import unittest +from unittest.mock import patch + +from semantic_index.client import SemanticIndexClient +from semantic_index.models import SearchResult + + +class FakeSearchService: + def __init__(self): + self.queries = [] + + def search(self, query): + self.queries.append(query) + return [ + SearchResult( + id="redmine:issue:1:chunk:0", + score=0.7, + text="Candidate follow up", + payload={ + "source": "redmine", + "project_identifier": "hiring", + "doc_type": "issue", + "issue_id": 1, + "redmine_url": "http://redmine/issues/1", + "source_record_id": "issue:1", + }, + ) + ] + + def get_document(self, document_id): + return {"id": document_id, "text": "Full text", "payload": {"project_identifier": "hiring"}} + + +class SemanticIndexClientTest(unittest.TestCase): + def test_in_process_client_returns_normalized_search_response(self): + search = FakeSearchService() + client = SemanticIndexClient(search_service=search) + + response = client.search("candidate follow up", project_identifier="hiring", limit=3) + + self.assertEqual("candidate follow up", response["query"]) + self.assertEqual({"project_identifier": "hiring", "limit": 3}, response["filters"]) + self.assertEqual("redmine:issue:1:chunk:0", response["results"][0]["id"]) + self.assertEqual("hiring", response["results"][0]["citation"]["project_identifier"]) + self.assertEqual("hiring", search.queries[0].project_identifier) + + def test_in_process_client_get_document(self): + client = SemanticIndexClient(search_service=FakeSearchService()) + + document = client.get_document("redmine:issue:1:chunk:0") + + self.assertEqual("Full text", document["text"]) + + def test_http_client_sends_auth_header_and_parses_search_response(self): + body = json.dumps({"query": "printer", "filters": {}, "results": []}).encode() + + class FakeResponse: + def __enter__(self): + return self + + def __exit__(self, exc_type, exc, tb): + return False + + def read(self): + return body + + captured = {} + + def fake_urlopen(request, timeout): + captured["url"] = request.full_url + captured["authorization"] = request.headers.get("Authorization") + captured["body"] = json.loads(request.data.decode()) + return FakeResponse() + + with patch("urllib.request.urlopen", fake_urlopen): + client = SemanticIndexClient(base_url="http://semantic.local", api_key="secret") + response = client.search("printer", project_identifier="customer-service") + + self.assertEqual("http://semantic.local/search", captured["url"]) + self.assertEqual("Bearer secret", captured["authorization"]) + self.assertEqual("customer-service", captured["body"]["project_identifier"]) + self.assertEqual("printer", response["query"]) + + +if __name__ == "__main__": + unittest.main() diff --git a/tests/semantic_index/test_documents.py b/tests/semantic_index/test_documents.py new file mode 100644 index 0000000..b1cd3ec --- /dev/null +++ b/tests/semantic_index/test_documents.py @@ -0,0 +1,138 @@ +import unittest + +from semantic_index.models import IndexDocument +from semantic_index.redmine import RedmineMapper + + +class RedmineMapperTest(unittest.TestCase): + def test_issue_chunks_have_stable_ids_and_metadata(self): + issue = { + "id": 42, + "subject": "Widget order ORD-12345 cannot ship", + "description": "Customer reports that widget order ORD-12345 is blocked.", + "project": {"id": 7, "identifier": "fud-helpdesk"}, + "contact": {"id": 9, "email": "ada@example.com", "name": "Ada Lovelace"}, + "created_on": "2026-04-01T10:00:00Z", + "updated_on": "2026-04-02T10:00:00Z", + "url": "http://redmine.local/issues/42", + } + + first = RedmineMapper(redmine_url="http://redmine.local").issue_to_documents(issue) + second = RedmineMapper(redmine_url="http://redmine.local").issue_to_documents(issue) + + self.assertEqual([doc.id for doc in first], [doc.id for doc in second]) + self.assertEqual("redmine:issue:42:chunk:0", first[0].id) + self.assertEqual("issue", first[0].payload["doc_type"]) + self.assertEqual(42, first[0].payload["issue_id"]) + self.assertEqual("fud-helpdesk", first[0].payload["project_identifier"]) + self.assertIsNone(first[0].payload["project_name"]) + self.assertFalse(first[0].payload["has_helpdesk_ticket"]) + self.assertEqual("ada@example.com", first[0].payload["contact_email"]) + self.assertEqual("Ada Lovelace", first[0].payload["contact_name"]) + self.assertEqual("http://redmine.local/issues/42", first[0].payload["redmine_url"]) + self.assertIn("source_hash", first[0].payload) + + def test_helpdesk_ticket_contact_is_mapped_to_all_issue_chunks(self): + issue = { + "id": 39779, + "subject": "Goods return", + "description": "Please arrange to return these goods.", + "project": {"id": 1, "identifier": "customer-service"}, + "helpdesk_ticket": { + "id": 35159, + "contact_id": 1890, + "from_address": "callum@safetagtracking.com", + "contact": { + "id": 1890, + "name": "Callum Mackeonis", + "company": "SafeTag Tracking", + "email": "callum@safetagtracking.com", + }, + }, + "journals": [ + {"id": 71570, "notes": "Hello, yes we can arrange this today.", "created_on": "2026-04-14T14:29:49Z"} + ], + } + + docs = RedmineMapper(redmine_url="http://redmine.local").issue_to_documents(issue) + issue_doc = next(doc for doc in docs if doc.payload["doc_type"] == "issue") + journal_doc = next(doc for doc in docs if doc.payload["doc_type"] == "journal") + contact_doc = next(doc for doc in docs if doc.payload["doc_type"] == "contact") + + for doc in (issue_doc, journal_doc, contact_doc): + self.assertEqual(35159, doc.payload["helpdesk_ticket_id"]) + self.assertTrue(doc.payload["has_helpdesk_ticket"]) + self.assertEqual(1890, doc.payload["contact_id"]) + self.assertEqual("Callum Mackeonis", doc.payload["contact_name"]) + self.assertEqual("SafeTag Tracking", doc.payload["contact_company"]) + self.assertEqual("callum@safetagtracking.com", doc.payload["contact_email"]) + self.assertIn("Callum Mackeonis", issue_doc.text) + self.assertIn("callum@safetagtracking.com", contact_doc.text) + + def test_configured_project_identifier_is_used_when_issue_payload_omits_identifier(self): + issue = { + "id": 42, + "subject": "Widget order", + "description": "Body", + "project": {"id": 1, "name": "Customer Service"}, + } + + docs = RedmineMapper( + redmine_url="http://redmine.local", + project_identifier="customer-service", + ).issue_to_documents(issue) + + self.assertEqual("customer-service", docs[0].payload["project_identifier"]) + self.assertEqual("Customer Service", docs[0].payload["project_name"]) + + def test_internal_non_helpdesk_issue_keeps_project_metadata_without_contact(self): + issue = { + "id": 55, + "subject": "Internal hiring task", + "description": "Follow up with candidate.", + "project": {"id": 68, "identifier": "hiring", "name": "Hiring"}, + } + + docs = RedmineMapper(redmine_url="http://redmine.local").issue_to_documents(issue) + + self.assertEqual(1, len(docs)) + self.assertEqual("hiring", docs[0].payload["project_identifier"]) + self.assertEqual("Hiring", docs[0].payload["project_name"]) + self.assertFalse(docs[0].payload["has_helpdesk_ticket"]) + self.assertIsNone(docs[0].payload["contact_id"]) + + def test_issue_journals_messages_and_contact_are_mapped(self): + issue = { + "id": 42, + "subject": "Widget order", + "description": "Ticket envelope", + "project": {"id": 7, "identifier": "fud-helpdesk"}, + "contact": {"id": 9, "email": "ada@example.com", "name": "Ada Lovelace"}, + "journals": [ + {"id": 5, "notes": "Private escalation note", "private_notes": True, "created_on": "2026-04-03T10:00:00Z"} + ], + "messages": [ + {"id": 6, "body": "Customer reply body", "direction": "incoming", "created_on": "2026-04-03T11:00:00Z"} + ], + } + + docs = RedmineMapper(redmine_url="http://redmine.local").issue_to_documents(issue) + ids = {doc.id for doc in docs} + types = {doc.payload["doc_type"] for doc in docs} + + self.assertIn("redmine:issue:42:journal:5:chunk:0", ids) + self.assertIn("redmine:issue:42:message:6:chunk:0", ids) + self.assertIn("redmine:contact:9:issue:42:chunk:0", ids) + self.assertEqual({"issue", "journal", "message", "contact"}, types) + journal = next(doc for doc in docs if doc.payload["doc_type"] == "journal") + message = next(doc for doc in docs if doc.payload["doc_type"] == "message") + self.assertEqual("private", journal.payload["visibility"]) + self.assertEqual("incoming", message.payload["direction"]) + + def test_empty_documents_are_rejected(self): + with self.assertRaises(ValueError): + IndexDocument(id="x", text=" ", payload={}) + + +if __name__ == "__main__": + unittest.main() diff --git a/tests/semantic_index/test_embeddings.py b/tests/semantic_index/test_embeddings.py new file mode 100644 index 0000000..66ca723 --- /dev/null +++ b/tests/semantic_index/test_embeddings.py @@ -0,0 +1,46 @@ +import unittest + +from semantic_index.embeddings import OpenAIEmbedder +from semantic_index.models import IndexDocument + + +class FakeOpenAIClient: + def __init__(self): + self.calls = [] + + def create_embeddings(self, model, inputs, dimensions=None): + self.calls.append({"model": model, "inputs": list(inputs), "dimensions": dimensions}) + return [[float(i)] * 3 for i, _ in enumerate(inputs, start=1)] + + +class OpenAIEmbedderTest(unittest.TestCase): + def test_batches_embedding_requests(self): + client = FakeOpenAIClient() + embedder = OpenAIEmbedder(client=client, batch_size=2, dimensions=1536) + docs = [ + IndexDocument(id="a", text="alpha", payload={}), + IndexDocument(id="b", text="bravo", payload={}), + IndexDocument(id="c", text="charlie", payload={}), + ] + + vectors = embedder.embed_documents(docs) + + self.assertEqual([[1.0, 1.0, 1.0], [2.0, 2.0, 2.0], [1.0, 1.0, 1.0]], vectors) + self.assertEqual(2, len(client.calls)) + self.assertEqual(["alpha", "bravo"], client.calls[0]["inputs"]) + self.assertEqual("text-embedding-3-small", client.calls[0]["model"]) + self.assertEqual(1536, client.calls[0]["dimensions"]) + + def test_rejects_empty_or_oversized_chunks_before_api_call(self): + client = FakeOpenAIClient() + embedder = OpenAIEmbedder(client=client, max_chars=5) + + with self.assertRaises(ValueError): + embedder.embed_texts(["ok", " "]) + with self.assertRaises(ValueError): + embedder.embed_texts(["toolong"]) + self.assertEqual([], client.calls) + + +if __name__ == "__main__": + unittest.main() diff --git a/tests/semantic_index/test_inspect_cli.py b/tests/semantic_index/test_inspect_cli.py new file mode 100644 index 0000000..d45f60c --- /dev/null +++ b/tests/semantic_index/test_inspect_cli.py @@ -0,0 +1,394 @@ +import io +import json +import unittest +from contextlib import redirect_stdout +from pathlib import Path + +from semantic_index.__main__ import main +from semantic_index.config import Settings +from semantic_index.models import SearchResult + + +class FakeSearchService: + def __init__(self): + self.queries = [] + + def search(self, query): + self.queries.append(query) + if "missing@example.test" in query.text: + return [] + return [ + SearchResult( + id="redmine:contact:1890:issue:39779:chunk:0" if "callum" in query.text else "redmine:issue:39779:chunk:0", + score=0.58, + text="Callum Mackeonis callum@safetagtracking.com SafeTag Tracking", + payload={ + "source": "redmine", + "doc_type": "contact" if "callum" in query.text else "issue", + "issue_id": 39779, + "project_identifier": "customer-service", + "contact_id": 1890, + "contact_name": "Callum Mackeonis", + "contact_email": "callum@safetagtracking.com", + "contact_company": "SafeTag Tracking", + "redmine_url": "http://redmine/issues/39779", + }, + ) + ] + + def get_document(self, document_id): + return { + "id": document_id, + "text": "Full indexed text", + "payload": { + "source": "redmine", + "doc_type": "journal", + "issue_id": 39778, + "project_identifier": "customer-service", + "contact_id": 1890, + "contact_name": "Callum Mackeonis", + "contact_email": "callum@safetagtracking.com", + "redmine_url": "http://redmine/issues/39778", + }, + } + + +class FakeStore: + def __init__(self): + self.list_limits = [] + + def count_documents(self, source=None, project_identifier=None, doc_type=None): + return 12 + + def list_documents(self, limit=10, source=None, project_identifier=None, doc_type=None): + self.list_limits.append(limit) + return [ + { + "id": "redmine:issue:39779:chunk:0", + "text": "Issue #39779: Goods return\nPlease return our goods.", + "payload": { + "source": "redmine", + "doc_type": "issue", + "issue_id": 39779, + "project_identifier": "customer-service", + "project_name": "Customer Service", + "has_helpdesk_ticket": True, + "contact_id": 1890, + "contact_name": "Callum Mackeonis", + "contact_email": "callum@safetagtracking.com", + "contact_company": "SafeTag Tracking", + "source_hash": "issue-hash", + "redmine_url": "http://redmine/issues/39779", + }, + }, + { + "id": "redmine:issue:39779:journal:71570:chunk:0", + "text": "Hello, we can arrange this today.", + "payload": { + "source": "redmine", + "doc_type": "journal", + "issue_id": 39779, + "project_identifier": "customer-service", + "project_name": "Customer Service", + "has_helpdesk_ticket": True, + "contact_id": 1890, + "contact_name": "Callum Mackeonis", + "contact_email": "callum@safetagtracking.com", + "contact_company": "SafeTag Tracking", + "source_hash": "journal-hash", + "redmine_url": "http://redmine/issues/39779", + }, + }, + { + "id": "redmine:contact:1890:issue:39779:chunk:0", + "text": "Callum Mackeonis callum@safetagtracking.com SafeTag Tracking", + "payload": { + "source": "redmine", + "doc_type": "contact", + "issue_id": 39779, + "project_identifier": "customer-service", + "project_name": "Customer Service", + "has_helpdesk_ticket": True, + "contact_id": 1890, + "contact_name": "Callum Mackeonis", + "contact_email": "callum@safetagtracking.com", + "contact_company": "SafeTag Tracking", + "source_hash": "contact-hash", + "redmine_url": "http://redmine/issues/39779", + }, + }, + { + "id": "redmine:issue:39800:chunk:0", + "text": "Ordinary issue with no helpdesk contact.", + "payload": { + "source": "redmine", + "doc_type": "issue", + "issue_id": 39800, + "project_identifier": "hiring", + "project_name": "Hiring", + "has_helpdesk_ticket": False, + "source_hash": "ordinary-hash", + "redmine_url": "http://redmine/issues/39800", + }, + }, + ] + + +class FakeRedmineSource: + def recent_helpdesk_issues(self, limit): + return [ + { + "id": 39779, + "subject": "Goods return", + "description": "Please return our goods.", + "project": {"id": 1, "identifier": "customer-service"}, + "helpdesk_ticket": { + "id": 35159, + "contact_id": 1890, + "contact": { + "id": 1890, + "name": "Callum Mackeonis", + "email": "callum@safetagtracking.com", + "company": "SafeTag Tracking", + }, + }, + } + ][:limit] + + +def fake_services(store=None, search=None): + settings = Settings( + openai_api_key="", + qdrant_url="http://qdrant", + qdrant_api_key=None, + qdrant_collection="semantic", + redmine_url="http://redmine", + redmine_api_key="", + redmine_project_identifier="customer-service", + sample_limit=50, + bind_host="127.0.0.1", + bind_port=8787, + service_api_key=None, + refresh_state_path=Path(".cache/semantic_index/refresh_state.json"), + ) + return { + "settings": settings, + "search": search or FakeSearchService(), + "store": store or FakeStore(), + "redmine_source": FakeRedmineSource(), + "backfill": FakeBackfillService(), + } + + +class FakeBackfillService: + def __init__(self): + self.calls = [] + + def backfill_redmine_sample(self, limit): + self.calls.append(("sample", limit)) + return {"source": "redmine", "issues": limit, "documents": limit} + + def backfill_redmine_projects(self, projects, per_project_limit): + self.calls.append(("projects", projects, per_project_limit)) + return { + "source": "redmine", + "projects": len(projects), + "issues": len(projects) * per_project_limit, + "documents": len(projects) * per_project_limit, + "project_results": [ + {"project_identifier": project, "issues": per_project_limit, "documents": per_project_limit} + for project in projects + ], + } + + def backfill_redmine_project_limits(self, project_limits): + self.calls.append(("project_limits", project_limits)) + return { + "source": "redmine", + "projects": len(project_limits), + "issues": sum(project_limits.values()), + "documents": sum(project_limits.values()), + "project_results": [ + {"project_identifier": project, "issues": limit, "documents": limit} + for project, limit in project_limits.items() + ], + } + + +class InspectCliTest(unittest.TestCase): + def run_cli(self, args): + out = io.StringIO() + with redirect_stdout(out): + main(args, service_builder=fake_services) + return out.getvalue() + + def test_no_args_prints_help_without_building_services(self): + def broken_services(): + raise AssertionError("help should not build live services") + + out = io.StringIO() + with redirect_stdout(out): + main([], service_builder=broken_services) + + self.assertIn("inspect", out.getvalue()) + + def test_count_lists_matching_document_count(self): + output = self.run_cli(["inspect", "count", "--source", "redmine", "--project", "customer-service"]) + + self.assertIn("12", output) + + def test_list_shows_snippet_and_metadata_by_default(self): + output = self.run_cli(["inspect", "list", "--limit", "5", "--source", "redmine", "--project", "customer-service"]) + + self.assertIn("redmine:issue:39779:chunk:0", output) + self.assertIn("issue #39779", output.lower()) + self.assertIn("customer-service", output) + self.assertIn("contact=#1890", output) + self.assertIn("Callum Mackeonis", output) + self.assertIn("callum@safetagtracking.com", output) + self.assertNotIn("Full indexed text", output) + + def test_search_runs_query_and_prints_citation(self): + output = self.run_cli(["inspect", "search", "order status", "--limit", "3", "--project", "customer-service"]) + + self.assertIn("score=0.5800", output) + self.assertIn("http://redmine/issues/39779", output) + + def test_show_prints_full_document_text(self): + output = self.run_cli(["inspect", "show", "redmine:issue:39778:chunk:0"]) + + self.assertIn("Full indexed text", output) + self.assertIn("doc_type=journal", output) + + def test_preview_redmine_maps_documents_without_writing(self): + output = self.run_cli(["inspect", "preview-redmine", "--limit", "1", "--project", "customer-service"]) + + self.assertIn("redmine:issue:39779:chunk:0", output) + self.assertIn("project=customer-service", output) + self.assertIn("Please return our goods", output) + + def test_preview_redmine_uses_minimal_service_builder(self): + services = [] + + def minimal_builder(settings): + services.append(settings.redmine_project_identifier) + return {"settings": settings, "redmine_source": FakeRedmineSource()} + + out = io.StringIO() + with redirect_stdout(out): + main( + ["inspect", "preview-redmine", "--limit", "1", "--project", "customer-service"], + service_builder=lambda: (_ for _ in ()).throw(AssertionError("full services should not be built")), + preview_service_builder=minimal_builder, + settings_loader=lambda: fake_services()["settings"], + ) + + self.assertEqual(["customer-service"], services) + self.assertIn("redmine:issue:39779:chunk:0", out.getvalue()) + + def test_audit_prints_doc_type_counts_contact_coverage_and_attachment_check(self): + output = self.run_cli(["inspect", "audit", "--limit", "10", "--source", "redmine", "--project", "customer-service"]) + + self.assertIn("documents=4", output) + self.assertIn("doc_type issue=2", output) + self.assertIn("doc_type journal=1", output) + self.assertIn("doc_type contact=1", output) + self.assertIn("contact_metadata 3/4", output) + self.assertIn("helpdesk_contact_metadata 3/3", output) + self.assertIn("project customer-service=3", output) + self.assertIn("project hiring=1", output) + self.assertIn("attachments=0", output) + self.assertNotIn("missing_contact redmine:issue:39800:chunk:0", output) + + def test_audit_json_returns_machine_readable_summary(self): + output = self.run_cli(["inspect", "audit", "--limit", "10", "--project", "customer-service", "--json"]) + payload = json.loads(output) + + self.assertEqual(4, payload["total_documents"]) + self.assertEqual(2, payload["doc_type_counts"]["issue"]) + self.assertEqual(3, payload["project_counts"]["customer-service"]) + self.assertEqual(1, payload["project_counts"]["hiring"]) + self.assertEqual([], payload["missing_helpdesk_contact_metadata"]) + + def test_compare_redmine_reports_missing_stale_and_contact_mismatches(self): + output = self.run_cli(["inspect", "compare-redmine", "--limit", "1", "--project", "customer-service"]) + + self.assertIn("preview_documents=2", output) + self.assertIn("indexed_documents=4", output) + self.assertIn("stale", output) + self.assertIn("redmine:issue:39779:chunk:0", output) + + def test_compare_redmine_fetches_a_large_index_window_to_avoid_false_missing_results(self): + store = FakeStore() + out = io.StringIO() + with redirect_stdout(out): + main(["inspect", "compare-redmine", "--limit", "3", "--project", "customer-service"], service_builder=lambda: fake_services(store=store)) + + self.assertEqual(5000, store.list_limits[0]) + + def test_smoke_search_prints_pass_fail_for_known_queries(self): + output = self.run_cli(["inspect", "smoke-search", "--project", "customer-service", "--email", "callum@safetagtracking.com", "--issue-id", "39779"]) + + self.assertIn("PASS email callum@safetagtracking.com", output) + self.assertIn("PASS issue 39779", output) + self.assertIn("redmine:contact:1890:issue:39779:chunk:0", output) + + def test_smoke_search_uses_issue_id_filter_for_issue_checks(self): + search = FakeSearchService() + out = io.StringIO() + with redirect_stdout(out): + main(["inspect", "smoke-search", "--project", "customer-service", "--issue-id", "39779"], service_builder=lambda: fake_services(search=search)) + + issue_queries = [query for query in search.queries if query.text == "39779"] + self.assertEqual(39779, issue_queries[0].issue_id) + + def test_smoke_search_json_returns_check_results(self): + output = self.run_cli(["inspect", "smoke-search", "--project", "customer-service", "--email", "missing@example.test", "--json"]) + payload = json.loads(output) + + self.assertFalse(payload["checks"][0]["passed"]) + self.assertEqual("email", payload["checks"][0]["kind"]) + + def test_backfill_redmine_projects_cli_parses_comma_separated_projects(self): + backfill = FakeBackfillService() + services = fake_services() + services["backfill"] = backfill + out = io.StringIO() + + with redirect_stdout(out): + main( + [ + "--backfill-redmine-projects", + "--projects", + "customer-service,hiring", + "--per-project-limit", + "25", + ], + service_builder=lambda: services, + ) + + self.assertEqual(("projects", ["customer-service", "hiring"], 25), backfill.calls[0]) + self.assertIn("'projects': 2", out.getvalue()) + + def test_backfill_redmine_projects_cli_parses_project_specific_limits(self): + backfill = FakeBackfillService() + services = fake_services() + services["backfill"] = backfill + out = io.StringIO() + + with redirect_stdout(out): + main( + [ + "--backfill-redmine-projects", + "--project-limits", + "customer-service=500,hiring=200", + ], + service_builder=lambda: services, + ) + + self.assertEqual(("project_limits", {"customer-service": 500, "hiring": 200}), backfill.calls[0]) + self.assertIn("'issues': 700", out.getvalue()) + + +if __name__ == "__main__": + unittest.main() diff --git a/tests/semantic_index/test_installer.py b/tests/semantic_index/test_installer.py new file mode 100644 index 0000000..1e6e71a --- /dev/null +++ b/tests/semantic_index/test_installer.py @@ -0,0 +1,58 @@ +import subprocess +import tempfile +import unittest +from pathlib import Path + + +ROOT = Path(__file__).resolve().parents[2] +INSTALLER = ROOT / "deploy" / "semantic-index" / "install.sh" + + +class SemanticIndexInstallerTest(unittest.TestCase): + def run_installer(self, *args, env=None): + return subprocess.run( + [str(INSTALLER), *args], + cwd=ROOT, + text=True, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + check=False, + env=env, + ) + + def test_default_mode_is_dry_run(self): + result = self.run_installer() + + self.assertEqual(0, result.returncode, result.stderr) + self.assertIn("mode=dry-run", result.stdout) + self.assertIn("would run: sudo mkdir -p /opt/semantic-index", result.stdout) + self.assertIn("would run: sudo rsync", result.stdout) + self.assertNotIn("Semantic Index installed, but deployment is not complete.", result.stdout) + + def test_apply_prints_manual_next_step_warning(self): + with tempfile.TemporaryDirectory() as tmp: + tmp_path = Path(tmp) + env = { + "PATH": "/usr/bin:/bin", + "SEMANTIC_INDEX_INSTALL_DIR": str(tmp_path / "opt" / "semantic-index"), + "SEMANTIC_INDEX_ENV_FILE": str(tmp_path / "etc" / "semantic-index.env"), + "SEMANTIC_INDEX_STATE_DIR": str(tmp_path / "var" / "lib" / "semantic-index"), + "SEMANTIC_INDEX_LOG_DIR": str(tmp_path / "var" / "log" / "semantic-index"), + "SEMANTIC_INDEX_SYSTEMD_DIR": str(tmp_path / "etc" / "systemd" / "system"), + } + result = self.run_installer("--apply", "--no-system", "--skip-deps", env=env) + + self.assertEqual(0, result.returncode, result.stderr) + self.assertIn("Semantic Index installed, but deployment is not complete.", result.stdout) + self.assertIn("The refresh timer was NOT enabled automatically.", result.stdout) + self.assertIn("Do not use --force-rebuild", result.stdout) + + def test_invalid_argument_fails_with_usage(self): + result = self.run_installer("--force-rebuild") + + self.assertEqual(2, result.returncode) + self.assertIn("Usage:", result.stderr) + + +if __name__ == "__main__": + unittest.main() diff --git a/tests/semantic_index/test_qdrant_store.py b/tests/semantic_index/test_qdrant_store.py new file mode 100644 index 0000000..437e958 --- /dev/null +++ b/tests/semantic_index/test_qdrant_store.py @@ -0,0 +1,187 @@ +import unittest + +from semantic_index.models import IndexDocument +from semantic_index.qdrant_store import QdrantStore + + +class FakeMatchValue: + def __init__(self, value): + self.value = value + + +class FakeFieldCondition: + def __init__(self, key, match=None, range=None): + self.key = key + self.match = match + self.range = range + + +class FakeFilter: + def __init__(self, must): + self.must = must + + +class FakeFilterSelector: + def __init__(self, filter): + self.filter = filter + + +class FakePointIdsList: + def __init__(self, points): + self.points = points + + +class FakeQModels: + MatchValue = FakeMatchValue + FieldCondition = FakeFieldCondition + Filter = FakeFilter + FilterSelector = FakeFilterSelector + PointIdsList = FakePointIdsList + + class PointStruct: + def __init__(self, id, vector, payload): + self.id = id + self.vector = vector + self.payload = payload + + +class FakeCountResult: + count = 7 + + +class FakeRecord: + def __init__(self): + self.id = "point-id" + self.payload = { + "document_id": "redmine:issue:1:chunk:0", + "text": "Indexed text", + "source": "redmine", + "project_identifier": "customer-service", + } + + +class FakeClient: + def __init__(self): + self.count_filter = None + self.scroll_filter = None + self.delete_filter = None + self.delete_selector = None + self.upsert_batches = [] + + def get_collections(self): + collection = type("Collection", (), {"name": "semantic"})() + return type("Collections", (), {"collections": [collection]})() + + def count(self, collection_name, count_filter, exact): + self.count_filter = count_filter + return FakeCountResult() + + def scroll(self, collection_name, scroll_filter, limit, with_payload, with_vectors, offset=None): + self.scroll_filter = scroll_filter + return [FakeRecord()], None + + def delete(self, collection_name, points_selector): + self.delete_selector = points_selector + self.delete_filter = getattr(points_selector, "filter", None) + + def upsert(self, collection_name, points): + self.upsert_batches.append(points) + + +class QdrantStoreReadTest(unittest.TestCase): + def make_store(self): + store = object.__new__(QdrantStore) + store.client = FakeClient() + store.collection = "semantic" + store.vector_size = 1536 + store.qmodels = FakeQModels + store.upsert_batch_size = 2 + return store + + def test_count_documents_builds_metadata_filter(self): + store = self.make_store() + + count = store.count_documents(source="redmine", project_identifier="customer-service", doc_type="issue") + + self.assertEqual(7, count) + conditions = store.client.count_filter.must + self.assertEqual(["source", "project_identifier", "doc_type"], [condition.key for condition in conditions]) + self.assertEqual("customer-service", conditions[1].match.value) + + def test_list_documents_strips_internal_payload_fields(self): + store = self.make_store() + + documents = store.list_documents(limit=5, source="redmine", project_identifier="customer-service") + + self.assertEqual("redmine:issue:1:chunk:0", documents[0]["id"]) + self.assertEqual("Indexed text", documents[0]["text"]) + self.assertNotIn("document_id", documents[0]["payload"]) + self.assertNotIn("text", documents[0]["payload"]) + + def test_delete_by_source_can_be_limited_to_project_scope(self): + store = self.make_store() + + store.delete_by_source("redmine", project_identifier="customer-service") + + conditions = store.client.delete_filter.must + self.assertEqual(["source", "project_identifier"], [condition.key for condition in conditions]) + self.assertEqual("redmine", conditions[0].match.value) + self.assertEqual("customer-service", conditions[1].match.value) + + def test_list_documents_can_be_limited_to_issue_scope(self): + store = self.make_store() + + store.list_documents(limit=5, source="redmine", project_identifier="customer-service", issue_id=39779) + + conditions = store.client.scroll_filter.must + self.assertEqual(["source", "project_identifier", "issue_id"], [condition.key for condition in conditions]) + self.assertEqual(39779, conditions[2].match.value) + + def test_delete_documents_deletes_stable_document_point_ids(self): + store = self.make_store() + + store.delete_documents(["redmine:issue:39779:chunk:0"]) + + self.assertEqual(1, len(store.client.delete_selector.points)) + self.assertNotEqual("redmine:issue:39779:chunk:0", store.client.delete_selector.points[0]) + + def test_upsert_sends_points_in_batches(self): + store = self.make_store() + documents = [ + IndexDocument(id=f"redmine:issue:{issue_id}:chunk:0", text=f"Issue {issue_id}", payload={"source": "redmine"}) + for issue_id in range(5) + ] + vectors = [[0.1, 0.2, 0.3] for _ in documents] + + store.upsert(documents, vectors) + + self.assertEqual([2, 2, 1], [len(batch) for batch in store.client.upsert_batches]) + self.assertEqual("Issue 0", store.client.upsert_batches[0][0].payload["text"]) + + def test_list_documents_paginates_qdrant_scroll_until_requested_limit(self): + class PagedClient(FakeClient): + def __init__(self): + super().__init__() + self.offsets = [] + + def scroll(self, collection_name, scroll_filter, limit, with_payload, with_vectors, offset=None): + self.offsets.append(offset) + first = FakeRecord() + first.payload = {**first.payload, "document_id": f"doc:{len(self.offsets)}a"} + second = FakeRecord() + second.payload = {**second.payload, "document_id": f"doc:{len(self.offsets)}b"} + if offset is None: + return [first, second], "next" + return [first, second], None + + store = self.make_store() + store.client = PagedClient() + + documents = store.list_documents(limit=3, source="redmine") + + self.assertEqual(["doc:1a", "doc:1b", "doc:2a"], [document["id"] for document in documents]) + self.assertEqual([None, "next"], store.client.offsets) + + +if __name__ == "__main__": + unittest.main() diff --git a/tests/semantic_index/test_redmine_source.py b/tests/semantic_index/test_redmine_source.py new file mode 100644 index 0000000..d54f168 --- /dev/null +++ b/tests/semantic_index/test_redmine_source.py @@ -0,0 +1,102 @@ +import unittest + +from semantic_index.redmine import RedmineApiSource + + +class RecordingRedmineSource(RedmineApiSource): + def __init__(self): + super().__init__(redmine_url="http://redmine.local", api_key="secret", project_identifier="customer-service") + self.urls = [] + + def _get_json(self, url): + self.urls.append(url) + if url.startswith("http://redmine.local/issues.json"): + return {"issues": [{"id": 39779}]} + return {"issue": {"id": 39779, "subject": "Goods return"}} + + +class PagedRedmineSource(RedmineApiSource): + def __init__(self): + super().__init__(redmine_url="http://redmine.local", api_key="secret", project_identifier="customer-service") + self.urls = [] + + def _get_json(self, url): + self.urls.append(url) + if url.startswith("http://redmine.local/issues.json"): + query = url.split("?", 1)[1] + params = dict(part.split("=", 1) for part in query.split("&")) + offset = int(params.get("offset", "0")) + limit = int(params.get("limit", "0")) + return {"issues": [{"id": issue_id} for issue_id in range(offset + 1, offset + limit + 1)]} + issue_id = int(url.split("/issues/", 1)[1].split(".", 1)[0]) + return {"issue": {"id": issue_id, "subject": f"Issue {issue_id}"}} + + +class DuplicatePagedRedmineSource(RedmineApiSource): + def __init__(self): + super().__init__(redmine_url="http://redmine.local", api_key="secret", project_identifier="customer-service") + + def _get_json(self, url): + if url.startswith("http://redmine.local/issues.json"): + query = url.split("?", 1)[1] + params = dict(part.split("=", 1) for part in query.split("&")) + offset = int(params.get("offset", "0")) + if offset == 0: + return {"issues": [{"id": 1}, {"id": 2}]} + if offset == 2: + return {"issues": [{"id": 2}, {"id": 3}]} + return {"issues": []} + issue_id = int(url.split("/issues/", 1)[1].split(".", 1)[0]) + return {"issue": {"id": issue_id, "subject": f"Issue {issue_id}"}} + + +class RedmineApiSourceTest(unittest.TestCase): + def test_recent_issue_summaries_do_not_fetch_issue_details(self): + source = RecordingRedmineSource() + + summaries = list(source.recent_issue_summaries(limit=1)) + + self.assertEqual(39779, summaries[0]["id"]) + self.assertEqual(1, len(source.urls)) + self.assertTrue(source.urls[0].startswith("http://redmine.local/issues.json")) + + def test_issue_detail_fetches_journals_and_helpdesk(self): + source = RecordingRedmineSource() + + detail = source.issue_detail(39779) + + self.assertEqual(39779, detail["id"]) + self.assertIn("include=journals%2Chelpdesk", source.urls[0]) + + def test_recent_helpdesk_issues_requests_helpdesk_include_with_journals(self): + source = RecordingRedmineSource() + + issues = list(source.recent_helpdesk_issues(limit=1)) + + self.assertEqual(39779, issues[0]["id"]) + self.assertIn("include=journals%2Chelpdesk", source.urls[1]) + self.assertIn("subproject_id=%21%2A", source.urls[0]) + + def test_recent_helpdesk_issues_paginates_past_redmine_page_limit(self): + source = PagedRedmineSource() + + issues = list(source.recent_helpdesk_issues(limit=250)) + + self.assertEqual(250, len(issues)) + list_urls = [url for url in source.urls if url.startswith("http://redmine.local/issues.json")] + self.assertEqual(3, len(list_urls)) + self.assertIn("limit=100", list_urls[0]) + self.assertIn("offset=0", list_urls[0]) + self.assertIn("offset=100", list_urls[1]) + self.assertIn("offset=200", list_urls[2]) + + def test_recent_helpdesk_issues_skips_duplicate_issue_ids_across_pages(self): + source = DuplicatePagedRedmineSource() + + issues = list(source.recent_helpdesk_issues(limit=3)) + + self.assertEqual([1, 2, 3], [issue["id"] for issue in issues]) + + +if __name__ == "__main__": + unittest.main() diff --git a/tests/semantic_index/test_refresh.py b/tests/semantic_index/test_refresh.py new file mode 100644 index 0000000..1c13b69 --- /dev/null +++ b/tests/semantic_index/test_refresh.py @@ -0,0 +1,277 @@ +import io +import json +import tempfile +import unittest +from contextlib import redirect_stdout +from pathlib import Path + +from semantic_index.__main__ import main +from semantic_index.models import IndexDocument +from semantic_index.refresh import FileRefreshState, RedmineRefreshService + + +def issue(updated_on="2026-04-25T12:00:00Z"): + return { + "id": 39779, + "subject": "Goods return", + "description": "Please return our goods.", + "updated_on": updated_on, + "project": {"id": 1, "identifier": "customer-service", "name": "Customer Service"}, + } + + +class FakeRedmineSource: + project_identifier = None + + def __init__(self, issues=None): + self.issues = issues or [issue()] + self.calls = [] + + def recent_helpdesk_issues(self, limit): + self.calls.append((self.project_identifier, limit)) + return self.issues[:limit] + + +class SummaryDetailRedmineSource(FakeRedmineSource): + def __init__(self, summaries, details): + super().__init__([]) + self.summaries = summaries + self.details = details + self.summary_calls = [] + self.detail_calls = [] + + def recent_issue_summaries(self, limit): + self.summary_calls.append((self.project_identifier, limit)) + return self.summaries[:limit] + + def issue_detail(self, issue_id): + self.detail_calls.append(issue_id) + return self.details[issue_id] + + +class RecordingEmbedder: + def __init__(self): + self.calls = [] + + def embed_documents(self, docs): + self.calls.append(list(docs)) + return [[0.1, 0.2, 0.3] for _ in docs] + + +class RefreshStore: + def __init__(self, existing=None): + self.existing = existing or {} + self.upserts = [] + self.deleted_ids = [] + + def list_documents(self, limit=10, source=None, project_identifier=None, doc_type=None, issue_id=None): + return list(self.existing.values())[:limit] + + def upsert(self, docs, vectors): + self.upserts.append((list(docs), list(vectors))) + + def delete_documents(self, document_ids): + self.deleted_ids.extend(document_ids) + + +class RedmineRefreshServiceTest(unittest.TestCase): + def test_refresh_skips_embeddings_when_source_hash_matches_existing_document(self): + source = FakeRedmineSource() + embedder = RecordingEmbedder() + service = RedmineRefreshService(source=source, embedder=embedder, store=RefreshStore()) + candidate = service.mapper.issue_to_documents(issue())[0] + service.store.existing[candidate.id] = { + "id": candidate.id, + "text": candidate.text, + "payload": dict(candidate.payload), + } + + result = service.refresh_redmine_project_limits({"customer-service": 1}) + + self.assertEqual(1, result["unchanged_documents"]) + self.assertEqual(0, result["embedded_documents"]) + self.assertEqual([], embedder.calls) + self.assertEqual([], service.store.upserts) + + def test_refresh_embeds_only_changed_and_new_documents(self): + source = FakeRedmineSource() + embedder = RecordingEmbedder() + service = RedmineRefreshService(source=source, embedder=embedder, store=RefreshStore()) + candidate = service.mapper.issue_to_documents(issue())[0] + service.store.existing[candidate.id] = { + "id": candidate.id, + "text": "Old text", + "payload": {**candidate.payload, "source_hash": "old-hash"}, + } + + result = service.refresh_redmine_project_limits({"customer-service": 1}) + + self.assertEqual(1, result["changed_documents"]) + self.assertEqual(1, result["embedded_documents"]) + self.assertEqual([[candidate]], embedder.calls) + self.assertEqual([candidate.id], [doc.id for doc in service.store.upserts[0][0]]) + + def test_refresh_deletes_stale_issue_documents_without_embedding(self): + source = FakeRedmineSource() + embedder = RecordingEmbedder() + service = RedmineRefreshService(source=source, embedder=embedder, store=RefreshStore()) + candidate = service.mapper.issue_to_documents(issue())[0] + service.store.existing[candidate.id] = {"id": candidate.id, "text": candidate.text, "payload": dict(candidate.payload)} + service.store.existing["redmine:issue:39779:journal:1:chunk:0"] = { + "id": "redmine:issue:39779:journal:1:chunk:0", + "text": "Deleted note", + "payload": {"source_hash": "gone", "issue_id": 39779}, + } + + result = service.refresh_redmine_project_limits({"customer-service": 1}) + + self.assertEqual(1, result["stale_documents"]) + self.assertEqual(["redmine:issue:39779:journal:1:chunk:0"], service.store.deleted_ids) + self.assertEqual([], embedder.calls) + + def test_dry_run_reports_planned_embeddings_without_embedding_or_mutating(self): + source = FakeRedmineSource() + embedder = RecordingEmbedder() + service = RedmineRefreshService(source=source, embedder=embedder, store=RefreshStore()) + + result = service.refresh_redmine_project_limits({"customer-service": 1}, dry_run=True) + + self.assertEqual(1, result["new_documents"]) + self.assertEqual(1, result["would_embed_documents"]) + self.assertEqual(0, result["embedded_documents"]) + self.assertEqual([], embedder.calls) + self.assertEqual([], service.store.upserts) + self.assertEqual([], service.store.deleted_ids) + + def test_force_rebuild_embeds_unchanged_documents(self): + source = FakeRedmineSource() + embedder = RecordingEmbedder() + service = RedmineRefreshService(source=source, embedder=embedder, store=RefreshStore()) + candidate = service.mapper.issue_to_documents(issue())[0] + service.store.existing[candidate.id] = {"id": candidate.id, "text": candidate.text, "payload": dict(candidate.payload)} + + result = service.refresh_redmine_project_limits({"customer-service": 1}, force_rebuild=True) + + self.assertEqual(1, result["force_rebuilt_documents"]) + self.assertEqual(1, result["embedded_documents"]) + self.assertEqual([[candidate]], embedder.calls) + + def test_force_rebuild_ignores_refresh_state_window_for_fetched_candidates(self): + source = FakeRedmineSource([issue(updated_on="2026-04-25T10:00:00Z")]) + embedder = RecordingEmbedder() + with tempfile.TemporaryDirectory() as tmp: + state = FileRefreshState(Path(tmp) / "refresh.json") + state.mark_success("customer-service", "2026-04-25T12:00:00Z") + service = RedmineRefreshService(source=source, embedder=embedder, store=RefreshStore(), state=state) + + result = service.refresh_redmine_project_limits({"customer-service": 1}, force_rebuild=True, overlap_minutes=15) + + self.assertEqual(0, result["skipped_issues"]) + self.assertEqual(1, result["embedded_documents"]) + + def test_file_refresh_state_updates_only_when_called(self): + with tempfile.TemporaryDirectory() as tmp: + state = FileRefreshState(Path(tmp) / "refresh.json") + self.assertEqual({}, state.load()) + + state.mark_success("customer-service", "2026-04-25T12:00:00Z") + + self.assertEqual( + {"projects": {"customer-service": {"last_successful_refresh_at": "2026-04-25T12:00:00Z"}}}, + json.loads((Path(tmp) / "refresh.json").read_text(encoding="utf-8")), + ) + + def test_refresh_state_skips_issues_older_than_overlap_window(self): + source = FakeRedmineSource([issue(updated_on="2026-04-25T10:00:00Z")]) + embedder = RecordingEmbedder() + with tempfile.TemporaryDirectory() as tmp: + state = FileRefreshState(Path(tmp) / "refresh.json") + state.mark_success("customer-service", "2026-04-25T12:00:00Z") + service = RedmineRefreshService(source=source, embedder=embedder, store=RefreshStore(), state=state) + + result = service.refresh_redmine_project_limits({"customer-service": 1}, dry_run=True, overlap_minutes=15) + + self.assertEqual(1, result["issues"]) + self.assertEqual(1, result["skipped_issues"]) + self.assertEqual(0, result["documents"]) + self.assertEqual([], embedder.calls) + + def test_refresh_skips_old_summaries_without_fetching_issue_detail(self): + old_summary = {"id": 39779, "updated_on": "2026-04-25T10:00:00Z"} + new_summary = {"id": 39780, "updated_on": "2026-04-25T11:50:00Z"} + source = SummaryDetailRedmineSource( + summaries=[old_summary, new_summary], + details={39780: {**issue("2026-04-25T11:50:00Z"), "id": 39780}}, + ) + embedder = RecordingEmbedder() + with tempfile.TemporaryDirectory() as tmp: + state = FileRefreshState(Path(tmp) / "refresh.json") + state.mark_success("customer-service", "2026-04-25T12:00:00Z") + service = RedmineRefreshService(source=source, embedder=embedder, store=RefreshStore(), state=state) + + result = service.refresh_redmine_project_limits({"customer-service": 2}, dry_run=True, overlap_minutes=15) + + self.assertEqual(2, result["scanned_issues"]) + self.assertEqual(1, result["skipped_issues"]) + self.assertEqual(1, result["detail_fetched_issues"]) + self.assertEqual([39780], source.detail_calls) + + +class RefreshCliTest(unittest.TestCase): + def test_refresh_redmine_projects_cli_parses_project_limits_and_dry_run(self): + class FakeRefresh: + def __init__(self): + self.calls = [] + + def refresh_redmine_project_limits(self, project_limits, dry_run=False, force_rebuild=False, overlap_minutes=15): + self.calls.append((project_limits, dry_run, force_rebuild, overlap_minutes)) + return {"source": "redmine", "projects": len(project_limits), "issues": sum(project_limits.values())} + + refresh = FakeRefresh() + services = {"refresh": refresh} + out = io.StringIO() + + with redirect_stdout(out): + main( + [ + "--refresh-redmine-projects", + "--project-limits", + "customer-service=5,hiring=2", + "--dry-run", + "--overlap-minutes", + "30", + ], + service_builder=lambda: services, + ) + + self.assertEqual(({"customer-service": 5, "hiring": 2}, True, False, 30), refresh.calls[0]) + self.assertIn("'projects': 2", out.getvalue()) + + def test_refresh_redmine_projects_cli_can_override_state_path(self): + class FakeRefresh: + def __init__(self): + self.state = None + + def refresh_redmine_project_limits(self, project_limits, dry_run=False, force_rebuild=False, overlap_minutes=15): + return {"state_path": str(self.state.path)} + + refresh = FakeRefresh() + out = io.StringIO() + + with redirect_stdout(out): + main( + [ + "--refresh-redmine-projects", + "--project-limits", + "customer-service=1", + "--state-path", + "/tmp/semantic-refresh-state.json", + ], + service_builder=lambda: {"refresh": refresh}, + ) + + self.assertIn("/tmp/semantic-refresh-state.json", out.getvalue()) + + +if __name__ == "__main__": + unittest.main() diff --git a/tests/semantic_index/test_search.py b/tests/semantic_index/test_search.py new file mode 100644 index 0000000..e4a638a --- /dev/null +++ b/tests/semantic_index/test_search.py @@ -0,0 +1,85 @@ +import unittest + +from semantic_index.models import IndexDocument, SearchQuery, SearchResult +from semantic_index.qdrant_store import build_filter, point_id_for_document +from semantic_index.search import HybridSearchService, keyword_boost + + +class FakeEmbedder: + def embed_query(self, text): + return [0.1, 0.2, 0.3] + + +class FakeStore: + def __init__(self): + self.query = None + + def search(self, vector, query, limit): + self.query = query + return [ + SearchResult( + id="weak", + score=0.7, + text="general support text", + payload={"redmine_url": "http://redmine/issues/1"}, + ), + SearchResult( + id="strong", + score=0.6, + text="Customer ada@example.com asked about ORD-12345", + payload={"redmine_url": "http://redmine/issues/2"}, + ), + ][:limit] + + +class SearchTest(unittest.TestCase): + def test_qdrant_point_id_is_deterministic_uuid_for_stable_document_id(self): + first = point_id_for_document("redmine:issue:42:journal:5:chunk:0") + second = point_id_for_document("redmine:issue:42:journal:5:chunk:0") + + self.assertEqual(first, second) + self.assertRegex(first, r"^[0-9a-f-]{36}$") + + def test_filter_maps_supported_metadata(self): + query = SearchQuery( + text="printer", + source="redmine", + project_identifier="fud-helpdesk", + doc_type="message", + issue_id=42, + contact_email="ada@example.com", + date_from="2026-04-01T00:00:00Z", + date_to="2026-04-30T23:59:59Z", + ) + + qfilter = build_filter(query) + + self.assertEqual( + [ + {"key": "source", "match": {"value": "redmine"}}, + {"key": "project_identifier", "match": {"value": "fud-helpdesk"}}, + {"key": "doc_type", "match": {"value": "message"}}, + {"key": "issue_id", "match": {"value": 42}}, + {"key": "contact_email", "match": {"value": "ada@example.com"}}, + {"key": "created_on", "range": {"gte": "2026-04-01T00:00:00Z", "lte": "2026-04-30T23:59:59Z"}}, + ], + qfilter["must"], + ) + + def test_keyword_boost_prioritizes_exact_email_and_order_matches(self): + weak = SearchResult(id="weak", score=0.7, text="general support text", payload={}) + strong = SearchResult(id="strong", score=0.6, text="Customer ada@example.com asked about ORD-12345", payload={}) + + self.assertGreater( + keyword_boost('ada@example.com "ORD-12345"', strong), + keyword_boost('ada@example.com "ORD-12345"', weak), + ) + + service = HybridSearchService(embedder=FakeEmbedder(), store=FakeStore()) + results = service.search(SearchQuery(text='ada@example.com "ORD-12345"', limit=2)) + self.assertEqual("strong", results[0].id) + self.assertEqual("http://redmine/issues/2", results[0].citation["url"]) + + +if __name__ == "__main__": + unittest.main() diff --git a/tests/semantic_index/test_shell_wrappers.py b/tests/semantic_index/test_shell_wrappers.py new file mode 100644 index 0000000..f8a2e66 --- /dev/null +++ b/tests/semantic_index/test_shell_wrappers.py @@ -0,0 +1,41 @@ +import os +import subprocess +import tempfile +import unittest +from pathlib import Path + + +ROOT = Path(__file__).resolve().parents[2] +REFRESH = ROOT / "semantic_index" / "refresh.sh" + + +class SemanticIndexShellWrapperTest(unittest.TestCase): + def test_refresh_wrapper_is_self_locating_when_called_from_another_directory(self): + with tempfile.TemporaryDirectory() as tmp: + tmp_path = Path(tmp) + env = { + **os.environ, + "PYTHON": "/bin/echo", + "SEMANTIC_INDEX_PROJECT_LIMITS": "customer-service=5", + "SEMANTIC_INDEX_LOG_DIR": str(tmp_path / "logs"), + "SEMANTIC_INDEX_STATE_PATH": str(tmp_path / "state" / "refresh_state.json"), + } + + result = subprocess.run( + [str(REFRESH)], + cwd=tmp, + env=env, + text=True, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + check=False, + ) + + self.assertEqual(0, result.returncode, result.stderr) + self.assertIn("-m semantic_index --refresh-redmine-projects", result.stdout) + self.assertIn("--project-limits customer-service=5", result.stdout) + self.assertIn("log_file=", result.stdout) + + +if __name__ == "__main__": + unittest.main()