Add semantic-index service, deployment assets, and tests
This commit is contained in:
Executable
+183
@@ -0,0 +1,183 @@
|
||||
#!/usr/bin/env bash
|
||||
set -euo pipefail
|
||||
|
||||
usage() {
|
||||
cat >&2 <<'EOF'
|
||||
Usage:
|
||||
deploy/semantic-index/install.sh [--dry-run] [--apply] [--start] [--no-system] [--skip-deps]
|
||||
|
||||
Modes:
|
||||
--dry-run Print commands that would run. This is the default.
|
||||
--apply Install files, venv, dependencies, env template, and systemd units.
|
||||
--start With --apply, reload systemd and start only semantic-index.service.
|
||||
--no-system Skip sudo/systemd operations. Useful for tests and local validation.
|
||||
--skip-deps Skip venv creation and dependency install.
|
||||
|
||||
The installer never runs backfill, never enables the refresh timer, and never
|
||||
passes --force-rebuild.
|
||||
EOF
|
||||
}
|
||||
|
||||
mode=dry-run
|
||||
start_service=0
|
||||
system_ops=1
|
||||
skip_deps=0
|
||||
|
||||
while [[ $# -gt 0 ]]; do
|
||||
case "$1" in
|
||||
--dry-run)
|
||||
mode=dry-run
|
||||
shift
|
||||
;;
|
||||
--apply)
|
||||
mode=apply
|
||||
shift
|
||||
;;
|
||||
--start)
|
||||
start_service=1
|
||||
shift
|
||||
;;
|
||||
--no-system)
|
||||
system_ops=0
|
||||
shift
|
||||
;;
|
||||
--skip-deps)
|
||||
skip_deps=1
|
||||
shift
|
||||
;;
|
||||
-h|--help)
|
||||
usage
|
||||
exit 0
|
||||
;;
|
||||
*)
|
||||
usage
|
||||
exit 2
|
||||
;;
|
||||
esac
|
||||
done
|
||||
|
||||
if [[ "$start_service" -eq 1 && "$mode" != "apply" ]]; then
|
||||
echo "--start requires --apply" >&2
|
||||
exit 2
|
||||
fi
|
||||
|
||||
repo_root=$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)
|
||||
install_dir=${SEMANTIC_INDEX_INSTALL_DIR:-/opt/semantic-index}
|
||||
env_file=${SEMANTIC_INDEX_ENV_FILE:-/etc/semantic-index.env}
|
||||
state_dir=${SEMANTIC_INDEX_STATE_DIR:-/var/lib/semantic-index}
|
||||
log_dir=${SEMANTIC_INDEX_LOG_DIR:-/var/log/semantic-index}
|
||||
systemd_dir=${SEMANTIC_INDEX_SYSTEMD_DIR:-/etc/systemd/system}
|
||||
python_bin=${PYTHON:-python3}
|
||||
|
||||
run() {
|
||||
if [[ "$mode" == "dry-run" ]]; then
|
||||
printf 'would run:'
|
||||
printf ' %q' "$@"
|
||||
printf '\n'
|
||||
else
|
||||
"$@"
|
||||
fi
|
||||
}
|
||||
|
||||
run_sudo() {
|
||||
if [[ "$system_ops" -eq 0 ]]; then
|
||||
run "$@"
|
||||
else
|
||||
run sudo "$@"
|
||||
fi
|
||||
}
|
||||
|
||||
install_env_template() {
|
||||
if [[ "$mode" == "dry-run" ]]; then
|
||||
echo "would copy env template only if missing: $env_file"
|
||||
return
|
||||
fi
|
||||
if [[ -e "$env_file" ]]; then
|
||||
echo "keeping existing $env_file"
|
||||
return
|
||||
fi
|
||||
if [[ "$system_ops" -eq 0 ]]; then
|
||||
mkdir -p "$(dirname "$env_file")"
|
||||
cp "$repo_root/deploy/semantic-index/semantic-index.env.example" "$env_file"
|
||||
else
|
||||
sudo install -m 0640 "$repo_root/deploy/semantic-index/semantic-index.env.example" "$env_file"
|
||||
fi
|
||||
}
|
||||
|
||||
print_next_steps_warning() {
|
||||
cat <<EOF
|
||||
|
||||
Semantic Index installed, but deployment is not complete.
|
||||
|
||||
Required manual steps:
|
||||
1. Edit $env_file and fill real secrets/URLs.
|
||||
2. Start or restart the HTTP service:
|
||||
sudo systemctl daemon-reload
|
||||
sudo systemctl start semantic-index.service
|
||||
3. Validate:
|
||||
curl -sS http://127.0.0.1:8787/health
|
||||
$install_dir/semantic_index/search.sh "goods return" customer-service 3
|
||||
4. Before enabling scheduled refresh, run:
|
||||
SEMANTIC_INDEX_PROJECT_LIMITS='customer-service=5' $install_dir/semantic_index/refresh.sh
|
||||
$install_dir/semantic_index/refresh.sh --apply
|
||||
5. Create/confirm a Qdrant snapshot before any production-scale backfill.
|
||||
|
||||
The refresh timer was NOT enabled automatically.
|
||||
Do not use --force-rebuild unless you intentionally want to pay to re-embed unchanged documents.
|
||||
EOF
|
||||
}
|
||||
|
||||
echo "mode=$mode"
|
||||
echo "install_dir=$install_dir"
|
||||
echo "env_file=$env_file"
|
||||
echo "state_dir=$state_dir"
|
||||
echo "log_dir=$log_dir"
|
||||
|
||||
run_sudo mkdir -p "$install_dir" "$state_dir" "$log_dir" "$systemd_dir"
|
||||
run_sudo rsync -a \
|
||||
--exclude ".env" \
|
||||
--exclude "__pycache__/" \
|
||||
--exclude "*.pyc" \
|
||||
"$repo_root/semantic_index" \
|
||||
"$repo_root/tests" \
|
||||
"$repo_root/docs" \
|
||||
"$repo_root/deploy" \
|
||||
"$repo_root/dist" \
|
||||
"$install_dir/"
|
||||
|
||||
if [[ "$skip_deps" -eq 1 ]]; then
|
||||
echo "skipping venv/dependency install because --skip-deps was used"
|
||||
elif [[ "$mode" == "apply" && "$system_ops" -eq 0 ]]; then
|
||||
run "$python_bin" -m venv "$install_dir/.venv"
|
||||
run "$install_dir/.venv/bin/pip" install openai qdrant-client fastapi uvicorn
|
||||
else
|
||||
run_sudo "$python_bin" -m venv "$install_dir/.venv"
|
||||
run_sudo "$install_dir/.venv/bin/pip" install openai qdrant-client fastapi uvicorn
|
||||
fi
|
||||
|
||||
install_env_template
|
||||
|
||||
run_sudo install -m 0644 "$repo_root/deploy/semantic-index/semantic-index.service" "$systemd_dir/semantic-index.service"
|
||||
run_sudo install -m 0644 "$repo_root/deploy/semantic-index/semantic-index-refresh.service" "$systemd_dir/semantic-index-refresh.service"
|
||||
run_sudo install -m 0644 "$repo_root/deploy/semantic-index/semantic-index-refresh.timer" "$systemd_dir/semantic-index-refresh.timer"
|
||||
|
||||
if [[ "$mode" == "apply" && "$skip_deps" -eq 0 ]]; then
|
||||
"$install_dir/.venv/bin/python" -m py_compile "$install_dir"/semantic_index/*.py
|
||||
"$install_dir/.venv/bin/python" -m unittest discover -s "$install_dir/tests/semantic_index"
|
||||
bash -n "$install_dir/semantic_index/refresh.sh"
|
||||
elif [[ "$mode" == "apply" ]]; then
|
||||
echo "skipping installed-code validation because --skip-deps was used"
|
||||
fi
|
||||
|
||||
if [[ "$mode" == "apply" && "$start_service" -eq 1 ]]; then
|
||||
if [[ "$system_ops" -eq 0 ]]; then
|
||||
echo "skipping systemctl start because --no-system was used"
|
||||
else
|
||||
sudo systemctl daemon-reload
|
||||
sudo systemctl start semantic-index.service
|
||||
fi
|
||||
fi
|
||||
|
||||
if [[ "$mode" == "apply" ]]; then
|
||||
print_next_steps_warning
|
||||
fi
|
||||
@@ -0,0 +1,12 @@
|
||||
[Unit]
|
||||
Description=Redmine Semantic Index Rolling Refresh
|
||||
After=network-online.target
|
||||
Wants=network-online.target
|
||||
|
||||
[Service]
|
||||
Type=oneshot
|
||||
WorkingDirectory=/opt/semantic-index
|
||||
EnvironmentFile=/etc/semantic-index.env
|
||||
ExecStart=/bin/bash -lc 'exec /opt/semantic-index/semantic_index/refresh.sh --apply'
|
||||
NoNewPrivileges=true
|
||||
PrivateTmp=true
|
||||
@@ -0,0 +1,10 @@
|
||||
[Unit]
|
||||
Description=Run Redmine Semantic Index Rolling Refresh
|
||||
|
||||
[Timer]
|
||||
OnBootSec=10min
|
||||
OnUnitActiveSec=30min
|
||||
Unit=semantic-index-refresh.service
|
||||
|
||||
[Install]
|
||||
WantedBy=timers.target
|
||||
@@ -0,0 +1,22 @@
|
||||
# Copy to /etc/semantic-index.env and fill secrets on the target host.
|
||||
# Do not commit real values.
|
||||
|
||||
OPENAI_API_KEY=
|
||||
QDRANT_URL=http://qdrant-host:6333
|
||||
QDRANT_API_KEY=
|
||||
QDRANT_COLLECTION=redmine_semantic_sample
|
||||
|
||||
REDMINE_URL=http://redmine-host
|
||||
REDMINE_API_KEY=
|
||||
REDMINE_PROJECT_IDENTIFIER=
|
||||
REDMINE_SAMPLE_LIMIT=500
|
||||
|
||||
SEMANTIC_INDEX_HOST=127.0.0.1
|
||||
SEMANTIC_INDEX_PORT=8787
|
||||
SEMANTIC_INDEX_API_KEY=
|
||||
SEMANTIC_INDEX_REFRESH_STATE_PATH=/var/lib/semantic-index/refresh_state.json
|
||||
|
||||
SEMANTIC_INDEX_PROJECT_LIMITS=customer-service=500,hiring=200,todo-jason=200,sales-inbox=100,business-development=100,dock-scheduling=100,prep-standardization=100
|
||||
SEMANTIC_INDEX_LOG_DIR=/var/log/semantic-index
|
||||
SEMANTIC_INDEX_STATE_PATH=/var/lib/semantic-index/refresh_state.json
|
||||
SEMANTIC_INDEX_OVERLAP_MINUTES=15
|
||||
@@ -0,0 +1,17 @@
|
||||
[Unit]
|
||||
Description=Redmine Semantic Index HTTP API
|
||||
After=network-online.target
|
||||
Wants=network-online.target
|
||||
|
||||
[Service]
|
||||
Type=simple
|
||||
WorkingDirectory=/opt/semantic-index
|
||||
EnvironmentFile=/etc/semantic-index.env
|
||||
ExecStart=/bin/bash -lc 'exec /opt/semantic-index/.venv/bin/uvicorn semantic_index.app:app --host "${SEMANTIC_INDEX_HOST}" --port "${SEMANTIC_INDEX_PORT}"'
|
||||
Restart=on-failure
|
||||
RestartSec=5
|
||||
NoNewPrivileges=true
|
||||
PrivateTmp=true
|
||||
|
||||
[Install]
|
||||
WantedBy=multi-user.target
|
||||
@@ -0,0 +1,64 @@
|
||||
# Semantic Index V1 Pre-Deployment Manifest
|
||||
|
||||
- Patch set: `semantic-index-v1-predeployment-20260425T150000Z`
|
||||
- Created: `2026-04-25T15:00:00Z`
|
||||
- Purpose: deployment manifest for the Redmine semantic index service and its
|
||||
LAN/production preparation docs.
|
||||
|
||||
## Files To Install
|
||||
|
||||
```text
|
||||
semantic_index/
|
||||
tests/semantic_index/
|
||||
deploy/semantic-index/
|
||||
docs/semantic_index_deployment_runbook.md
|
||||
docs/semantic_index_production_notes.md
|
||||
docs/semantic_index_predeployment_validation.md
|
||||
docs/redmine_issue_api_helpdesk_include.md
|
||||
dist/semantic-index-v1-predeployment-20260425T150000Z.MANIFEST.md
|
||||
```
|
||||
|
||||
## Files Not To Install
|
||||
|
||||
```text
|
||||
semantic_index/.env
|
||||
.cache/
|
||||
.venv/
|
||||
__pycache__/
|
||||
*.pyc
|
||||
```
|
||||
|
||||
Keep runtime secrets in `semantic_index/.env` or in the service manager
|
||||
environment on the target host. Do not commit or copy local secrets into a
|
||||
source bundle.
|
||||
|
||||
## External Dependencies
|
||||
|
||||
- Redmine Helpdesk API patch documented in
|
||||
`docs/redmine_issue_api_helpdesk_include.md`
|
||||
- Qdrant reachable through `QDRANT_URL`
|
||||
- OpenAI API key for `text-embedding-3-small`
|
||||
- Python packages: `openai`, `qdrant-client`, `fastapi`, `uvicorn`
|
||||
|
||||
## Validation Commands
|
||||
|
||||
```sh
|
||||
deploy/semantic-index/install.sh
|
||||
.venv/bin/python -m py_compile semantic_index/*.py
|
||||
.venv/bin/python -m unittest discover -s tests/semantic_index
|
||||
bash -n semantic_index/refresh.sh
|
||||
SEMANTIC_INDEX_PROJECT_LIMITS='customer-service=5' semantic_index/refresh.sh
|
||||
```
|
||||
|
||||
Before any production backfill, follow
|
||||
`docs/semantic_index_deployment_runbook.md` and confirm Qdrant snapshot or
|
||||
volume rollback is available.
|
||||
|
||||
## Operational Rules
|
||||
|
||||
- Run `semantic_index/refresh.sh` in dry-run mode before `--apply`.
|
||||
- Do not schedule `--force-rebuild`; keep it manual-only.
|
||||
- Review refresh logs for `detail_fetched_issues`, `would_embed_documents`, and
|
||||
`embedded_documents`.
|
||||
- Bind HTTP to localhost unless LAN access is explicitly required and protected
|
||||
with `SEMANTIC_INDEX_API_KEY`.
|
||||
@@ -0,0 +1,336 @@
|
||||
# Semantic Index Deployment Runbook
|
||||
|
||||
This runbook captures the current deployment shape for the Redmine semantic
|
||||
index. It is written for the LAN test server first, with the same steps intended
|
||||
to carry forward to production after paths and secrets are adjusted.
|
||||
The latest LAN validation record is in
|
||||
`docs/semantic_index_predeployment_validation.md`.
|
||||
|
||||
## Deployable Files
|
||||
|
||||
Copy or update these tracked paths together:
|
||||
|
||||
- `semantic_index/`
|
||||
- `tests/semantic_index/`
|
||||
- `deploy/semantic-index/`
|
||||
- `docs/semantic_index_production_notes.md`
|
||||
- `docs/semantic_index_deployment_runbook.md`
|
||||
- `docs/semantic_index_predeployment_validation.md`
|
||||
- `docs/redmine_issue_api_helpdesk_include.md`
|
||||
|
||||
The Helpdesk contact metadata dependency is the Redmine plugin API patch
|
||||
documented in `docs/redmine_issue_api_helpdesk_include.md`. Deploy that plugin
|
||||
patch before expecting Helpdesk contact fields in indexed results.
|
||||
|
||||
Do not copy local-only runtime files:
|
||||
|
||||
- `semantic_index/.env`
|
||||
- `.cache/`
|
||||
- `.venv/`
|
||||
- `__pycache__/`
|
||||
- Qdrant storage snapshots or rollback tarballs unless deliberately restoring
|
||||
|
||||
## Runtime Prerequisites
|
||||
|
||||
Python runtime dependencies:
|
||||
|
||||
```sh
|
||||
pip install openai qdrant-client fastapi uvicorn
|
||||
```
|
||||
|
||||
Qdrant is expected to run on the larger host and be reachable from the semantic
|
||||
index host through `QDRANT_URL`. The current collection default is
|
||||
`redmine_semantic_sample`.
|
||||
|
||||
Qdrant Docker example:
|
||||
|
||||
```sh
|
||||
docker run -p 6333:6333 -p 6334:6334 \
|
||||
-v qdrant_storage:/qdrant/storage \
|
||||
qdrant/qdrant
|
||||
```
|
||||
|
||||
Before destructive maintenance, create a Qdrant snapshot or preserve the Docker
|
||||
volume.
|
||||
|
||||
## Environment
|
||||
|
||||
For a production-style install, use:
|
||||
|
||||
- code: `/opt/semantic-index`
|
||||
- environment file: `/etc/semantic-index.env`
|
||||
- refresh state: `/var/lib/semantic-index/refresh_state.json`
|
||||
- refresh logs: `/var/log/semantic-index`
|
||||
|
||||
Create `/etc/semantic-index.env` from
|
||||
`deploy/semantic-index/semantic-index.env.example` and fill secrets on the
|
||||
target host:
|
||||
|
||||
```sh
|
||||
OPENAI_API_KEY=
|
||||
QDRANT_URL=http://qdrant-host:6333
|
||||
QDRANT_API_KEY=
|
||||
QDRANT_COLLECTION=redmine_semantic_sample
|
||||
REDMINE_URL=http://redmine-host
|
||||
REDMINE_API_KEY=
|
||||
REDMINE_PROJECT_IDENTIFIER=
|
||||
REDMINE_SAMPLE_LIMIT=500
|
||||
SEMANTIC_INDEX_HOST=127.0.0.1
|
||||
SEMANTIC_INDEX_PORT=8787
|
||||
SEMANTIC_INDEX_API_KEY=
|
||||
SEMANTIC_INDEX_REFRESH_STATE_PATH=/var/lib/semantic-index/refresh_state.json
|
||||
```
|
||||
|
||||
Recommended production-style refresh overrides:
|
||||
|
||||
```sh
|
||||
SEMANTIC_INDEX_PROJECT_LIMITS='customer-service=500,hiring=200,todo-jason=200,sales-inbox=100,business-development=100,dock-scheduling=100,prep-standardization=100'
|
||||
SEMANTIC_INDEX_LOG_DIR=/var/log/semantic-index
|
||||
SEMANTIC_INDEX_STATE_PATH=/var/lib/semantic-index/refresh_state.json
|
||||
SEMANTIC_INDEX_OVERLAP_MINUTES=15
|
||||
```
|
||||
|
||||
Keep `SEMANTIC_INDEX_API_KEY` set when binding outside localhost. Do not commit
|
||||
API keys or `.env` files.
|
||||
|
||||
## Systemd Templates
|
||||
|
||||
Templates live in `deploy/semantic-index/`:
|
||||
|
||||
```text
|
||||
install.sh
|
||||
semantic-index.service
|
||||
semantic-index-refresh.service
|
||||
semantic-index-refresh.timer
|
||||
semantic-index.env.example
|
||||
```
|
||||
|
||||
Use the installer first. It defaults to dry-run:
|
||||
|
||||
```sh
|
||||
deploy/semantic-index/install.sh
|
||||
```
|
||||
|
||||
Apply the install:
|
||||
|
||||
```sh
|
||||
deploy/semantic-index/install.sh --apply
|
||||
```
|
||||
|
||||
Optionally start only the HTTP service after installing:
|
||||
|
||||
```sh
|
||||
deploy/semantic-index/install.sh --apply --start
|
||||
```
|
||||
|
||||
The installer creates `/opt/semantic-index`, `/var/lib/semantic-index`, and
|
||||
`/var/log/semantic-index`; copies the deploy unit; creates
|
||||
`/etc/semantic-index.env` only if it does not already exist; installs systemd
|
||||
unit files; and runs local validation. It does not run backfill, does not enable
|
||||
the refresh timer, and never passes `--force-rebuild`.
|
||||
|
||||
Manual install shape, if the installer cannot be used:
|
||||
|
||||
```sh
|
||||
sudo mkdir -p /opt/semantic-index /var/lib/semantic-index /var/log/semantic-index
|
||||
sudo rsync -a \
|
||||
--exclude '.env' \
|
||||
--exclude '__pycache__/' \
|
||||
--exclude '*.pyc' \
|
||||
semantic_index tests docs deploy dist /opt/semantic-index/
|
||||
sudo cp deploy/semantic-index/semantic-index.env.example /etc/semantic-index.env
|
||||
sudo install -m 0644 deploy/semantic-index/semantic-index.service /etc/systemd/system/semantic-index.service
|
||||
sudo install -m 0644 deploy/semantic-index/semantic-index-refresh.service /etc/systemd/system/semantic-index-refresh.service
|
||||
sudo install -m 0644 deploy/semantic-index/semantic-index-refresh.timer /etc/systemd/system/semantic-index-refresh.timer
|
||||
```
|
||||
|
||||
After editing `/etc/semantic-index.env`, validate manually before enabling the
|
||||
timer:
|
||||
|
||||
```sh
|
||||
sudo systemctl daemon-reload
|
||||
sudo systemctl start semantic-index.service
|
||||
sudo systemctl status semantic-index.service
|
||||
sudo systemctl start semantic-index-refresh.service
|
||||
sudo journalctl -u semantic-index-refresh.service -n 100 --no-pager
|
||||
```
|
||||
|
||||
Enable the timer only after manual dry-run and `--apply` logs look normal:
|
||||
|
||||
```sh
|
||||
sudo systemctl enable --now semantic-index-refresh.timer
|
||||
```
|
||||
|
||||
## Initial Validation
|
||||
|
||||
Run syntax and test checks after copying code:
|
||||
|
||||
```sh
|
||||
.venv/bin/python -m py_compile semantic_index/*.py
|
||||
.venv/bin/python -m unittest discover -s tests/semantic_index
|
||||
bash -n semantic_index/refresh.sh
|
||||
```
|
||||
|
||||
Confirm service startup:
|
||||
|
||||
```sh
|
||||
uvicorn semantic_index.app:app --host 127.0.0.1 --port 8787
|
||||
curl -sS http://127.0.0.1:8787/health
|
||||
```
|
||||
|
||||
If `SEMANTIC_INDEX_API_KEY` is set:
|
||||
|
||||
```sh
|
||||
curl -sS -H "Authorization: Bearer $SEMANTIC_INDEX_API_KEY" \
|
||||
http://127.0.0.1:8787/projects
|
||||
```
|
||||
|
||||
## Initial Backfill
|
||||
|
||||
Preview Redmine mapping before writing to Qdrant:
|
||||
|
||||
```sh
|
||||
.venv/bin/python -m semantic_index inspect preview-redmine \
|
||||
--project customer-service \
|
||||
--limit 5
|
||||
```
|
||||
|
||||
Backfill the current balanced sample:
|
||||
|
||||
```sh
|
||||
.venv/bin/python -m semantic_index --backfill-redmine-projects \
|
||||
--project-limits customer-service=500,hiring=200,todo-jason=200,sales-inbox=100,business-development=100,dock-scheduling=100,prep-standardization=100
|
||||
```
|
||||
|
||||
Audit the result:
|
||||
|
||||
```sh
|
||||
.venv/bin/python -m semantic_index inspect audit --source redmine --limit 5000
|
||||
.venv/bin/python -m semantic_index inspect smoke-search --project customer-service
|
||||
```
|
||||
|
||||
Expected broad shape for the current LAN sample is roughly:
|
||||
|
||||
- Customer Service is the largest project.
|
||||
- Helpdesk tickets have contact metadata.
|
||||
- Internal projects may have no Helpdesk contact metadata.
|
||||
- `attachments=0`.
|
||||
|
||||
## Routine Refresh
|
||||
|
||||
Use the wrapper for production-style refresh. It defaults to dry-run:
|
||||
|
||||
```sh
|
||||
semantic_index/refresh.sh
|
||||
```
|
||||
|
||||
Small smoke check:
|
||||
|
||||
```sh
|
||||
SEMANTIC_INDEX_PROJECT_LIMITS='customer-service=5' semantic_index/refresh.sh
|
||||
```
|
||||
|
||||
Apply refresh manually:
|
||||
|
||||
```sh
|
||||
semantic_index/refresh.sh --apply
|
||||
```
|
||||
|
||||
Installed wrappers can also be called by absolute path, for example
|
||||
`/opt/semantic-index/semantic_index/refresh.sh`. The wrapper uses its own
|
||||
install root as the working directory and reads defaults from
|
||||
`/etc/semantic-index.env` when that file is readable.
|
||||
|
||||
Review the log path printed by the wrapper. For a healthy routine run after
|
||||
state exists, expect:
|
||||
|
||||
- `scanned_issues` greater than or equal to `detail_fetched_issues`
|
||||
- old issues counted under `skipped_issues`
|
||||
- `would_embed_documents` and `embedded_documents` near zero when Redmine has
|
||||
not changed
|
||||
- no scheduled use of `--force-rebuild`
|
||||
|
||||
Only schedule the wrapper after manual dry-run and apply logs look normal.
|
||||
|
||||
Cron shape, when ready:
|
||||
|
||||
```cron
|
||||
*/30 * * * * cd /home/iadnah/redmine && semantic_index/refresh.sh --apply
|
||||
```
|
||||
|
||||
## Search Validation
|
||||
|
||||
HTTP search:
|
||||
|
||||
```sh
|
||||
semantic_index/search.sh "goods return" customer-service 3
|
||||
semantic_index/search.sh "candidate follow up" hiring 5
|
||||
```
|
||||
|
||||
CLI inspection:
|
||||
|
||||
```sh
|
||||
.venv/bin/python -m semantic_index inspect search "goods return" \
|
||||
--project customer-service \
|
||||
--limit 3
|
||||
|
||||
.venv/bin/python -m semantic_index inspect list \
|
||||
--source redmine \
|
||||
--project customer-service \
|
||||
--limit 10
|
||||
```
|
||||
|
||||
MCP stdio:
|
||||
|
||||
```sh
|
||||
.venv/bin/python -m semantic_index --mcp-stdio
|
||||
```
|
||||
|
||||
Available tools:
|
||||
|
||||
- `semantic_search`
|
||||
- `semantic_get_document`
|
||||
- `semantic_list_projects`
|
||||
- `semantic_backfill_redmine_sample`
|
||||
- `semantic_refresh_redmine`
|
||||
|
||||
## Rollback
|
||||
|
||||
Code rollback:
|
||||
|
||||
- Stop `uvicorn` or the service manager unit.
|
||||
- Restore the previous `semantic_index/` code.
|
||||
- Restore the previous Redmine Helpdesk plugin patch if contact metadata broke.
|
||||
- Restart the service.
|
||||
|
||||
Index rollback options:
|
||||
|
||||
- Restore a Qdrant snapshot or preserved Docker volume.
|
||||
- Or rebuild from Redmine with the known-good code using the multi-project
|
||||
backfill command above.
|
||||
|
||||
Refresh rollback:
|
||||
|
||||
- Disable cron/systemd schedule if enabled.
|
||||
- Preserve the failing log file for diagnosis.
|
||||
- If the refresh state is wrong, move the state file aside rather than editing
|
||||
it in place:
|
||||
|
||||
```sh
|
||||
mv .cache/semantic_index/refresh_state.json .cache/semantic_index/refresh_state.json.bad
|
||||
```
|
||||
|
||||
The next refresh will behave like a first refresh for state purposes, while the
|
||||
`source_hash` guard still prevents embedding unchanged documents.
|
||||
|
||||
## Production Readiness Checklist
|
||||
|
||||
- Redmine API key is scoped appropriately and stored outside git.
|
||||
- Qdrant URL and collection are confirmed.
|
||||
- Qdrant snapshot/export path is known.
|
||||
- Helpdesk API patch is deployed and validated.
|
||||
- HTTP service is bound only to trusted localhost/LAN as intended.
|
||||
- `SEMANTIC_INDEX_API_KEY` is set for non-localhost use.
|
||||
- Initial backfill audit and smoke searches pass.
|
||||
- Refresh dry-run and apply logs show expected low embedding counts.
|
||||
- `--force-rebuild` is documented as manual-only.
|
||||
@@ -0,0 +1,182 @@
|
||||
# Semantic Index Pre-Deployment Validation
|
||||
|
||||
Validation date: `2026-04-25`
|
||||
|
||||
This records the current LAN pre-deployment checks for the semantic index. It
|
||||
does not include secrets.
|
||||
|
||||
## Deploy Unit
|
||||
|
||||
Semantic-index deployable files are documented in:
|
||||
|
||||
- `dist/semantic-index-v1-predeployment-20260425T150000Z.MANIFEST.md`
|
||||
- `docs/semantic_index_deployment_runbook.md`
|
||||
|
||||
Current known unrelated worktree changes are outside the semantic-index deploy
|
||||
unit and should not be mixed into the semantic-index release package:
|
||||
|
||||
- `redMCP/README.md`
|
||||
- `redMCP/app/McpDispatcher.php`
|
||||
- `redMCP/app/RedmineClient.php`
|
||||
- `redMCP/composer.json`
|
||||
- `redMCP/bin/test-redmine-structure.php`
|
||||
- `TODO.md`
|
||||
|
||||
## Local Verification
|
||||
|
||||
Passed:
|
||||
|
||||
```sh
|
||||
.venv/bin/python -m py_compile semantic_index/*.py
|
||||
.venv/bin/python -m unittest discover -s tests/semantic_index
|
||||
bash -n semantic_index/refresh.sh
|
||||
```
|
||||
|
||||
Observed semantic test result:
|
||||
|
||||
```text
|
||||
Ran 65 tests in 1.041s
|
||||
OK
|
||||
```
|
||||
|
||||
## LAN Redmine Preview
|
||||
|
||||
Passed:
|
||||
|
||||
```sh
|
||||
.venv/bin/python -m semantic_index inspect preview-redmine \
|
||||
--project customer-service \
|
||||
--limit 5
|
||||
```
|
||||
|
||||
Observed:
|
||||
|
||||
- Helpdesk issue chunks include contact id, name, email, and company metadata.
|
||||
- Issue `39779` includes Callum Mackeonis and `callum@safetagtracking.com`.
|
||||
- Journals are present as separate indexed documents.
|
||||
- Contact documents are present as separate indexed documents.
|
||||
|
||||
## Qdrant Audit
|
||||
|
||||
Passed:
|
||||
|
||||
```sh
|
||||
.venv/bin/python -m semantic_index inspect audit --source redmine --limit 5000 --json
|
||||
```
|
||||
|
||||
Observed:
|
||||
|
||||
```text
|
||||
total_documents=2947
|
||||
doc_type contact=714
|
||||
doc_type issue=1208
|
||||
doc_type journal=1025
|
||||
project business-development=66
|
||||
project customer-service=1684
|
||||
project dock-scheduling=63
|
||||
project hiring=409
|
||||
project prep-standardization=25
|
||||
project sales-inbox=192
|
||||
project todo-jason=508
|
||||
contact_metadata=2232
|
||||
helpdesk_contact_metadata=2232/2232
|
||||
attachments=0
|
||||
```
|
||||
|
||||
## HTTP Validation
|
||||
|
||||
Passed:
|
||||
|
||||
```sh
|
||||
curl -sS http://127.0.0.1:8787/health
|
||||
```
|
||||
|
||||
Observed:
|
||||
|
||||
```json
|
||||
{"status":"ok"}
|
||||
```
|
||||
|
||||
Unauthenticated `/projects` correctly returned unauthorized when
|
||||
`SEMANTIC_INDEX_API_KEY` was configured.
|
||||
|
||||
Authenticated `/projects` passed and returned the expected seven projects:
|
||||
|
||||
```text
|
||||
business-development
|
||||
customer-service
|
||||
dock-scheduling
|
||||
hiring
|
||||
prep-standardization
|
||||
sales-inbox
|
||||
todo-jason
|
||||
```
|
||||
|
||||
HTTP search passed:
|
||||
|
||||
```sh
|
||||
semantic_index/search.sh "goods return" customer-service 3
|
||||
```
|
||||
|
||||
Observed:
|
||||
|
||||
- Top result was `redmine:issue:39779:chunk:0`.
|
||||
- Citation included project `customer-service`.
|
||||
- Citation included contact id `1890`, contact name, contact email, and Redmine
|
||||
URL.
|
||||
|
||||
## Refresh Validation
|
||||
|
||||
Passed safe dry-run smoke check:
|
||||
|
||||
```sh
|
||||
SEMANTIC_INDEX_PROJECT_LIMITS='customer-service=5' semantic_index/refresh.sh
|
||||
```
|
||||
|
||||
Observed:
|
||||
|
||||
```text
|
||||
mode=dry-run
|
||||
issues=5
|
||||
scanned_issues=5
|
||||
detail_fetched_issues=0
|
||||
skipped_issues=5
|
||||
would_embed_documents=0
|
||||
embedded_documents=0
|
||||
```
|
||||
|
||||
This confirms the refresh state prefilter skips old issues before Redmine detail
|
||||
fetch and before embedding.
|
||||
|
||||
## Qdrant Validation
|
||||
|
||||
Read-only collection check passed:
|
||||
|
||||
```text
|
||||
collection=redmine_semantic_sample
|
||||
status=green
|
||||
vector_size=1536
|
||||
distance=Cosine
|
||||
points_count=2947
|
||||
update_queue.length=0
|
||||
```
|
||||
|
||||
Read-only snapshot listing endpoint responded successfully:
|
||||
|
||||
```text
|
||||
/collections/redmine_semantic_sample/snapshots
|
||||
result=[]
|
||||
```
|
||||
|
||||
No snapshot was created during this validation.
|
||||
|
||||
## Remaining Pre-Deployment Items
|
||||
|
||||
- Decide final target host paths for logs and refresh state.
|
||||
- Decide service manager shape: manual `uvicorn`, systemd service, or another
|
||||
supervisor.
|
||||
- Create or confirm a Qdrant snapshot immediately before production backfill.
|
||||
- Package only the semantic-index deploy unit, keeping unrelated `redMCP`
|
||||
worktree changes out of the release.
|
||||
- Keep scheduled refresh disabled until manual dry-run and `--apply` logs are
|
||||
reviewed on the target host.
|
||||
@@ -0,0 +1,76 @@
|
||||
# Semantic Index Production Notes
|
||||
|
||||
These notes capture the current production direction for the Redmine semantic
|
||||
index. The service is still local-agent oriented, but the refresh command is now
|
||||
shaped so it can later be run by cron or systemd without changing the command.
|
||||
Use `docs/semantic_index_deployment_runbook.md` for the full deploy, validation,
|
||||
and rollback checklist.
|
||||
|
||||
## Routine Refresh
|
||||
|
||||
Use the wrapper from the repository root:
|
||||
|
||||
```sh
|
||||
semantic_index/refresh.sh
|
||||
```
|
||||
|
||||
By default this is a dry-run. It does not call OpenAI for document embeddings
|
||||
and does not write to Qdrant. To apply a rolling refresh:
|
||||
|
||||
```sh
|
||||
semantic_index/refresh.sh --apply
|
||||
```
|
||||
|
||||
The wrapper writes a timestamped log under `.cache/semantic_index/logs` and uses
|
||||
`.cache/semantic_index/refresh_state.json` for rolling refresh state.
|
||||
|
||||
## Production Overrides
|
||||
|
||||
Use environment variables rather than editing the script:
|
||||
|
||||
```sh
|
||||
SEMANTIC_INDEX_PROJECT_LIMITS='customer-service=500,hiring=200,todo-jason=200,sales-inbox=100,business-development=100,dock-scheduling=100,prep-standardization=100'
|
||||
SEMANTIC_INDEX_LOG_DIR=/var/log/semantic-index
|
||||
SEMANTIC_INDEX_STATE_PATH=/var/lib/semantic-index/refresh_state.json
|
||||
SEMANTIC_INDEX_OVERLAP_MINUTES=15
|
||||
```
|
||||
|
||||
Keep `OPENAI_API_KEY`, `QDRANT_URL`, `REDMINE_URL`, and `REDMINE_API_KEY` in the
|
||||
existing `.env` workflow or in the service manager environment.
|
||||
|
||||
For production-style deployment, use `/opt/semantic-index` for code,
|
||||
`/etc/semantic-index.env` for service environment, `/var/lib/semantic-index`
|
||||
for refresh state, and `/var/log/semantic-index` for refresh logs. Systemd
|
||||
templates live in `deploy/semantic-index/`.
|
||||
|
||||
## Embedding Cost Guard
|
||||
|
||||
Normal refresh embeds only documents that are new or whose Redmine-derived
|
||||
`source_hash` changed. Unchanged documents are left alone. Stale indexed
|
||||
documents for refreshed issues are deleted without embedding.
|
||||
|
||||
Do not schedule `--force-rebuild`. Use it only as a manual maintenance action
|
||||
when intentionally re-embedding unchanged documents.
|
||||
|
||||
## Cron Shape
|
||||
|
||||
A later cron entry can call the same wrapper:
|
||||
|
||||
```cron
|
||||
*/30 * * * * cd /home/iadnah/redmine && semantic_index/refresh.sh --apply
|
||||
```
|
||||
|
||||
Before adding a real schedule, run the wrapper manually and confirm the log
|
||||
shows expected `embedded_documents`, `unchanged_documents`, and
|
||||
`skipped_issues` counts.
|
||||
|
||||
For a quick wrapper smoke check, reduce the project limits:
|
||||
|
||||
```sh
|
||||
SEMANTIC_INDEX_PROJECT_LIMITS='customer-service=5' semantic_index/refresh.sh
|
||||
```
|
||||
|
||||
After refresh state exists, routine dry-runs should show old issues as
|
||||
`skipped_issues` without matching `detail_fetched_issues`. That indicates the
|
||||
refresh is avoiding unnecessary Redmine detail requests before it reaches the
|
||||
embedding cost guard.
|
||||
@@ -0,0 +1,12 @@
|
||||
OPENAI_API_KEY=
|
||||
QDRANT_URL=http://localhost:6333
|
||||
QDRANT_API_KEY=
|
||||
QDRANT_COLLECTION=redmine_semantic_sample
|
||||
REDMINE_URL=http://192.168.50.170
|
||||
REDMINE_API_KEY=
|
||||
REDMINE_PROJECT_IDENTIFIER=fud-helpdesk
|
||||
REDMINE_SAMPLE_LIMIT=500
|
||||
SEMANTIC_INDEX_HOST=127.0.0.1
|
||||
SEMANTIC_INDEX_PORT=8787
|
||||
SEMANTIC_INDEX_API_KEY=
|
||||
SEMANTIC_INDEX_REFRESH_STATE_PATH=.cache/semantic_index/refresh_state.json
|
||||
@@ -0,0 +1,271 @@
|
||||
# Redmine Semantic Index
|
||||
|
||||
Local semantic index service for a recent Redmine Helpdesk sample. V1 uses
|
||||
OpenAI `text-embedding-3-small` embeddings and Qdrant vectors, with Redmine as
|
||||
the first source adapter.
|
||||
|
||||
For deploy, validation, and rollback steps, see
|
||||
`docs/semantic_index_deployment_runbook.md`.
|
||||
|
||||
## Configuration
|
||||
|
||||
Copy `.env.example` to `.env` and set local secrets there. Do not commit `.env`.
|
||||
|
||||
Required for live use:
|
||||
|
||||
- `OPENAI_API_KEY`
|
||||
- `QDRANT_URL`
|
||||
- `REDMINE_URL`
|
||||
- `REDMINE_API_KEY`
|
||||
|
||||
Optional:
|
||||
|
||||
- `QDRANT_API_KEY`
|
||||
- `QDRANT_COLLECTION`
|
||||
- `REDMINE_PROJECT_IDENTIFIER`
|
||||
- `REDMINE_SAMPLE_LIMIT`
|
||||
- `SEMANTIC_INDEX_API_KEY`
|
||||
|
||||
## HTTP
|
||||
|
||||
Install runtime dependencies in your chosen environment:
|
||||
|
||||
```sh
|
||||
pip install openai qdrant-client fastapi uvicorn
|
||||
```
|
||||
|
||||
Run:
|
||||
|
||||
```sh
|
||||
uvicorn semantic_index.app:app --host 127.0.0.1 --port 8787
|
||||
```
|
||||
|
||||
Endpoints:
|
||||
|
||||
- `GET /health`
|
||||
- `POST /sources/redmine/backfill-sample`
|
||||
- `POST /search`
|
||||
- `GET /documents/{id}`
|
||||
- `GET /projects`
|
||||
|
||||
If `SEMANTIC_INDEX_API_KEY` is set, pass `Authorization: Bearer <key>`.
|
||||
|
||||
Search response shape is shared by HTTP, MCP, and the Python client:
|
||||
|
||||
```json
|
||||
{
|
||||
"query": "candidate follow up",
|
||||
"filters": {"project_identifier": "hiring", "limit": 5},
|
||||
"results": [
|
||||
{
|
||||
"id": "redmine:issue:123:chunk:0",
|
||||
"score": 0.72,
|
||||
"snippet": "Candidate follow up...",
|
||||
"payload": {},
|
||||
"citation": {
|
||||
"source": "redmine",
|
||||
"doc_type": "issue",
|
||||
"issue_id": 123,
|
||||
"project_identifier": "hiring",
|
||||
"url": "http://redmine/issues/123"
|
||||
}
|
||||
}
|
||||
]
|
||||
}
|
||||
```
|
||||
|
||||
HTTP examples:
|
||||
|
||||
```sh
|
||||
curl -sS -H "Authorization: Bearer $SEMANTIC_INDEX_API_KEY" \
|
||||
-H "Content-Type: application/json" \
|
||||
-d '{"query":"candidate follow up","project_identifier":"hiring","limit":5}' \
|
||||
http://127.0.0.1:8787/search
|
||||
|
||||
curl -sS -H "Authorization: Bearer $SEMANTIC_INDEX_API_KEY" \
|
||||
http://127.0.0.1:8787/projects
|
||||
```
|
||||
|
||||
## Python Client
|
||||
|
||||
Use the client in-process when running from this repo/environment:
|
||||
|
||||
```python
|
||||
from semantic_index.client import SemanticIndexClient
|
||||
|
||||
client = SemanticIndexClient.local()
|
||||
results = client.search("callum@safetagtracking.com", project_identifier="customer-service", limit=5)
|
||||
document = client.get_document(results["results"][0]["id"])
|
||||
```
|
||||
|
||||
Use HTTP mode from another local program:
|
||||
|
||||
```python
|
||||
from semantic_index.client import SemanticIndexClient
|
||||
|
||||
client = SemanticIndexClient(base_url="http://127.0.0.1:8787", api_key="...")
|
||||
results = client.search("candidate follow up", project_identifier="hiring", limit=5)
|
||||
```
|
||||
|
||||
## Backfill
|
||||
|
||||
Refresh the configured Redmine sample from the command line:
|
||||
|
||||
```sh
|
||||
python3 -m semantic_index --backfill-redmine-sample --limit 50
|
||||
```
|
||||
|
||||
When `REDMINE_PROJECT_IDENTIFIER` is set, the rebuild deletes and replaces only
|
||||
indexed Redmine documents for that project. Without a project identifier, it
|
||||
rebuilds the Redmine source sample for the collection.
|
||||
|
||||
Refresh a balanced multi-project sample:
|
||||
|
||||
```sh
|
||||
python3 -m semantic_index --backfill-redmine-projects \
|
||||
--projects customer-service,hiring,todo-jason,sales-inbox,business-development,dock-scheduling,prep-standardization \
|
||||
--per-project-limit 100
|
||||
```
|
||||
|
||||
Use project-specific limits when Customer Service should stay larger than the
|
||||
internal project sample:
|
||||
|
||||
```sh
|
||||
python3 -m semantic_index --backfill-redmine-projects \
|
||||
--project-limits customer-service=500,hiring=200,todo-jason=200,sales-inbox=100,business-development=100,dock-scheduling=100,prep-standardization=100
|
||||
```
|
||||
|
||||
Multi-project backfill rebuilds each project scope independently. Non-Helpdesk
|
||||
projects are indexed as ordinary Redmine issues and journals; they are not
|
||||
expected to have Helpdesk contact metadata.
|
||||
|
||||
## Rolling Refresh
|
||||
|
||||
Use rolling refresh for routine updates after an initial backfill:
|
||||
|
||||
```sh
|
||||
python3 -m semantic_index --refresh-redmine-projects \
|
||||
--project-limits customer-service=500,hiring=200,todo-jason=200,sales-inbox=100,business-development=100,dock-scheduling=100,prep-standardization=100 \
|
||||
--dry-run
|
||||
```
|
||||
|
||||
Dry-run reports what would change without calling OpenAI or writing to Qdrant.
|
||||
Remove `--dry-run` to apply the refresh.
|
||||
|
||||
The refresh maps each recent Redmine issue to stable document IDs, reads the
|
||||
existing Qdrant payloads for that issue, and compares `source_hash` values.
|
||||
Only new or changed documents are embedded and upserted. Unchanged documents
|
||||
are left alone, and stale documents for refreshed issues are deleted without
|
||||
embedding. Use `--force-rebuild` only when you explicitly want to re-embed
|
||||
matching documents.
|
||||
|
||||
The default local state file is `.cache/semantic_index/refresh_state.json`.
|
||||
After a successful refresh, later runs skip issues older than the previous
|
||||
success timestamp minus `--overlap-minutes` unless `--force-rebuild` is used.
|
||||
Override it with:
|
||||
|
||||
```sh
|
||||
python3 -m semantic_index --refresh-redmine-projects \
|
||||
--project-limits customer-service=500 \
|
||||
--state-path /tmp/semantic-refresh-state.json
|
||||
```
|
||||
|
||||
The HTTP endpoint exposes the same behavior:
|
||||
|
||||
```sh
|
||||
curl -sS -X POST http://127.0.0.1:8787/sources/redmine/refresh \
|
||||
-H 'Content-Type: application/json' \
|
||||
-d '{"project_limits":{"customer-service":500},"dry_run":true}'
|
||||
```
|
||||
|
||||
For production-style operation, use the wrapper script. It defaults to dry-run
|
||||
and writes timestamped logs under `.cache/semantic_index/logs`:
|
||||
|
||||
```sh
|
||||
semantic_index/refresh.sh
|
||||
semantic_index/refresh.sh --apply
|
||||
```
|
||||
|
||||
For a quick smoke check of the wrapper path:
|
||||
|
||||
```sh
|
||||
SEMANTIC_INDEX_PROJECT_LIMITS='customer-service=5' semantic_index/refresh.sh
|
||||
```
|
||||
|
||||
Override project limits, state path, or log location through environment
|
||||
variables:
|
||||
|
||||
```sh
|
||||
SEMANTIC_INDEX_PROJECT_LIMITS='customer-service=500,hiring=200' \
|
||||
SEMANTIC_INDEX_LOG_DIR=/var/log/semantic-index \
|
||||
SEMANTIC_INDEX_STATE_PATH=/var/lib/semantic-index/refresh_state.json \
|
||||
semantic_index/refresh.sh --apply
|
||||
```
|
||||
|
||||
Do not schedule `--force-rebuild`. Force rebuilds should stay manual because
|
||||
they intentionally re-embed unchanged documents.
|
||||
|
||||
## MCP Stdio
|
||||
|
||||
```sh
|
||||
python3 -m semantic_index --mcp-stdio
|
||||
```
|
||||
|
||||
Tools:
|
||||
|
||||
- `semantic_search`
|
||||
- `semantic_get_document`
|
||||
- `semantic_list_projects`
|
||||
- `semantic_backfill_redmine_sample`
|
||||
- `semantic_refresh_redmine`
|
||||
|
||||
For agent workflows, list projects first when the user has not named a project,
|
||||
search broadly or with `project_identifier` when known, then call
|
||||
`semantic_get_document` for any promising result. Treat returned citations and
|
||||
Redmine URLs as the authoritative references. Backfill tools are operational and
|
||||
should not be part of normal search behavior.
|
||||
|
||||
## Inspection CLI
|
||||
|
||||
Use the inspect commands before larger backfills to see what is already indexed
|
||||
or preview what Redmine would produce without writing to Qdrant.
|
||||
|
||||
```sh
|
||||
python3 -m semantic_index inspect count --source redmine --project customer-service
|
||||
python3 -m semantic_index inspect list --limit 20 --source redmine --project customer-service
|
||||
python3 -m semantic_index inspect search "order status" --limit 5 --project customer-service
|
||||
python3 -m semantic_index inspect search "customer@example.com" --limit 5 --project customer-service
|
||||
python3 -m semantic_index inspect show redmine:issue:39778:chunk:0
|
||||
python3 -m semantic_index inspect preview-redmine --limit 10 --project customer-service
|
||||
python3 -m semantic_index inspect audit --source redmine --project customer-service --limit 500
|
||||
python3 -m semantic_index inspect compare-redmine --project customer-service --limit 20
|
||||
python3 -m semantic_index inspect smoke-search --project customer-service
|
||||
```
|
||||
|
||||
`count`, `list`, `show`, and `preview-redmine` do not call OpenAI.
|
||||
`search` embeds the query text. List/search output shows snippets by default;
|
||||
pass `--full-text` when you need the full indexed text.
|
||||
`audit` summarizes indexed document coverage without calling OpenAI.
|
||||
`compare-redmine` previews live Redmine chunks and compares them to indexed
|
||||
Qdrant documents without writing to Qdrant. `smoke-search` runs known search
|
||||
checks and calls OpenAI for query embeddings. Pass `--json` to `audit`,
|
||||
`compare-redmine`, or `smoke-search` for machine-readable output.
|
||||
For mixed project samples, run `audit` without `--project` to see project-level
|
||||
counts and Helpdesk-contact coverage separately from ordinary internal issues.
|
||||
|
||||
For Helpdesk tickets, Redmine issue ingestion expects
|
||||
`/issues/:id.json?include=journals,helpdesk` to return `helpdesk_ticket`
|
||||
metadata with an expanded contact. See
|
||||
`docs/redmine_issue_api_helpdesk_include.md` for the Redmine API patch notes.
|
||||
|
||||
## Qdrant
|
||||
|
||||
For local Docker-hosted Qdrant:
|
||||
|
||||
```sh
|
||||
docker run -p 6333:6333 -p 6334:6334 -v qdrant_storage:/qdrant/storage qdrant/qdrant
|
||||
```
|
||||
|
||||
Create snapshots with Qdrant's snapshot API or mounted storage tooling before
|
||||
destructive maintenance. The default collection name is
|
||||
`redmine_semantic_sample`.
|
||||
@@ -0,0 +1,12 @@
|
||||
"""Local semantic index service for Redmine and future source adapters."""
|
||||
|
||||
__all__ = [
|
||||
"config",
|
||||
"embeddings",
|
||||
"ingest",
|
||||
"mcp",
|
||||
"models",
|
||||
"qdrant_store",
|
||||
"redmine",
|
||||
"search",
|
||||
]
|
||||
@@ -0,0 +1,206 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
from pathlib import Path
|
||||
from typing import Callable, Dict, List, Optional
|
||||
|
||||
from .app import build_services
|
||||
from .config import Settings, load_settings
|
||||
from .inspect import (
|
||||
print_audit,
|
||||
print_compare_redmine,
|
||||
print_count,
|
||||
print_list,
|
||||
print_preview_redmine,
|
||||
print_search,
|
||||
print_show,
|
||||
print_smoke_search,
|
||||
)
|
||||
from .mcp import SemanticMCP, serve_stdio
|
||||
from .refresh import FileRefreshState
|
||||
from .redmine import RedmineApiSource
|
||||
|
||||
|
||||
def build_preview_services(settings: Settings) -> Dict[str, object]:
|
||||
return {
|
||||
"settings": settings,
|
||||
"redmine_source": RedmineApiSource(
|
||||
redmine_url=settings.redmine_url,
|
||||
api_key=settings.redmine_api_key or "",
|
||||
project_identifier=settings.redmine_project_identifier,
|
||||
),
|
||||
}
|
||||
|
||||
|
||||
def parse_projects(raw: str) -> List[str]:
|
||||
return [project.strip() for project in raw.split(",") if project.strip()]
|
||||
|
||||
|
||||
def parse_project_limits(raw: str) -> Dict[str, int]:
|
||||
project_limits: Dict[str, int] = {}
|
||||
for item in raw.split(","):
|
||||
if not item.strip():
|
||||
continue
|
||||
project, limit = item.split("=", 1)
|
||||
project_limits[project.strip()] = int(limit.strip())
|
||||
return project_limits
|
||||
|
||||
|
||||
def main(
|
||||
argv: Optional[List[str]] = None,
|
||||
service_builder: Callable[[], Dict[str, object]] = build_services,
|
||||
preview_service_builder: Optional[Callable[[Settings], Dict[str, object]]] = None,
|
||||
settings_loader: Callable[[], Settings] = load_settings,
|
||||
) -> None:
|
||||
parser = argparse.ArgumentParser(description="Semantic index helper", allow_abbrev=False)
|
||||
parser.add_argument("--mcp-stdio", action="store_true", help="Run the MCP-compatible stdio tool server")
|
||||
parser.add_argument("--backfill-redmine-sample", action="store_true", help="Backfill the configured Redmine sample")
|
||||
parser.add_argument("--backfill-redmine-projects", action="store_true", help="Backfill multiple Redmine projects")
|
||||
parser.add_argument("--refresh-redmine-projects", action="store_true", help="Refresh recent Redmine issues without re-embedding unchanged documents")
|
||||
parser.add_argument("--projects", help="Comma-separated Redmine project identifiers for multi-project backfill")
|
||||
parser.add_argument("--project-limits", help="Comma-separated project=limit pairs for multi-project backfill")
|
||||
parser.add_argument("--per-project-limit", type=int, default=500)
|
||||
parser.add_argument("--limit", type=int, default=500)
|
||||
parser.add_argument("--dry-run", action="store_true", help="Report planned refresh work without embeddings or writes")
|
||||
parser.add_argument("--force-rebuild", action="store_true", help="Embed and upsert refresh candidates even when source hashes match")
|
||||
parser.add_argument("--overlap-minutes", type=int, default=15, help="Refresh overlap window for rolling update state")
|
||||
parser.add_argument("--state-path", help="Override rolling refresh state file path")
|
||||
subparsers = parser.add_subparsers(dest="command")
|
||||
|
||||
inspect_parser = subparsers.add_parser("inspect", help="Inspect indexed documents and preview Redmine chunks")
|
||||
inspect_subparsers = inspect_parser.add_subparsers(dest="inspect_command", required=True)
|
||||
|
||||
def add_filters(command_parser: argparse.ArgumentParser) -> None:
|
||||
command_parser.add_argument("--source", default="redmine")
|
||||
command_parser.add_argument("--project", dest="project_identifier")
|
||||
command_parser.add_argument("--doc-type")
|
||||
|
||||
count_parser = inspect_subparsers.add_parser("count", help="Count indexed documents")
|
||||
add_filters(count_parser)
|
||||
|
||||
list_parser = inspect_subparsers.add_parser("list", help="List indexed documents")
|
||||
add_filters(list_parser)
|
||||
list_parser.add_argument("--limit", type=int, default=20)
|
||||
list_parser.add_argument("--full-text", action="store_true")
|
||||
|
||||
search_parser = inspect_subparsers.add_parser("search", help="Search indexed documents")
|
||||
search_parser.add_argument("query")
|
||||
add_filters(search_parser)
|
||||
search_parser.add_argument("--limit", type=int, default=10)
|
||||
search_parser.add_argument("--full-text", action="store_true")
|
||||
|
||||
show_parser = inspect_subparsers.add_parser("show", help="Show one indexed document")
|
||||
show_parser.add_argument("document_id")
|
||||
|
||||
preview_parser = inspect_subparsers.add_parser("preview-redmine", help="Preview Redmine chunks without writing to Qdrant")
|
||||
preview_parser.add_argument("--limit", type=int, default=10)
|
||||
preview_parser.add_argument("--project", dest="project_identifier")
|
||||
preview_parser.add_argument("--full-text", action="store_true")
|
||||
|
||||
audit_parser = inspect_subparsers.add_parser("audit", help="Audit indexed documents for trust-check coverage")
|
||||
add_filters(audit_parser)
|
||||
audit_parser.add_argument("--limit", type=int, default=500)
|
||||
audit_parser.add_argument("--json", action="store_true")
|
||||
|
||||
compare_parser = inspect_subparsers.add_parser("compare-redmine", help="Compare live Redmine preview chunks with indexed documents")
|
||||
compare_parser.add_argument("--limit", type=int, default=20)
|
||||
compare_parser.add_argument("--project", dest="project_identifier")
|
||||
compare_parser.add_argument("--json", action="store_true")
|
||||
|
||||
smoke_parser = inspect_subparsers.add_parser("smoke-search", help="Run repeatable search checks against indexed documents")
|
||||
smoke_parser.add_argument("--project", dest="project_identifier")
|
||||
smoke_parser.add_argument("--email", default="callum@safetagtracking.com")
|
||||
smoke_parser.add_argument("--issue-id", type=int, default=39779)
|
||||
smoke_parser.add_argument("--order-token")
|
||||
smoke_parser.add_argument("--natural-query", default="customer needs goods returned")
|
||||
smoke_parser.add_argument("--json", action="store_true")
|
||||
|
||||
args = parser.parse_args(argv)
|
||||
|
||||
if not args.command and not args.backfill_redmine_sample and not args.backfill_redmine_projects and not args.refresh_redmine_projects and not args.mcp_stdio:
|
||||
parser.print_help()
|
||||
return
|
||||
|
||||
if args.command == "inspect" and args.inspect_command == "preview-redmine":
|
||||
if preview_service_builder is not None:
|
||||
services = preview_service_builder(settings_loader())
|
||||
elif service_builder is build_services:
|
||||
services = build_preview_services(settings_loader())
|
||||
else:
|
||||
services = service_builder()
|
||||
project = args.project_identifier or services["settings"].redmine_project_identifier
|
||||
print_preview_redmine(services["redmine_source"], services["settings"].redmine_url, project, args.limit, args.full_text)
|
||||
return
|
||||
|
||||
services = service_builder()
|
||||
if args.state_path and "refresh" in services and hasattr(services["refresh"], "state"):
|
||||
services["refresh"].state = FileRefreshState(Path(args.state_path))
|
||||
if args.backfill_redmine_sample:
|
||||
print(services["backfill"].backfill_redmine_sample(limit=args.limit))
|
||||
return
|
||||
if args.backfill_redmine_projects:
|
||||
if args.project_limits:
|
||||
print(services["backfill"].backfill_redmine_project_limits(parse_project_limits(args.project_limits)))
|
||||
return
|
||||
projects = parse_projects(args.projects or "")
|
||||
if not projects:
|
||||
parser.error("--projects or --project-limits is required with --backfill-redmine-projects")
|
||||
print(services["backfill"].backfill_redmine_projects(projects, per_project_limit=args.per_project_limit))
|
||||
return
|
||||
if args.refresh_redmine_projects:
|
||||
if args.project_limits:
|
||||
project_limits = parse_project_limits(args.project_limits)
|
||||
else:
|
||||
projects = parse_projects(args.projects or "")
|
||||
if not projects:
|
||||
parser.error("--projects or --project-limits is required with --refresh-redmine-projects")
|
||||
project_limits = {project: args.per_project_limit for project in projects}
|
||||
print(
|
||||
services["refresh"].refresh_redmine_project_limits(
|
||||
project_limits,
|
||||
dry_run=args.dry_run,
|
||||
force_rebuild=args.force_rebuild,
|
||||
overlap_minutes=args.overlap_minutes,
|
||||
)
|
||||
)
|
||||
return
|
||||
if args.mcp_stdio:
|
||||
serve_stdio(SemanticMCP(search_service=services["search"], backfill_service=services["backfill"], store=services["store"], refresh_service=services.get("refresh")))
|
||||
return
|
||||
if args.command == "inspect":
|
||||
if args.inspect_command == "count":
|
||||
print_count(services["store"], args.source, args.project_identifier, args.doc_type)
|
||||
return
|
||||
if args.inspect_command == "list":
|
||||
print_list(services["store"], args.limit, args.source, args.project_identifier, args.doc_type, args.full_text)
|
||||
return
|
||||
if args.inspect_command == "search":
|
||||
print_search(services["search"], args.query, args.limit, args.source, args.project_identifier, args.doc_type, args.full_text)
|
||||
return
|
||||
if args.inspect_command == "show":
|
||||
print_show(services["search"], args.document_id)
|
||||
return
|
||||
if args.inspect_command == "audit":
|
||||
print_audit(services["store"], args.limit, args.source, args.project_identifier, args.doc_type, args.json)
|
||||
return
|
||||
if args.inspect_command == "compare-redmine":
|
||||
project = args.project_identifier or services["settings"].redmine_project_identifier
|
||||
print_compare_redmine(services["store"], services["redmine_source"], services["settings"].redmine_url, project, args.limit, args.json)
|
||||
return
|
||||
if args.inspect_command == "smoke-search":
|
||||
project = args.project_identifier or services["settings"].redmine_project_identifier
|
||||
print_smoke_search(
|
||||
services["search"],
|
||||
project,
|
||||
args.email,
|
||||
args.issue_id,
|
||||
args.order_token,
|
||||
args.natural_query,
|
||||
args.json,
|
||||
)
|
||||
return
|
||||
parser.print_help()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -0,0 +1,153 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import Any, Callable, Dict, Optional
|
||||
|
||||
from .config import Settings, load_settings
|
||||
from .embeddings import OpenAIEmbedder, OpenAIEmbeddingClient
|
||||
from .ingest import BackfillService
|
||||
from .models import SearchQuery, search_response
|
||||
from .qdrant_store import QdrantStore
|
||||
from .refresh import FileRefreshState, RedmineRefreshService
|
||||
from .redmine import RedmineApiSource, RedmineMapper
|
||||
from .search import HybridSearchService
|
||||
|
||||
|
||||
def build_services(settings: Optional[Settings] = None) -> Dict[str, Any]:
|
||||
settings = settings or load_settings()
|
||||
embedding_client = OpenAIEmbeddingClient(api_key=settings.openai_api_key)
|
||||
embedder = OpenAIEmbedder(client=embedding_client)
|
||||
store = QdrantStore(
|
||||
url=settings.qdrant_url,
|
||||
api_key=settings.qdrant_api_key,
|
||||
collection=settings.qdrant_collection,
|
||||
)
|
||||
redmine_source = RedmineApiSource(
|
||||
redmine_url=settings.redmine_url,
|
||||
api_key=settings.redmine_api_key or "",
|
||||
project_identifier=settings.redmine_project_identifier,
|
||||
)
|
||||
search_service = HybridSearchService(embedder=embedder, store=store)
|
||||
backfill_service = BackfillService(
|
||||
source=redmine_source,
|
||||
embedder=embedder,
|
||||
store=store,
|
||||
mapper=RedmineMapper(redmine_url=settings.redmine_url, project_identifier=settings.redmine_project_identifier),
|
||||
)
|
||||
refresh_service = RedmineRefreshService(
|
||||
source=redmine_source,
|
||||
embedder=embedder,
|
||||
store=store,
|
||||
mapper=RedmineMapper(redmine_url=settings.redmine_url, project_identifier=settings.redmine_project_identifier),
|
||||
state=FileRefreshState(settings.refresh_state_path),
|
||||
)
|
||||
return {
|
||||
"settings": settings,
|
||||
"search": search_service,
|
||||
"backfill": backfill_service,
|
||||
"refresh": refresh_service,
|
||||
"store": store,
|
||||
"redmine_source": redmine_source,
|
||||
}
|
||||
|
||||
|
||||
def create_app(settings: Optional[Settings] = None, service_builder: Optional[Callable[[], Dict[str, Any]]] = None):
|
||||
try:
|
||||
from fastapi import FastAPI, Header, HTTPException
|
||||
except ImportError as exc:
|
||||
raise RuntimeError("Install fastapi and uvicorn to run the HTTP service") from exc
|
||||
|
||||
services: Optional[Dict[str, Any]] = None
|
||||
app = FastAPI(title="Redmine Semantic Index", version="0.1.0")
|
||||
|
||||
def get_services() -> Dict[str, Any]:
|
||||
nonlocal services
|
||||
if services is None:
|
||||
if service_builder is not None:
|
||||
services = service_builder()
|
||||
else:
|
||||
services = build_services(settings)
|
||||
return services
|
||||
|
||||
def authorize(authorization: Optional[str]) -> None:
|
||||
api_key = get_services()["settings"].service_api_key
|
||||
if not api_key:
|
||||
return
|
||||
expected = f"Bearer {api_key}"
|
||||
if authorization != expected:
|
||||
raise HTTPException(status_code=401, detail="unauthorized")
|
||||
|
||||
@app.get("/health")
|
||||
def health() -> Dict[str, str]:
|
||||
return {"status": "ok"}
|
||||
|
||||
@app.post("/sources/redmine/backfill-sample")
|
||||
def backfill(payload: Dict[str, Any] | None = None, authorization: Optional[str] = Header(default=None)) -> Dict[str, Any]:
|
||||
authorize(authorization)
|
||||
active_services = get_services()
|
||||
limit = int((payload or {}).get("limit", active_services["settings"].sample_limit))
|
||||
return active_services["backfill"].backfill_redmine_sample(limit=limit)
|
||||
|
||||
@app.post("/sources/redmine/refresh")
|
||||
def refresh(payload: Dict[str, Any] | None = None, authorization: Optional[str] = Header(default=None)) -> Dict[str, Any]:
|
||||
authorize(authorization)
|
||||
payload = payload or {}
|
||||
project_limits = payload.get("project_limits")
|
||||
if not project_limits:
|
||||
project = payload.get("project_identifier") or get_services()["settings"].redmine_project_identifier
|
||||
if not project:
|
||||
raise HTTPException(status_code=400, detail="project_limits or project_identifier is required")
|
||||
project_limits = {project: int(payload.get("limit", get_services()["settings"].sample_limit))}
|
||||
return get_services()["refresh"].refresh_redmine_project_limits(
|
||||
{str(project): int(limit) for project, limit in project_limits.items()},
|
||||
dry_run=bool(payload.get("dry_run", False)),
|
||||
force_rebuild=bool(payload.get("force_rebuild", False)),
|
||||
overlap_minutes=int(payload.get("overlap_minutes", 15)),
|
||||
)
|
||||
|
||||
@app.post("/search")
|
||||
def search(payload: Dict[str, Any], authorization: Optional[str] = Header(default=None)) -> Dict[str, Any]:
|
||||
authorize(authorization)
|
||||
query = SearchQuery(
|
||||
text=payload.get("query") or payload.get("text") or "",
|
||||
source=payload.get("source"),
|
||||
project_id=payload.get("project_id"),
|
||||
project_identifier=payload.get("project_identifier"),
|
||||
doc_type=payload.get("doc_type"),
|
||||
issue_id=payload.get("issue_id"),
|
||||
contact_id=payload.get("contact_id"),
|
||||
contact_email=payload.get("contact_email"),
|
||||
date_from=payload.get("date_from"),
|
||||
date_to=payload.get("date_to"),
|
||||
limit=int(payload.get("limit", 10)),
|
||||
include_snippets=bool(payload.get("include_snippets", True)),
|
||||
)
|
||||
results = get_services()["search"].search(query)
|
||||
return search_response(query, results)
|
||||
|
||||
@app.get("/projects")
|
||||
def projects(authorization: Optional[str] = Header(default=None)) -> Dict[str, Any]:
|
||||
authorize(authorization)
|
||||
return {"projects": get_services()["store"].list_projects(source="redmine")}
|
||||
|
||||
@app.get("/documents/{document_id}")
|
||||
def document(document_id: str, authorization: Optional[str] = Header(default=None)) -> Dict[str, Any]:
|
||||
authorize(authorization)
|
||||
found = get_services()["search"].get_document(document_id)
|
||||
if found is None:
|
||||
raise HTTPException(status_code=404, detail="not_found")
|
||||
return found
|
||||
|
||||
return app
|
||||
|
||||
|
||||
class LazyASGIApp:
|
||||
def __init__(self) -> None:
|
||||
self._app = None
|
||||
|
||||
async def __call__(self, scope, receive, send):
|
||||
if self._app is None:
|
||||
self._app = create_app()
|
||||
await self._app(scope, receive, send)
|
||||
|
||||
|
||||
app = LazyASGIApp()
|
||||
@@ -0,0 +1,25 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import List
|
||||
|
||||
|
||||
def chunk_text(text: str, max_chars: int = 3500, overlap: int = 300) -> List[str]:
|
||||
cleaned = "\n".join(line.rstrip() for line in text.strip().splitlines()).strip()
|
||||
if not cleaned:
|
||||
return []
|
||||
if len(cleaned) <= max_chars:
|
||||
return [cleaned]
|
||||
|
||||
chunks: List[str] = []
|
||||
start = 0
|
||||
while start < len(cleaned):
|
||||
end = min(start + max_chars, len(cleaned))
|
||||
if end < len(cleaned):
|
||||
boundary = max(cleaned.rfind("\n\n", start, end), cleaned.rfind(". ", start, end))
|
||||
if boundary > start + int(max_chars * 0.5):
|
||||
end = boundary + 1
|
||||
chunks.append(cleaned[start:end].strip())
|
||||
if end >= len(cleaned):
|
||||
break
|
||||
start = max(0, end - overlap)
|
||||
return [chunk for chunk in chunks if chunk]
|
||||
@@ -0,0 +1,72 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import urllib.request
|
||||
from typing import Any, Dict, Optional
|
||||
|
||||
from .app import build_services
|
||||
from .models import SearchQuery, search_response
|
||||
|
||||
|
||||
class SemanticIndexClient:
|
||||
def __init__(
|
||||
self,
|
||||
base_url: Optional[str] = None,
|
||||
api_key: Optional[str] = None,
|
||||
search_service: Optional[Any] = None,
|
||||
) -> None:
|
||||
self.base_url = base_url.rstrip("/") if base_url else None
|
||||
self.api_key = api_key
|
||||
self.search_service = search_service
|
||||
|
||||
@classmethod
|
||||
def local(cls) -> "SemanticIndexClient":
|
||||
return cls(search_service=build_services()["search"])
|
||||
|
||||
def search(self, query: str, **filters: Any) -> Dict[str, Any]:
|
||||
if self.base_url:
|
||||
return self._post_json("/search", {"query": query, **filters})
|
||||
search_service = self.search_service or build_services()["search"]
|
||||
search_query = SearchQuery(
|
||||
text=query,
|
||||
source=filters.get("source"),
|
||||
project_id=filters.get("project_id"),
|
||||
project_identifier=filters.get("project_identifier"),
|
||||
doc_type=filters.get("doc_type"),
|
||||
issue_id=filters.get("issue_id"),
|
||||
contact_id=filters.get("contact_id"),
|
||||
contact_email=filters.get("contact_email"),
|
||||
date_from=filters.get("date_from"),
|
||||
date_to=filters.get("date_to"),
|
||||
limit=int(filters.get("limit", 10)),
|
||||
include_snippets=bool(filters.get("include_snippets", True)),
|
||||
)
|
||||
return search_response(search_query, search_service.search(search_query))
|
||||
|
||||
def get_document(self, document_id: str) -> Dict[str, Any]:
|
||||
if self.base_url:
|
||||
return self._get_json(f"/documents/{document_id}")
|
||||
search_service = self.search_service or build_services()["search"]
|
||||
return search_service.get_document(document_id) or {"error": "not_found", "id": document_id}
|
||||
|
||||
def _post_json(self, path: str, payload: Dict[str, Any]) -> Dict[str, Any]:
|
||||
data = json.dumps(payload).encode("utf-8")
|
||||
request = urllib.request.Request(
|
||||
f"{self.base_url}{path}",
|
||||
data=data,
|
||||
headers=self._headers(),
|
||||
method="POST",
|
||||
)
|
||||
with urllib.request.urlopen(request, timeout=60) as response:
|
||||
return json.loads(response.read().decode("utf-8"))
|
||||
|
||||
def _get_json(self, path: str) -> Dict[str, Any]:
|
||||
request = urllib.request.Request(f"{self.base_url}{path}", headers=self._headers())
|
||||
with urllib.request.urlopen(request, timeout=60) as response:
|
||||
return json.loads(response.read().decode("utf-8"))
|
||||
|
||||
def _headers(self) -> Dict[str, str]:
|
||||
headers = {"Content-Type": "application/json"}
|
||||
if self.api_key:
|
||||
headers["Authorization"] = f"Bearer {self.api_key}"
|
||||
return headers
|
||||
@@ -0,0 +1,64 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import os
|
||||
from dataclasses import dataclass
|
||||
from pathlib import Path
|
||||
from typing import Dict, Optional
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class Settings:
|
||||
openai_api_key: Optional[str]
|
||||
qdrant_url: str
|
||||
qdrant_api_key: Optional[str]
|
||||
qdrant_collection: str
|
||||
redmine_url: str
|
||||
redmine_api_key: Optional[str]
|
||||
redmine_project_identifier: Optional[str]
|
||||
sample_limit: int
|
||||
bind_host: str
|
||||
bind_port: int
|
||||
service_api_key: Optional[str]
|
||||
refresh_state_path: Path
|
||||
|
||||
|
||||
def load_dotenv(path: str | Path = ".env") -> Dict[str, str]:
|
||||
values: Dict[str, str] = {}
|
||||
dotenv = Path(path)
|
||||
if not dotenv.exists():
|
||||
return values
|
||||
for raw_line in dotenv.read_text(encoding="utf-8").splitlines():
|
||||
line = raw_line.strip()
|
||||
if not line or line.startswith("#") or "=" not in line:
|
||||
continue
|
||||
key, value = line.split("=", 1)
|
||||
values[key.strip()] = value.strip().strip('"').strip("'")
|
||||
return values
|
||||
|
||||
|
||||
def resolve_dotenv_path(dotenv_path: str | Path = ".env") -> Path:
|
||||
primary = Path(dotenv_path)
|
||||
if primary.exists():
|
||||
return primary
|
||||
package_env = primary.parent / "semantic_index" / ".env"
|
||||
if package_env.exists():
|
||||
return package_env
|
||||
return primary
|
||||
|
||||
|
||||
def load_settings(dotenv_path: str | Path = ".env") -> Settings:
|
||||
env = {**load_dotenv(resolve_dotenv_path(dotenv_path)), **os.environ}
|
||||
return Settings(
|
||||
openai_api_key=env.get("OPENAI_API_KEY"),
|
||||
qdrant_url=env.get("QDRANT_URL", "http://localhost:6333"),
|
||||
qdrant_api_key=env.get("QDRANT_API_KEY"),
|
||||
qdrant_collection=env.get("QDRANT_COLLECTION", "redmine_semantic_sample"),
|
||||
redmine_url=env.get("REDMINE_URL", "http://localhost"),
|
||||
redmine_api_key=env.get("REDMINE_API_KEY"),
|
||||
redmine_project_identifier=env.get("REDMINE_PROJECT_IDENTIFIER"),
|
||||
sample_limit=int(env.get("REDMINE_SAMPLE_LIMIT", "500")),
|
||||
bind_host=env.get("SEMANTIC_INDEX_HOST", "127.0.0.1"),
|
||||
bind_port=int(env.get("SEMANTIC_INDEX_PORT", "8787")),
|
||||
service_api_key=env.get("SEMANTIC_INDEX_API_KEY"),
|
||||
refresh_state_path=Path(env.get("SEMANTIC_INDEX_REFRESH_STATE_PATH", ".cache/semantic_index/refresh_state.json")),
|
||||
)
|
||||
@@ -0,0 +1,64 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import Iterable, List, Optional, Protocol, Sequence
|
||||
|
||||
from .models import IndexDocument
|
||||
|
||||
|
||||
class EmbeddingClient(Protocol):
|
||||
def create_embeddings(self, model: str, inputs: Sequence[str], dimensions: Optional[int] = None) -> List[List[float]]:
|
||||
...
|
||||
|
||||
|
||||
class OpenAIEmbeddingClient:
|
||||
def __init__(self, api_key: Optional[str] = None) -> None:
|
||||
try:
|
||||
from openai import OpenAI
|
||||
except ImportError as exc:
|
||||
raise RuntimeError("Install openai to use live embeddings") from exc
|
||||
self.client = OpenAI(api_key=api_key)
|
||||
|
||||
def create_embeddings(self, model: str, inputs: Sequence[str], dimensions: Optional[int] = None) -> List[List[float]]:
|
||||
kwargs = {"model": model, "input": list(inputs)}
|
||||
if dimensions is not None:
|
||||
kwargs["dimensions"] = dimensions
|
||||
response = self.client.embeddings.create(**kwargs)
|
||||
return [item.embedding for item in response.data]
|
||||
|
||||
|
||||
class OpenAIEmbedder:
|
||||
def __init__(
|
||||
self,
|
||||
client: EmbeddingClient,
|
||||
model: str = "text-embedding-3-small",
|
||||
dimensions: int = 1536,
|
||||
batch_size: int = 100,
|
||||
max_chars: int = 12000,
|
||||
) -> None:
|
||||
self.client = client
|
||||
self.model = model
|
||||
self.dimensions = dimensions
|
||||
self.batch_size = batch_size
|
||||
self.max_chars = max_chars
|
||||
|
||||
def embed_documents(self, documents: Sequence[IndexDocument]) -> List[List[float]]:
|
||||
return self.embed_texts([document.text for document in documents])
|
||||
|
||||
def embed_query(self, text: str) -> List[float]:
|
||||
return self.embed_texts([text])[0]
|
||||
|
||||
def embed_texts(self, texts: Iterable[str]) -> List[List[float]]:
|
||||
values = list(texts)
|
||||
self._validate(values)
|
||||
vectors: List[List[float]] = []
|
||||
for start in range(0, len(values), self.batch_size):
|
||||
batch = values[start : start + self.batch_size]
|
||||
vectors.extend(self.client.create_embeddings(self.model, batch, dimensions=self.dimensions))
|
||||
return vectors
|
||||
|
||||
def _validate(self, texts: Sequence[str]) -> None:
|
||||
for text in texts:
|
||||
if not text.strip():
|
||||
raise ValueError("embedding text cannot be empty")
|
||||
if len(text) > self.max_chars:
|
||||
raise ValueError(f"embedding text exceeds {self.max_chars} characters")
|
||||
@@ -0,0 +1,100 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import Any, Dict, Iterable, List, Protocol, Sequence
|
||||
|
||||
from .models import IndexDocument
|
||||
from .redmine import RedmineMapper
|
||||
|
||||
|
||||
class RedmineSource(Protocol):
|
||||
project_identifier: str | None
|
||||
|
||||
def recent_helpdesk_issues(self, limit: int) -> Iterable[Dict[str, Any]]:
|
||||
...
|
||||
|
||||
|
||||
class DocumentEmbedder(Protocol):
|
||||
def embed_documents(self, docs: Sequence[IndexDocument]) -> List[List[float]]:
|
||||
...
|
||||
|
||||
|
||||
class RebuildStore(Protocol):
|
||||
def rebuild_source(
|
||||
self,
|
||||
source: str,
|
||||
docs: Sequence[IndexDocument],
|
||||
vectors: Sequence[Sequence[float]],
|
||||
project_identifier: str | None = None,
|
||||
) -> None:
|
||||
...
|
||||
|
||||
|
||||
class BackfillService:
|
||||
def __init__(self, source: RedmineSource, embedder: DocumentEmbedder, store: RebuildStore, mapper: RedmineMapper | None = None) -> None:
|
||||
self.source = source
|
||||
self.embedder = embedder
|
||||
self.store = store
|
||||
self.mapper = mapper or RedmineMapper(redmine_url="")
|
||||
|
||||
def backfill_redmine_sample(self, limit: int = 500) -> Dict[str, int | str]:
|
||||
issues = list(self.source.recent_helpdesk_issues(limit))
|
||||
documents: List[IndexDocument] = []
|
||||
for issue in issues:
|
||||
documents.extend(self.mapper.issue_to_documents(issue))
|
||||
documents = deduplicate_documents(documents)
|
||||
vectors = self.embedder.embed_documents(documents) if documents else []
|
||||
self.store.rebuild_source("redmine", documents, vectors, project_identifier=self._project_identifier())
|
||||
return {"source": "redmine", "issues": len(issues), "documents": len(documents)}
|
||||
|
||||
def backfill_redmine_projects(self, projects: Sequence[str], per_project_limit: int = 500) -> Dict[str, object]:
|
||||
return self.backfill_redmine_project_limits({project: per_project_limit for project in projects})
|
||||
|
||||
def backfill_redmine_project_limits(self, project_limits: Dict[str, int]) -> Dict[str, object]:
|
||||
previous_source_project = getattr(self.source, "project_identifier", None)
|
||||
previous_mapper_project = getattr(self.mapper, "project_identifier", None)
|
||||
project_results: List[Dict[str, int | str]] = []
|
||||
total_issues = 0
|
||||
total_documents = 0
|
||||
try:
|
||||
for project, project_limit in project_limits.items():
|
||||
if hasattr(self.source, "project_identifier"):
|
||||
self.source.project_identifier = project
|
||||
if hasattr(self.mapper, "project_identifier"):
|
||||
self.mapper.project_identifier = project
|
||||
issues = list(self.source.recent_helpdesk_issues(project_limit))
|
||||
documents: List[IndexDocument] = []
|
||||
for issue in issues:
|
||||
documents.extend(self.mapper.issue_to_documents(issue))
|
||||
documents = deduplicate_documents(documents)
|
||||
vectors = self.embedder.embed_documents(documents) if documents else []
|
||||
self.store.rebuild_source("redmine", documents, vectors, project_identifier=project)
|
||||
project_results.append(
|
||||
{"project_identifier": project, "issues": len(issues), "documents": len(documents)}
|
||||
)
|
||||
total_issues += len(issues)
|
||||
total_documents += len(documents)
|
||||
finally:
|
||||
if hasattr(self.source, "project_identifier"):
|
||||
self.source.project_identifier = previous_source_project
|
||||
if hasattr(self.mapper, "project_identifier"):
|
||||
self.mapper.project_identifier = previous_mapper_project
|
||||
return {
|
||||
"source": "redmine",
|
||||
"projects": len(project_limits),
|
||||
"issues": total_issues,
|
||||
"documents": total_documents,
|
||||
"project_results": project_results,
|
||||
}
|
||||
|
||||
def _project_identifier(self) -> str | None:
|
||||
mapper_project = getattr(self.mapper, "project_identifier", None)
|
||||
if mapper_project:
|
||||
return mapper_project
|
||||
return getattr(self.source, "project_identifier", None)
|
||||
|
||||
|
||||
def deduplicate_documents(documents: Sequence[IndexDocument]) -> List[IndexDocument]:
|
||||
unique: Dict[str, IndexDocument] = {}
|
||||
for document in documents:
|
||||
unique[document.id] = document
|
||||
return list(unique.values())
|
||||
@@ -0,0 +1,292 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
from collections import Counter
|
||||
from typing import Any, Dict, Iterable, List, Optional
|
||||
|
||||
from .models import SearchQuery, SearchResult
|
||||
from .redmine import RedmineMapper
|
||||
|
||||
|
||||
def print_count(store: Any, source: Optional[str], project: Optional[str], doc_type: Optional[str]) -> None:
|
||||
count = store.count_documents(source=source, project_identifier=project, doc_type=doc_type)
|
||||
print(count)
|
||||
|
||||
|
||||
def print_list(store: Any, limit: int, source: Optional[str], project: Optional[str], doc_type: Optional[str], full_text: bool) -> None:
|
||||
documents = store.list_documents(limit=limit, source=source, project_identifier=project, doc_type=doc_type)
|
||||
for document in documents:
|
||||
print_document(document, full_text=full_text)
|
||||
|
||||
|
||||
def print_search(search_service: Any, query_text: str, limit: int, source: Optional[str], project: Optional[str], doc_type: Optional[str], full_text: bool) -> None:
|
||||
query = SearchQuery(
|
||||
text=query_text,
|
||||
source=source,
|
||||
project_identifier=project,
|
||||
doc_type=doc_type,
|
||||
limit=limit,
|
||||
include_snippets=not full_text,
|
||||
)
|
||||
for result in search_service.search(query):
|
||||
print_result(result, full_text=full_text)
|
||||
|
||||
|
||||
def print_show(search_service: Any, document_id: str) -> None:
|
||||
document = search_service.get_document(document_id)
|
||||
if document is None:
|
||||
print(f"not found: {document_id}")
|
||||
return
|
||||
print_document(document, full_text=True)
|
||||
|
||||
|
||||
def print_preview_redmine(source: Any, redmine_url: str, project: Optional[str], limit: int, full_text: bool) -> None:
|
||||
previous_project = getattr(source, "project_identifier", None)
|
||||
if project and hasattr(source, "project_identifier"):
|
||||
source.project_identifier = project
|
||||
try:
|
||||
mapper = RedmineMapper(redmine_url=redmine_url, project_identifier=project)
|
||||
documents = []
|
||||
for issue in source.recent_helpdesk_issues(limit):
|
||||
documents.extend(mapper.issue_to_documents(issue))
|
||||
finally:
|
||||
if hasattr(source, "project_identifier"):
|
||||
source.project_identifier = previous_project
|
||||
for document in documents:
|
||||
print_document({"id": document.id, "text": document.text, "payload": document.payload}, full_text=full_text)
|
||||
|
||||
|
||||
def print_audit(store: Any, limit: int, source: Optional[str], project: Optional[str], doc_type: Optional[str], as_json: bool) -> None:
|
||||
documents = store.list_documents(limit=limit, source=source, project_identifier=project, doc_type=doc_type)
|
||||
report = audit_documents(documents)
|
||||
if as_json:
|
||||
print(json.dumps(report, sort_keys=True))
|
||||
return
|
||||
print(f"documents={report['total_documents']}")
|
||||
for name, count in sorted(report["doc_type_counts"].items()):
|
||||
print(f"doc_type {name}={count}")
|
||||
for name, count in sorted(report["project_counts"].items()):
|
||||
print(f"project {name}={count}")
|
||||
print(f"contact_metadata {report['contact_metadata_count']}/{report['total_documents']}")
|
||||
print(f"helpdesk_contact_metadata {report['helpdesk_contact_metadata_count']}/{report['helpdesk_documents']}")
|
||||
print(f"attachments={report['attachment_documents']}")
|
||||
for document_id in report["missing_helpdesk_contact_metadata"]:
|
||||
print(f"missing_contact {document_id}")
|
||||
for document_id in report["unexpected_attachment_documents"]:
|
||||
print(f"unexpected_attachment {document_id}")
|
||||
|
||||
|
||||
def print_compare_redmine(store: Any, source: Any, redmine_url: str, project: Optional[str], limit: int, as_json: bool) -> None:
|
||||
preview_documents = preview_redmine_documents(source, redmine_url, project, limit)
|
||||
indexed_documents = store.list_documents(limit=max(5000, limit * 100), source="redmine", project_identifier=project)
|
||||
report = compare_documents(preview_documents, indexed_documents)
|
||||
if as_json:
|
||||
print(json.dumps(report, sort_keys=True))
|
||||
return
|
||||
print(f"preview_documents={report['preview_documents']}")
|
||||
print(f"indexed_documents={report['indexed_documents']}")
|
||||
for document_id in report["missing"]:
|
||||
print(f"missing {document_id}")
|
||||
for document_id in report["stale"]:
|
||||
print(f"stale {document_id}")
|
||||
for mismatch in report["contact_mismatches"]:
|
||||
print(f"contact_mismatch {mismatch['id']}")
|
||||
|
||||
|
||||
def print_smoke_search(
|
||||
search_service: Any,
|
||||
project: Optional[str],
|
||||
email: str,
|
||||
issue_id: Optional[int],
|
||||
order_token: Optional[str],
|
||||
natural_query: str,
|
||||
as_json: bool,
|
||||
) -> None:
|
||||
checks = smoke_search(search_service, project, email, issue_id, order_token, natural_query)
|
||||
report = {"project_identifier": project, "checks": checks}
|
||||
if as_json:
|
||||
print(json.dumps(report, sort_keys=True))
|
||||
return
|
||||
for check in checks:
|
||||
status = "PASS" if check["passed"] else "FAIL"
|
||||
print(f"{status} {check['kind']} {check['query']}")
|
||||
for result in check["results"]:
|
||||
payload = result["payload"]
|
||||
print(
|
||||
f" {result['id']} score={result['score']:.4f} "
|
||||
f"doc_type={payload.get('doc_type')} issue={payload.get('issue_id')} "
|
||||
f"contact={contact_display(payload)} url={result['citation'].get('url')}"
|
||||
)
|
||||
|
||||
|
||||
def audit_documents(documents: List[Dict[str, Any]]) -> Dict[str, Any]:
|
||||
doc_type_counts = Counter(str((document.get("payload") or {}).get("doc_type") or "unknown") for document in documents)
|
||||
project_counts = Counter(str((document.get("payload") or {}).get("project_identifier") or "unknown") for document in documents)
|
||||
missing_contact = []
|
||||
missing_helpdesk_contact = []
|
||||
contact_metadata_count = 0
|
||||
helpdesk_documents = 0
|
||||
helpdesk_contact_metadata_count = 0
|
||||
unexpected_attachments = []
|
||||
for document in documents:
|
||||
payload = document.get("payload") or {}
|
||||
doc_type = str(payload.get("doc_type") or "")
|
||||
has_contact = bool(payload.get("contact_id") and payload.get("contact_email"))
|
||||
has_helpdesk_ticket = bool(payload.get("has_helpdesk_ticket"))
|
||||
if has_contact:
|
||||
contact_metadata_count += 1
|
||||
elif doc_type in {"issue", "journal", "message", "contact"} and has_helpdesk_ticket:
|
||||
missing_contact.append(str(document.get("id")))
|
||||
if has_helpdesk_ticket:
|
||||
helpdesk_documents += 1
|
||||
if has_contact:
|
||||
helpdesk_contact_metadata_count += 1
|
||||
elif doc_type in {"issue", "journal", "message", "contact"}:
|
||||
missing_helpdesk_contact.append(str(document.get("id")))
|
||||
if doc_type == "attachment":
|
||||
unexpected_attachments.append(str(document.get("id")))
|
||||
return {
|
||||
"total_documents": len(documents),
|
||||
"doc_type_counts": dict(doc_type_counts),
|
||||
"project_counts": dict(project_counts),
|
||||
"contact_metadata_count": contact_metadata_count,
|
||||
"helpdesk_documents": helpdesk_documents,
|
||||
"helpdesk_contact_metadata_count": helpdesk_contact_metadata_count,
|
||||
"missing_contact_metadata": missing_contact,
|
||||
"missing_helpdesk_contact_metadata": missing_helpdesk_contact,
|
||||
"attachment_documents": len(unexpected_attachments),
|
||||
"unexpected_attachment_documents": unexpected_attachments,
|
||||
}
|
||||
|
||||
|
||||
def preview_redmine_documents(source: Any, redmine_url: str, project: Optional[str], limit: int) -> List[Dict[str, Any]]:
|
||||
previous_project = getattr(source, "project_identifier", None)
|
||||
if project and hasattr(source, "project_identifier"):
|
||||
source.project_identifier = project
|
||||
try:
|
||||
mapper = RedmineMapper(redmine_url=redmine_url, project_identifier=project)
|
||||
documents = []
|
||||
for issue in source.recent_helpdesk_issues(limit):
|
||||
documents.extend(mapper.issue_to_documents(issue))
|
||||
return [{"id": document.id, "text": document.text, "payload": document.payload} for document in documents]
|
||||
finally:
|
||||
if hasattr(source, "project_identifier"):
|
||||
source.project_identifier = previous_project
|
||||
|
||||
|
||||
def compare_documents(preview_documents: List[Dict[str, Any]], indexed_documents: List[Dict[str, Any]]) -> Dict[str, Any]:
|
||||
indexed_by_id = {str(document.get("id")): document for document in indexed_documents}
|
||||
missing = []
|
||||
stale = []
|
||||
contact_mismatches = []
|
||||
for preview in preview_documents:
|
||||
document_id = str(preview.get("id"))
|
||||
indexed = indexed_by_id.get(document_id)
|
||||
if indexed is None:
|
||||
missing.append(document_id)
|
||||
continue
|
||||
preview_payload = preview.get("payload") or {}
|
||||
indexed_payload = indexed.get("payload") or {}
|
||||
if preview_payload.get("source_hash") != indexed_payload.get("source_hash"):
|
||||
stale.append(document_id)
|
||||
contact_fields = ("contact_id", "contact_name", "contact_email", "contact_company")
|
||||
if any(preview_payload.get(field) != indexed_payload.get(field) for field in contact_fields):
|
||||
contact_mismatches.append({"id": document_id})
|
||||
return {
|
||||
"preview_documents": len(preview_documents),
|
||||
"indexed_documents": len(indexed_documents),
|
||||
"missing": missing,
|
||||
"stale": stale,
|
||||
"contact_mismatches": contact_mismatches,
|
||||
}
|
||||
|
||||
|
||||
def smoke_search(
|
||||
search_service: Any,
|
||||
project: Optional[str],
|
||||
email: str,
|
||||
issue_id: Optional[int],
|
||||
order_token: Optional[str],
|
||||
natural_query: str,
|
||||
) -> List[Dict[str, Any]]:
|
||||
checks = [run_smoke_query(search_service, "email", email, project, expected_email=email)]
|
||||
if issue_id is not None:
|
||||
checks.append(run_smoke_query(search_service, "issue", str(issue_id), project, expected_issue_id=issue_id))
|
||||
if order_token:
|
||||
checks.append(run_smoke_query(search_service, "order", order_token, project))
|
||||
if natural_query:
|
||||
checks.append(run_smoke_query(search_service, "natural", natural_query, project))
|
||||
return checks
|
||||
|
||||
|
||||
def run_smoke_query(
|
||||
search_service: Any,
|
||||
kind: str,
|
||||
text: str,
|
||||
project: Optional[str],
|
||||
expected_email: Optional[str] = None,
|
||||
expected_issue_id: Optional[int] = None,
|
||||
) -> Dict[str, Any]:
|
||||
query = SearchQuery(text=text, source="redmine", project_identifier=project, issue_id=expected_issue_id, limit=5)
|
||||
results = search_service.search(query)
|
||||
result_dicts = [result.to_dict(include_snippet=True) for result in results]
|
||||
passed = bool(result_dicts)
|
||||
if expected_email:
|
||||
passed = passed and any((result["payload"] or {}).get("contact_email") == expected_email for result in result_dicts)
|
||||
if expected_issue_id is not None:
|
||||
passed = passed and any((result["payload"] or {}).get("issue_id") == expected_issue_id for result in result_dicts)
|
||||
return {"kind": kind, "query": text, "passed": passed, "results": result_dicts}
|
||||
|
||||
|
||||
def print_result(result: SearchResult, full_text: bool) -> None:
|
||||
print(f"{result.id} score={result.score:.4f}")
|
||||
print_metadata(result.payload)
|
||||
print(f"url={result.citation.get('url')}")
|
||||
print(result.text if full_text else snippet(result.text))
|
||||
print()
|
||||
|
||||
|
||||
def print_document(document: Dict[str, Any], full_text: bool) -> None:
|
||||
payload = document.get("payload") or {}
|
||||
print(document.get("id"))
|
||||
print_metadata(payload)
|
||||
url = payload.get("redmine_url")
|
||||
if url:
|
||||
print(f"url={url}")
|
||||
print(document.get("text", "") if full_text else snippet(document.get("text", "")))
|
||||
print()
|
||||
|
||||
|
||||
def print_metadata(payload: Dict[str, Any]) -> None:
|
||||
contact = contact_display(payload)
|
||||
fields = [
|
||||
("source", payload.get("source")),
|
||||
("doc_type", payload.get("doc_type")),
|
||||
("issue", payload.get("issue_id")),
|
||||
("project", payload.get("project_identifier")),
|
||||
("contact", contact),
|
||||
("created", payload.get("created_on")),
|
||||
("updated", payload.get("updated_on")),
|
||||
]
|
||||
print(" ".join(f"{name}={value}" for name, value in fields if value is not None))
|
||||
|
||||
|
||||
def contact_display(payload: Dict[str, Any]) -> Optional[str]:
|
||||
contact_id = payload.get("contact_id")
|
||||
pieces = []
|
||||
if contact_id is not None:
|
||||
pieces.append(f"#{contact_id}")
|
||||
if payload.get("contact_name"):
|
||||
pieces.append(str(payload["contact_name"]))
|
||||
if payload.get("contact_email"):
|
||||
pieces.append(str(payload["contact_email"]))
|
||||
if payload.get("contact_company"):
|
||||
pieces.append(str(payload["contact_company"]))
|
||||
return " | ".join(pieces) if pieces else None
|
||||
|
||||
|
||||
def snippet(text: str, max_chars: int = 240) -> str:
|
||||
compact = " ".join(text.split())
|
||||
if len(compact) <= max_chars:
|
||||
return compact
|
||||
return compact[: max_chars - 3].rstrip() + "..."
|
||||
@@ -0,0 +1,80 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import sys
|
||||
from typing import Any, Dict, Optional
|
||||
|
||||
from .models import SearchQuery, search_response
|
||||
|
||||
|
||||
class SemanticMCP:
|
||||
def __init__(self, search_service: Any, backfill_service: Optional[Any], store: Optional[Any] = None, refresh_service: Optional[Any] = None) -> None:
|
||||
self.search_service = search_service
|
||||
self.backfill_service = backfill_service
|
||||
self.store = store
|
||||
self.refresh_service = refresh_service
|
||||
|
||||
def tools(self) -> Dict[str, Dict[str, str]]:
|
||||
return {
|
||||
"semantic_search": {"description": "Search the semantic index and return cited snippets."},
|
||||
"semantic_get_document": {"description": "Fetch one indexed document by stable id."},
|
||||
"semantic_list_projects": {"description": "List indexed project identifiers and document counts."},
|
||||
"semantic_backfill_redmine_sample": {"description": "Rebuild the Redmine sample collection."},
|
||||
"semantic_refresh_redmine": {"description": "Refresh recent Redmine issues without re-embedding unchanged documents."},
|
||||
}
|
||||
|
||||
def call_tool(self, name: str, arguments: Dict[str, Any]) -> Dict[str, Any]:
|
||||
if name == "semantic_search":
|
||||
query = SearchQuery(
|
||||
text=arguments.get("query") or arguments.get("text") or "",
|
||||
source=arguments.get("source"),
|
||||
project_id=arguments.get("project_id"),
|
||||
project_identifier=arguments.get("project_identifier"),
|
||||
doc_type=arguments.get("doc_type"),
|
||||
issue_id=arguments.get("issue_id"),
|
||||
contact_id=arguments.get("contact_id"),
|
||||
contact_email=arguments.get("contact_email"),
|
||||
date_from=arguments.get("date_from"),
|
||||
date_to=arguments.get("date_to"),
|
||||
limit=int(arguments.get("limit", 10)),
|
||||
include_snippets=bool(arguments.get("include_snippets", True)),
|
||||
)
|
||||
results = self.search_service.search(query)
|
||||
return search_response(query, results)
|
||||
if name == "semantic_get_document":
|
||||
return self.search_service.get_document(arguments["id"]) or {"error": "not_found", "id": arguments["id"]}
|
||||
if name == "semantic_list_projects":
|
||||
if self.store is None:
|
||||
return {"error": "project_listing_unavailable"}
|
||||
return {"projects": self.store.list_projects(source=arguments.get("source", "redmine"))}
|
||||
if name == "semantic_backfill_redmine_sample":
|
||||
if self.backfill_service is None:
|
||||
return {"error": "backfill_unavailable"}
|
||||
return self.backfill_service.backfill_redmine_sample(limit=int(arguments.get("limit", 500)))
|
||||
if name == "semantic_refresh_redmine":
|
||||
if self.refresh_service is None:
|
||||
return {"error": "refresh_unavailable"}
|
||||
project_limits = arguments.get("project_limits")
|
||||
if not project_limits:
|
||||
project = arguments.get("project_identifier")
|
||||
if not project:
|
||||
return {"error": "project_required"}
|
||||
project_limits = {project: int(arguments.get("limit", 500))}
|
||||
return self.refresh_service.refresh_redmine_project_limits(
|
||||
{str(project): int(limit) for project, limit in project_limits.items()},
|
||||
dry_run=bool(arguments.get("dry_run", False)),
|
||||
force_rebuild=bool(arguments.get("force_rebuild", False)),
|
||||
overlap_minutes=int(arguments.get("overlap_minutes", 15)),
|
||||
)
|
||||
raise ValueError(f"unknown tool: {name}")
|
||||
|
||||
|
||||
def serve_stdio(mcp: SemanticMCP) -> None:
|
||||
for line in sys.stdin:
|
||||
request = json.loads(line)
|
||||
try:
|
||||
result = mcp.call_tool(request["name"], request.get("arguments") or {})
|
||||
response = {"id": request.get("id"), "result": result}
|
||||
except Exception as exc:
|
||||
response = {"id": request.get("id"), "error": str(exc)}
|
||||
print(json.dumps(response), flush=True)
|
||||
@@ -0,0 +1,100 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from dataclasses import dataclass, field
|
||||
from typing import Any, Dict, Optional
|
||||
|
||||
|
||||
Payload = Dict[str, Any]
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class IndexDocument:
|
||||
id: str
|
||||
text: str
|
||||
payload: Payload = field(default_factory=dict)
|
||||
|
||||
def __post_init__(self) -> None:
|
||||
if not self.id.strip():
|
||||
raise ValueError("document id is required")
|
||||
if not self.text.strip():
|
||||
raise ValueError("document text is required")
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class SearchQuery:
|
||||
text: str
|
||||
source: Optional[str] = None
|
||||
project_id: Optional[int] = None
|
||||
project_identifier: Optional[str] = None
|
||||
doc_type: Optional[str] = None
|
||||
issue_id: Optional[int] = None
|
||||
contact_id: Optional[int] = None
|
||||
contact_email: Optional[str] = None
|
||||
date_from: Optional[str] = None
|
||||
date_to: Optional[str] = None
|
||||
limit: int = 10
|
||||
include_snippets: bool = True
|
||||
|
||||
def __post_init__(self) -> None:
|
||||
if not self.text.strip():
|
||||
raise ValueError("search text is required")
|
||||
if self.limit < 1 or self.limit > 100:
|
||||
raise ValueError("limit must be between 1 and 100")
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class SearchResult:
|
||||
id: str
|
||||
score: float
|
||||
text: str
|
||||
payload: Payload
|
||||
|
||||
@property
|
||||
def snippet(self) -> str:
|
||||
return self.text[:500]
|
||||
|
||||
@property
|
||||
def citation(self) -> Payload:
|
||||
return {
|
||||
"id": self.id,
|
||||
"source": self.payload.get("source"),
|
||||
"doc_type": self.payload.get("doc_type"),
|
||||
"issue_id": self.payload.get("issue_id"),
|
||||
"project_identifier": self.payload.get("project_identifier"),
|
||||
"contact_id": self.payload.get("contact_id"),
|
||||
"contact_name": self.payload.get("contact_name"),
|
||||
"contact_email": self.payload.get("contact_email"),
|
||||
"url": self.payload.get("redmine_url"),
|
||||
"record_id": self.payload.get("source_record_id"),
|
||||
}
|
||||
|
||||
def to_dict(self, include_snippet: bool = True) -> Payload:
|
||||
data: Payload = {
|
||||
"id": self.id,
|
||||
"score": self.score,
|
||||
"payload": self.payload,
|
||||
"citation": self.citation,
|
||||
}
|
||||
if include_snippet:
|
||||
data["snippet"] = self.snippet
|
||||
return data
|
||||
|
||||
|
||||
def search_response(query: SearchQuery, results: list[SearchResult]) -> Payload:
|
||||
filters = {
|
||||
"source": query.source,
|
||||
"project_id": query.project_id,
|
||||
"project_identifier": query.project_identifier,
|
||||
"doc_type": query.doc_type,
|
||||
"issue_id": query.issue_id,
|
||||
"contact_id": query.contact_id,
|
||||
"contact_email": query.contact_email,
|
||||
"date_from": query.date_from,
|
||||
"date_to": query.date_to,
|
||||
"limit": query.limit,
|
||||
}
|
||||
return {
|
||||
"query": query.text,
|
||||
"filters": {key: value for key, value in filters.items() if value is not None},
|
||||
"results": [result.to_dict(include_snippet=query.include_snippets) for result in results],
|
||||
}
|
||||
@@ -0,0 +1,219 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import uuid
|
||||
from typing import Any, Dict, List, Optional, Sequence
|
||||
from collections import Counter
|
||||
|
||||
from .models import IndexDocument, SearchQuery, SearchResult
|
||||
|
||||
|
||||
def point_id_for_document(document_id: str) -> str:
|
||||
return str(uuid.uuid5(uuid.NAMESPACE_URL, document_id))
|
||||
|
||||
|
||||
def build_filter(query: SearchQuery) -> Dict[str, List[Dict[str, Any]]]:
|
||||
must: List[Dict[str, Any]] = []
|
||||
equality_fields = {
|
||||
"source": query.source,
|
||||
"project_id": query.project_id,
|
||||
"project_identifier": query.project_identifier,
|
||||
"doc_type": query.doc_type,
|
||||
"issue_id": query.issue_id,
|
||||
"contact_id": query.contact_id,
|
||||
"contact_email": query.contact_email,
|
||||
}
|
||||
for key, value in equality_fields.items():
|
||||
if value is not None:
|
||||
must.append({"key": key, "match": {"value": value}})
|
||||
if query.date_from or query.date_to:
|
||||
range_filter: Dict[str, str] = {}
|
||||
if query.date_from:
|
||||
range_filter["gte"] = query.date_from
|
||||
if query.date_to:
|
||||
range_filter["lte"] = query.date_to
|
||||
must.append({"key": "created_on", "range": range_filter})
|
||||
return {"must": must}
|
||||
|
||||
|
||||
class QdrantStore:
|
||||
def __init__(self, url: str, api_key: Optional[str], collection: str, vector_size: int = 1536, upsert_batch_size: int = 64) -> None:
|
||||
try:
|
||||
from qdrant_client import QdrantClient
|
||||
from qdrant_client.http import models as qmodels
|
||||
except ImportError as exc:
|
||||
raise RuntimeError("Install qdrant-client to use live Qdrant storage") from exc
|
||||
self.client = QdrantClient(url=url, api_key=api_key)
|
||||
self.collection = collection
|
||||
self.vector_size = vector_size
|
||||
self.upsert_batch_size = upsert_batch_size
|
||||
self.qmodels = qmodels
|
||||
|
||||
def ensure_collection(self) -> None:
|
||||
collections = self.client.get_collections().collections
|
||||
if any(collection.name == self.collection for collection in collections):
|
||||
return
|
||||
self.client.create_collection(
|
||||
collection_name=self.collection,
|
||||
vectors_config=self.qmodels.VectorParams(size=self.vector_size, distance=self.qmodels.Distance.COSINE),
|
||||
)
|
||||
|
||||
def upsert(self, documents: Sequence[IndexDocument], vectors: Sequence[Sequence[float]]) -> None:
|
||||
if len(documents) != len(vectors):
|
||||
raise ValueError("documents and vectors length mismatch")
|
||||
self.ensure_collection()
|
||||
points = [
|
||||
self.qmodels.PointStruct(
|
||||
id=point_id_for_document(document.id),
|
||||
vector=list(vector),
|
||||
payload={**document.payload, "document_id": document.id, "text": document.text},
|
||||
)
|
||||
for document, vector in zip(documents, vectors)
|
||||
]
|
||||
for start in range(0, len(points), self.upsert_batch_size):
|
||||
batch = points[start : start + self.upsert_batch_size]
|
||||
if batch:
|
||||
self.client.upsert(collection_name=self.collection, points=batch)
|
||||
|
||||
def delete_by_source(self, source: str, project_identifier: Optional[str] = None) -> None:
|
||||
self.ensure_collection()
|
||||
query = SearchQuery(text="*", source=source, project_identifier=project_identifier)
|
||||
self.client.delete(
|
||||
collection_name=self.collection,
|
||||
points_selector=self.qmodels.FilterSelector(
|
||||
filter=self._to_qdrant_filter(build_filter(query))
|
||||
),
|
||||
)
|
||||
|
||||
def delete_documents(self, document_ids: Sequence[str]) -> None:
|
||||
self.ensure_collection()
|
||||
if not document_ids:
|
||||
return
|
||||
self.client.delete(
|
||||
collection_name=self.collection,
|
||||
points_selector=self.qmodels.PointIdsList(
|
||||
points=[point_id_for_document(document_id) for document_id in document_ids]
|
||||
),
|
||||
)
|
||||
|
||||
def rebuild_source(
|
||||
self,
|
||||
source: str,
|
||||
documents: Sequence[IndexDocument],
|
||||
vectors: Sequence[Sequence[float]],
|
||||
project_identifier: Optional[str] = None,
|
||||
) -> None:
|
||||
self.delete_by_source(source, project_identifier=project_identifier)
|
||||
self.upsert(documents, vectors)
|
||||
|
||||
def search(self, vector: Sequence[float], query: SearchQuery, limit: int) -> List[SearchResult]:
|
||||
self.ensure_collection()
|
||||
qfilter = self._to_qdrant_filter(build_filter(query))
|
||||
if hasattr(self.client, "query_points"):
|
||||
response = self.client.query_points(
|
||||
collection_name=self.collection,
|
||||
query=list(vector),
|
||||
query_filter=qfilter,
|
||||
limit=limit,
|
||||
with_payload=True,
|
||||
)
|
||||
results = response.points
|
||||
else:
|
||||
results = self.client.search(
|
||||
collection_name=self.collection,
|
||||
query_vector=list(vector),
|
||||
query_filter=qfilter,
|
||||
limit=limit,
|
||||
with_payload=True,
|
||||
)
|
||||
return [self._point_to_result(point) for point in results]
|
||||
|
||||
def get_document(self, document_id: str) -> Optional[Dict[str, Any]]:
|
||||
self.ensure_collection()
|
||||
points = self.client.retrieve(collection_name=self.collection, ids=[point_id_for_document(document_id)], with_payload=True)
|
||||
if not points:
|
||||
return None
|
||||
payload = dict(points[0].payload or {})
|
||||
text = payload.pop("text", "")
|
||||
payload.pop("document_id", None)
|
||||
return {"id": document_id, "text": text, "payload": payload}
|
||||
|
||||
def count_documents(
|
||||
self,
|
||||
source: Optional[str] = None,
|
||||
project_identifier: Optional[str] = None,
|
||||
doc_type: Optional[str] = None,
|
||||
) -> int:
|
||||
self.ensure_collection()
|
||||
query = SearchQuery(text="*", source=source, project_identifier=project_identifier, doc_type=doc_type)
|
||||
result = self.client.count(
|
||||
collection_name=self.collection,
|
||||
count_filter=self._to_qdrant_filter(build_filter(query)),
|
||||
exact=True,
|
||||
)
|
||||
return int(result.count)
|
||||
|
||||
def list_documents(
|
||||
self,
|
||||
limit: int = 10,
|
||||
source: Optional[str] = None,
|
||||
project_identifier: Optional[str] = None,
|
||||
doc_type: Optional[str] = None,
|
||||
issue_id: Optional[int] = None,
|
||||
) -> List[Dict[str, Any]]:
|
||||
self.ensure_collection()
|
||||
query = SearchQuery(text="*", source=source, project_identifier=project_identifier, doc_type=doc_type, issue_id=issue_id)
|
||||
qfilter = self._to_qdrant_filter(build_filter(query))
|
||||
records = []
|
||||
offset = None
|
||||
while len(records) < limit:
|
||||
batch_limit = limit - len(records)
|
||||
batch, offset = self.client.scroll(
|
||||
collection_name=self.collection,
|
||||
scroll_filter=qfilter,
|
||||
limit=batch_limit,
|
||||
with_payload=True,
|
||||
with_vectors=False,
|
||||
offset=offset,
|
||||
)
|
||||
records.extend(batch[:batch_limit])
|
||||
if not offset or not batch:
|
||||
break
|
||||
return [self._record_to_document(record) for record in records]
|
||||
|
||||
def list_projects(self, source: Optional[str] = None, limit: int = 5000) -> List[Dict[str, Any]]:
|
||||
documents = self.list_documents(limit=limit, source=source)
|
||||
counts = Counter(
|
||||
str((document.get("payload") or {}).get("project_identifier"))
|
||||
for document in documents
|
||||
if (document.get("payload") or {}).get("project_identifier")
|
||||
)
|
||||
return [
|
||||
{"project_identifier": project, "document_count": count}
|
||||
for project, count in sorted(counts.items())
|
||||
]
|
||||
|
||||
def _to_qdrant_filter(self, raw_filter: Dict[str, List[Dict[str, Any]]]) -> Any:
|
||||
conditions = []
|
||||
for condition in raw_filter.get("must", []):
|
||||
if "match" in condition:
|
||||
conditions.append(
|
||||
self.qmodels.FieldCondition(
|
||||
key=condition["key"],
|
||||
match=self.qmodels.MatchValue(value=condition["match"]["value"]),
|
||||
)
|
||||
)
|
||||
elif "range" in condition:
|
||||
conditions.append(self.qmodels.FieldCondition(key=condition["key"], range=self.qmodels.DatetimeRange(**condition["range"])))
|
||||
return self.qmodels.Filter(must=conditions) if conditions else None
|
||||
|
||||
def _point_to_result(self, point: Any) -> SearchResult:
|
||||
payload = dict(point.payload or {})
|
||||
text = payload.pop("text", "")
|
||||
document_id = payload.pop("document_id", str(point.id))
|
||||
return SearchResult(id=document_id, score=float(point.score), text=text, payload=payload)
|
||||
|
||||
def _record_to_document(self, record: Any) -> Dict[str, Any]:
|
||||
payload = dict(record.payload or {})
|
||||
text = payload.pop("text", "")
|
||||
document_id = payload.pop("document_id", str(record.id))
|
||||
return {"id": document_id, "text": text, "payload": payload}
|
||||
@@ -0,0 +1,243 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import hashlib
|
||||
import json
|
||||
import urllib.parse
|
||||
import urllib.request
|
||||
from typing import Any, Dict, Iterable, List, Optional
|
||||
|
||||
from .chunking import chunk_text
|
||||
from .models import IndexDocument, Payload
|
||||
|
||||
|
||||
Issue = Dict[str, Any]
|
||||
|
||||
|
||||
class RedmineMapper:
|
||||
def __init__(self, redmine_url: str, chunk_chars: int = 3500, project_identifier: Optional[str] = None) -> None:
|
||||
self.redmine_url = redmine_url.rstrip("/")
|
||||
self.chunk_chars = chunk_chars
|
||||
self.project_identifier = project_identifier
|
||||
|
||||
def issue_to_documents(self, issue: Issue) -> List[IndexDocument]:
|
||||
docs: List[IndexDocument] = []
|
||||
docs.extend(self._issue_documents(issue))
|
||||
docs.extend(self._journal_documents(issue))
|
||||
docs.extend(self._message_documents(issue))
|
||||
docs.extend(self._contact_documents(issue))
|
||||
return docs
|
||||
|
||||
def _issue_documents(self, issue: Issue) -> List[IndexDocument]:
|
||||
issue_id = int(issue["id"])
|
||||
subject = issue.get("subject") or ""
|
||||
description = issue.get("description") or ""
|
||||
contact = self._issue_contact(issue)
|
||||
contact_text = self._contact_text(contact)
|
||||
text = f"Issue #{issue_id}: {subject}\n\n{description}\n\n{contact_text}".strip()
|
||||
return self._documents_for_record(
|
||||
base_id=f"redmine:issue:{issue_id}",
|
||||
text=text,
|
||||
issue=issue,
|
||||
doc_type="issue",
|
||||
source_record_id=f"issue:{issue_id}",
|
||||
record=issue,
|
||||
)
|
||||
|
||||
def _journal_documents(self, issue: Issue) -> List[IndexDocument]:
|
||||
docs: List[IndexDocument] = []
|
||||
issue_id = int(issue["id"])
|
||||
for journal in issue.get("journals") or []:
|
||||
notes = journal.get("notes") or ""
|
||||
if not notes.strip():
|
||||
continue
|
||||
docs.extend(
|
||||
self._documents_for_record(
|
||||
base_id=f"redmine:issue:{issue_id}:journal:{journal['id']}",
|
||||
text=notes,
|
||||
issue=issue,
|
||||
doc_type="journal",
|
||||
source_record_id=f"journal:{journal['id']}",
|
||||
record=journal,
|
||||
extra={
|
||||
"journal_id": journal.get("id"),
|
||||
"visibility": "private" if journal.get("private_notes") else "public",
|
||||
"created_on": journal.get("created_on") or issue.get("updated_on"),
|
||||
},
|
||||
)
|
||||
)
|
||||
return docs
|
||||
|
||||
def _message_documents(self, issue: Issue) -> List[IndexDocument]:
|
||||
docs: List[IndexDocument] = []
|
||||
issue_id = int(issue["id"])
|
||||
for message in issue.get("messages") or issue.get("journal_messages") or []:
|
||||
body = message.get("body") or message.get("content") or message.get("message") or ""
|
||||
if not body.strip():
|
||||
continue
|
||||
docs.extend(
|
||||
self._documents_for_record(
|
||||
base_id=f"redmine:issue:{issue_id}:message:{message['id']}",
|
||||
text=body,
|
||||
issue=issue,
|
||||
doc_type="message",
|
||||
source_record_id=f"message:{message['id']}",
|
||||
record=message,
|
||||
extra={
|
||||
"message_id": message.get("id"),
|
||||
"direction": message.get("direction"),
|
||||
"created_on": message.get("created_on") or issue.get("updated_on"),
|
||||
},
|
||||
)
|
||||
)
|
||||
return docs
|
||||
|
||||
def _contact_documents(self, issue: Issue) -> List[IndexDocument]:
|
||||
contact = self._issue_contact(issue)
|
||||
contact_id = contact.get("id")
|
||||
if not contact_id:
|
||||
return []
|
||||
text = self._contact_text(contact)
|
||||
if not text.strip():
|
||||
return []
|
||||
return self._documents_for_record(
|
||||
base_id=f"redmine:contact:{contact_id}:issue:{issue['id']}",
|
||||
text=text,
|
||||
issue=issue,
|
||||
doc_type="contact",
|
||||
source_record_id=f"contact:{contact_id}",
|
||||
record=contact,
|
||||
)
|
||||
|
||||
def _documents_for_record(
|
||||
self,
|
||||
base_id: str,
|
||||
text: str,
|
||||
issue: Issue,
|
||||
doc_type: str,
|
||||
source_record_id: str,
|
||||
record: Dict[str, Any],
|
||||
extra: Optional[Payload] = None,
|
||||
) -> List[IndexDocument]:
|
||||
chunks = chunk_text(text, max_chars=self.chunk_chars)
|
||||
payload = self._base_payload(issue, doc_type, source_record_id, record)
|
||||
if extra:
|
||||
payload.update({key: value for key, value in extra.items() if value is not None})
|
||||
return [
|
||||
IndexDocument(id=f"{base_id}:chunk:{index}", text=chunk, payload={**payload, "chunk_index": index})
|
||||
for index, chunk in enumerate(chunks)
|
||||
]
|
||||
|
||||
def _base_payload(self, issue: Issue, doc_type: str, source_record_id: str, record: Dict[str, Any]) -> Payload:
|
||||
project = issue.get("project") or {}
|
||||
helpdesk_ticket = issue.get("helpdesk_ticket") or {}
|
||||
contact = self._issue_contact(issue)
|
||||
issue_id = int(issue["id"])
|
||||
redmine_url = issue.get("url") or f"{self.redmine_url}/issues/{issue_id}"
|
||||
created_on = record.get("created_on") or issue.get("created_on")
|
||||
updated_on = record.get("updated_on") or issue.get("updated_on")
|
||||
return {
|
||||
"source": "redmine",
|
||||
"doc_type": doc_type,
|
||||
"issue_id": issue_id,
|
||||
"project_id": project.get("id"),
|
||||
"project_identifier": project.get("identifier") or self.project_identifier,
|
||||
"project_name": project.get("name"),
|
||||
"has_helpdesk_ticket": bool(helpdesk_ticket.get("id")),
|
||||
"helpdesk_ticket_id": helpdesk_ticket.get("id"),
|
||||
"contact_id": contact.get("id"),
|
||||
"contact_email": contact.get("email"),
|
||||
"contact_name": contact.get("name"),
|
||||
"contact_company": contact.get("company"),
|
||||
"created_on": created_on,
|
||||
"updated_on": updated_on,
|
||||
"visibility": "public",
|
||||
"redmine_url": redmine_url,
|
||||
"source_record_id": source_record_id,
|
||||
"source_hash": stable_hash(record),
|
||||
}
|
||||
|
||||
def _issue_contact(self, issue: Issue) -> Payload:
|
||||
contact = issue.get("contact") or issue.get("customer") or {}
|
||||
helpdesk_ticket = issue.get("helpdesk_ticket") or {}
|
||||
helpdesk_contact = helpdesk_ticket.get("contact") or {}
|
||||
merged = {**helpdesk_contact, **contact}
|
||||
if not merged.get("id"):
|
||||
merged["id"] = helpdesk_ticket.get("contact_id")
|
||||
if not merged.get("email"):
|
||||
merged["email"] = helpdesk_ticket.get("contact_email") or helpdesk_ticket.get("from_address")
|
||||
if not merged.get("name"):
|
||||
merged["name"] = helpdesk_ticket.get("contact_name")
|
||||
if not merged.get("company"):
|
||||
merged["company"] = helpdesk_ticket.get("contact_company")
|
||||
return {key: value for key, value in merged.items() if value not in (None, "")}
|
||||
|
||||
def _contact_text(self, contact: Payload) -> str:
|
||||
text_parts = [
|
||||
contact.get("name"),
|
||||
contact.get("email"),
|
||||
contact.get("phone"),
|
||||
contact.get("company"),
|
||||
]
|
||||
return "\n".join(str(part) for part in text_parts if part)
|
||||
|
||||
|
||||
class RedmineApiSource:
|
||||
def __init__(self, redmine_url: str, api_key: str, project_identifier: Optional[str] = None) -> None:
|
||||
self.redmine_url = redmine_url.rstrip("/")
|
||||
self.api_key = api_key
|
||||
self.project_identifier = project_identifier
|
||||
|
||||
def recent_helpdesk_issues(self, limit: int) -> Iterable[Issue]:
|
||||
for issue in self.recent_issue_summaries(limit):
|
||||
yield self.issue_detail(int(issue["id"]), fallback=issue)
|
||||
|
||||
def recent_issue_summaries(self, limit: int) -> Iterable[Issue]:
|
||||
yielded = 0
|
||||
offset = 0
|
||||
seen_issue_ids = set()
|
||||
page_size = 100
|
||||
while yielded < limit:
|
||||
current_limit = min(page_size, limit - yielded)
|
||||
params = {
|
||||
"limit": str(current_limit),
|
||||
"offset": str(offset),
|
||||
"sort": "updated_on:desc,id:desc",
|
||||
"include": "journals",
|
||||
"status_id": "*",
|
||||
}
|
||||
if self.project_identifier:
|
||||
params["project_id"] = self.project_identifier
|
||||
params["subproject_id"] = "!*"
|
||||
path = f"{self.redmine_url}/issues.json?{urllib.parse.urlencode(params)}"
|
||||
payload = self._get_json(path)
|
||||
issues = payload.get("issues", [])
|
||||
if not issues:
|
||||
break
|
||||
for issue in issues:
|
||||
issue_id = issue["id"]
|
||||
if issue_id in seen_issue_ids:
|
||||
continue
|
||||
seen_issue_ids.add(issue_id)
|
||||
issue.setdefault("url", f"{self.redmine_url}/issues/{issue_id}")
|
||||
yield issue
|
||||
yielded += 1
|
||||
if yielded >= limit:
|
||||
break
|
||||
offset += len(issues)
|
||||
|
||||
def issue_detail(self, issue_id: int, fallback: Optional[Issue] = None) -> Issue:
|
||||
detail_params = urllib.parse.urlencode({"include": "journals,helpdesk"})
|
||||
detail = self._get_json(f"{self.redmine_url}/issues/{issue_id}.json?{detail_params}")
|
||||
merged = {**(fallback or {}), **detail.get("issue", {})}
|
||||
merged.setdefault("url", f"{self.redmine_url}/issues/{issue_id}")
|
||||
return merged
|
||||
|
||||
def _get_json(self, url: str) -> Dict[str, Any]:
|
||||
request = urllib.request.Request(url, headers={"X-Redmine-API-Key": self.api_key, "Accept": "application/json"})
|
||||
with urllib.request.urlopen(request, timeout=30) as response:
|
||||
return json.loads(response.read().decode("utf-8"))
|
||||
|
||||
|
||||
def stable_hash(record: Dict[str, Any]) -> str:
|
||||
canonical = json.dumps(record, sort_keys=True, separators=(",", ":"), default=str)
|
||||
return hashlib.sha256(canonical.encode("utf-8")).hexdigest()
|
||||
@@ -0,0 +1,225 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
from datetime import datetime, timedelta, timezone
|
||||
from pathlib import Path
|
||||
from typing import Any, Dict, Iterable, List, Optional, Protocol, Sequence
|
||||
|
||||
from .ingest import deduplicate_documents
|
||||
from .models import IndexDocument
|
||||
from .redmine import RedmineMapper
|
||||
|
||||
|
||||
class RedmineRefreshSource(Protocol):
|
||||
project_identifier: str | None
|
||||
|
||||
def recent_helpdesk_issues(self, limit: int) -> Iterable[Dict[str, Any]]:
|
||||
...
|
||||
|
||||
|
||||
class RefreshEmbedder(Protocol):
|
||||
def embed_documents(self, docs: Sequence[IndexDocument]) -> List[List[float]]:
|
||||
...
|
||||
|
||||
|
||||
class RefreshStore(Protocol):
|
||||
def list_documents(
|
||||
self,
|
||||
limit: int = 10,
|
||||
source: Optional[str] = None,
|
||||
project_identifier: Optional[str] = None,
|
||||
doc_type: Optional[str] = None,
|
||||
issue_id: Optional[int] = None,
|
||||
) -> List[Dict[str, Any]]:
|
||||
...
|
||||
|
||||
def upsert(self, docs: Sequence[IndexDocument], vectors: Sequence[Sequence[float]]) -> None:
|
||||
...
|
||||
|
||||
def delete_documents(self, document_ids: Sequence[str]) -> None:
|
||||
...
|
||||
|
||||
|
||||
class FileRefreshState:
|
||||
def __init__(self, path: Path) -> None:
|
||||
self.path = path
|
||||
|
||||
def load(self) -> Dict[str, Any]:
|
||||
if not self.path.exists():
|
||||
return {}
|
||||
return json.loads(self.path.read_text(encoding="utf-8"))
|
||||
|
||||
def mark_success(self, project_identifier: str, timestamp: Optional[str] = None) -> None:
|
||||
payload = self.load()
|
||||
payload.setdefault("projects", {})
|
||||
payload["projects"][project_identifier] = {
|
||||
"last_successful_refresh_at": timestamp or datetime.now(timezone.utc).isoformat()
|
||||
}
|
||||
self.path.parent.mkdir(parents=True, exist_ok=True)
|
||||
self.path.write_text(json.dumps(payload, indent=2, sort_keys=True) + "\n", encoding="utf-8")
|
||||
|
||||
|
||||
class RedmineRefreshService:
|
||||
def __init__(
|
||||
self,
|
||||
source: RedmineRefreshSource,
|
||||
embedder: RefreshEmbedder,
|
||||
store: RefreshStore,
|
||||
mapper: Optional[RedmineMapper] = None,
|
||||
state: Optional[FileRefreshState] = None,
|
||||
) -> None:
|
||||
self.source = source
|
||||
self.embedder = embedder
|
||||
self.store = store
|
||||
self.mapper = mapper or RedmineMapper(redmine_url="")
|
||||
self.state = state
|
||||
|
||||
def refresh_redmine_project_limits(
|
||||
self,
|
||||
project_limits: Dict[str, int],
|
||||
dry_run: bool = False,
|
||||
force_rebuild: bool = False,
|
||||
overlap_minutes: int = 15,
|
||||
) -> Dict[str, Any]:
|
||||
previous_source_project = getattr(self.source, "project_identifier", None)
|
||||
previous_mapper_project = getattr(self.mapper, "project_identifier", None)
|
||||
project_results: List[Dict[str, Any]] = []
|
||||
totals = {
|
||||
"issues": 0,
|
||||
"scanned_issues": 0,
|
||||
"detail_fetched_issues": 0,
|
||||
"skipped_issues": 0,
|
||||
"documents": 0,
|
||||
"unchanged_documents": 0,
|
||||
"changed_documents": 0,
|
||||
"new_documents": 0,
|
||||
"stale_documents": 0,
|
||||
"force_rebuilt_documents": 0,
|
||||
"would_embed_documents": 0,
|
||||
"embedded_documents": 0,
|
||||
}
|
||||
try:
|
||||
for project, limit in project_limits.items():
|
||||
if hasattr(self.source, "project_identifier"):
|
||||
self.source.project_identifier = project
|
||||
if hasattr(self.mapper, "project_identifier"):
|
||||
self.mapper.project_identifier = project
|
||||
project_result = self._refresh_project(project, limit, dry_run, force_rebuild, overlap_minutes)
|
||||
project_results.append(project_result)
|
||||
for key in totals:
|
||||
totals[key] += int(project_result.get(key, 0))
|
||||
if not dry_run and self.state is not None:
|
||||
self.state.mark_success(project)
|
||||
finally:
|
||||
if hasattr(self.source, "project_identifier"):
|
||||
self.source.project_identifier = previous_source_project
|
||||
if hasattr(self.mapper, "project_identifier"):
|
||||
self.mapper.project_identifier = previous_mapper_project
|
||||
return {
|
||||
"source": "redmine",
|
||||
"projects": len(project_limits),
|
||||
"dry_run": dry_run,
|
||||
"force_rebuild": force_rebuild,
|
||||
"overlap_minutes": overlap_minutes,
|
||||
**totals,
|
||||
"project_results": project_results,
|
||||
}
|
||||
|
||||
def _refresh_project(self, project: str, limit: int, dry_run: bool, force_rebuild: bool, overlap_minutes: int) -> Dict[str, Any]:
|
||||
summaries = list(self._recent_issue_summaries(limit))
|
||||
result: Dict[str, Any] = {
|
||||
"project_identifier": project,
|
||||
"issues": len(summaries),
|
||||
"scanned_issues": len(summaries),
|
||||
"detail_fetched_issues": 0,
|
||||
"skipped_issues": 0,
|
||||
"documents": 0,
|
||||
"unchanged_documents": 0,
|
||||
"changed_documents": 0,
|
||||
"new_documents": 0,
|
||||
"stale_documents": 0,
|
||||
"force_rebuilt_documents": 0,
|
||||
"would_embed_documents": 0,
|
||||
"embedded_documents": 0,
|
||||
}
|
||||
cutoff = self._cutoff_for_project(project, overlap_minutes)
|
||||
docs_to_embed: List[IndexDocument] = []
|
||||
stale_ids: List[str] = []
|
||||
for summary in summaries:
|
||||
if cutoff is not None and not force_rebuild and not self._issue_is_in_refresh_window(summary, cutoff):
|
||||
result["skipped_issues"] += 1
|
||||
continue
|
||||
issue = self._issue_detail(summary)
|
||||
result["detail_fetched_issues"] += 1
|
||||
candidates = deduplicate_documents(self.mapper.issue_to_documents(issue))
|
||||
result["documents"] += len(candidates)
|
||||
existing = self.store.list_documents(
|
||||
limit=5000,
|
||||
source="redmine",
|
||||
project_identifier=project,
|
||||
issue_id=int(issue["id"]),
|
||||
)
|
||||
existing_by_id = {document["id"]: document for document in existing}
|
||||
candidate_by_id = {document.id: document for document in candidates}
|
||||
for stale_id in sorted(set(existing_by_id) - set(candidate_by_id)):
|
||||
stale_ids.append(stale_id)
|
||||
result["stale_documents"] += 1
|
||||
for document in candidates:
|
||||
existing_document = existing_by_id.get(document.id)
|
||||
if existing_document is None:
|
||||
result["new_documents"] += 1
|
||||
docs_to_embed.append(document)
|
||||
continue
|
||||
existing_hash = (existing_document.get("payload") or {}).get("source_hash")
|
||||
document_hash = document.payload.get("source_hash")
|
||||
if force_rebuild:
|
||||
result["force_rebuilt_documents"] += 1
|
||||
docs_to_embed.append(document)
|
||||
elif existing_hash != document_hash:
|
||||
result["changed_documents"] += 1
|
||||
docs_to_embed.append(document)
|
||||
else:
|
||||
result["unchanged_documents"] += 1
|
||||
result["would_embed_documents"] = len(docs_to_embed)
|
||||
if dry_run:
|
||||
return result
|
||||
if stale_ids:
|
||||
self.store.delete_documents(stale_ids)
|
||||
if docs_to_embed:
|
||||
vectors = self.embedder.embed_documents(docs_to_embed)
|
||||
self.store.upsert(docs_to_embed, vectors)
|
||||
result["embedded_documents"] = len(docs_to_embed)
|
||||
return result
|
||||
|
||||
def _recent_issue_summaries(self, limit: int) -> Iterable[Dict[str, Any]]:
|
||||
if hasattr(self.source, "recent_issue_summaries"):
|
||||
return self.source.recent_issue_summaries(limit) # type: ignore[attr-defined]
|
||||
return self.source.recent_helpdesk_issues(limit)
|
||||
|
||||
def _issue_detail(self, summary: Dict[str, Any]) -> Dict[str, Any]:
|
||||
if hasattr(self.source, "issue_detail"):
|
||||
return self.source.issue_detail(int(summary["id"])) # type: ignore[attr-defined]
|
||||
return summary
|
||||
|
||||
def _cutoff_for_project(self, project: str, overlap_minutes: int) -> Optional[datetime]:
|
||||
if self.state is None:
|
||||
return None
|
||||
timestamp = ((self.state.load().get("projects") or {}).get(project) or {}).get("last_successful_refresh_at")
|
||||
if not timestamp:
|
||||
return None
|
||||
parsed = parse_redmine_datetime(timestamp)
|
||||
return parsed - timedelta(minutes=overlap_minutes)
|
||||
|
||||
def _issue_is_in_refresh_window(self, issue: Dict[str, Any], cutoff: datetime) -> bool:
|
||||
updated_on = issue.get("updated_on")
|
||||
if not updated_on:
|
||||
return True
|
||||
return parse_redmine_datetime(str(updated_on)) >= cutoff
|
||||
|
||||
|
||||
def parse_redmine_datetime(raw: str) -> datetime:
|
||||
normalized = raw.replace("Z", "+00:00")
|
||||
parsed = datetime.fromisoformat(normalized)
|
||||
if parsed.tzinfo is None:
|
||||
return parsed.replace(tzinfo=timezone.utc)
|
||||
return parsed.astimezone(timezone.utc)
|
||||
Executable
+107
@@ -0,0 +1,107 @@
|
||||
#!/usr/bin/env bash
|
||||
set -euo pipefail
|
||||
|
||||
usage() {
|
||||
cat >&2 <<'EOF'
|
||||
Usage:
|
||||
semantic_index/refresh.sh [--apply] [--dry-run]
|
||||
|
||||
Examples:
|
||||
semantic_index/refresh.sh
|
||||
semantic_index/refresh.sh --apply
|
||||
|
||||
Environment:
|
||||
SEMANTIC_INDEX_PROJECT_LIMITS comma-separated project=limit pairs
|
||||
SEMANTIC_INDEX_LOG_DIR default: .cache/semantic_index/logs
|
||||
SEMANTIC_INDEX_STATE_PATH default: .cache/semantic_index/refresh_state.json
|
||||
SEMANTIC_INDEX_OVERLAP_MINUTES default: 15
|
||||
PYTHON default: <install-root>/.venv/bin/python
|
||||
|
||||
This wrapper never passes --force-rebuild. Run force rebuilds manually.
|
||||
EOF
|
||||
}
|
||||
|
||||
script_dir=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
|
||||
install_root=$(cd "$script_dir/.." && pwd)
|
||||
|
||||
load_env_defaults() {
|
||||
local file=$1
|
||||
local key value
|
||||
[[ -r "$file" ]] || return 0
|
||||
while IFS= read -r line || [[ -n "$line" ]]; do
|
||||
line=${line#"${line%%[![:space:]]*}"}
|
||||
line=${line%"${line##*[![:space:]]}"}
|
||||
[[ -z "$line" || "$line" == \#* || "$line" != *=* ]] && continue
|
||||
key=${line%%=*}
|
||||
value=${line#*=}
|
||||
key=${key%"${key##*[![:space:]]}"}
|
||||
value=${value#"${value%%[![:space:]]*}"}
|
||||
value=${value%"${value##*[![:space:]]}"}
|
||||
value=${value%\"}
|
||||
value=${value#\"}
|
||||
value=${value%\'}
|
||||
value=${value#\'}
|
||||
if [[ -z "${!key+x}" ]]; then
|
||||
export "$key=$value"
|
||||
fi
|
||||
done < "$file"
|
||||
}
|
||||
|
||||
load_env_defaults /etc/semantic-index.env
|
||||
|
||||
mode=dry-run
|
||||
while [[ $# -gt 0 ]]; do
|
||||
case "$1" in
|
||||
--apply)
|
||||
mode=apply
|
||||
shift
|
||||
;;
|
||||
--dry-run)
|
||||
mode=dry-run
|
||||
shift
|
||||
;;
|
||||
-h|--help)
|
||||
usage
|
||||
exit 0
|
||||
;;
|
||||
*)
|
||||
usage
|
||||
exit 2
|
||||
;;
|
||||
esac
|
||||
done
|
||||
|
||||
project_limits=${SEMANTIC_INDEX_PROJECT_LIMITS:-customer-service=500,hiring=200,todo-jason=200,sales-inbox=100,business-development=100,dock-scheduling=100,prep-standardization=100}
|
||||
log_dir=${SEMANTIC_INDEX_LOG_DIR:-.cache/semantic_index/logs}
|
||||
state_path=${SEMANTIC_INDEX_STATE_PATH:-.cache/semantic_index/refresh_state.json}
|
||||
overlap_minutes=${SEMANTIC_INDEX_OVERLAP_MINUTES:-15}
|
||||
python_bin=${PYTHON:-$install_root/.venv/bin/python}
|
||||
|
||||
mkdir -p "$log_dir" "$(dirname "$state_path")"
|
||||
timestamp=$(date -u +"%Y%m%dT%H%M%SZ")
|
||||
log_file="$log_dir/redmine-refresh-$timestamp.log"
|
||||
|
||||
args=(
|
||||
-m semantic_index
|
||||
--refresh-redmine-projects
|
||||
--project-limits "$project_limits"
|
||||
--state-path "$state_path"
|
||||
--overlap-minutes "$overlap_minutes"
|
||||
)
|
||||
|
||||
if [[ "$mode" == "dry-run" ]]; then
|
||||
args+=(--dry-run)
|
||||
fi
|
||||
|
||||
{
|
||||
printf 'started_at=%s\n' "$(date -u +"%Y-%m-%dT%H:%M:%SZ")"
|
||||
printf 'mode=%s\n' "$mode"
|
||||
printf 'project_limits=%s\n' "$project_limits"
|
||||
printf 'state_path=%s\n' "$state_path"
|
||||
printf 'overlap_minutes=%s\n' "$overlap_minutes"
|
||||
cd "$install_root"
|
||||
"$python_bin" "${args[@]}"
|
||||
printf '\nfinished_at=%s\n' "$(date -u +"%Y-%m-%dT%H:%M:%SZ")"
|
||||
} 2>&1 | tee "$log_file"
|
||||
|
||||
printf 'log_file=%s\n' "$log_file"
|
||||
@@ -0,0 +1,61 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import re
|
||||
from typing import Any, Dict, List, Optional, Protocol
|
||||
|
||||
from .models import SearchQuery, SearchResult
|
||||
|
||||
|
||||
class QueryEmbedder(Protocol):
|
||||
def embed_query(self, text: str) -> List[float]:
|
||||
...
|
||||
|
||||
|
||||
class SearchStore(Protocol):
|
||||
def search(self, vector: List[float], query: SearchQuery, limit: int) -> List[SearchResult]:
|
||||
...
|
||||
|
||||
def get_document(self, document_id: str) -> Optional[Dict[str, Any]]:
|
||||
...
|
||||
|
||||
|
||||
class HybridSearchService:
|
||||
def __init__(self, embedder: QueryEmbedder, store: SearchStore) -> None:
|
||||
self.embedder = embedder
|
||||
self.store = store
|
||||
|
||||
def search(self, query: SearchQuery) -> List[SearchResult]:
|
||||
vector = self.embedder.embed_query(query.text)
|
||||
candidates = self.store.search(vector, query, limit=query.limit)
|
||||
rescored = [
|
||||
SearchResult(
|
||||
id=result.id,
|
||||
score=result.score + keyword_boost(query.text, result),
|
||||
text=result.text,
|
||||
payload=result.payload,
|
||||
)
|
||||
for result in candidates
|
||||
]
|
||||
return sorted(rescored, key=lambda result: result.score, reverse=True)[: query.limit]
|
||||
|
||||
def get_document(self, document_id: str) -> Optional[Dict[str, Any]]:
|
||||
return self.store.get_document(document_id)
|
||||
|
||||
|
||||
def keyword_boost(query_text: str, result: SearchResult) -> float:
|
||||
haystack = " ".join([result.text, " ".join(str(value) for value in result.payload.values() if value is not None)]).lower()
|
||||
boost = 0.0
|
||||
for phrase in re.findall(r'"([^"]+)"', query_text):
|
||||
if phrase.lower() in haystack:
|
||||
boost += 0.35
|
||||
for email in re.findall(r"[\w.+-]+@[\w.-]+\.[A-Za-z]{2,}", query_text):
|
||||
if email.lower() in haystack:
|
||||
boost += 0.3
|
||||
for token in re.findall(r"\b(?:#?\d{2,}|[A-Z]{2,}[-_]\d{2,}|[A-Z0-9]{4,}-[A-Z0-9-]{2,})\b", query_text):
|
||||
normalized = token.lower().lstrip("#")
|
||||
if token.lower() in haystack or normalized in haystack:
|
||||
boost += 0.25
|
||||
for word in re.findall(r"\b[A-Za-z][\w.-]{2,}\b", query_text):
|
||||
if word.lower() in haystack:
|
||||
boost += 0.03
|
||||
return boost
|
||||
Executable
+71
@@ -0,0 +1,71 @@
|
||||
#!/usr/bin/env bash
|
||||
set -euo pipefail
|
||||
|
||||
usage() {
|
||||
cat >&2 <<'EOF'
|
||||
Usage:
|
||||
semantic_index/search.sh "query text" [project_identifier] [limit]
|
||||
|
||||
Examples:
|
||||
semantic_index/search.sh "goods return" customer-service 3
|
||||
semantic_index/search.sh "candidate follow up" hiring 5 | jq '.results[] | {id, score, citation}'
|
||||
|
||||
Environment:
|
||||
SEMANTIC_INDEX_URL default: http://127.0.0.1:8787
|
||||
SEMANTIC_INDEX_API_KEY optional; falls back to semantic_index/.env or .env
|
||||
EOF
|
||||
}
|
||||
|
||||
if [[ $# -lt 1 ]]; then
|
||||
usage
|
||||
exit 2
|
||||
fi
|
||||
|
||||
query=$1
|
||||
project=${2:-}
|
||||
limit=${3:-10}
|
||||
base_url=${SEMANTIC_INDEX_URL:-http://127.0.0.1:8787}
|
||||
script_dir=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
|
||||
install_root=$(cd "$script_dir/.." && pwd)
|
||||
|
||||
read_env_value() {
|
||||
local key=$1
|
||||
local file
|
||||
for file in /etc/semantic-index.env "$install_root/semantic_index/.env" "$install_root/.env" semantic_index/.env .env; do
|
||||
if [[ -f "$file" ]]; then
|
||||
awk -F= -v key="$key" '
|
||||
$1 == key {
|
||||
value = substr($0, index($0, "=") + 1)
|
||||
gsub(/^[ \t"'\''"]+|[ \t"'\''"]+$/, "", value)
|
||||
print value
|
||||
exit
|
||||
}
|
||||
' "$file"
|
||||
return
|
||||
fi
|
||||
done
|
||||
}
|
||||
|
||||
json_escape() {
|
||||
sed \
|
||||
-e 's/\\/\\\\/g' \
|
||||
-e 's/"/\\"/g' \
|
||||
-e ':a;N;$!ba;s/\n/\\n/g'
|
||||
}
|
||||
|
||||
escaped_query=$(printf '%s' "$query" | json_escape)
|
||||
payload="{\"query\":\"$escaped_query\",\"limit\":$limit"
|
||||
if [[ -n "$project" ]]; then
|
||||
escaped_project=$(printf '%s' "$project" | json_escape)
|
||||
payload="$payload,\"project_identifier\":\"$escaped_project\""
|
||||
fi
|
||||
payload="$payload}"
|
||||
|
||||
api_key=${SEMANTIC_INDEX_API_KEY:-$(read_env_value SEMANTIC_INDEX_API_KEY)}
|
||||
|
||||
args=(-sS -H "Content-Type: application/json" -d "$payload")
|
||||
if [[ -n "${api_key:-}" ]]; then
|
||||
args+=(-H "Authorization: Bearer $api_key")
|
||||
fi
|
||||
|
||||
curl "${args[@]}" "$base_url/search"
|
||||
@@ -0,0 +1,31 @@
|
||||
• Added semantic_index/search.sh. It outputs raw JSON from the running HTTP service.
|
||||
|
||||
Usage:
|
||||
|
||||
semantic_index/search.sh "goods return" customer-service 3
|
||||
semantic_index/search.sh "candidate follow up" hiring 5 | jq '.results[] | {id, score, citation}'
|
||||
|
||||
It defaults to http://127.0.0.1:8787, reads SEMANTIC_INDEX_API_KEY from the environment or semantic_i
|
||||
ndex/.env, and accepts:
|
||||
|
||||
semantic_index/search.sh "query text" [project_identifier] [limit]
|
||||
|
||||
Verified:
|
||||
|
||||
- Script works against the local service.
|
||||
- Semantic tests still pass: 49 tests, OK.
|
||||
- Python compile check passed.
|
||||
|
||||
|
||||
› Okay yeah, that's nice and fast
|
||||
|
||||
|
||||
• Good. That script is the right shape for quick manual probing: fast because it avoids Python startup,
|
||||
plain JSON so it pipes cleanly, and thin enough that it won’t hide what the API is returning.
|
||||
|
||||
A couple useful examples:
|
||||
|
||||
semantic_index/search.sh "goods return" customer-service 3 | jq '.results[].citation'
|
||||
semantic_index/search.sh "candidate follow up" hiring 5 | jq -r '.results[] | "\(.score)
|
||||
\(.citation.url) \(.snippet)"'
|
||||
semantic_index/search.sh "callum@safetagtracking.com" customer-service 3 | jq '.results[0]'
|
||||
@@ -0,0 +1,115 @@
|
||||
import unittest
|
||||
from pathlib import Path
|
||||
|
||||
from semantic_index.app import create_app
|
||||
from semantic_index.config import Settings
|
||||
from semantic_index.models import SearchResult
|
||||
|
||||
|
||||
class FakeSearchService:
|
||||
def search(self, query):
|
||||
return [
|
||||
SearchResult(
|
||||
id="redmine:issue:1:chunk:0",
|
||||
score=0.8,
|
||||
text="Snippet text",
|
||||
payload={
|
||||
"source": "redmine",
|
||||
"project_identifier": "customer-service",
|
||||
"doc_type": "issue",
|
||||
"issue_id": 1,
|
||||
"redmine_url": "http://redmine/issues/1",
|
||||
"source_record_id": "issue:1",
|
||||
},
|
||||
)
|
||||
]
|
||||
|
||||
def get_document(self, document_id):
|
||||
return {"id": document_id, "text": "Full text", "payload": {}}
|
||||
|
||||
|
||||
class FakeStore:
|
||||
def list_projects(self, source=None, limit=1000):
|
||||
return [{"project_identifier": "customer-service", "document_count": 10}]
|
||||
|
||||
|
||||
class FakeRefreshService:
|
||||
def __init__(self):
|
||||
self.calls = []
|
||||
|
||||
def refresh_redmine_project_limits(self, project_limits, dry_run=False, force_rebuild=False, overlap_minutes=15):
|
||||
self.calls.append((project_limits, dry_run, force_rebuild, overlap_minutes))
|
||||
return {"source": "redmine", "projects": len(project_limits), "dry_run": dry_run}
|
||||
|
||||
|
||||
def fake_services():
|
||||
refresh = FakeRefreshService()
|
||||
return {
|
||||
"settings": Settings(
|
||||
openai_api_key="",
|
||||
qdrant_url="http://qdrant",
|
||||
qdrant_api_key=None,
|
||||
qdrant_collection="semantic",
|
||||
redmine_url="http://redmine",
|
||||
redmine_api_key="",
|
||||
redmine_project_identifier=None,
|
||||
sample_limit=50,
|
||||
bind_host="127.0.0.1",
|
||||
bind_port=8787,
|
||||
service_api_key=None,
|
||||
refresh_state_path=Path(".cache/semantic_index/refresh_state.json"),
|
||||
),
|
||||
"search": FakeSearchService(),
|
||||
"store": FakeStore(),
|
||||
"refresh": refresh,
|
||||
}
|
||||
|
||||
|
||||
class SemanticIndexAppTest(unittest.TestCase):
|
||||
def test_health_does_not_build_live_services(self):
|
||||
def broken_builder():
|
||||
raise AssertionError("health should not build live clients")
|
||||
|
||||
app = create_app(service_builder=broken_builder)
|
||||
routes = {route.path: route.endpoint for route in app.routes}
|
||||
|
||||
self.assertEqual({"status": "ok"}, routes["/health"]())
|
||||
|
||||
def test_search_endpoint_returns_normalized_agent_response(self):
|
||||
app = create_app(service_builder=fake_services)
|
||||
routes = {route.path: route.endpoint for route in app.routes}
|
||||
|
||||
response = routes["/search"]({"query": "printer", "project_identifier": "customer-service", "limit": 3})
|
||||
|
||||
self.assertEqual("printer", response["query"])
|
||||
self.assertEqual("customer-service", response["filters"]["project_identifier"])
|
||||
self.assertEqual("customer-service", response["results"][0]["citation"]["project_identifier"])
|
||||
|
||||
def test_projects_endpoint_lists_indexed_projects(self):
|
||||
app = create_app(service_builder=fake_services)
|
||||
routes = {route.path: route.endpoint for route in app.routes}
|
||||
|
||||
response = routes["/projects"]()
|
||||
|
||||
self.assertEqual("customer-service", response["projects"][0]["project_identifier"])
|
||||
|
||||
def test_refresh_endpoint_passes_project_limits_and_cost_flags(self):
|
||||
services = fake_services()
|
||||
app = create_app(service_builder=lambda: services)
|
||||
routes = {route.path: route.endpoint for route in app.routes}
|
||||
|
||||
response = routes["/sources/redmine/refresh"](
|
||||
{
|
||||
"project_limits": {"customer-service": 5},
|
||||
"dry_run": True,
|
||||
"force_rebuild": False,
|
||||
"overlap_minutes": 30,
|
||||
}
|
||||
)
|
||||
|
||||
self.assertTrue(response["dry_run"])
|
||||
self.assertEqual(({"customer-service": 5}, True, False, 30), services["refresh"].calls[0])
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
unittest.main()
|
||||
@@ -0,0 +1,182 @@
|
||||
import unittest
|
||||
|
||||
from semantic_index.ingest import BackfillService
|
||||
from semantic_index.mcp import SemanticMCP
|
||||
from semantic_index.models import SearchQuery, SearchResult
|
||||
from semantic_index.redmine import RedmineMapper
|
||||
|
||||
|
||||
class FakeRedmineSource:
|
||||
project_identifier = None
|
||||
|
||||
def recent_helpdesk_issues(self, limit):
|
||||
return [
|
||||
{
|
||||
"id": 1,
|
||||
"subject": "First",
|
||||
"description": "First body",
|
||||
"project": {"identifier": self.project_identifier},
|
||||
},
|
||||
{
|
||||
"id": 2,
|
||||
"subject": "Second",
|
||||
"description": "Second body",
|
||||
"project": {"identifier": self.project_identifier},
|
||||
},
|
||||
][:limit]
|
||||
|
||||
|
||||
class DuplicateDocumentRedmineSource:
|
||||
project_identifier = "customer-service"
|
||||
|
||||
def recent_helpdesk_issues(self, limit):
|
||||
return [
|
||||
{"id": 1, "subject": "First", "description": "First body", "project": {"identifier": "customer-service"}},
|
||||
{"id": 1, "subject": "First duplicate", "description": "Duplicate body", "project": {"identifier": "customer-service"}},
|
||||
][:limit]
|
||||
|
||||
|
||||
class FakeEmbedder:
|
||||
def embed_documents(self, docs):
|
||||
return [[float(i), 0.0, 0.0] for i, _ in enumerate(docs, start=1)]
|
||||
|
||||
def embed_query(self, text):
|
||||
return [0.1, 0.0, 0.0]
|
||||
|
||||
|
||||
class FakeStore:
|
||||
def __init__(self):
|
||||
self.deleted = []
|
||||
self.upserts = []
|
||||
|
||||
def rebuild_source(self, source, docs, vectors, project_identifier=None):
|
||||
self.deleted.append((source, project_identifier))
|
||||
self.upserts.append((docs, vectors))
|
||||
|
||||
def list_projects(self, source=None, limit=1000):
|
||||
return [
|
||||
{"project_identifier": "customer-service", "document_count": 1684},
|
||||
{"project_identifier": "hiring", "document_count": 409},
|
||||
]
|
||||
|
||||
|
||||
class FakeRefreshService:
|
||||
def __init__(self):
|
||||
self.calls = []
|
||||
|
||||
def refresh_redmine_project_limits(self, project_limits, dry_run=False, force_rebuild=False, overlap_minutes=15):
|
||||
self.calls.append((project_limits, dry_run, force_rebuild, overlap_minutes))
|
||||
return {"source": "redmine", "projects": len(project_limits), "dry_run": dry_run}
|
||||
|
||||
|
||||
class FakeSearchService:
|
||||
def __init__(self):
|
||||
self.queries = []
|
||||
|
||||
def search(self, query):
|
||||
self.queries.append(query)
|
||||
return [SearchResult(id="doc1", score=0.5, text="Snippet", payload={"redmine_url": "http://redmine/issues/1"})]
|
||||
|
||||
def get_document(self, document_id):
|
||||
return {"id": document_id, "text": "Snippet"}
|
||||
|
||||
|
||||
class BackfillAndMCPTest(unittest.TestCase):
|
||||
def test_sample_backfill_rebuilds_redmine_source(self):
|
||||
service = BackfillService(source=FakeRedmineSource(), embedder=FakeEmbedder(), store=FakeStore())
|
||||
|
||||
result = service.backfill_redmine_sample(limit=2)
|
||||
|
||||
self.assertEqual({"source": "redmine", "issues": 2, "documents": 2}, result)
|
||||
self.assertEqual([("redmine", None)], service.store.deleted)
|
||||
docs, vectors = service.store.upserts[0]
|
||||
self.assertEqual(["redmine:issue:1:chunk:0", "redmine:issue:2:chunk:0"], [doc.id for doc in docs])
|
||||
self.assertEqual(2, len(vectors))
|
||||
|
||||
def test_sample_backfill_rebuilds_only_the_configured_project_scope(self):
|
||||
store = FakeStore()
|
||||
service = BackfillService(
|
||||
source=FakeRedmineSource(),
|
||||
embedder=FakeEmbedder(),
|
||||
store=store,
|
||||
mapper=RedmineMapper(redmine_url="", project_identifier="customer-service"),
|
||||
)
|
||||
|
||||
service.backfill_redmine_sample(limit=1)
|
||||
|
||||
self.assertEqual([("redmine", "customer-service")], store.deleted)
|
||||
|
||||
def test_multi_project_backfill_rebuilds_each_project_scope(self):
|
||||
store = FakeStore()
|
||||
service = BackfillService(source=FakeRedmineSource(), embedder=FakeEmbedder(), store=store)
|
||||
|
||||
result = service.backfill_redmine_projects(["customer-service", "hiring"], per_project_limit=1)
|
||||
|
||||
self.assertEqual(
|
||||
{
|
||||
"source": "redmine",
|
||||
"projects": 2,
|
||||
"issues": 2,
|
||||
"documents": 2,
|
||||
"project_results": [
|
||||
{"project_identifier": "customer-service", "issues": 1, "documents": 1},
|
||||
{"project_identifier": "hiring", "issues": 1, "documents": 1},
|
||||
],
|
||||
},
|
||||
result,
|
||||
)
|
||||
self.assertEqual([("redmine", "customer-service"), ("redmine", "hiring")], store.deleted)
|
||||
self.assertEqual("customer-service", store.upserts[0][0][0].payload["project_identifier"])
|
||||
self.assertEqual("hiring", store.upserts[1][0][0].payload["project_identifier"])
|
||||
|
||||
def test_multi_project_backfill_accepts_per_project_limits(self):
|
||||
store = FakeStore()
|
||||
service = BackfillService(source=FakeRedmineSource(), embedder=FakeEmbedder(), store=store)
|
||||
|
||||
result = service.backfill_redmine_project_limits({"customer-service": 2, "hiring": 1})
|
||||
|
||||
self.assertEqual(3, result["issues"])
|
||||
self.assertEqual(
|
||||
[
|
||||
{"project_identifier": "customer-service", "issues": 2, "documents": 2},
|
||||
{"project_identifier": "hiring", "issues": 1, "documents": 1},
|
||||
],
|
||||
result["project_results"],
|
||||
)
|
||||
|
||||
def test_backfill_deduplicates_documents_by_stable_id_before_embedding(self):
|
||||
store = FakeStore()
|
||||
service = BackfillService(source=DuplicateDocumentRedmineSource(), embedder=FakeEmbedder(), store=store)
|
||||
|
||||
result = service.backfill_redmine_sample(limit=2)
|
||||
|
||||
self.assertEqual({"source": "redmine", "issues": 2, "documents": 1}, result)
|
||||
docs, vectors = store.upserts[0]
|
||||
self.assertEqual(["redmine:issue:1:chunk:0"], [doc.id for doc in docs])
|
||||
self.assertEqual(1, len(vectors))
|
||||
|
||||
def test_mcp_tools_return_json_ready_results(self):
|
||||
search = FakeSearchService()
|
||||
refresh = FakeRefreshService()
|
||||
mcp = SemanticMCP(search_service=search, backfill_service=None, store=FakeStore(), refresh_service=refresh)
|
||||
|
||||
response = mcp.call_tool("semantic_search", {"query": "printer", "source": "redmine", "project_identifier": "hiring", "limit": 3})
|
||||
document = mcp.call_tool("semantic_get_document", {"id": "doc1"})
|
||||
projects = mcp.call_tool("semantic_list_projects", {"source": "redmine"})
|
||||
refresh_response = mcp.call_tool("semantic_refresh_redmine", {"project_identifier": "customer-service", "limit": 5, "dry_run": True})
|
||||
|
||||
self.assertEqual("printer", response["query"])
|
||||
self.assertEqual("hiring", response["filters"]["project_identifier"])
|
||||
self.assertEqual("doc1", response["results"][0]["id"])
|
||||
self.assertEqual("http://redmine/issues/1", response["results"][0]["citation"]["url"])
|
||||
self.assertIsInstance(search.queries[0], SearchQuery)
|
||||
self.assertEqual("redmine", search.queries[0].source)
|
||||
self.assertEqual("hiring", search.queries[0].project_identifier)
|
||||
self.assertEqual({"id": "doc1", "text": "Snippet"}, document)
|
||||
self.assertEqual("customer-service", projects["projects"][0]["project_identifier"])
|
||||
self.assertTrue(refresh_response["dry_run"])
|
||||
self.assertEqual(({"customer-service": 5}, True, False, 15), refresh.calls[0])
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
unittest.main()
|
||||
@@ -0,0 +1,37 @@
|
||||
import subprocess
|
||||
import sys
|
||||
from pathlib import Path
|
||||
from tempfile import TemporaryDirectory
|
||||
import unittest
|
||||
|
||||
from semantic_index.config import load_settings
|
||||
|
||||
|
||||
class SemanticIndexCliTest(unittest.TestCase):
|
||||
def test_help_does_not_require_http_runtime_dependencies(self):
|
||||
result = subprocess.run(
|
||||
[sys.executable, "-m", "semantic_index", "--help"],
|
||||
check=False,
|
||||
stdout=subprocess.PIPE,
|
||||
stderr=subprocess.PIPE,
|
||||
text=True,
|
||||
)
|
||||
|
||||
self.assertEqual("", result.stderr)
|
||||
self.assertEqual(0, result.returncode)
|
||||
self.assertIn("--mcp-stdio", result.stdout)
|
||||
|
||||
def test_settings_load_from_package_env_when_root_env_missing(self):
|
||||
with TemporaryDirectory() as tmp:
|
||||
env_path = Path(tmp) / "semantic_index" / ".env"
|
||||
env_path.parent.mkdir()
|
||||
env_path.write_text("QDRANT_URL=http://qdrant.example:6333\nREDMINE_SAMPLE_LIMIT=7\n", encoding="utf-8")
|
||||
|
||||
settings = load_settings(Path(tmp) / ".env")
|
||||
|
||||
self.assertEqual("http://qdrant.example:6333", settings.qdrant_url)
|
||||
self.assertEqual(7, settings.sample_limit)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
unittest.main()
|
||||
@@ -0,0 +1,87 @@
|
||||
import json
|
||||
import unittest
|
||||
from unittest.mock import patch
|
||||
|
||||
from semantic_index.client import SemanticIndexClient
|
||||
from semantic_index.models import SearchResult
|
||||
|
||||
|
||||
class FakeSearchService:
|
||||
def __init__(self):
|
||||
self.queries = []
|
||||
|
||||
def search(self, query):
|
||||
self.queries.append(query)
|
||||
return [
|
||||
SearchResult(
|
||||
id="redmine:issue:1:chunk:0",
|
||||
score=0.7,
|
||||
text="Candidate follow up",
|
||||
payload={
|
||||
"source": "redmine",
|
||||
"project_identifier": "hiring",
|
||||
"doc_type": "issue",
|
||||
"issue_id": 1,
|
||||
"redmine_url": "http://redmine/issues/1",
|
||||
"source_record_id": "issue:1",
|
||||
},
|
||||
)
|
||||
]
|
||||
|
||||
def get_document(self, document_id):
|
||||
return {"id": document_id, "text": "Full text", "payload": {"project_identifier": "hiring"}}
|
||||
|
||||
|
||||
class SemanticIndexClientTest(unittest.TestCase):
|
||||
def test_in_process_client_returns_normalized_search_response(self):
|
||||
search = FakeSearchService()
|
||||
client = SemanticIndexClient(search_service=search)
|
||||
|
||||
response = client.search("candidate follow up", project_identifier="hiring", limit=3)
|
||||
|
||||
self.assertEqual("candidate follow up", response["query"])
|
||||
self.assertEqual({"project_identifier": "hiring", "limit": 3}, response["filters"])
|
||||
self.assertEqual("redmine:issue:1:chunk:0", response["results"][0]["id"])
|
||||
self.assertEqual("hiring", response["results"][0]["citation"]["project_identifier"])
|
||||
self.assertEqual("hiring", search.queries[0].project_identifier)
|
||||
|
||||
def test_in_process_client_get_document(self):
|
||||
client = SemanticIndexClient(search_service=FakeSearchService())
|
||||
|
||||
document = client.get_document("redmine:issue:1:chunk:0")
|
||||
|
||||
self.assertEqual("Full text", document["text"])
|
||||
|
||||
def test_http_client_sends_auth_header_and_parses_search_response(self):
|
||||
body = json.dumps({"query": "printer", "filters": {}, "results": []}).encode()
|
||||
|
||||
class FakeResponse:
|
||||
def __enter__(self):
|
||||
return self
|
||||
|
||||
def __exit__(self, exc_type, exc, tb):
|
||||
return False
|
||||
|
||||
def read(self):
|
||||
return body
|
||||
|
||||
captured = {}
|
||||
|
||||
def fake_urlopen(request, timeout):
|
||||
captured["url"] = request.full_url
|
||||
captured["authorization"] = request.headers.get("Authorization")
|
||||
captured["body"] = json.loads(request.data.decode())
|
||||
return FakeResponse()
|
||||
|
||||
with patch("urllib.request.urlopen", fake_urlopen):
|
||||
client = SemanticIndexClient(base_url="http://semantic.local", api_key="secret")
|
||||
response = client.search("printer", project_identifier="customer-service")
|
||||
|
||||
self.assertEqual("http://semantic.local/search", captured["url"])
|
||||
self.assertEqual("Bearer secret", captured["authorization"])
|
||||
self.assertEqual("customer-service", captured["body"]["project_identifier"])
|
||||
self.assertEqual("printer", response["query"])
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
unittest.main()
|
||||
@@ -0,0 +1,138 @@
|
||||
import unittest
|
||||
|
||||
from semantic_index.models import IndexDocument
|
||||
from semantic_index.redmine import RedmineMapper
|
||||
|
||||
|
||||
class RedmineMapperTest(unittest.TestCase):
|
||||
def test_issue_chunks_have_stable_ids_and_metadata(self):
|
||||
issue = {
|
||||
"id": 42,
|
||||
"subject": "Widget order ORD-12345 cannot ship",
|
||||
"description": "Customer reports that widget order ORD-12345 is blocked.",
|
||||
"project": {"id": 7, "identifier": "fud-helpdesk"},
|
||||
"contact": {"id": 9, "email": "ada@example.com", "name": "Ada Lovelace"},
|
||||
"created_on": "2026-04-01T10:00:00Z",
|
||||
"updated_on": "2026-04-02T10:00:00Z",
|
||||
"url": "http://redmine.local/issues/42",
|
||||
}
|
||||
|
||||
first = RedmineMapper(redmine_url="http://redmine.local").issue_to_documents(issue)
|
||||
second = RedmineMapper(redmine_url="http://redmine.local").issue_to_documents(issue)
|
||||
|
||||
self.assertEqual([doc.id for doc in first], [doc.id for doc in second])
|
||||
self.assertEqual("redmine:issue:42:chunk:0", first[0].id)
|
||||
self.assertEqual("issue", first[0].payload["doc_type"])
|
||||
self.assertEqual(42, first[0].payload["issue_id"])
|
||||
self.assertEqual("fud-helpdesk", first[0].payload["project_identifier"])
|
||||
self.assertIsNone(first[0].payload["project_name"])
|
||||
self.assertFalse(first[0].payload["has_helpdesk_ticket"])
|
||||
self.assertEqual("ada@example.com", first[0].payload["contact_email"])
|
||||
self.assertEqual("Ada Lovelace", first[0].payload["contact_name"])
|
||||
self.assertEqual("http://redmine.local/issues/42", first[0].payload["redmine_url"])
|
||||
self.assertIn("source_hash", first[0].payload)
|
||||
|
||||
def test_helpdesk_ticket_contact_is_mapped_to_all_issue_chunks(self):
|
||||
issue = {
|
||||
"id": 39779,
|
||||
"subject": "Goods return",
|
||||
"description": "Please arrange to return these goods.",
|
||||
"project": {"id": 1, "identifier": "customer-service"},
|
||||
"helpdesk_ticket": {
|
||||
"id": 35159,
|
||||
"contact_id": 1890,
|
||||
"from_address": "callum@safetagtracking.com",
|
||||
"contact": {
|
||||
"id": 1890,
|
||||
"name": "Callum Mackeonis",
|
||||
"company": "SafeTag Tracking",
|
||||
"email": "callum@safetagtracking.com",
|
||||
},
|
||||
},
|
||||
"journals": [
|
||||
{"id": 71570, "notes": "Hello, yes we can arrange this today.", "created_on": "2026-04-14T14:29:49Z"}
|
||||
],
|
||||
}
|
||||
|
||||
docs = RedmineMapper(redmine_url="http://redmine.local").issue_to_documents(issue)
|
||||
issue_doc = next(doc for doc in docs if doc.payload["doc_type"] == "issue")
|
||||
journal_doc = next(doc for doc in docs if doc.payload["doc_type"] == "journal")
|
||||
contact_doc = next(doc for doc in docs if doc.payload["doc_type"] == "contact")
|
||||
|
||||
for doc in (issue_doc, journal_doc, contact_doc):
|
||||
self.assertEqual(35159, doc.payload["helpdesk_ticket_id"])
|
||||
self.assertTrue(doc.payload["has_helpdesk_ticket"])
|
||||
self.assertEqual(1890, doc.payload["contact_id"])
|
||||
self.assertEqual("Callum Mackeonis", doc.payload["contact_name"])
|
||||
self.assertEqual("SafeTag Tracking", doc.payload["contact_company"])
|
||||
self.assertEqual("callum@safetagtracking.com", doc.payload["contact_email"])
|
||||
self.assertIn("Callum Mackeonis", issue_doc.text)
|
||||
self.assertIn("callum@safetagtracking.com", contact_doc.text)
|
||||
|
||||
def test_configured_project_identifier_is_used_when_issue_payload_omits_identifier(self):
|
||||
issue = {
|
||||
"id": 42,
|
||||
"subject": "Widget order",
|
||||
"description": "Body",
|
||||
"project": {"id": 1, "name": "Customer Service"},
|
||||
}
|
||||
|
||||
docs = RedmineMapper(
|
||||
redmine_url="http://redmine.local",
|
||||
project_identifier="customer-service",
|
||||
).issue_to_documents(issue)
|
||||
|
||||
self.assertEqual("customer-service", docs[0].payload["project_identifier"])
|
||||
self.assertEqual("Customer Service", docs[0].payload["project_name"])
|
||||
|
||||
def test_internal_non_helpdesk_issue_keeps_project_metadata_without_contact(self):
|
||||
issue = {
|
||||
"id": 55,
|
||||
"subject": "Internal hiring task",
|
||||
"description": "Follow up with candidate.",
|
||||
"project": {"id": 68, "identifier": "hiring", "name": "Hiring"},
|
||||
}
|
||||
|
||||
docs = RedmineMapper(redmine_url="http://redmine.local").issue_to_documents(issue)
|
||||
|
||||
self.assertEqual(1, len(docs))
|
||||
self.assertEqual("hiring", docs[0].payload["project_identifier"])
|
||||
self.assertEqual("Hiring", docs[0].payload["project_name"])
|
||||
self.assertFalse(docs[0].payload["has_helpdesk_ticket"])
|
||||
self.assertIsNone(docs[0].payload["contact_id"])
|
||||
|
||||
def test_issue_journals_messages_and_contact_are_mapped(self):
|
||||
issue = {
|
||||
"id": 42,
|
||||
"subject": "Widget order",
|
||||
"description": "Ticket envelope",
|
||||
"project": {"id": 7, "identifier": "fud-helpdesk"},
|
||||
"contact": {"id": 9, "email": "ada@example.com", "name": "Ada Lovelace"},
|
||||
"journals": [
|
||||
{"id": 5, "notes": "Private escalation note", "private_notes": True, "created_on": "2026-04-03T10:00:00Z"}
|
||||
],
|
||||
"messages": [
|
||||
{"id": 6, "body": "Customer reply body", "direction": "incoming", "created_on": "2026-04-03T11:00:00Z"}
|
||||
],
|
||||
}
|
||||
|
||||
docs = RedmineMapper(redmine_url="http://redmine.local").issue_to_documents(issue)
|
||||
ids = {doc.id for doc in docs}
|
||||
types = {doc.payload["doc_type"] for doc in docs}
|
||||
|
||||
self.assertIn("redmine:issue:42:journal:5:chunk:0", ids)
|
||||
self.assertIn("redmine:issue:42:message:6:chunk:0", ids)
|
||||
self.assertIn("redmine:contact:9:issue:42:chunk:0", ids)
|
||||
self.assertEqual({"issue", "journal", "message", "contact"}, types)
|
||||
journal = next(doc for doc in docs if doc.payload["doc_type"] == "journal")
|
||||
message = next(doc for doc in docs if doc.payload["doc_type"] == "message")
|
||||
self.assertEqual("private", journal.payload["visibility"])
|
||||
self.assertEqual("incoming", message.payload["direction"])
|
||||
|
||||
def test_empty_documents_are_rejected(self):
|
||||
with self.assertRaises(ValueError):
|
||||
IndexDocument(id="x", text=" ", payload={})
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
unittest.main()
|
||||
@@ -0,0 +1,46 @@
|
||||
import unittest
|
||||
|
||||
from semantic_index.embeddings import OpenAIEmbedder
|
||||
from semantic_index.models import IndexDocument
|
||||
|
||||
|
||||
class FakeOpenAIClient:
|
||||
def __init__(self):
|
||||
self.calls = []
|
||||
|
||||
def create_embeddings(self, model, inputs, dimensions=None):
|
||||
self.calls.append({"model": model, "inputs": list(inputs), "dimensions": dimensions})
|
||||
return [[float(i)] * 3 for i, _ in enumerate(inputs, start=1)]
|
||||
|
||||
|
||||
class OpenAIEmbedderTest(unittest.TestCase):
|
||||
def test_batches_embedding_requests(self):
|
||||
client = FakeOpenAIClient()
|
||||
embedder = OpenAIEmbedder(client=client, batch_size=2, dimensions=1536)
|
||||
docs = [
|
||||
IndexDocument(id="a", text="alpha", payload={}),
|
||||
IndexDocument(id="b", text="bravo", payload={}),
|
||||
IndexDocument(id="c", text="charlie", payload={}),
|
||||
]
|
||||
|
||||
vectors = embedder.embed_documents(docs)
|
||||
|
||||
self.assertEqual([[1.0, 1.0, 1.0], [2.0, 2.0, 2.0], [1.0, 1.0, 1.0]], vectors)
|
||||
self.assertEqual(2, len(client.calls))
|
||||
self.assertEqual(["alpha", "bravo"], client.calls[0]["inputs"])
|
||||
self.assertEqual("text-embedding-3-small", client.calls[0]["model"])
|
||||
self.assertEqual(1536, client.calls[0]["dimensions"])
|
||||
|
||||
def test_rejects_empty_or_oversized_chunks_before_api_call(self):
|
||||
client = FakeOpenAIClient()
|
||||
embedder = OpenAIEmbedder(client=client, max_chars=5)
|
||||
|
||||
with self.assertRaises(ValueError):
|
||||
embedder.embed_texts(["ok", " "])
|
||||
with self.assertRaises(ValueError):
|
||||
embedder.embed_texts(["toolong"])
|
||||
self.assertEqual([], client.calls)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
unittest.main()
|
||||
@@ -0,0 +1,394 @@
|
||||
import io
|
||||
import json
|
||||
import unittest
|
||||
from contextlib import redirect_stdout
|
||||
from pathlib import Path
|
||||
|
||||
from semantic_index.__main__ import main
|
||||
from semantic_index.config import Settings
|
||||
from semantic_index.models import SearchResult
|
||||
|
||||
|
||||
class FakeSearchService:
|
||||
def __init__(self):
|
||||
self.queries = []
|
||||
|
||||
def search(self, query):
|
||||
self.queries.append(query)
|
||||
if "missing@example.test" in query.text:
|
||||
return []
|
||||
return [
|
||||
SearchResult(
|
||||
id="redmine:contact:1890:issue:39779:chunk:0" if "callum" in query.text else "redmine:issue:39779:chunk:0",
|
||||
score=0.58,
|
||||
text="Callum Mackeonis callum@safetagtracking.com SafeTag Tracking",
|
||||
payload={
|
||||
"source": "redmine",
|
||||
"doc_type": "contact" if "callum" in query.text else "issue",
|
||||
"issue_id": 39779,
|
||||
"project_identifier": "customer-service",
|
||||
"contact_id": 1890,
|
||||
"contact_name": "Callum Mackeonis",
|
||||
"contact_email": "callum@safetagtracking.com",
|
||||
"contact_company": "SafeTag Tracking",
|
||||
"redmine_url": "http://redmine/issues/39779",
|
||||
},
|
||||
)
|
||||
]
|
||||
|
||||
def get_document(self, document_id):
|
||||
return {
|
||||
"id": document_id,
|
||||
"text": "Full indexed text",
|
||||
"payload": {
|
||||
"source": "redmine",
|
||||
"doc_type": "journal",
|
||||
"issue_id": 39778,
|
||||
"project_identifier": "customer-service",
|
||||
"contact_id": 1890,
|
||||
"contact_name": "Callum Mackeonis",
|
||||
"contact_email": "callum@safetagtracking.com",
|
||||
"redmine_url": "http://redmine/issues/39778",
|
||||
},
|
||||
}
|
||||
|
||||
|
||||
class FakeStore:
|
||||
def __init__(self):
|
||||
self.list_limits = []
|
||||
|
||||
def count_documents(self, source=None, project_identifier=None, doc_type=None):
|
||||
return 12
|
||||
|
||||
def list_documents(self, limit=10, source=None, project_identifier=None, doc_type=None):
|
||||
self.list_limits.append(limit)
|
||||
return [
|
||||
{
|
||||
"id": "redmine:issue:39779:chunk:0",
|
||||
"text": "Issue #39779: Goods return\nPlease return our goods.",
|
||||
"payload": {
|
||||
"source": "redmine",
|
||||
"doc_type": "issue",
|
||||
"issue_id": 39779,
|
||||
"project_identifier": "customer-service",
|
||||
"project_name": "Customer Service",
|
||||
"has_helpdesk_ticket": True,
|
||||
"contact_id": 1890,
|
||||
"contact_name": "Callum Mackeonis",
|
||||
"contact_email": "callum@safetagtracking.com",
|
||||
"contact_company": "SafeTag Tracking",
|
||||
"source_hash": "issue-hash",
|
||||
"redmine_url": "http://redmine/issues/39779",
|
||||
},
|
||||
},
|
||||
{
|
||||
"id": "redmine:issue:39779:journal:71570:chunk:0",
|
||||
"text": "Hello, we can arrange this today.",
|
||||
"payload": {
|
||||
"source": "redmine",
|
||||
"doc_type": "journal",
|
||||
"issue_id": 39779,
|
||||
"project_identifier": "customer-service",
|
||||
"project_name": "Customer Service",
|
||||
"has_helpdesk_ticket": True,
|
||||
"contact_id": 1890,
|
||||
"contact_name": "Callum Mackeonis",
|
||||
"contact_email": "callum@safetagtracking.com",
|
||||
"contact_company": "SafeTag Tracking",
|
||||
"source_hash": "journal-hash",
|
||||
"redmine_url": "http://redmine/issues/39779",
|
||||
},
|
||||
},
|
||||
{
|
||||
"id": "redmine:contact:1890:issue:39779:chunk:0",
|
||||
"text": "Callum Mackeonis callum@safetagtracking.com SafeTag Tracking",
|
||||
"payload": {
|
||||
"source": "redmine",
|
||||
"doc_type": "contact",
|
||||
"issue_id": 39779,
|
||||
"project_identifier": "customer-service",
|
||||
"project_name": "Customer Service",
|
||||
"has_helpdesk_ticket": True,
|
||||
"contact_id": 1890,
|
||||
"contact_name": "Callum Mackeonis",
|
||||
"contact_email": "callum@safetagtracking.com",
|
||||
"contact_company": "SafeTag Tracking",
|
||||
"source_hash": "contact-hash",
|
||||
"redmine_url": "http://redmine/issues/39779",
|
||||
},
|
||||
},
|
||||
{
|
||||
"id": "redmine:issue:39800:chunk:0",
|
||||
"text": "Ordinary issue with no helpdesk contact.",
|
||||
"payload": {
|
||||
"source": "redmine",
|
||||
"doc_type": "issue",
|
||||
"issue_id": 39800,
|
||||
"project_identifier": "hiring",
|
||||
"project_name": "Hiring",
|
||||
"has_helpdesk_ticket": False,
|
||||
"source_hash": "ordinary-hash",
|
||||
"redmine_url": "http://redmine/issues/39800",
|
||||
},
|
||||
},
|
||||
]
|
||||
|
||||
|
||||
class FakeRedmineSource:
|
||||
def recent_helpdesk_issues(self, limit):
|
||||
return [
|
||||
{
|
||||
"id": 39779,
|
||||
"subject": "Goods return",
|
||||
"description": "Please return our goods.",
|
||||
"project": {"id": 1, "identifier": "customer-service"},
|
||||
"helpdesk_ticket": {
|
||||
"id": 35159,
|
||||
"contact_id": 1890,
|
||||
"contact": {
|
||||
"id": 1890,
|
||||
"name": "Callum Mackeonis",
|
||||
"email": "callum@safetagtracking.com",
|
||||
"company": "SafeTag Tracking",
|
||||
},
|
||||
},
|
||||
}
|
||||
][:limit]
|
||||
|
||||
|
||||
def fake_services(store=None, search=None):
|
||||
settings = Settings(
|
||||
openai_api_key="",
|
||||
qdrant_url="http://qdrant",
|
||||
qdrant_api_key=None,
|
||||
qdrant_collection="semantic",
|
||||
redmine_url="http://redmine",
|
||||
redmine_api_key="",
|
||||
redmine_project_identifier="customer-service",
|
||||
sample_limit=50,
|
||||
bind_host="127.0.0.1",
|
||||
bind_port=8787,
|
||||
service_api_key=None,
|
||||
refresh_state_path=Path(".cache/semantic_index/refresh_state.json"),
|
||||
)
|
||||
return {
|
||||
"settings": settings,
|
||||
"search": search or FakeSearchService(),
|
||||
"store": store or FakeStore(),
|
||||
"redmine_source": FakeRedmineSource(),
|
||||
"backfill": FakeBackfillService(),
|
||||
}
|
||||
|
||||
|
||||
class FakeBackfillService:
|
||||
def __init__(self):
|
||||
self.calls = []
|
||||
|
||||
def backfill_redmine_sample(self, limit):
|
||||
self.calls.append(("sample", limit))
|
||||
return {"source": "redmine", "issues": limit, "documents": limit}
|
||||
|
||||
def backfill_redmine_projects(self, projects, per_project_limit):
|
||||
self.calls.append(("projects", projects, per_project_limit))
|
||||
return {
|
||||
"source": "redmine",
|
||||
"projects": len(projects),
|
||||
"issues": len(projects) * per_project_limit,
|
||||
"documents": len(projects) * per_project_limit,
|
||||
"project_results": [
|
||||
{"project_identifier": project, "issues": per_project_limit, "documents": per_project_limit}
|
||||
for project in projects
|
||||
],
|
||||
}
|
||||
|
||||
def backfill_redmine_project_limits(self, project_limits):
|
||||
self.calls.append(("project_limits", project_limits))
|
||||
return {
|
||||
"source": "redmine",
|
||||
"projects": len(project_limits),
|
||||
"issues": sum(project_limits.values()),
|
||||
"documents": sum(project_limits.values()),
|
||||
"project_results": [
|
||||
{"project_identifier": project, "issues": limit, "documents": limit}
|
||||
for project, limit in project_limits.items()
|
||||
],
|
||||
}
|
||||
|
||||
|
||||
class InspectCliTest(unittest.TestCase):
|
||||
def run_cli(self, args):
|
||||
out = io.StringIO()
|
||||
with redirect_stdout(out):
|
||||
main(args, service_builder=fake_services)
|
||||
return out.getvalue()
|
||||
|
||||
def test_no_args_prints_help_without_building_services(self):
|
||||
def broken_services():
|
||||
raise AssertionError("help should not build live services")
|
||||
|
||||
out = io.StringIO()
|
||||
with redirect_stdout(out):
|
||||
main([], service_builder=broken_services)
|
||||
|
||||
self.assertIn("inspect", out.getvalue())
|
||||
|
||||
def test_count_lists_matching_document_count(self):
|
||||
output = self.run_cli(["inspect", "count", "--source", "redmine", "--project", "customer-service"])
|
||||
|
||||
self.assertIn("12", output)
|
||||
|
||||
def test_list_shows_snippet_and_metadata_by_default(self):
|
||||
output = self.run_cli(["inspect", "list", "--limit", "5", "--source", "redmine", "--project", "customer-service"])
|
||||
|
||||
self.assertIn("redmine:issue:39779:chunk:0", output)
|
||||
self.assertIn("issue #39779", output.lower())
|
||||
self.assertIn("customer-service", output)
|
||||
self.assertIn("contact=#1890", output)
|
||||
self.assertIn("Callum Mackeonis", output)
|
||||
self.assertIn("callum@safetagtracking.com", output)
|
||||
self.assertNotIn("Full indexed text", output)
|
||||
|
||||
def test_search_runs_query_and_prints_citation(self):
|
||||
output = self.run_cli(["inspect", "search", "order status", "--limit", "3", "--project", "customer-service"])
|
||||
|
||||
self.assertIn("score=0.5800", output)
|
||||
self.assertIn("http://redmine/issues/39779", output)
|
||||
|
||||
def test_show_prints_full_document_text(self):
|
||||
output = self.run_cli(["inspect", "show", "redmine:issue:39778:chunk:0"])
|
||||
|
||||
self.assertIn("Full indexed text", output)
|
||||
self.assertIn("doc_type=journal", output)
|
||||
|
||||
def test_preview_redmine_maps_documents_without_writing(self):
|
||||
output = self.run_cli(["inspect", "preview-redmine", "--limit", "1", "--project", "customer-service"])
|
||||
|
||||
self.assertIn("redmine:issue:39779:chunk:0", output)
|
||||
self.assertIn("project=customer-service", output)
|
||||
self.assertIn("Please return our goods", output)
|
||||
|
||||
def test_preview_redmine_uses_minimal_service_builder(self):
|
||||
services = []
|
||||
|
||||
def minimal_builder(settings):
|
||||
services.append(settings.redmine_project_identifier)
|
||||
return {"settings": settings, "redmine_source": FakeRedmineSource()}
|
||||
|
||||
out = io.StringIO()
|
||||
with redirect_stdout(out):
|
||||
main(
|
||||
["inspect", "preview-redmine", "--limit", "1", "--project", "customer-service"],
|
||||
service_builder=lambda: (_ for _ in ()).throw(AssertionError("full services should not be built")),
|
||||
preview_service_builder=minimal_builder,
|
||||
settings_loader=lambda: fake_services()["settings"],
|
||||
)
|
||||
|
||||
self.assertEqual(["customer-service"], services)
|
||||
self.assertIn("redmine:issue:39779:chunk:0", out.getvalue())
|
||||
|
||||
def test_audit_prints_doc_type_counts_contact_coverage_and_attachment_check(self):
|
||||
output = self.run_cli(["inspect", "audit", "--limit", "10", "--source", "redmine", "--project", "customer-service"])
|
||||
|
||||
self.assertIn("documents=4", output)
|
||||
self.assertIn("doc_type issue=2", output)
|
||||
self.assertIn("doc_type journal=1", output)
|
||||
self.assertIn("doc_type contact=1", output)
|
||||
self.assertIn("contact_metadata 3/4", output)
|
||||
self.assertIn("helpdesk_contact_metadata 3/3", output)
|
||||
self.assertIn("project customer-service=3", output)
|
||||
self.assertIn("project hiring=1", output)
|
||||
self.assertIn("attachments=0", output)
|
||||
self.assertNotIn("missing_contact redmine:issue:39800:chunk:0", output)
|
||||
|
||||
def test_audit_json_returns_machine_readable_summary(self):
|
||||
output = self.run_cli(["inspect", "audit", "--limit", "10", "--project", "customer-service", "--json"])
|
||||
payload = json.loads(output)
|
||||
|
||||
self.assertEqual(4, payload["total_documents"])
|
||||
self.assertEqual(2, payload["doc_type_counts"]["issue"])
|
||||
self.assertEqual(3, payload["project_counts"]["customer-service"])
|
||||
self.assertEqual(1, payload["project_counts"]["hiring"])
|
||||
self.assertEqual([], payload["missing_helpdesk_contact_metadata"])
|
||||
|
||||
def test_compare_redmine_reports_missing_stale_and_contact_mismatches(self):
|
||||
output = self.run_cli(["inspect", "compare-redmine", "--limit", "1", "--project", "customer-service"])
|
||||
|
||||
self.assertIn("preview_documents=2", output)
|
||||
self.assertIn("indexed_documents=4", output)
|
||||
self.assertIn("stale", output)
|
||||
self.assertIn("redmine:issue:39779:chunk:0", output)
|
||||
|
||||
def test_compare_redmine_fetches_a_large_index_window_to_avoid_false_missing_results(self):
|
||||
store = FakeStore()
|
||||
out = io.StringIO()
|
||||
with redirect_stdout(out):
|
||||
main(["inspect", "compare-redmine", "--limit", "3", "--project", "customer-service"], service_builder=lambda: fake_services(store=store))
|
||||
|
||||
self.assertEqual(5000, store.list_limits[0])
|
||||
|
||||
def test_smoke_search_prints_pass_fail_for_known_queries(self):
|
||||
output = self.run_cli(["inspect", "smoke-search", "--project", "customer-service", "--email", "callum@safetagtracking.com", "--issue-id", "39779"])
|
||||
|
||||
self.assertIn("PASS email callum@safetagtracking.com", output)
|
||||
self.assertIn("PASS issue 39779", output)
|
||||
self.assertIn("redmine:contact:1890:issue:39779:chunk:0", output)
|
||||
|
||||
def test_smoke_search_uses_issue_id_filter_for_issue_checks(self):
|
||||
search = FakeSearchService()
|
||||
out = io.StringIO()
|
||||
with redirect_stdout(out):
|
||||
main(["inspect", "smoke-search", "--project", "customer-service", "--issue-id", "39779"], service_builder=lambda: fake_services(search=search))
|
||||
|
||||
issue_queries = [query for query in search.queries if query.text == "39779"]
|
||||
self.assertEqual(39779, issue_queries[0].issue_id)
|
||||
|
||||
def test_smoke_search_json_returns_check_results(self):
|
||||
output = self.run_cli(["inspect", "smoke-search", "--project", "customer-service", "--email", "missing@example.test", "--json"])
|
||||
payload = json.loads(output)
|
||||
|
||||
self.assertFalse(payload["checks"][0]["passed"])
|
||||
self.assertEqual("email", payload["checks"][0]["kind"])
|
||||
|
||||
def test_backfill_redmine_projects_cli_parses_comma_separated_projects(self):
|
||||
backfill = FakeBackfillService()
|
||||
services = fake_services()
|
||||
services["backfill"] = backfill
|
||||
out = io.StringIO()
|
||||
|
||||
with redirect_stdout(out):
|
||||
main(
|
||||
[
|
||||
"--backfill-redmine-projects",
|
||||
"--projects",
|
||||
"customer-service,hiring",
|
||||
"--per-project-limit",
|
||||
"25",
|
||||
],
|
||||
service_builder=lambda: services,
|
||||
)
|
||||
|
||||
self.assertEqual(("projects", ["customer-service", "hiring"], 25), backfill.calls[0])
|
||||
self.assertIn("'projects': 2", out.getvalue())
|
||||
|
||||
def test_backfill_redmine_projects_cli_parses_project_specific_limits(self):
|
||||
backfill = FakeBackfillService()
|
||||
services = fake_services()
|
||||
services["backfill"] = backfill
|
||||
out = io.StringIO()
|
||||
|
||||
with redirect_stdout(out):
|
||||
main(
|
||||
[
|
||||
"--backfill-redmine-projects",
|
||||
"--project-limits",
|
||||
"customer-service=500,hiring=200",
|
||||
],
|
||||
service_builder=lambda: services,
|
||||
)
|
||||
|
||||
self.assertEqual(("project_limits", {"customer-service": 500, "hiring": 200}), backfill.calls[0])
|
||||
self.assertIn("'issues': 700", out.getvalue())
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
unittest.main()
|
||||
@@ -0,0 +1,58 @@
|
||||
import subprocess
|
||||
import tempfile
|
||||
import unittest
|
||||
from pathlib import Path
|
||||
|
||||
|
||||
ROOT = Path(__file__).resolve().parents[2]
|
||||
INSTALLER = ROOT / "deploy" / "semantic-index" / "install.sh"
|
||||
|
||||
|
||||
class SemanticIndexInstallerTest(unittest.TestCase):
|
||||
def run_installer(self, *args, env=None):
|
||||
return subprocess.run(
|
||||
[str(INSTALLER), *args],
|
||||
cwd=ROOT,
|
||||
text=True,
|
||||
stdout=subprocess.PIPE,
|
||||
stderr=subprocess.PIPE,
|
||||
check=False,
|
||||
env=env,
|
||||
)
|
||||
|
||||
def test_default_mode_is_dry_run(self):
|
||||
result = self.run_installer()
|
||||
|
||||
self.assertEqual(0, result.returncode, result.stderr)
|
||||
self.assertIn("mode=dry-run", result.stdout)
|
||||
self.assertIn("would run: sudo mkdir -p /opt/semantic-index", result.stdout)
|
||||
self.assertIn("would run: sudo rsync", result.stdout)
|
||||
self.assertNotIn("Semantic Index installed, but deployment is not complete.", result.stdout)
|
||||
|
||||
def test_apply_prints_manual_next_step_warning(self):
|
||||
with tempfile.TemporaryDirectory() as tmp:
|
||||
tmp_path = Path(tmp)
|
||||
env = {
|
||||
"PATH": "/usr/bin:/bin",
|
||||
"SEMANTIC_INDEX_INSTALL_DIR": str(tmp_path / "opt" / "semantic-index"),
|
||||
"SEMANTIC_INDEX_ENV_FILE": str(tmp_path / "etc" / "semantic-index.env"),
|
||||
"SEMANTIC_INDEX_STATE_DIR": str(tmp_path / "var" / "lib" / "semantic-index"),
|
||||
"SEMANTIC_INDEX_LOG_DIR": str(tmp_path / "var" / "log" / "semantic-index"),
|
||||
"SEMANTIC_INDEX_SYSTEMD_DIR": str(tmp_path / "etc" / "systemd" / "system"),
|
||||
}
|
||||
result = self.run_installer("--apply", "--no-system", "--skip-deps", env=env)
|
||||
|
||||
self.assertEqual(0, result.returncode, result.stderr)
|
||||
self.assertIn("Semantic Index installed, but deployment is not complete.", result.stdout)
|
||||
self.assertIn("The refresh timer was NOT enabled automatically.", result.stdout)
|
||||
self.assertIn("Do not use --force-rebuild", result.stdout)
|
||||
|
||||
def test_invalid_argument_fails_with_usage(self):
|
||||
result = self.run_installer("--force-rebuild")
|
||||
|
||||
self.assertEqual(2, result.returncode)
|
||||
self.assertIn("Usage:", result.stderr)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
unittest.main()
|
||||
@@ -0,0 +1,187 @@
|
||||
import unittest
|
||||
|
||||
from semantic_index.models import IndexDocument
|
||||
from semantic_index.qdrant_store import QdrantStore
|
||||
|
||||
|
||||
class FakeMatchValue:
|
||||
def __init__(self, value):
|
||||
self.value = value
|
||||
|
||||
|
||||
class FakeFieldCondition:
|
||||
def __init__(self, key, match=None, range=None):
|
||||
self.key = key
|
||||
self.match = match
|
||||
self.range = range
|
||||
|
||||
|
||||
class FakeFilter:
|
||||
def __init__(self, must):
|
||||
self.must = must
|
||||
|
||||
|
||||
class FakeFilterSelector:
|
||||
def __init__(self, filter):
|
||||
self.filter = filter
|
||||
|
||||
|
||||
class FakePointIdsList:
|
||||
def __init__(self, points):
|
||||
self.points = points
|
||||
|
||||
|
||||
class FakeQModels:
|
||||
MatchValue = FakeMatchValue
|
||||
FieldCondition = FakeFieldCondition
|
||||
Filter = FakeFilter
|
||||
FilterSelector = FakeFilterSelector
|
||||
PointIdsList = FakePointIdsList
|
||||
|
||||
class PointStruct:
|
||||
def __init__(self, id, vector, payload):
|
||||
self.id = id
|
||||
self.vector = vector
|
||||
self.payload = payload
|
||||
|
||||
|
||||
class FakeCountResult:
|
||||
count = 7
|
||||
|
||||
|
||||
class FakeRecord:
|
||||
def __init__(self):
|
||||
self.id = "point-id"
|
||||
self.payload = {
|
||||
"document_id": "redmine:issue:1:chunk:0",
|
||||
"text": "Indexed text",
|
||||
"source": "redmine",
|
||||
"project_identifier": "customer-service",
|
||||
}
|
||||
|
||||
|
||||
class FakeClient:
|
||||
def __init__(self):
|
||||
self.count_filter = None
|
||||
self.scroll_filter = None
|
||||
self.delete_filter = None
|
||||
self.delete_selector = None
|
||||
self.upsert_batches = []
|
||||
|
||||
def get_collections(self):
|
||||
collection = type("Collection", (), {"name": "semantic"})()
|
||||
return type("Collections", (), {"collections": [collection]})()
|
||||
|
||||
def count(self, collection_name, count_filter, exact):
|
||||
self.count_filter = count_filter
|
||||
return FakeCountResult()
|
||||
|
||||
def scroll(self, collection_name, scroll_filter, limit, with_payload, with_vectors, offset=None):
|
||||
self.scroll_filter = scroll_filter
|
||||
return [FakeRecord()], None
|
||||
|
||||
def delete(self, collection_name, points_selector):
|
||||
self.delete_selector = points_selector
|
||||
self.delete_filter = getattr(points_selector, "filter", None)
|
||||
|
||||
def upsert(self, collection_name, points):
|
||||
self.upsert_batches.append(points)
|
||||
|
||||
|
||||
class QdrantStoreReadTest(unittest.TestCase):
|
||||
def make_store(self):
|
||||
store = object.__new__(QdrantStore)
|
||||
store.client = FakeClient()
|
||||
store.collection = "semantic"
|
||||
store.vector_size = 1536
|
||||
store.qmodels = FakeQModels
|
||||
store.upsert_batch_size = 2
|
||||
return store
|
||||
|
||||
def test_count_documents_builds_metadata_filter(self):
|
||||
store = self.make_store()
|
||||
|
||||
count = store.count_documents(source="redmine", project_identifier="customer-service", doc_type="issue")
|
||||
|
||||
self.assertEqual(7, count)
|
||||
conditions = store.client.count_filter.must
|
||||
self.assertEqual(["source", "project_identifier", "doc_type"], [condition.key for condition in conditions])
|
||||
self.assertEqual("customer-service", conditions[1].match.value)
|
||||
|
||||
def test_list_documents_strips_internal_payload_fields(self):
|
||||
store = self.make_store()
|
||||
|
||||
documents = store.list_documents(limit=5, source="redmine", project_identifier="customer-service")
|
||||
|
||||
self.assertEqual("redmine:issue:1:chunk:0", documents[0]["id"])
|
||||
self.assertEqual("Indexed text", documents[0]["text"])
|
||||
self.assertNotIn("document_id", documents[0]["payload"])
|
||||
self.assertNotIn("text", documents[0]["payload"])
|
||||
|
||||
def test_delete_by_source_can_be_limited_to_project_scope(self):
|
||||
store = self.make_store()
|
||||
|
||||
store.delete_by_source("redmine", project_identifier="customer-service")
|
||||
|
||||
conditions = store.client.delete_filter.must
|
||||
self.assertEqual(["source", "project_identifier"], [condition.key for condition in conditions])
|
||||
self.assertEqual("redmine", conditions[0].match.value)
|
||||
self.assertEqual("customer-service", conditions[1].match.value)
|
||||
|
||||
def test_list_documents_can_be_limited_to_issue_scope(self):
|
||||
store = self.make_store()
|
||||
|
||||
store.list_documents(limit=5, source="redmine", project_identifier="customer-service", issue_id=39779)
|
||||
|
||||
conditions = store.client.scroll_filter.must
|
||||
self.assertEqual(["source", "project_identifier", "issue_id"], [condition.key for condition in conditions])
|
||||
self.assertEqual(39779, conditions[2].match.value)
|
||||
|
||||
def test_delete_documents_deletes_stable_document_point_ids(self):
|
||||
store = self.make_store()
|
||||
|
||||
store.delete_documents(["redmine:issue:39779:chunk:0"])
|
||||
|
||||
self.assertEqual(1, len(store.client.delete_selector.points))
|
||||
self.assertNotEqual("redmine:issue:39779:chunk:0", store.client.delete_selector.points[0])
|
||||
|
||||
def test_upsert_sends_points_in_batches(self):
|
||||
store = self.make_store()
|
||||
documents = [
|
||||
IndexDocument(id=f"redmine:issue:{issue_id}:chunk:0", text=f"Issue {issue_id}", payload={"source": "redmine"})
|
||||
for issue_id in range(5)
|
||||
]
|
||||
vectors = [[0.1, 0.2, 0.3] for _ in documents]
|
||||
|
||||
store.upsert(documents, vectors)
|
||||
|
||||
self.assertEqual([2, 2, 1], [len(batch) for batch in store.client.upsert_batches])
|
||||
self.assertEqual("Issue 0", store.client.upsert_batches[0][0].payload["text"])
|
||||
|
||||
def test_list_documents_paginates_qdrant_scroll_until_requested_limit(self):
|
||||
class PagedClient(FakeClient):
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self.offsets = []
|
||||
|
||||
def scroll(self, collection_name, scroll_filter, limit, with_payload, with_vectors, offset=None):
|
||||
self.offsets.append(offset)
|
||||
first = FakeRecord()
|
||||
first.payload = {**first.payload, "document_id": f"doc:{len(self.offsets)}a"}
|
||||
second = FakeRecord()
|
||||
second.payload = {**second.payload, "document_id": f"doc:{len(self.offsets)}b"}
|
||||
if offset is None:
|
||||
return [first, second], "next"
|
||||
return [first, second], None
|
||||
|
||||
store = self.make_store()
|
||||
store.client = PagedClient()
|
||||
|
||||
documents = store.list_documents(limit=3, source="redmine")
|
||||
|
||||
self.assertEqual(["doc:1a", "doc:1b", "doc:2a"], [document["id"] for document in documents])
|
||||
self.assertEqual([None, "next"], store.client.offsets)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
unittest.main()
|
||||
@@ -0,0 +1,102 @@
|
||||
import unittest
|
||||
|
||||
from semantic_index.redmine import RedmineApiSource
|
||||
|
||||
|
||||
class RecordingRedmineSource(RedmineApiSource):
|
||||
def __init__(self):
|
||||
super().__init__(redmine_url="http://redmine.local", api_key="secret", project_identifier="customer-service")
|
||||
self.urls = []
|
||||
|
||||
def _get_json(self, url):
|
||||
self.urls.append(url)
|
||||
if url.startswith("http://redmine.local/issues.json"):
|
||||
return {"issues": [{"id": 39779}]}
|
||||
return {"issue": {"id": 39779, "subject": "Goods return"}}
|
||||
|
||||
|
||||
class PagedRedmineSource(RedmineApiSource):
|
||||
def __init__(self):
|
||||
super().__init__(redmine_url="http://redmine.local", api_key="secret", project_identifier="customer-service")
|
||||
self.urls = []
|
||||
|
||||
def _get_json(self, url):
|
||||
self.urls.append(url)
|
||||
if url.startswith("http://redmine.local/issues.json"):
|
||||
query = url.split("?", 1)[1]
|
||||
params = dict(part.split("=", 1) for part in query.split("&"))
|
||||
offset = int(params.get("offset", "0"))
|
||||
limit = int(params.get("limit", "0"))
|
||||
return {"issues": [{"id": issue_id} for issue_id in range(offset + 1, offset + limit + 1)]}
|
||||
issue_id = int(url.split("/issues/", 1)[1].split(".", 1)[0])
|
||||
return {"issue": {"id": issue_id, "subject": f"Issue {issue_id}"}}
|
||||
|
||||
|
||||
class DuplicatePagedRedmineSource(RedmineApiSource):
|
||||
def __init__(self):
|
||||
super().__init__(redmine_url="http://redmine.local", api_key="secret", project_identifier="customer-service")
|
||||
|
||||
def _get_json(self, url):
|
||||
if url.startswith("http://redmine.local/issues.json"):
|
||||
query = url.split("?", 1)[1]
|
||||
params = dict(part.split("=", 1) for part in query.split("&"))
|
||||
offset = int(params.get("offset", "0"))
|
||||
if offset == 0:
|
||||
return {"issues": [{"id": 1}, {"id": 2}]}
|
||||
if offset == 2:
|
||||
return {"issues": [{"id": 2}, {"id": 3}]}
|
||||
return {"issues": []}
|
||||
issue_id = int(url.split("/issues/", 1)[1].split(".", 1)[0])
|
||||
return {"issue": {"id": issue_id, "subject": f"Issue {issue_id}"}}
|
||||
|
||||
|
||||
class RedmineApiSourceTest(unittest.TestCase):
|
||||
def test_recent_issue_summaries_do_not_fetch_issue_details(self):
|
||||
source = RecordingRedmineSource()
|
||||
|
||||
summaries = list(source.recent_issue_summaries(limit=1))
|
||||
|
||||
self.assertEqual(39779, summaries[0]["id"])
|
||||
self.assertEqual(1, len(source.urls))
|
||||
self.assertTrue(source.urls[0].startswith("http://redmine.local/issues.json"))
|
||||
|
||||
def test_issue_detail_fetches_journals_and_helpdesk(self):
|
||||
source = RecordingRedmineSource()
|
||||
|
||||
detail = source.issue_detail(39779)
|
||||
|
||||
self.assertEqual(39779, detail["id"])
|
||||
self.assertIn("include=journals%2Chelpdesk", source.urls[0])
|
||||
|
||||
def test_recent_helpdesk_issues_requests_helpdesk_include_with_journals(self):
|
||||
source = RecordingRedmineSource()
|
||||
|
||||
issues = list(source.recent_helpdesk_issues(limit=1))
|
||||
|
||||
self.assertEqual(39779, issues[0]["id"])
|
||||
self.assertIn("include=journals%2Chelpdesk", source.urls[1])
|
||||
self.assertIn("subproject_id=%21%2A", source.urls[0])
|
||||
|
||||
def test_recent_helpdesk_issues_paginates_past_redmine_page_limit(self):
|
||||
source = PagedRedmineSource()
|
||||
|
||||
issues = list(source.recent_helpdesk_issues(limit=250))
|
||||
|
||||
self.assertEqual(250, len(issues))
|
||||
list_urls = [url for url in source.urls if url.startswith("http://redmine.local/issues.json")]
|
||||
self.assertEqual(3, len(list_urls))
|
||||
self.assertIn("limit=100", list_urls[0])
|
||||
self.assertIn("offset=0", list_urls[0])
|
||||
self.assertIn("offset=100", list_urls[1])
|
||||
self.assertIn("offset=200", list_urls[2])
|
||||
|
||||
def test_recent_helpdesk_issues_skips_duplicate_issue_ids_across_pages(self):
|
||||
source = DuplicatePagedRedmineSource()
|
||||
|
||||
issues = list(source.recent_helpdesk_issues(limit=3))
|
||||
|
||||
self.assertEqual([1, 2, 3], [issue["id"] for issue in issues])
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
unittest.main()
|
||||
@@ -0,0 +1,277 @@
|
||||
import io
|
||||
import json
|
||||
import tempfile
|
||||
import unittest
|
||||
from contextlib import redirect_stdout
|
||||
from pathlib import Path
|
||||
|
||||
from semantic_index.__main__ import main
|
||||
from semantic_index.models import IndexDocument
|
||||
from semantic_index.refresh import FileRefreshState, RedmineRefreshService
|
||||
|
||||
|
||||
def issue(updated_on="2026-04-25T12:00:00Z"):
|
||||
return {
|
||||
"id": 39779,
|
||||
"subject": "Goods return",
|
||||
"description": "Please return our goods.",
|
||||
"updated_on": updated_on,
|
||||
"project": {"id": 1, "identifier": "customer-service", "name": "Customer Service"},
|
||||
}
|
||||
|
||||
|
||||
class FakeRedmineSource:
|
||||
project_identifier = None
|
||||
|
||||
def __init__(self, issues=None):
|
||||
self.issues = issues or [issue()]
|
||||
self.calls = []
|
||||
|
||||
def recent_helpdesk_issues(self, limit):
|
||||
self.calls.append((self.project_identifier, limit))
|
||||
return self.issues[:limit]
|
||||
|
||||
|
||||
class SummaryDetailRedmineSource(FakeRedmineSource):
|
||||
def __init__(self, summaries, details):
|
||||
super().__init__([])
|
||||
self.summaries = summaries
|
||||
self.details = details
|
||||
self.summary_calls = []
|
||||
self.detail_calls = []
|
||||
|
||||
def recent_issue_summaries(self, limit):
|
||||
self.summary_calls.append((self.project_identifier, limit))
|
||||
return self.summaries[:limit]
|
||||
|
||||
def issue_detail(self, issue_id):
|
||||
self.detail_calls.append(issue_id)
|
||||
return self.details[issue_id]
|
||||
|
||||
|
||||
class RecordingEmbedder:
|
||||
def __init__(self):
|
||||
self.calls = []
|
||||
|
||||
def embed_documents(self, docs):
|
||||
self.calls.append(list(docs))
|
||||
return [[0.1, 0.2, 0.3] for _ in docs]
|
||||
|
||||
|
||||
class RefreshStore:
|
||||
def __init__(self, existing=None):
|
||||
self.existing = existing or {}
|
||||
self.upserts = []
|
||||
self.deleted_ids = []
|
||||
|
||||
def list_documents(self, limit=10, source=None, project_identifier=None, doc_type=None, issue_id=None):
|
||||
return list(self.existing.values())[:limit]
|
||||
|
||||
def upsert(self, docs, vectors):
|
||||
self.upserts.append((list(docs), list(vectors)))
|
||||
|
||||
def delete_documents(self, document_ids):
|
||||
self.deleted_ids.extend(document_ids)
|
||||
|
||||
|
||||
class RedmineRefreshServiceTest(unittest.TestCase):
|
||||
def test_refresh_skips_embeddings_when_source_hash_matches_existing_document(self):
|
||||
source = FakeRedmineSource()
|
||||
embedder = RecordingEmbedder()
|
||||
service = RedmineRefreshService(source=source, embedder=embedder, store=RefreshStore())
|
||||
candidate = service.mapper.issue_to_documents(issue())[0]
|
||||
service.store.existing[candidate.id] = {
|
||||
"id": candidate.id,
|
||||
"text": candidate.text,
|
||||
"payload": dict(candidate.payload),
|
||||
}
|
||||
|
||||
result = service.refresh_redmine_project_limits({"customer-service": 1})
|
||||
|
||||
self.assertEqual(1, result["unchanged_documents"])
|
||||
self.assertEqual(0, result["embedded_documents"])
|
||||
self.assertEqual([], embedder.calls)
|
||||
self.assertEqual([], service.store.upserts)
|
||||
|
||||
def test_refresh_embeds_only_changed_and_new_documents(self):
|
||||
source = FakeRedmineSource()
|
||||
embedder = RecordingEmbedder()
|
||||
service = RedmineRefreshService(source=source, embedder=embedder, store=RefreshStore())
|
||||
candidate = service.mapper.issue_to_documents(issue())[0]
|
||||
service.store.existing[candidate.id] = {
|
||||
"id": candidate.id,
|
||||
"text": "Old text",
|
||||
"payload": {**candidate.payload, "source_hash": "old-hash"},
|
||||
}
|
||||
|
||||
result = service.refresh_redmine_project_limits({"customer-service": 1})
|
||||
|
||||
self.assertEqual(1, result["changed_documents"])
|
||||
self.assertEqual(1, result["embedded_documents"])
|
||||
self.assertEqual([[candidate]], embedder.calls)
|
||||
self.assertEqual([candidate.id], [doc.id for doc in service.store.upserts[0][0]])
|
||||
|
||||
def test_refresh_deletes_stale_issue_documents_without_embedding(self):
|
||||
source = FakeRedmineSource()
|
||||
embedder = RecordingEmbedder()
|
||||
service = RedmineRefreshService(source=source, embedder=embedder, store=RefreshStore())
|
||||
candidate = service.mapper.issue_to_documents(issue())[0]
|
||||
service.store.existing[candidate.id] = {"id": candidate.id, "text": candidate.text, "payload": dict(candidate.payload)}
|
||||
service.store.existing["redmine:issue:39779:journal:1:chunk:0"] = {
|
||||
"id": "redmine:issue:39779:journal:1:chunk:0",
|
||||
"text": "Deleted note",
|
||||
"payload": {"source_hash": "gone", "issue_id": 39779},
|
||||
}
|
||||
|
||||
result = service.refresh_redmine_project_limits({"customer-service": 1})
|
||||
|
||||
self.assertEqual(1, result["stale_documents"])
|
||||
self.assertEqual(["redmine:issue:39779:journal:1:chunk:0"], service.store.deleted_ids)
|
||||
self.assertEqual([], embedder.calls)
|
||||
|
||||
def test_dry_run_reports_planned_embeddings_without_embedding_or_mutating(self):
|
||||
source = FakeRedmineSource()
|
||||
embedder = RecordingEmbedder()
|
||||
service = RedmineRefreshService(source=source, embedder=embedder, store=RefreshStore())
|
||||
|
||||
result = service.refresh_redmine_project_limits({"customer-service": 1}, dry_run=True)
|
||||
|
||||
self.assertEqual(1, result["new_documents"])
|
||||
self.assertEqual(1, result["would_embed_documents"])
|
||||
self.assertEqual(0, result["embedded_documents"])
|
||||
self.assertEqual([], embedder.calls)
|
||||
self.assertEqual([], service.store.upserts)
|
||||
self.assertEqual([], service.store.deleted_ids)
|
||||
|
||||
def test_force_rebuild_embeds_unchanged_documents(self):
|
||||
source = FakeRedmineSource()
|
||||
embedder = RecordingEmbedder()
|
||||
service = RedmineRefreshService(source=source, embedder=embedder, store=RefreshStore())
|
||||
candidate = service.mapper.issue_to_documents(issue())[0]
|
||||
service.store.existing[candidate.id] = {"id": candidate.id, "text": candidate.text, "payload": dict(candidate.payload)}
|
||||
|
||||
result = service.refresh_redmine_project_limits({"customer-service": 1}, force_rebuild=True)
|
||||
|
||||
self.assertEqual(1, result["force_rebuilt_documents"])
|
||||
self.assertEqual(1, result["embedded_documents"])
|
||||
self.assertEqual([[candidate]], embedder.calls)
|
||||
|
||||
def test_force_rebuild_ignores_refresh_state_window_for_fetched_candidates(self):
|
||||
source = FakeRedmineSource([issue(updated_on="2026-04-25T10:00:00Z")])
|
||||
embedder = RecordingEmbedder()
|
||||
with tempfile.TemporaryDirectory() as tmp:
|
||||
state = FileRefreshState(Path(tmp) / "refresh.json")
|
||||
state.mark_success("customer-service", "2026-04-25T12:00:00Z")
|
||||
service = RedmineRefreshService(source=source, embedder=embedder, store=RefreshStore(), state=state)
|
||||
|
||||
result = service.refresh_redmine_project_limits({"customer-service": 1}, force_rebuild=True, overlap_minutes=15)
|
||||
|
||||
self.assertEqual(0, result["skipped_issues"])
|
||||
self.assertEqual(1, result["embedded_documents"])
|
||||
|
||||
def test_file_refresh_state_updates_only_when_called(self):
|
||||
with tempfile.TemporaryDirectory() as tmp:
|
||||
state = FileRefreshState(Path(tmp) / "refresh.json")
|
||||
self.assertEqual({}, state.load())
|
||||
|
||||
state.mark_success("customer-service", "2026-04-25T12:00:00Z")
|
||||
|
||||
self.assertEqual(
|
||||
{"projects": {"customer-service": {"last_successful_refresh_at": "2026-04-25T12:00:00Z"}}},
|
||||
json.loads((Path(tmp) / "refresh.json").read_text(encoding="utf-8")),
|
||||
)
|
||||
|
||||
def test_refresh_state_skips_issues_older_than_overlap_window(self):
|
||||
source = FakeRedmineSource([issue(updated_on="2026-04-25T10:00:00Z")])
|
||||
embedder = RecordingEmbedder()
|
||||
with tempfile.TemporaryDirectory() as tmp:
|
||||
state = FileRefreshState(Path(tmp) / "refresh.json")
|
||||
state.mark_success("customer-service", "2026-04-25T12:00:00Z")
|
||||
service = RedmineRefreshService(source=source, embedder=embedder, store=RefreshStore(), state=state)
|
||||
|
||||
result = service.refresh_redmine_project_limits({"customer-service": 1}, dry_run=True, overlap_minutes=15)
|
||||
|
||||
self.assertEqual(1, result["issues"])
|
||||
self.assertEqual(1, result["skipped_issues"])
|
||||
self.assertEqual(0, result["documents"])
|
||||
self.assertEqual([], embedder.calls)
|
||||
|
||||
def test_refresh_skips_old_summaries_without_fetching_issue_detail(self):
|
||||
old_summary = {"id": 39779, "updated_on": "2026-04-25T10:00:00Z"}
|
||||
new_summary = {"id": 39780, "updated_on": "2026-04-25T11:50:00Z"}
|
||||
source = SummaryDetailRedmineSource(
|
||||
summaries=[old_summary, new_summary],
|
||||
details={39780: {**issue("2026-04-25T11:50:00Z"), "id": 39780}},
|
||||
)
|
||||
embedder = RecordingEmbedder()
|
||||
with tempfile.TemporaryDirectory() as tmp:
|
||||
state = FileRefreshState(Path(tmp) / "refresh.json")
|
||||
state.mark_success("customer-service", "2026-04-25T12:00:00Z")
|
||||
service = RedmineRefreshService(source=source, embedder=embedder, store=RefreshStore(), state=state)
|
||||
|
||||
result = service.refresh_redmine_project_limits({"customer-service": 2}, dry_run=True, overlap_minutes=15)
|
||||
|
||||
self.assertEqual(2, result["scanned_issues"])
|
||||
self.assertEqual(1, result["skipped_issues"])
|
||||
self.assertEqual(1, result["detail_fetched_issues"])
|
||||
self.assertEqual([39780], source.detail_calls)
|
||||
|
||||
|
||||
class RefreshCliTest(unittest.TestCase):
|
||||
def test_refresh_redmine_projects_cli_parses_project_limits_and_dry_run(self):
|
||||
class FakeRefresh:
|
||||
def __init__(self):
|
||||
self.calls = []
|
||||
|
||||
def refresh_redmine_project_limits(self, project_limits, dry_run=False, force_rebuild=False, overlap_minutes=15):
|
||||
self.calls.append((project_limits, dry_run, force_rebuild, overlap_minutes))
|
||||
return {"source": "redmine", "projects": len(project_limits), "issues": sum(project_limits.values())}
|
||||
|
||||
refresh = FakeRefresh()
|
||||
services = {"refresh": refresh}
|
||||
out = io.StringIO()
|
||||
|
||||
with redirect_stdout(out):
|
||||
main(
|
||||
[
|
||||
"--refresh-redmine-projects",
|
||||
"--project-limits",
|
||||
"customer-service=5,hiring=2",
|
||||
"--dry-run",
|
||||
"--overlap-minutes",
|
||||
"30",
|
||||
],
|
||||
service_builder=lambda: services,
|
||||
)
|
||||
|
||||
self.assertEqual(({"customer-service": 5, "hiring": 2}, True, False, 30), refresh.calls[0])
|
||||
self.assertIn("'projects': 2", out.getvalue())
|
||||
|
||||
def test_refresh_redmine_projects_cli_can_override_state_path(self):
|
||||
class FakeRefresh:
|
||||
def __init__(self):
|
||||
self.state = None
|
||||
|
||||
def refresh_redmine_project_limits(self, project_limits, dry_run=False, force_rebuild=False, overlap_minutes=15):
|
||||
return {"state_path": str(self.state.path)}
|
||||
|
||||
refresh = FakeRefresh()
|
||||
out = io.StringIO()
|
||||
|
||||
with redirect_stdout(out):
|
||||
main(
|
||||
[
|
||||
"--refresh-redmine-projects",
|
||||
"--project-limits",
|
||||
"customer-service=1",
|
||||
"--state-path",
|
||||
"/tmp/semantic-refresh-state.json",
|
||||
],
|
||||
service_builder=lambda: {"refresh": refresh},
|
||||
)
|
||||
|
||||
self.assertIn("/tmp/semantic-refresh-state.json", out.getvalue())
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
unittest.main()
|
||||
@@ -0,0 +1,85 @@
|
||||
import unittest
|
||||
|
||||
from semantic_index.models import IndexDocument, SearchQuery, SearchResult
|
||||
from semantic_index.qdrant_store import build_filter, point_id_for_document
|
||||
from semantic_index.search import HybridSearchService, keyword_boost
|
||||
|
||||
|
||||
class FakeEmbedder:
|
||||
def embed_query(self, text):
|
||||
return [0.1, 0.2, 0.3]
|
||||
|
||||
|
||||
class FakeStore:
|
||||
def __init__(self):
|
||||
self.query = None
|
||||
|
||||
def search(self, vector, query, limit):
|
||||
self.query = query
|
||||
return [
|
||||
SearchResult(
|
||||
id="weak",
|
||||
score=0.7,
|
||||
text="general support text",
|
||||
payload={"redmine_url": "http://redmine/issues/1"},
|
||||
),
|
||||
SearchResult(
|
||||
id="strong",
|
||||
score=0.6,
|
||||
text="Customer ada@example.com asked about ORD-12345",
|
||||
payload={"redmine_url": "http://redmine/issues/2"},
|
||||
),
|
||||
][:limit]
|
||||
|
||||
|
||||
class SearchTest(unittest.TestCase):
|
||||
def test_qdrant_point_id_is_deterministic_uuid_for_stable_document_id(self):
|
||||
first = point_id_for_document("redmine:issue:42:journal:5:chunk:0")
|
||||
second = point_id_for_document("redmine:issue:42:journal:5:chunk:0")
|
||||
|
||||
self.assertEqual(first, second)
|
||||
self.assertRegex(first, r"^[0-9a-f-]{36}$")
|
||||
|
||||
def test_filter_maps_supported_metadata(self):
|
||||
query = SearchQuery(
|
||||
text="printer",
|
||||
source="redmine",
|
||||
project_identifier="fud-helpdesk",
|
||||
doc_type="message",
|
||||
issue_id=42,
|
||||
contact_email="ada@example.com",
|
||||
date_from="2026-04-01T00:00:00Z",
|
||||
date_to="2026-04-30T23:59:59Z",
|
||||
)
|
||||
|
||||
qfilter = build_filter(query)
|
||||
|
||||
self.assertEqual(
|
||||
[
|
||||
{"key": "source", "match": {"value": "redmine"}},
|
||||
{"key": "project_identifier", "match": {"value": "fud-helpdesk"}},
|
||||
{"key": "doc_type", "match": {"value": "message"}},
|
||||
{"key": "issue_id", "match": {"value": 42}},
|
||||
{"key": "contact_email", "match": {"value": "ada@example.com"}},
|
||||
{"key": "created_on", "range": {"gte": "2026-04-01T00:00:00Z", "lte": "2026-04-30T23:59:59Z"}},
|
||||
],
|
||||
qfilter["must"],
|
||||
)
|
||||
|
||||
def test_keyword_boost_prioritizes_exact_email_and_order_matches(self):
|
||||
weak = SearchResult(id="weak", score=0.7, text="general support text", payload={})
|
||||
strong = SearchResult(id="strong", score=0.6, text="Customer ada@example.com asked about ORD-12345", payload={})
|
||||
|
||||
self.assertGreater(
|
||||
keyword_boost('ada@example.com "ORD-12345"', strong),
|
||||
keyword_boost('ada@example.com "ORD-12345"', weak),
|
||||
)
|
||||
|
||||
service = HybridSearchService(embedder=FakeEmbedder(), store=FakeStore())
|
||||
results = service.search(SearchQuery(text='ada@example.com "ORD-12345"', limit=2))
|
||||
self.assertEqual("strong", results[0].id)
|
||||
self.assertEqual("http://redmine/issues/2", results[0].citation["url"])
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
unittest.main()
|
||||
@@ -0,0 +1,41 @@
|
||||
import os
|
||||
import subprocess
|
||||
import tempfile
|
||||
import unittest
|
||||
from pathlib import Path
|
||||
|
||||
|
||||
ROOT = Path(__file__).resolve().parents[2]
|
||||
REFRESH = ROOT / "semantic_index" / "refresh.sh"
|
||||
|
||||
|
||||
class SemanticIndexShellWrapperTest(unittest.TestCase):
|
||||
def test_refresh_wrapper_is_self_locating_when_called_from_another_directory(self):
|
||||
with tempfile.TemporaryDirectory() as tmp:
|
||||
tmp_path = Path(tmp)
|
||||
env = {
|
||||
**os.environ,
|
||||
"PYTHON": "/bin/echo",
|
||||
"SEMANTIC_INDEX_PROJECT_LIMITS": "customer-service=5",
|
||||
"SEMANTIC_INDEX_LOG_DIR": str(tmp_path / "logs"),
|
||||
"SEMANTIC_INDEX_STATE_PATH": str(tmp_path / "state" / "refresh_state.json"),
|
||||
}
|
||||
|
||||
result = subprocess.run(
|
||||
[str(REFRESH)],
|
||||
cwd=tmp,
|
||||
env=env,
|
||||
text=True,
|
||||
stdout=subprocess.PIPE,
|
||||
stderr=subprocess.PIPE,
|
||||
check=False,
|
||||
)
|
||||
|
||||
self.assertEqual(0, result.returncode, result.stderr)
|
||||
self.assertIn("-m semantic_index --refresh-redmine-projects", result.stdout)
|
||||
self.assertIn("--project-limits customer-service=5", result.stdout)
|
||||
self.assertIn("log_file=", result.stdout)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
unittest.main()
|
||||
Reference in New Issue
Block a user