Add semantic-index service, deployment assets, and tests

This commit is contained in:
Jason Thistlethwaite
2026-05-04 09:50:03 -04:00
parent faad70872b
commit b305544f63
42 changed files with 5059 additions and 0 deletions
+183
View File
@@ -0,0 +1,183 @@
#!/usr/bin/env bash
set -euo pipefail
usage() {
cat >&2 <<'EOF'
Usage:
deploy/semantic-index/install.sh [--dry-run] [--apply] [--start] [--no-system] [--skip-deps]
Modes:
--dry-run Print commands that would run. This is the default.
--apply Install files, venv, dependencies, env template, and systemd units.
--start With --apply, reload systemd and start only semantic-index.service.
--no-system Skip sudo/systemd operations. Useful for tests and local validation.
--skip-deps Skip venv creation and dependency install.
The installer never runs backfill, never enables the refresh timer, and never
passes --force-rebuild.
EOF
}
mode=dry-run
start_service=0
system_ops=1
skip_deps=0
while [[ $# -gt 0 ]]; do
case "$1" in
--dry-run)
mode=dry-run
shift
;;
--apply)
mode=apply
shift
;;
--start)
start_service=1
shift
;;
--no-system)
system_ops=0
shift
;;
--skip-deps)
skip_deps=1
shift
;;
-h|--help)
usage
exit 0
;;
*)
usage
exit 2
;;
esac
done
if [[ "$start_service" -eq 1 && "$mode" != "apply" ]]; then
echo "--start requires --apply" >&2
exit 2
fi
repo_root=$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)
install_dir=${SEMANTIC_INDEX_INSTALL_DIR:-/opt/semantic-index}
env_file=${SEMANTIC_INDEX_ENV_FILE:-/etc/semantic-index.env}
state_dir=${SEMANTIC_INDEX_STATE_DIR:-/var/lib/semantic-index}
log_dir=${SEMANTIC_INDEX_LOG_DIR:-/var/log/semantic-index}
systemd_dir=${SEMANTIC_INDEX_SYSTEMD_DIR:-/etc/systemd/system}
python_bin=${PYTHON:-python3}
run() {
if [[ "$mode" == "dry-run" ]]; then
printf 'would run:'
printf ' %q' "$@"
printf '\n'
else
"$@"
fi
}
run_sudo() {
if [[ "$system_ops" -eq 0 ]]; then
run "$@"
else
run sudo "$@"
fi
}
install_env_template() {
if [[ "$mode" == "dry-run" ]]; then
echo "would copy env template only if missing: $env_file"
return
fi
if [[ -e "$env_file" ]]; then
echo "keeping existing $env_file"
return
fi
if [[ "$system_ops" -eq 0 ]]; then
mkdir -p "$(dirname "$env_file")"
cp "$repo_root/deploy/semantic-index/semantic-index.env.example" "$env_file"
else
sudo install -m 0640 "$repo_root/deploy/semantic-index/semantic-index.env.example" "$env_file"
fi
}
print_next_steps_warning() {
cat <<EOF
Semantic Index installed, but deployment is not complete.
Required manual steps:
1. Edit $env_file and fill real secrets/URLs.
2. Start or restart the HTTP service:
sudo systemctl daemon-reload
sudo systemctl start semantic-index.service
3. Validate:
curl -sS http://127.0.0.1:8787/health
$install_dir/semantic_index/search.sh "goods return" customer-service 3
4. Before enabling scheduled refresh, run:
SEMANTIC_INDEX_PROJECT_LIMITS='customer-service=5' $install_dir/semantic_index/refresh.sh
$install_dir/semantic_index/refresh.sh --apply
5. Create/confirm a Qdrant snapshot before any production-scale backfill.
The refresh timer was NOT enabled automatically.
Do not use --force-rebuild unless you intentionally want to pay to re-embed unchanged documents.
EOF
}
echo "mode=$mode"
echo "install_dir=$install_dir"
echo "env_file=$env_file"
echo "state_dir=$state_dir"
echo "log_dir=$log_dir"
run_sudo mkdir -p "$install_dir" "$state_dir" "$log_dir" "$systemd_dir"
run_sudo rsync -a \
--exclude ".env" \
--exclude "__pycache__/" \
--exclude "*.pyc" \
"$repo_root/semantic_index" \
"$repo_root/tests" \
"$repo_root/docs" \
"$repo_root/deploy" \
"$repo_root/dist" \
"$install_dir/"
if [[ "$skip_deps" -eq 1 ]]; then
echo "skipping venv/dependency install because --skip-deps was used"
elif [[ "$mode" == "apply" && "$system_ops" -eq 0 ]]; then
run "$python_bin" -m venv "$install_dir/.venv"
run "$install_dir/.venv/bin/pip" install openai qdrant-client fastapi uvicorn
else
run_sudo "$python_bin" -m venv "$install_dir/.venv"
run_sudo "$install_dir/.venv/bin/pip" install openai qdrant-client fastapi uvicorn
fi
install_env_template
run_sudo install -m 0644 "$repo_root/deploy/semantic-index/semantic-index.service" "$systemd_dir/semantic-index.service"
run_sudo install -m 0644 "$repo_root/deploy/semantic-index/semantic-index-refresh.service" "$systemd_dir/semantic-index-refresh.service"
run_sudo install -m 0644 "$repo_root/deploy/semantic-index/semantic-index-refresh.timer" "$systemd_dir/semantic-index-refresh.timer"
if [[ "$mode" == "apply" && "$skip_deps" -eq 0 ]]; then
"$install_dir/.venv/bin/python" -m py_compile "$install_dir"/semantic_index/*.py
"$install_dir/.venv/bin/python" -m unittest discover -s "$install_dir/tests/semantic_index"
bash -n "$install_dir/semantic_index/refresh.sh"
elif [[ "$mode" == "apply" ]]; then
echo "skipping installed-code validation because --skip-deps was used"
fi
if [[ "$mode" == "apply" && "$start_service" -eq 1 ]]; then
if [[ "$system_ops" -eq 0 ]]; then
echo "skipping systemctl start because --no-system was used"
else
sudo systemctl daemon-reload
sudo systemctl start semantic-index.service
fi
fi
if [[ "$mode" == "apply" ]]; then
print_next_steps_warning
fi
@@ -0,0 +1,12 @@
[Unit]
Description=Redmine Semantic Index Rolling Refresh
After=network-online.target
Wants=network-online.target
[Service]
Type=oneshot
WorkingDirectory=/opt/semantic-index
EnvironmentFile=/etc/semantic-index.env
ExecStart=/bin/bash -lc 'exec /opt/semantic-index/semantic_index/refresh.sh --apply'
NoNewPrivileges=true
PrivateTmp=true
@@ -0,0 +1,10 @@
[Unit]
Description=Run Redmine Semantic Index Rolling Refresh
[Timer]
OnBootSec=10min
OnUnitActiveSec=30min
Unit=semantic-index-refresh.service
[Install]
WantedBy=timers.target
@@ -0,0 +1,22 @@
# Copy to /etc/semantic-index.env and fill secrets on the target host.
# Do not commit real values.
OPENAI_API_KEY=
QDRANT_URL=http://qdrant-host:6333
QDRANT_API_KEY=
QDRANT_COLLECTION=redmine_semantic_sample
REDMINE_URL=http://redmine-host
REDMINE_API_KEY=
REDMINE_PROJECT_IDENTIFIER=
REDMINE_SAMPLE_LIMIT=500
SEMANTIC_INDEX_HOST=127.0.0.1
SEMANTIC_INDEX_PORT=8787
SEMANTIC_INDEX_API_KEY=
SEMANTIC_INDEX_REFRESH_STATE_PATH=/var/lib/semantic-index/refresh_state.json
SEMANTIC_INDEX_PROJECT_LIMITS=customer-service=500,hiring=200,todo-jason=200,sales-inbox=100,business-development=100,dock-scheduling=100,prep-standardization=100
SEMANTIC_INDEX_LOG_DIR=/var/log/semantic-index
SEMANTIC_INDEX_STATE_PATH=/var/lib/semantic-index/refresh_state.json
SEMANTIC_INDEX_OVERLAP_MINUTES=15
@@ -0,0 +1,17 @@
[Unit]
Description=Redmine Semantic Index HTTP API
After=network-online.target
Wants=network-online.target
[Service]
Type=simple
WorkingDirectory=/opt/semantic-index
EnvironmentFile=/etc/semantic-index.env
ExecStart=/bin/bash -lc 'exec /opt/semantic-index/.venv/bin/uvicorn semantic_index.app:app --host "${SEMANTIC_INDEX_HOST}" --port "${SEMANTIC_INDEX_PORT}"'
Restart=on-failure
RestartSec=5
NoNewPrivileges=true
PrivateTmp=true
[Install]
WantedBy=multi-user.target
@@ -0,0 +1,64 @@
# Semantic Index V1 Pre-Deployment Manifest
- Patch set: `semantic-index-v1-predeployment-20260425T150000Z`
- Created: `2026-04-25T15:00:00Z`
- Purpose: deployment manifest for the Redmine semantic index service and its
LAN/production preparation docs.
## Files To Install
```text
semantic_index/
tests/semantic_index/
deploy/semantic-index/
docs/semantic_index_deployment_runbook.md
docs/semantic_index_production_notes.md
docs/semantic_index_predeployment_validation.md
docs/redmine_issue_api_helpdesk_include.md
dist/semantic-index-v1-predeployment-20260425T150000Z.MANIFEST.md
```
## Files Not To Install
```text
semantic_index/.env
.cache/
.venv/
__pycache__/
*.pyc
```
Keep runtime secrets in `semantic_index/.env` or in the service manager
environment on the target host. Do not commit or copy local secrets into a
source bundle.
## External Dependencies
- Redmine Helpdesk API patch documented in
`docs/redmine_issue_api_helpdesk_include.md`
- Qdrant reachable through `QDRANT_URL`
- OpenAI API key for `text-embedding-3-small`
- Python packages: `openai`, `qdrant-client`, `fastapi`, `uvicorn`
## Validation Commands
```sh
deploy/semantic-index/install.sh
.venv/bin/python -m py_compile semantic_index/*.py
.venv/bin/python -m unittest discover -s tests/semantic_index
bash -n semantic_index/refresh.sh
SEMANTIC_INDEX_PROJECT_LIMITS='customer-service=5' semantic_index/refresh.sh
```
Before any production backfill, follow
`docs/semantic_index_deployment_runbook.md` and confirm Qdrant snapshot or
volume rollback is available.
## Operational Rules
- Run `semantic_index/refresh.sh` in dry-run mode before `--apply`.
- Do not schedule `--force-rebuild`; keep it manual-only.
- Review refresh logs for `detail_fetched_issues`, `would_embed_documents`, and
`embedded_documents`.
- Bind HTTP to localhost unless LAN access is explicitly required and protected
with `SEMANTIC_INDEX_API_KEY`.
+336
View File
@@ -0,0 +1,336 @@
# Semantic Index Deployment Runbook
This runbook captures the current deployment shape for the Redmine semantic
index. It is written for the LAN test server first, with the same steps intended
to carry forward to production after paths and secrets are adjusted.
The latest LAN validation record is in
`docs/semantic_index_predeployment_validation.md`.
## Deployable Files
Copy or update these tracked paths together:
- `semantic_index/`
- `tests/semantic_index/`
- `deploy/semantic-index/`
- `docs/semantic_index_production_notes.md`
- `docs/semantic_index_deployment_runbook.md`
- `docs/semantic_index_predeployment_validation.md`
- `docs/redmine_issue_api_helpdesk_include.md`
The Helpdesk contact metadata dependency is the Redmine plugin API patch
documented in `docs/redmine_issue_api_helpdesk_include.md`. Deploy that plugin
patch before expecting Helpdesk contact fields in indexed results.
Do not copy local-only runtime files:
- `semantic_index/.env`
- `.cache/`
- `.venv/`
- `__pycache__/`
- Qdrant storage snapshots or rollback tarballs unless deliberately restoring
## Runtime Prerequisites
Python runtime dependencies:
```sh
pip install openai qdrant-client fastapi uvicorn
```
Qdrant is expected to run on the larger host and be reachable from the semantic
index host through `QDRANT_URL`. The current collection default is
`redmine_semantic_sample`.
Qdrant Docker example:
```sh
docker run -p 6333:6333 -p 6334:6334 \
-v qdrant_storage:/qdrant/storage \
qdrant/qdrant
```
Before destructive maintenance, create a Qdrant snapshot or preserve the Docker
volume.
## Environment
For a production-style install, use:
- code: `/opt/semantic-index`
- environment file: `/etc/semantic-index.env`
- refresh state: `/var/lib/semantic-index/refresh_state.json`
- refresh logs: `/var/log/semantic-index`
Create `/etc/semantic-index.env` from
`deploy/semantic-index/semantic-index.env.example` and fill secrets on the
target host:
```sh
OPENAI_API_KEY=
QDRANT_URL=http://qdrant-host:6333
QDRANT_API_KEY=
QDRANT_COLLECTION=redmine_semantic_sample
REDMINE_URL=http://redmine-host
REDMINE_API_KEY=
REDMINE_PROJECT_IDENTIFIER=
REDMINE_SAMPLE_LIMIT=500
SEMANTIC_INDEX_HOST=127.0.0.1
SEMANTIC_INDEX_PORT=8787
SEMANTIC_INDEX_API_KEY=
SEMANTIC_INDEX_REFRESH_STATE_PATH=/var/lib/semantic-index/refresh_state.json
```
Recommended production-style refresh overrides:
```sh
SEMANTIC_INDEX_PROJECT_LIMITS='customer-service=500,hiring=200,todo-jason=200,sales-inbox=100,business-development=100,dock-scheduling=100,prep-standardization=100'
SEMANTIC_INDEX_LOG_DIR=/var/log/semantic-index
SEMANTIC_INDEX_STATE_PATH=/var/lib/semantic-index/refresh_state.json
SEMANTIC_INDEX_OVERLAP_MINUTES=15
```
Keep `SEMANTIC_INDEX_API_KEY` set when binding outside localhost. Do not commit
API keys or `.env` files.
## Systemd Templates
Templates live in `deploy/semantic-index/`:
```text
install.sh
semantic-index.service
semantic-index-refresh.service
semantic-index-refresh.timer
semantic-index.env.example
```
Use the installer first. It defaults to dry-run:
```sh
deploy/semantic-index/install.sh
```
Apply the install:
```sh
deploy/semantic-index/install.sh --apply
```
Optionally start only the HTTP service after installing:
```sh
deploy/semantic-index/install.sh --apply --start
```
The installer creates `/opt/semantic-index`, `/var/lib/semantic-index`, and
`/var/log/semantic-index`; copies the deploy unit; creates
`/etc/semantic-index.env` only if it does not already exist; installs systemd
unit files; and runs local validation. It does not run backfill, does not enable
the refresh timer, and never passes `--force-rebuild`.
Manual install shape, if the installer cannot be used:
```sh
sudo mkdir -p /opt/semantic-index /var/lib/semantic-index /var/log/semantic-index
sudo rsync -a \
--exclude '.env' \
--exclude '__pycache__/' \
--exclude '*.pyc' \
semantic_index tests docs deploy dist /opt/semantic-index/
sudo cp deploy/semantic-index/semantic-index.env.example /etc/semantic-index.env
sudo install -m 0644 deploy/semantic-index/semantic-index.service /etc/systemd/system/semantic-index.service
sudo install -m 0644 deploy/semantic-index/semantic-index-refresh.service /etc/systemd/system/semantic-index-refresh.service
sudo install -m 0644 deploy/semantic-index/semantic-index-refresh.timer /etc/systemd/system/semantic-index-refresh.timer
```
After editing `/etc/semantic-index.env`, validate manually before enabling the
timer:
```sh
sudo systemctl daemon-reload
sudo systemctl start semantic-index.service
sudo systemctl status semantic-index.service
sudo systemctl start semantic-index-refresh.service
sudo journalctl -u semantic-index-refresh.service -n 100 --no-pager
```
Enable the timer only after manual dry-run and `--apply` logs look normal:
```sh
sudo systemctl enable --now semantic-index-refresh.timer
```
## Initial Validation
Run syntax and test checks after copying code:
```sh
.venv/bin/python -m py_compile semantic_index/*.py
.venv/bin/python -m unittest discover -s tests/semantic_index
bash -n semantic_index/refresh.sh
```
Confirm service startup:
```sh
uvicorn semantic_index.app:app --host 127.0.0.1 --port 8787
curl -sS http://127.0.0.1:8787/health
```
If `SEMANTIC_INDEX_API_KEY` is set:
```sh
curl -sS -H "Authorization: Bearer $SEMANTIC_INDEX_API_KEY" \
http://127.0.0.1:8787/projects
```
## Initial Backfill
Preview Redmine mapping before writing to Qdrant:
```sh
.venv/bin/python -m semantic_index inspect preview-redmine \
--project customer-service \
--limit 5
```
Backfill the current balanced sample:
```sh
.venv/bin/python -m semantic_index --backfill-redmine-projects \
--project-limits customer-service=500,hiring=200,todo-jason=200,sales-inbox=100,business-development=100,dock-scheduling=100,prep-standardization=100
```
Audit the result:
```sh
.venv/bin/python -m semantic_index inspect audit --source redmine --limit 5000
.venv/bin/python -m semantic_index inspect smoke-search --project customer-service
```
Expected broad shape for the current LAN sample is roughly:
- Customer Service is the largest project.
- Helpdesk tickets have contact metadata.
- Internal projects may have no Helpdesk contact metadata.
- `attachments=0`.
## Routine Refresh
Use the wrapper for production-style refresh. It defaults to dry-run:
```sh
semantic_index/refresh.sh
```
Small smoke check:
```sh
SEMANTIC_INDEX_PROJECT_LIMITS='customer-service=5' semantic_index/refresh.sh
```
Apply refresh manually:
```sh
semantic_index/refresh.sh --apply
```
Installed wrappers can also be called by absolute path, for example
`/opt/semantic-index/semantic_index/refresh.sh`. The wrapper uses its own
install root as the working directory and reads defaults from
`/etc/semantic-index.env` when that file is readable.
Review the log path printed by the wrapper. For a healthy routine run after
state exists, expect:
- `scanned_issues` greater than or equal to `detail_fetched_issues`
- old issues counted under `skipped_issues`
- `would_embed_documents` and `embedded_documents` near zero when Redmine has
not changed
- no scheduled use of `--force-rebuild`
Only schedule the wrapper after manual dry-run and apply logs look normal.
Cron shape, when ready:
```cron
*/30 * * * * cd /home/iadnah/redmine && semantic_index/refresh.sh --apply
```
## Search Validation
HTTP search:
```sh
semantic_index/search.sh "goods return" customer-service 3
semantic_index/search.sh "candidate follow up" hiring 5
```
CLI inspection:
```sh
.venv/bin/python -m semantic_index inspect search "goods return" \
--project customer-service \
--limit 3
.venv/bin/python -m semantic_index inspect list \
--source redmine \
--project customer-service \
--limit 10
```
MCP stdio:
```sh
.venv/bin/python -m semantic_index --mcp-stdio
```
Available tools:
- `semantic_search`
- `semantic_get_document`
- `semantic_list_projects`
- `semantic_backfill_redmine_sample`
- `semantic_refresh_redmine`
## Rollback
Code rollback:
- Stop `uvicorn` or the service manager unit.
- Restore the previous `semantic_index/` code.
- Restore the previous Redmine Helpdesk plugin patch if contact metadata broke.
- Restart the service.
Index rollback options:
- Restore a Qdrant snapshot or preserved Docker volume.
- Or rebuild from Redmine with the known-good code using the multi-project
backfill command above.
Refresh rollback:
- Disable cron/systemd schedule if enabled.
- Preserve the failing log file for diagnosis.
- If the refresh state is wrong, move the state file aside rather than editing
it in place:
```sh
mv .cache/semantic_index/refresh_state.json .cache/semantic_index/refresh_state.json.bad
```
The next refresh will behave like a first refresh for state purposes, while the
`source_hash` guard still prevents embedding unchanged documents.
## Production Readiness Checklist
- Redmine API key is scoped appropriately and stored outside git.
- Qdrant URL and collection are confirmed.
- Qdrant snapshot/export path is known.
- Helpdesk API patch is deployed and validated.
- HTTP service is bound only to trusted localhost/LAN as intended.
- `SEMANTIC_INDEX_API_KEY` is set for non-localhost use.
- Initial backfill audit and smoke searches pass.
- Refresh dry-run and apply logs show expected low embedding counts.
- `--force-rebuild` is documented as manual-only.
@@ -0,0 +1,182 @@
# Semantic Index Pre-Deployment Validation
Validation date: `2026-04-25`
This records the current LAN pre-deployment checks for the semantic index. It
does not include secrets.
## Deploy Unit
Semantic-index deployable files are documented in:
- `dist/semantic-index-v1-predeployment-20260425T150000Z.MANIFEST.md`
- `docs/semantic_index_deployment_runbook.md`
Current known unrelated worktree changes are outside the semantic-index deploy
unit and should not be mixed into the semantic-index release package:
- `redMCP/README.md`
- `redMCP/app/McpDispatcher.php`
- `redMCP/app/RedmineClient.php`
- `redMCP/composer.json`
- `redMCP/bin/test-redmine-structure.php`
- `TODO.md`
## Local Verification
Passed:
```sh
.venv/bin/python -m py_compile semantic_index/*.py
.venv/bin/python -m unittest discover -s tests/semantic_index
bash -n semantic_index/refresh.sh
```
Observed semantic test result:
```text
Ran 65 tests in 1.041s
OK
```
## LAN Redmine Preview
Passed:
```sh
.venv/bin/python -m semantic_index inspect preview-redmine \
--project customer-service \
--limit 5
```
Observed:
- Helpdesk issue chunks include contact id, name, email, and company metadata.
- Issue `39779` includes Callum Mackeonis and `callum@safetagtracking.com`.
- Journals are present as separate indexed documents.
- Contact documents are present as separate indexed documents.
## Qdrant Audit
Passed:
```sh
.venv/bin/python -m semantic_index inspect audit --source redmine --limit 5000 --json
```
Observed:
```text
total_documents=2947
doc_type contact=714
doc_type issue=1208
doc_type journal=1025
project business-development=66
project customer-service=1684
project dock-scheduling=63
project hiring=409
project prep-standardization=25
project sales-inbox=192
project todo-jason=508
contact_metadata=2232
helpdesk_contact_metadata=2232/2232
attachments=0
```
## HTTP Validation
Passed:
```sh
curl -sS http://127.0.0.1:8787/health
```
Observed:
```json
{"status":"ok"}
```
Unauthenticated `/projects` correctly returned unauthorized when
`SEMANTIC_INDEX_API_KEY` was configured.
Authenticated `/projects` passed and returned the expected seven projects:
```text
business-development
customer-service
dock-scheduling
hiring
prep-standardization
sales-inbox
todo-jason
```
HTTP search passed:
```sh
semantic_index/search.sh "goods return" customer-service 3
```
Observed:
- Top result was `redmine:issue:39779:chunk:0`.
- Citation included project `customer-service`.
- Citation included contact id `1890`, contact name, contact email, and Redmine
URL.
## Refresh Validation
Passed safe dry-run smoke check:
```sh
SEMANTIC_INDEX_PROJECT_LIMITS='customer-service=5' semantic_index/refresh.sh
```
Observed:
```text
mode=dry-run
issues=5
scanned_issues=5
detail_fetched_issues=0
skipped_issues=5
would_embed_documents=0
embedded_documents=0
```
This confirms the refresh state prefilter skips old issues before Redmine detail
fetch and before embedding.
## Qdrant Validation
Read-only collection check passed:
```text
collection=redmine_semantic_sample
status=green
vector_size=1536
distance=Cosine
points_count=2947
update_queue.length=0
```
Read-only snapshot listing endpoint responded successfully:
```text
/collections/redmine_semantic_sample/snapshots
result=[]
```
No snapshot was created during this validation.
## Remaining Pre-Deployment Items
- Decide final target host paths for logs and refresh state.
- Decide service manager shape: manual `uvicorn`, systemd service, or another
supervisor.
- Create or confirm a Qdrant snapshot immediately before production backfill.
- Package only the semantic-index deploy unit, keeping unrelated `redMCP`
worktree changes out of the release.
- Keep scheduled refresh disabled until manual dry-run and `--apply` logs are
reviewed on the target host.
+76
View File
@@ -0,0 +1,76 @@
# Semantic Index Production Notes
These notes capture the current production direction for the Redmine semantic
index. The service is still local-agent oriented, but the refresh command is now
shaped so it can later be run by cron or systemd without changing the command.
Use `docs/semantic_index_deployment_runbook.md` for the full deploy, validation,
and rollback checklist.
## Routine Refresh
Use the wrapper from the repository root:
```sh
semantic_index/refresh.sh
```
By default this is a dry-run. It does not call OpenAI for document embeddings
and does not write to Qdrant. To apply a rolling refresh:
```sh
semantic_index/refresh.sh --apply
```
The wrapper writes a timestamped log under `.cache/semantic_index/logs` and uses
`.cache/semantic_index/refresh_state.json` for rolling refresh state.
## Production Overrides
Use environment variables rather than editing the script:
```sh
SEMANTIC_INDEX_PROJECT_LIMITS='customer-service=500,hiring=200,todo-jason=200,sales-inbox=100,business-development=100,dock-scheduling=100,prep-standardization=100'
SEMANTIC_INDEX_LOG_DIR=/var/log/semantic-index
SEMANTIC_INDEX_STATE_PATH=/var/lib/semantic-index/refresh_state.json
SEMANTIC_INDEX_OVERLAP_MINUTES=15
```
Keep `OPENAI_API_KEY`, `QDRANT_URL`, `REDMINE_URL`, and `REDMINE_API_KEY` in the
existing `.env` workflow or in the service manager environment.
For production-style deployment, use `/opt/semantic-index` for code,
`/etc/semantic-index.env` for service environment, `/var/lib/semantic-index`
for refresh state, and `/var/log/semantic-index` for refresh logs. Systemd
templates live in `deploy/semantic-index/`.
## Embedding Cost Guard
Normal refresh embeds only documents that are new or whose Redmine-derived
`source_hash` changed. Unchanged documents are left alone. Stale indexed
documents for refreshed issues are deleted without embedding.
Do not schedule `--force-rebuild`. Use it only as a manual maintenance action
when intentionally re-embedding unchanged documents.
## Cron Shape
A later cron entry can call the same wrapper:
```cron
*/30 * * * * cd /home/iadnah/redmine && semantic_index/refresh.sh --apply
```
Before adding a real schedule, run the wrapper manually and confirm the log
shows expected `embedded_documents`, `unchanged_documents`, and
`skipped_issues` counts.
For a quick wrapper smoke check, reduce the project limits:
```sh
SEMANTIC_INDEX_PROJECT_LIMITS='customer-service=5' semantic_index/refresh.sh
```
After refresh state exists, routine dry-runs should show old issues as
`skipped_issues` without matching `detail_fetched_issues`. That indicates the
refresh is avoiding unnecessary Redmine detail requests before it reaches the
embedding cost guard.
+12
View File
@@ -0,0 +1,12 @@
OPENAI_API_KEY=
QDRANT_URL=http://localhost:6333
QDRANT_API_KEY=
QDRANT_COLLECTION=redmine_semantic_sample
REDMINE_URL=http://192.168.50.170
REDMINE_API_KEY=
REDMINE_PROJECT_IDENTIFIER=fud-helpdesk
REDMINE_SAMPLE_LIMIT=500
SEMANTIC_INDEX_HOST=127.0.0.1
SEMANTIC_INDEX_PORT=8787
SEMANTIC_INDEX_API_KEY=
SEMANTIC_INDEX_REFRESH_STATE_PATH=.cache/semantic_index/refresh_state.json
+271
View File
@@ -0,0 +1,271 @@
# Redmine Semantic Index
Local semantic index service for a recent Redmine Helpdesk sample. V1 uses
OpenAI `text-embedding-3-small` embeddings and Qdrant vectors, with Redmine as
the first source adapter.
For deploy, validation, and rollback steps, see
`docs/semantic_index_deployment_runbook.md`.
## Configuration
Copy `.env.example` to `.env` and set local secrets there. Do not commit `.env`.
Required for live use:
- `OPENAI_API_KEY`
- `QDRANT_URL`
- `REDMINE_URL`
- `REDMINE_API_KEY`
Optional:
- `QDRANT_API_KEY`
- `QDRANT_COLLECTION`
- `REDMINE_PROJECT_IDENTIFIER`
- `REDMINE_SAMPLE_LIMIT`
- `SEMANTIC_INDEX_API_KEY`
## HTTP
Install runtime dependencies in your chosen environment:
```sh
pip install openai qdrant-client fastapi uvicorn
```
Run:
```sh
uvicorn semantic_index.app:app --host 127.0.0.1 --port 8787
```
Endpoints:
- `GET /health`
- `POST /sources/redmine/backfill-sample`
- `POST /search`
- `GET /documents/{id}`
- `GET /projects`
If `SEMANTIC_INDEX_API_KEY` is set, pass `Authorization: Bearer <key>`.
Search response shape is shared by HTTP, MCP, and the Python client:
```json
{
"query": "candidate follow up",
"filters": {"project_identifier": "hiring", "limit": 5},
"results": [
{
"id": "redmine:issue:123:chunk:0",
"score": 0.72,
"snippet": "Candidate follow up...",
"payload": {},
"citation": {
"source": "redmine",
"doc_type": "issue",
"issue_id": 123,
"project_identifier": "hiring",
"url": "http://redmine/issues/123"
}
}
]
}
```
HTTP examples:
```sh
curl -sS -H "Authorization: Bearer $SEMANTIC_INDEX_API_KEY" \
-H "Content-Type: application/json" \
-d '{"query":"candidate follow up","project_identifier":"hiring","limit":5}' \
http://127.0.0.1:8787/search
curl -sS -H "Authorization: Bearer $SEMANTIC_INDEX_API_KEY" \
http://127.0.0.1:8787/projects
```
## Python Client
Use the client in-process when running from this repo/environment:
```python
from semantic_index.client import SemanticIndexClient
client = SemanticIndexClient.local()
results = client.search("callum@safetagtracking.com", project_identifier="customer-service", limit=5)
document = client.get_document(results["results"][0]["id"])
```
Use HTTP mode from another local program:
```python
from semantic_index.client import SemanticIndexClient
client = SemanticIndexClient(base_url="http://127.0.0.1:8787", api_key="...")
results = client.search("candidate follow up", project_identifier="hiring", limit=5)
```
## Backfill
Refresh the configured Redmine sample from the command line:
```sh
python3 -m semantic_index --backfill-redmine-sample --limit 50
```
When `REDMINE_PROJECT_IDENTIFIER` is set, the rebuild deletes and replaces only
indexed Redmine documents for that project. Without a project identifier, it
rebuilds the Redmine source sample for the collection.
Refresh a balanced multi-project sample:
```sh
python3 -m semantic_index --backfill-redmine-projects \
--projects customer-service,hiring,todo-jason,sales-inbox,business-development,dock-scheduling,prep-standardization \
--per-project-limit 100
```
Use project-specific limits when Customer Service should stay larger than the
internal project sample:
```sh
python3 -m semantic_index --backfill-redmine-projects \
--project-limits customer-service=500,hiring=200,todo-jason=200,sales-inbox=100,business-development=100,dock-scheduling=100,prep-standardization=100
```
Multi-project backfill rebuilds each project scope independently. Non-Helpdesk
projects are indexed as ordinary Redmine issues and journals; they are not
expected to have Helpdesk contact metadata.
## Rolling Refresh
Use rolling refresh for routine updates after an initial backfill:
```sh
python3 -m semantic_index --refresh-redmine-projects \
--project-limits customer-service=500,hiring=200,todo-jason=200,sales-inbox=100,business-development=100,dock-scheduling=100,prep-standardization=100 \
--dry-run
```
Dry-run reports what would change without calling OpenAI or writing to Qdrant.
Remove `--dry-run` to apply the refresh.
The refresh maps each recent Redmine issue to stable document IDs, reads the
existing Qdrant payloads for that issue, and compares `source_hash` values.
Only new or changed documents are embedded and upserted. Unchanged documents
are left alone, and stale documents for refreshed issues are deleted without
embedding. Use `--force-rebuild` only when you explicitly want to re-embed
matching documents.
The default local state file is `.cache/semantic_index/refresh_state.json`.
After a successful refresh, later runs skip issues older than the previous
success timestamp minus `--overlap-minutes` unless `--force-rebuild` is used.
Override it with:
```sh
python3 -m semantic_index --refresh-redmine-projects \
--project-limits customer-service=500 \
--state-path /tmp/semantic-refresh-state.json
```
The HTTP endpoint exposes the same behavior:
```sh
curl -sS -X POST http://127.0.0.1:8787/sources/redmine/refresh \
-H 'Content-Type: application/json' \
-d '{"project_limits":{"customer-service":500},"dry_run":true}'
```
For production-style operation, use the wrapper script. It defaults to dry-run
and writes timestamped logs under `.cache/semantic_index/logs`:
```sh
semantic_index/refresh.sh
semantic_index/refresh.sh --apply
```
For a quick smoke check of the wrapper path:
```sh
SEMANTIC_INDEX_PROJECT_LIMITS='customer-service=5' semantic_index/refresh.sh
```
Override project limits, state path, or log location through environment
variables:
```sh
SEMANTIC_INDEX_PROJECT_LIMITS='customer-service=500,hiring=200' \
SEMANTIC_INDEX_LOG_DIR=/var/log/semantic-index \
SEMANTIC_INDEX_STATE_PATH=/var/lib/semantic-index/refresh_state.json \
semantic_index/refresh.sh --apply
```
Do not schedule `--force-rebuild`. Force rebuilds should stay manual because
they intentionally re-embed unchanged documents.
## MCP Stdio
```sh
python3 -m semantic_index --mcp-stdio
```
Tools:
- `semantic_search`
- `semantic_get_document`
- `semantic_list_projects`
- `semantic_backfill_redmine_sample`
- `semantic_refresh_redmine`
For agent workflows, list projects first when the user has not named a project,
search broadly or with `project_identifier` when known, then call
`semantic_get_document` for any promising result. Treat returned citations and
Redmine URLs as the authoritative references. Backfill tools are operational and
should not be part of normal search behavior.
## Inspection CLI
Use the inspect commands before larger backfills to see what is already indexed
or preview what Redmine would produce without writing to Qdrant.
```sh
python3 -m semantic_index inspect count --source redmine --project customer-service
python3 -m semantic_index inspect list --limit 20 --source redmine --project customer-service
python3 -m semantic_index inspect search "order status" --limit 5 --project customer-service
python3 -m semantic_index inspect search "customer@example.com" --limit 5 --project customer-service
python3 -m semantic_index inspect show redmine:issue:39778:chunk:0
python3 -m semantic_index inspect preview-redmine --limit 10 --project customer-service
python3 -m semantic_index inspect audit --source redmine --project customer-service --limit 500
python3 -m semantic_index inspect compare-redmine --project customer-service --limit 20
python3 -m semantic_index inspect smoke-search --project customer-service
```
`count`, `list`, `show`, and `preview-redmine` do not call OpenAI.
`search` embeds the query text. List/search output shows snippets by default;
pass `--full-text` when you need the full indexed text.
`audit` summarizes indexed document coverage without calling OpenAI.
`compare-redmine` previews live Redmine chunks and compares them to indexed
Qdrant documents without writing to Qdrant. `smoke-search` runs known search
checks and calls OpenAI for query embeddings. Pass `--json` to `audit`,
`compare-redmine`, or `smoke-search` for machine-readable output.
For mixed project samples, run `audit` without `--project` to see project-level
counts and Helpdesk-contact coverage separately from ordinary internal issues.
For Helpdesk tickets, Redmine issue ingestion expects
`/issues/:id.json?include=journals,helpdesk` to return `helpdesk_ticket`
metadata with an expanded contact. See
`docs/redmine_issue_api_helpdesk_include.md` for the Redmine API patch notes.
## Qdrant
For local Docker-hosted Qdrant:
```sh
docker run -p 6333:6333 -p 6334:6334 -v qdrant_storage:/qdrant/storage qdrant/qdrant
```
Create snapshots with Qdrant's snapshot API or mounted storage tooling before
destructive maintenance. The default collection name is
`redmine_semantic_sample`.
+12
View File
@@ -0,0 +1,12 @@
"""Local semantic index service for Redmine and future source adapters."""
__all__ = [
"config",
"embeddings",
"ingest",
"mcp",
"models",
"qdrant_store",
"redmine",
"search",
]
+206
View File
@@ -0,0 +1,206 @@
from __future__ import annotations
import argparse
from pathlib import Path
from typing import Callable, Dict, List, Optional
from .app import build_services
from .config import Settings, load_settings
from .inspect import (
print_audit,
print_compare_redmine,
print_count,
print_list,
print_preview_redmine,
print_search,
print_show,
print_smoke_search,
)
from .mcp import SemanticMCP, serve_stdio
from .refresh import FileRefreshState
from .redmine import RedmineApiSource
def build_preview_services(settings: Settings) -> Dict[str, object]:
return {
"settings": settings,
"redmine_source": RedmineApiSource(
redmine_url=settings.redmine_url,
api_key=settings.redmine_api_key or "",
project_identifier=settings.redmine_project_identifier,
),
}
def parse_projects(raw: str) -> List[str]:
return [project.strip() for project in raw.split(",") if project.strip()]
def parse_project_limits(raw: str) -> Dict[str, int]:
project_limits: Dict[str, int] = {}
for item in raw.split(","):
if not item.strip():
continue
project, limit = item.split("=", 1)
project_limits[project.strip()] = int(limit.strip())
return project_limits
def main(
argv: Optional[List[str]] = None,
service_builder: Callable[[], Dict[str, object]] = build_services,
preview_service_builder: Optional[Callable[[Settings], Dict[str, object]]] = None,
settings_loader: Callable[[], Settings] = load_settings,
) -> None:
parser = argparse.ArgumentParser(description="Semantic index helper", allow_abbrev=False)
parser.add_argument("--mcp-stdio", action="store_true", help="Run the MCP-compatible stdio tool server")
parser.add_argument("--backfill-redmine-sample", action="store_true", help="Backfill the configured Redmine sample")
parser.add_argument("--backfill-redmine-projects", action="store_true", help="Backfill multiple Redmine projects")
parser.add_argument("--refresh-redmine-projects", action="store_true", help="Refresh recent Redmine issues without re-embedding unchanged documents")
parser.add_argument("--projects", help="Comma-separated Redmine project identifiers for multi-project backfill")
parser.add_argument("--project-limits", help="Comma-separated project=limit pairs for multi-project backfill")
parser.add_argument("--per-project-limit", type=int, default=500)
parser.add_argument("--limit", type=int, default=500)
parser.add_argument("--dry-run", action="store_true", help="Report planned refresh work without embeddings or writes")
parser.add_argument("--force-rebuild", action="store_true", help="Embed and upsert refresh candidates even when source hashes match")
parser.add_argument("--overlap-minutes", type=int, default=15, help="Refresh overlap window for rolling update state")
parser.add_argument("--state-path", help="Override rolling refresh state file path")
subparsers = parser.add_subparsers(dest="command")
inspect_parser = subparsers.add_parser("inspect", help="Inspect indexed documents and preview Redmine chunks")
inspect_subparsers = inspect_parser.add_subparsers(dest="inspect_command", required=True)
def add_filters(command_parser: argparse.ArgumentParser) -> None:
command_parser.add_argument("--source", default="redmine")
command_parser.add_argument("--project", dest="project_identifier")
command_parser.add_argument("--doc-type")
count_parser = inspect_subparsers.add_parser("count", help="Count indexed documents")
add_filters(count_parser)
list_parser = inspect_subparsers.add_parser("list", help="List indexed documents")
add_filters(list_parser)
list_parser.add_argument("--limit", type=int, default=20)
list_parser.add_argument("--full-text", action="store_true")
search_parser = inspect_subparsers.add_parser("search", help="Search indexed documents")
search_parser.add_argument("query")
add_filters(search_parser)
search_parser.add_argument("--limit", type=int, default=10)
search_parser.add_argument("--full-text", action="store_true")
show_parser = inspect_subparsers.add_parser("show", help="Show one indexed document")
show_parser.add_argument("document_id")
preview_parser = inspect_subparsers.add_parser("preview-redmine", help="Preview Redmine chunks without writing to Qdrant")
preview_parser.add_argument("--limit", type=int, default=10)
preview_parser.add_argument("--project", dest="project_identifier")
preview_parser.add_argument("--full-text", action="store_true")
audit_parser = inspect_subparsers.add_parser("audit", help="Audit indexed documents for trust-check coverage")
add_filters(audit_parser)
audit_parser.add_argument("--limit", type=int, default=500)
audit_parser.add_argument("--json", action="store_true")
compare_parser = inspect_subparsers.add_parser("compare-redmine", help="Compare live Redmine preview chunks with indexed documents")
compare_parser.add_argument("--limit", type=int, default=20)
compare_parser.add_argument("--project", dest="project_identifier")
compare_parser.add_argument("--json", action="store_true")
smoke_parser = inspect_subparsers.add_parser("smoke-search", help="Run repeatable search checks against indexed documents")
smoke_parser.add_argument("--project", dest="project_identifier")
smoke_parser.add_argument("--email", default="callum@safetagtracking.com")
smoke_parser.add_argument("--issue-id", type=int, default=39779)
smoke_parser.add_argument("--order-token")
smoke_parser.add_argument("--natural-query", default="customer needs goods returned")
smoke_parser.add_argument("--json", action="store_true")
args = parser.parse_args(argv)
if not args.command and not args.backfill_redmine_sample and not args.backfill_redmine_projects and not args.refresh_redmine_projects and not args.mcp_stdio:
parser.print_help()
return
if args.command == "inspect" and args.inspect_command == "preview-redmine":
if preview_service_builder is not None:
services = preview_service_builder(settings_loader())
elif service_builder is build_services:
services = build_preview_services(settings_loader())
else:
services = service_builder()
project = args.project_identifier or services["settings"].redmine_project_identifier
print_preview_redmine(services["redmine_source"], services["settings"].redmine_url, project, args.limit, args.full_text)
return
services = service_builder()
if args.state_path and "refresh" in services and hasattr(services["refresh"], "state"):
services["refresh"].state = FileRefreshState(Path(args.state_path))
if args.backfill_redmine_sample:
print(services["backfill"].backfill_redmine_sample(limit=args.limit))
return
if args.backfill_redmine_projects:
if args.project_limits:
print(services["backfill"].backfill_redmine_project_limits(parse_project_limits(args.project_limits)))
return
projects = parse_projects(args.projects or "")
if not projects:
parser.error("--projects or --project-limits is required with --backfill-redmine-projects")
print(services["backfill"].backfill_redmine_projects(projects, per_project_limit=args.per_project_limit))
return
if args.refresh_redmine_projects:
if args.project_limits:
project_limits = parse_project_limits(args.project_limits)
else:
projects = parse_projects(args.projects or "")
if not projects:
parser.error("--projects or --project-limits is required with --refresh-redmine-projects")
project_limits = {project: args.per_project_limit for project in projects}
print(
services["refresh"].refresh_redmine_project_limits(
project_limits,
dry_run=args.dry_run,
force_rebuild=args.force_rebuild,
overlap_minutes=args.overlap_minutes,
)
)
return
if args.mcp_stdio:
serve_stdio(SemanticMCP(search_service=services["search"], backfill_service=services["backfill"], store=services["store"], refresh_service=services.get("refresh")))
return
if args.command == "inspect":
if args.inspect_command == "count":
print_count(services["store"], args.source, args.project_identifier, args.doc_type)
return
if args.inspect_command == "list":
print_list(services["store"], args.limit, args.source, args.project_identifier, args.doc_type, args.full_text)
return
if args.inspect_command == "search":
print_search(services["search"], args.query, args.limit, args.source, args.project_identifier, args.doc_type, args.full_text)
return
if args.inspect_command == "show":
print_show(services["search"], args.document_id)
return
if args.inspect_command == "audit":
print_audit(services["store"], args.limit, args.source, args.project_identifier, args.doc_type, args.json)
return
if args.inspect_command == "compare-redmine":
project = args.project_identifier or services["settings"].redmine_project_identifier
print_compare_redmine(services["store"], services["redmine_source"], services["settings"].redmine_url, project, args.limit, args.json)
return
if args.inspect_command == "smoke-search":
project = args.project_identifier or services["settings"].redmine_project_identifier
print_smoke_search(
services["search"],
project,
args.email,
args.issue_id,
args.order_token,
args.natural_query,
args.json,
)
return
parser.print_help()
if __name__ == "__main__":
main()
+153
View File
@@ -0,0 +1,153 @@
from __future__ import annotations
from typing import Any, Callable, Dict, Optional
from .config import Settings, load_settings
from .embeddings import OpenAIEmbedder, OpenAIEmbeddingClient
from .ingest import BackfillService
from .models import SearchQuery, search_response
from .qdrant_store import QdrantStore
from .refresh import FileRefreshState, RedmineRefreshService
from .redmine import RedmineApiSource, RedmineMapper
from .search import HybridSearchService
def build_services(settings: Optional[Settings] = None) -> Dict[str, Any]:
settings = settings or load_settings()
embedding_client = OpenAIEmbeddingClient(api_key=settings.openai_api_key)
embedder = OpenAIEmbedder(client=embedding_client)
store = QdrantStore(
url=settings.qdrant_url,
api_key=settings.qdrant_api_key,
collection=settings.qdrant_collection,
)
redmine_source = RedmineApiSource(
redmine_url=settings.redmine_url,
api_key=settings.redmine_api_key or "",
project_identifier=settings.redmine_project_identifier,
)
search_service = HybridSearchService(embedder=embedder, store=store)
backfill_service = BackfillService(
source=redmine_source,
embedder=embedder,
store=store,
mapper=RedmineMapper(redmine_url=settings.redmine_url, project_identifier=settings.redmine_project_identifier),
)
refresh_service = RedmineRefreshService(
source=redmine_source,
embedder=embedder,
store=store,
mapper=RedmineMapper(redmine_url=settings.redmine_url, project_identifier=settings.redmine_project_identifier),
state=FileRefreshState(settings.refresh_state_path),
)
return {
"settings": settings,
"search": search_service,
"backfill": backfill_service,
"refresh": refresh_service,
"store": store,
"redmine_source": redmine_source,
}
def create_app(settings: Optional[Settings] = None, service_builder: Optional[Callable[[], Dict[str, Any]]] = None):
try:
from fastapi import FastAPI, Header, HTTPException
except ImportError as exc:
raise RuntimeError("Install fastapi and uvicorn to run the HTTP service") from exc
services: Optional[Dict[str, Any]] = None
app = FastAPI(title="Redmine Semantic Index", version="0.1.0")
def get_services() -> Dict[str, Any]:
nonlocal services
if services is None:
if service_builder is not None:
services = service_builder()
else:
services = build_services(settings)
return services
def authorize(authorization: Optional[str]) -> None:
api_key = get_services()["settings"].service_api_key
if not api_key:
return
expected = f"Bearer {api_key}"
if authorization != expected:
raise HTTPException(status_code=401, detail="unauthorized")
@app.get("/health")
def health() -> Dict[str, str]:
return {"status": "ok"}
@app.post("/sources/redmine/backfill-sample")
def backfill(payload: Dict[str, Any] | None = None, authorization: Optional[str] = Header(default=None)) -> Dict[str, Any]:
authorize(authorization)
active_services = get_services()
limit = int((payload or {}).get("limit", active_services["settings"].sample_limit))
return active_services["backfill"].backfill_redmine_sample(limit=limit)
@app.post("/sources/redmine/refresh")
def refresh(payload: Dict[str, Any] | None = None, authorization: Optional[str] = Header(default=None)) -> Dict[str, Any]:
authorize(authorization)
payload = payload or {}
project_limits = payload.get("project_limits")
if not project_limits:
project = payload.get("project_identifier") or get_services()["settings"].redmine_project_identifier
if not project:
raise HTTPException(status_code=400, detail="project_limits or project_identifier is required")
project_limits = {project: int(payload.get("limit", get_services()["settings"].sample_limit))}
return get_services()["refresh"].refresh_redmine_project_limits(
{str(project): int(limit) for project, limit in project_limits.items()},
dry_run=bool(payload.get("dry_run", False)),
force_rebuild=bool(payload.get("force_rebuild", False)),
overlap_minutes=int(payload.get("overlap_minutes", 15)),
)
@app.post("/search")
def search(payload: Dict[str, Any], authorization: Optional[str] = Header(default=None)) -> Dict[str, Any]:
authorize(authorization)
query = SearchQuery(
text=payload.get("query") or payload.get("text") or "",
source=payload.get("source"),
project_id=payload.get("project_id"),
project_identifier=payload.get("project_identifier"),
doc_type=payload.get("doc_type"),
issue_id=payload.get("issue_id"),
contact_id=payload.get("contact_id"),
contact_email=payload.get("contact_email"),
date_from=payload.get("date_from"),
date_to=payload.get("date_to"),
limit=int(payload.get("limit", 10)),
include_snippets=bool(payload.get("include_snippets", True)),
)
results = get_services()["search"].search(query)
return search_response(query, results)
@app.get("/projects")
def projects(authorization: Optional[str] = Header(default=None)) -> Dict[str, Any]:
authorize(authorization)
return {"projects": get_services()["store"].list_projects(source="redmine")}
@app.get("/documents/{document_id}")
def document(document_id: str, authorization: Optional[str] = Header(default=None)) -> Dict[str, Any]:
authorize(authorization)
found = get_services()["search"].get_document(document_id)
if found is None:
raise HTTPException(status_code=404, detail="not_found")
return found
return app
class LazyASGIApp:
def __init__(self) -> None:
self._app = None
async def __call__(self, scope, receive, send):
if self._app is None:
self._app = create_app()
await self._app(scope, receive, send)
app = LazyASGIApp()
+25
View File
@@ -0,0 +1,25 @@
from __future__ import annotations
from typing import List
def chunk_text(text: str, max_chars: int = 3500, overlap: int = 300) -> List[str]:
cleaned = "\n".join(line.rstrip() for line in text.strip().splitlines()).strip()
if not cleaned:
return []
if len(cleaned) <= max_chars:
return [cleaned]
chunks: List[str] = []
start = 0
while start < len(cleaned):
end = min(start + max_chars, len(cleaned))
if end < len(cleaned):
boundary = max(cleaned.rfind("\n\n", start, end), cleaned.rfind(". ", start, end))
if boundary > start + int(max_chars * 0.5):
end = boundary + 1
chunks.append(cleaned[start:end].strip())
if end >= len(cleaned):
break
start = max(0, end - overlap)
return [chunk for chunk in chunks if chunk]
+72
View File
@@ -0,0 +1,72 @@
from __future__ import annotations
import json
import urllib.request
from typing import Any, Dict, Optional
from .app import build_services
from .models import SearchQuery, search_response
class SemanticIndexClient:
def __init__(
self,
base_url: Optional[str] = None,
api_key: Optional[str] = None,
search_service: Optional[Any] = None,
) -> None:
self.base_url = base_url.rstrip("/") if base_url else None
self.api_key = api_key
self.search_service = search_service
@classmethod
def local(cls) -> "SemanticIndexClient":
return cls(search_service=build_services()["search"])
def search(self, query: str, **filters: Any) -> Dict[str, Any]:
if self.base_url:
return self._post_json("/search", {"query": query, **filters})
search_service = self.search_service or build_services()["search"]
search_query = SearchQuery(
text=query,
source=filters.get("source"),
project_id=filters.get("project_id"),
project_identifier=filters.get("project_identifier"),
doc_type=filters.get("doc_type"),
issue_id=filters.get("issue_id"),
contact_id=filters.get("contact_id"),
contact_email=filters.get("contact_email"),
date_from=filters.get("date_from"),
date_to=filters.get("date_to"),
limit=int(filters.get("limit", 10)),
include_snippets=bool(filters.get("include_snippets", True)),
)
return search_response(search_query, search_service.search(search_query))
def get_document(self, document_id: str) -> Dict[str, Any]:
if self.base_url:
return self._get_json(f"/documents/{document_id}")
search_service = self.search_service or build_services()["search"]
return search_service.get_document(document_id) or {"error": "not_found", "id": document_id}
def _post_json(self, path: str, payload: Dict[str, Any]) -> Dict[str, Any]:
data = json.dumps(payload).encode("utf-8")
request = urllib.request.Request(
f"{self.base_url}{path}",
data=data,
headers=self._headers(),
method="POST",
)
with urllib.request.urlopen(request, timeout=60) as response:
return json.loads(response.read().decode("utf-8"))
def _get_json(self, path: str) -> Dict[str, Any]:
request = urllib.request.Request(f"{self.base_url}{path}", headers=self._headers())
with urllib.request.urlopen(request, timeout=60) as response:
return json.loads(response.read().decode("utf-8"))
def _headers(self) -> Dict[str, str]:
headers = {"Content-Type": "application/json"}
if self.api_key:
headers["Authorization"] = f"Bearer {self.api_key}"
return headers
+64
View File
@@ -0,0 +1,64 @@
from __future__ import annotations
import os
from dataclasses import dataclass
from pathlib import Path
from typing import Dict, Optional
@dataclass(frozen=True)
class Settings:
openai_api_key: Optional[str]
qdrant_url: str
qdrant_api_key: Optional[str]
qdrant_collection: str
redmine_url: str
redmine_api_key: Optional[str]
redmine_project_identifier: Optional[str]
sample_limit: int
bind_host: str
bind_port: int
service_api_key: Optional[str]
refresh_state_path: Path
def load_dotenv(path: str | Path = ".env") -> Dict[str, str]:
values: Dict[str, str] = {}
dotenv = Path(path)
if not dotenv.exists():
return values
for raw_line in dotenv.read_text(encoding="utf-8").splitlines():
line = raw_line.strip()
if not line or line.startswith("#") or "=" not in line:
continue
key, value = line.split("=", 1)
values[key.strip()] = value.strip().strip('"').strip("'")
return values
def resolve_dotenv_path(dotenv_path: str | Path = ".env") -> Path:
primary = Path(dotenv_path)
if primary.exists():
return primary
package_env = primary.parent / "semantic_index" / ".env"
if package_env.exists():
return package_env
return primary
def load_settings(dotenv_path: str | Path = ".env") -> Settings:
env = {**load_dotenv(resolve_dotenv_path(dotenv_path)), **os.environ}
return Settings(
openai_api_key=env.get("OPENAI_API_KEY"),
qdrant_url=env.get("QDRANT_URL", "http://localhost:6333"),
qdrant_api_key=env.get("QDRANT_API_KEY"),
qdrant_collection=env.get("QDRANT_COLLECTION", "redmine_semantic_sample"),
redmine_url=env.get("REDMINE_URL", "http://localhost"),
redmine_api_key=env.get("REDMINE_API_KEY"),
redmine_project_identifier=env.get("REDMINE_PROJECT_IDENTIFIER"),
sample_limit=int(env.get("REDMINE_SAMPLE_LIMIT", "500")),
bind_host=env.get("SEMANTIC_INDEX_HOST", "127.0.0.1"),
bind_port=int(env.get("SEMANTIC_INDEX_PORT", "8787")),
service_api_key=env.get("SEMANTIC_INDEX_API_KEY"),
refresh_state_path=Path(env.get("SEMANTIC_INDEX_REFRESH_STATE_PATH", ".cache/semantic_index/refresh_state.json")),
)
+64
View File
@@ -0,0 +1,64 @@
from __future__ import annotations
from typing import Iterable, List, Optional, Protocol, Sequence
from .models import IndexDocument
class EmbeddingClient(Protocol):
def create_embeddings(self, model: str, inputs: Sequence[str], dimensions: Optional[int] = None) -> List[List[float]]:
...
class OpenAIEmbeddingClient:
def __init__(self, api_key: Optional[str] = None) -> None:
try:
from openai import OpenAI
except ImportError as exc:
raise RuntimeError("Install openai to use live embeddings") from exc
self.client = OpenAI(api_key=api_key)
def create_embeddings(self, model: str, inputs: Sequence[str], dimensions: Optional[int] = None) -> List[List[float]]:
kwargs = {"model": model, "input": list(inputs)}
if dimensions is not None:
kwargs["dimensions"] = dimensions
response = self.client.embeddings.create(**kwargs)
return [item.embedding for item in response.data]
class OpenAIEmbedder:
def __init__(
self,
client: EmbeddingClient,
model: str = "text-embedding-3-small",
dimensions: int = 1536,
batch_size: int = 100,
max_chars: int = 12000,
) -> None:
self.client = client
self.model = model
self.dimensions = dimensions
self.batch_size = batch_size
self.max_chars = max_chars
def embed_documents(self, documents: Sequence[IndexDocument]) -> List[List[float]]:
return self.embed_texts([document.text for document in documents])
def embed_query(self, text: str) -> List[float]:
return self.embed_texts([text])[0]
def embed_texts(self, texts: Iterable[str]) -> List[List[float]]:
values = list(texts)
self._validate(values)
vectors: List[List[float]] = []
for start in range(0, len(values), self.batch_size):
batch = values[start : start + self.batch_size]
vectors.extend(self.client.create_embeddings(self.model, batch, dimensions=self.dimensions))
return vectors
def _validate(self, texts: Sequence[str]) -> None:
for text in texts:
if not text.strip():
raise ValueError("embedding text cannot be empty")
if len(text) > self.max_chars:
raise ValueError(f"embedding text exceeds {self.max_chars} characters")
+100
View File
@@ -0,0 +1,100 @@
from __future__ import annotations
from typing import Any, Dict, Iterable, List, Protocol, Sequence
from .models import IndexDocument
from .redmine import RedmineMapper
class RedmineSource(Protocol):
project_identifier: str | None
def recent_helpdesk_issues(self, limit: int) -> Iterable[Dict[str, Any]]:
...
class DocumentEmbedder(Protocol):
def embed_documents(self, docs: Sequence[IndexDocument]) -> List[List[float]]:
...
class RebuildStore(Protocol):
def rebuild_source(
self,
source: str,
docs: Sequence[IndexDocument],
vectors: Sequence[Sequence[float]],
project_identifier: str | None = None,
) -> None:
...
class BackfillService:
def __init__(self, source: RedmineSource, embedder: DocumentEmbedder, store: RebuildStore, mapper: RedmineMapper | None = None) -> None:
self.source = source
self.embedder = embedder
self.store = store
self.mapper = mapper or RedmineMapper(redmine_url="")
def backfill_redmine_sample(self, limit: int = 500) -> Dict[str, int | str]:
issues = list(self.source.recent_helpdesk_issues(limit))
documents: List[IndexDocument] = []
for issue in issues:
documents.extend(self.mapper.issue_to_documents(issue))
documents = deduplicate_documents(documents)
vectors = self.embedder.embed_documents(documents) if documents else []
self.store.rebuild_source("redmine", documents, vectors, project_identifier=self._project_identifier())
return {"source": "redmine", "issues": len(issues), "documents": len(documents)}
def backfill_redmine_projects(self, projects: Sequence[str], per_project_limit: int = 500) -> Dict[str, object]:
return self.backfill_redmine_project_limits({project: per_project_limit for project in projects})
def backfill_redmine_project_limits(self, project_limits: Dict[str, int]) -> Dict[str, object]:
previous_source_project = getattr(self.source, "project_identifier", None)
previous_mapper_project = getattr(self.mapper, "project_identifier", None)
project_results: List[Dict[str, int | str]] = []
total_issues = 0
total_documents = 0
try:
for project, project_limit in project_limits.items():
if hasattr(self.source, "project_identifier"):
self.source.project_identifier = project
if hasattr(self.mapper, "project_identifier"):
self.mapper.project_identifier = project
issues = list(self.source.recent_helpdesk_issues(project_limit))
documents: List[IndexDocument] = []
for issue in issues:
documents.extend(self.mapper.issue_to_documents(issue))
documents = deduplicate_documents(documents)
vectors = self.embedder.embed_documents(documents) if documents else []
self.store.rebuild_source("redmine", documents, vectors, project_identifier=project)
project_results.append(
{"project_identifier": project, "issues": len(issues), "documents": len(documents)}
)
total_issues += len(issues)
total_documents += len(documents)
finally:
if hasattr(self.source, "project_identifier"):
self.source.project_identifier = previous_source_project
if hasattr(self.mapper, "project_identifier"):
self.mapper.project_identifier = previous_mapper_project
return {
"source": "redmine",
"projects": len(project_limits),
"issues": total_issues,
"documents": total_documents,
"project_results": project_results,
}
def _project_identifier(self) -> str | None:
mapper_project = getattr(self.mapper, "project_identifier", None)
if mapper_project:
return mapper_project
return getattr(self.source, "project_identifier", None)
def deduplicate_documents(documents: Sequence[IndexDocument]) -> List[IndexDocument]:
unique: Dict[str, IndexDocument] = {}
for document in documents:
unique[document.id] = document
return list(unique.values())
+292
View File
@@ -0,0 +1,292 @@
from __future__ import annotations
import json
from collections import Counter
from typing import Any, Dict, Iterable, List, Optional
from .models import SearchQuery, SearchResult
from .redmine import RedmineMapper
def print_count(store: Any, source: Optional[str], project: Optional[str], doc_type: Optional[str]) -> None:
count = store.count_documents(source=source, project_identifier=project, doc_type=doc_type)
print(count)
def print_list(store: Any, limit: int, source: Optional[str], project: Optional[str], doc_type: Optional[str], full_text: bool) -> None:
documents = store.list_documents(limit=limit, source=source, project_identifier=project, doc_type=doc_type)
for document in documents:
print_document(document, full_text=full_text)
def print_search(search_service: Any, query_text: str, limit: int, source: Optional[str], project: Optional[str], doc_type: Optional[str], full_text: bool) -> None:
query = SearchQuery(
text=query_text,
source=source,
project_identifier=project,
doc_type=doc_type,
limit=limit,
include_snippets=not full_text,
)
for result in search_service.search(query):
print_result(result, full_text=full_text)
def print_show(search_service: Any, document_id: str) -> None:
document = search_service.get_document(document_id)
if document is None:
print(f"not found: {document_id}")
return
print_document(document, full_text=True)
def print_preview_redmine(source: Any, redmine_url: str, project: Optional[str], limit: int, full_text: bool) -> None:
previous_project = getattr(source, "project_identifier", None)
if project and hasattr(source, "project_identifier"):
source.project_identifier = project
try:
mapper = RedmineMapper(redmine_url=redmine_url, project_identifier=project)
documents = []
for issue in source.recent_helpdesk_issues(limit):
documents.extend(mapper.issue_to_documents(issue))
finally:
if hasattr(source, "project_identifier"):
source.project_identifier = previous_project
for document in documents:
print_document({"id": document.id, "text": document.text, "payload": document.payload}, full_text=full_text)
def print_audit(store: Any, limit: int, source: Optional[str], project: Optional[str], doc_type: Optional[str], as_json: bool) -> None:
documents = store.list_documents(limit=limit, source=source, project_identifier=project, doc_type=doc_type)
report = audit_documents(documents)
if as_json:
print(json.dumps(report, sort_keys=True))
return
print(f"documents={report['total_documents']}")
for name, count in sorted(report["doc_type_counts"].items()):
print(f"doc_type {name}={count}")
for name, count in sorted(report["project_counts"].items()):
print(f"project {name}={count}")
print(f"contact_metadata {report['contact_metadata_count']}/{report['total_documents']}")
print(f"helpdesk_contact_metadata {report['helpdesk_contact_metadata_count']}/{report['helpdesk_documents']}")
print(f"attachments={report['attachment_documents']}")
for document_id in report["missing_helpdesk_contact_metadata"]:
print(f"missing_contact {document_id}")
for document_id in report["unexpected_attachment_documents"]:
print(f"unexpected_attachment {document_id}")
def print_compare_redmine(store: Any, source: Any, redmine_url: str, project: Optional[str], limit: int, as_json: bool) -> None:
preview_documents = preview_redmine_documents(source, redmine_url, project, limit)
indexed_documents = store.list_documents(limit=max(5000, limit * 100), source="redmine", project_identifier=project)
report = compare_documents(preview_documents, indexed_documents)
if as_json:
print(json.dumps(report, sort_keys=True))
return
print(f"preview_documents={report['preview_documents']}")
print(f"indexed_documents={report['indexed_documents']}")
for document_id in report["missing"]:
print(f"missing {document_id}")
for document_id in report["stale"]:
print(f"stale {document_id}")
for mismatch in report["contact_mismatches"]:
print(f"contact_mismatch {mismatch['id']}")
def print_smoke_search(
search_service: Any,
project: Optional[str],
email: str,
issue_id: Optional[int],
order_token: Optional[str],
natural_query: str,
as_json: bool,
) -> None:
checks = smoke_search(search_service, project, email, issue_id, order_token, natural_query)
report = {"project_identifier": project, "checks": checks}
if as_json:
print(json.dumps(report, sort_keys=True))
return
for check in checks:
status = "PASS" if check["passed"] else "FAIL"
print(f"{status} {check['kind']} {check['query']}")
for result in check["results"]:
payload = result["payload"]
print(
f" {result['id']} score={result['score']:.4f} "
f"doc_type={payload.get('doc_type')} issue={payload.get('issue_id')} "
f"contact={contact_display(payload)} url={result['citation'].get('url')}"
)
def audit_documents(documents: List[Dict[str, Any]]) -> Dict[str, Any]:
doc_type_counts = Counter(str((document.get("payload") or {}).get("doc_type") or "unknown") for document in documents)
project_counts = Counter(str((document.get("payload") or {}).get("project_identifier") or "unknown") for document in documents)
missing_contact = []
missing_helpdesk_contact = []
contact_metadata_count = 0
helpdesk_documents = 0
helpdesk_contact_metadata_count = 0
unexpected_attachments = []
for document in documents:
payload = document.get("payload") or {}
doc_type = str(payload.get("doc_type") or "")
has_contact = bool(payload.get("contact_id") and payload.get("contact_email"))
has_helpdesk_ticket = bool(payload.get("has_helpdesk_ticket"))
if has_contact:
contact_metadata_count += 1
elif doc_type in {"issue", "journal", "message", "contact"} and has_helpdesk_ticket:
missing_contact.append(str(document.get("id")))
if has_helpdesk_ticket:
helpdesk_documents += 1
if has_contact:
helpdesk_contact_metadata_count += 1
elif doc_type in {"issue", "journal", "message", "contact"}:
missing_helpdesk_contact.append(str(document.get("id")))
if doc_type == "attachment":
unexpected_attachments.append(str(document.get("id")))
return {
"total_documents": len(documents),
"doc_type_counts": dict(doc_type_counts),
"project_counts": dict(project_counts),
"contact_metadata_count": contact_metadata_count,
"helpdesk_documents": helpdesk_documents,
"helpdesk_contact_metadata_count": helpdesk_contact_metadata_count,
"missing_contact_metadata": missing_contact,
"missing_helpdesk_contact_metadata": missing_helpdesk_contact,
"attachment_documents": len(unexpected_attachments),
"unexpected_attachment_documents": unexpected_attachments,
}
def preview_redmine_documents(source: Any, redmine_url: str, project: Optional[str], limit: int) -> List[Dict[str, Any]]:
previous_project = getattr(source, "project_identifier", None)
if project and hasattr(source, "project_identifier"):
source.project_identifier = project
try:
mapper = RedmineMapper(redmine_url=redmine_url, project_identifier=project)
documents = []
for issue in source.recent_helpdesk_issues(limit):
documents.extend(mapper.issue_to_documents(issue))
return [{"id": document.id, "text": document.text, "payload": document.payload} for document in documents]
finally:
if hasattr(source, "project_identifier"):
source.project_identifier = previous_project
def compare_documents(preview_documents: List[Dict[str, Any]], indexed_documents: List[Dict[str, Any]]) -> Dict[str, Any]:
indexed_by_id = {str(document.get("id")): document for document in indexed_documents}
missing = []
stale = []
contact_mismatches = []
for preview in preview_documents:
document_id = str(preview.get("id"))
indexed = indexed_by_id.get(document_id)
if indexed is None:
missing.append(document_id)
continue
preview_payload = preview.get("payload") or {}
indexed_payload = indexed.get("payload") or {}
if preview_payload.get("source_hash") != indexed_payload.get("source_hash"):
stale.append(document_id)
contact_fields = ("contact_id", "contact_name", "contact_email", "contact_company")
if any(preview_payload.get(field) != indexed_payload.get(field) for field in contact_fields):
contact_mismatches.append({"id": document_id})
return {
"preview_documents": len(preview_documents),
"indexed_documents": len(indexed_documents),
"missing": missing,
"stale": stale,
"contact_mismatches": contact_mismatches,
}
def smoke_search(
search_service: Any,
project: Optional[str],
email: str,
issue_id: Optional[int],
order_token: Optional[str],
natural_query: str,
) -> List[Dict[str, Any]]:
checks = [run_smoke_query(search_service, "email", email, project, expected_email=email)]
if issue_id is not None:
checks.append(run_smoke_query(search_service, "issue", str(issue_id), project, expected_issue_id=issue_id))
if order_token:
checks.append(run_smoke_query(search_service, "order", order_token, project))
if natural_query:
checks.append(run_smoke_query(search_service, "natural", natural_query, project))
return checks
def run_smoke_query(
search_service: Any,
kind: str,
text: str,
project: Optional[str],
expected_email: Optional[str] = None,
expected_issue_id: Optional[int] = None,
) -> Dict[str, Any]:
query = SearchQuery(text=text, source="redmine", project_identifier=project, issue_id=expected_issue_id, limit=5)
results = search_service.search(query)
result_dicts = [result.to_dict(include_snippet=True) for result in results]
passed = bool(result_dicts)
if expected_email:
passed = passed and any((result["payload"] or {}).get("contact_email") == expected_email for result in result_dicts)
if expected_issue_id is not None:
passed = passed and any((result["payload"] or {}).get("issue_id") == expected_issue_id for result in result_dicts)
return {"kind": kind, "query": text, "passed": passed, "results": result_dicts}
def print_result(result: SearchResult, full_text: bool) -> None:
print(f"{result.id} score={result.score:.4f}")
print_metadata(result.payload)
print(f"url={result.citation.get('url')}")
print(result.text if full_text else snippet(result.text))
print()
def print_document(document: Dict[str, Any], full_text: bool) -> None:
payload = document.get("payload") or {}
print(document.get("id"))
print_metadata(payload)
url = payload.get("redmine_url")
if url:
print(f"url={url}")
print(document.get("text", "") if full_text else snippet(document.get("text", "")))
print()
def print_metadata(payload: Dict[str, Any]) -> None:
contact = contact_display(payload)
fields = [
("source", payload.get("source")),
("doc_type", payload.get("doc_type")),
("issue", payload.get("issue_id")),
("project", payload.get("project_identifier")),
("contact", contact),
("created", payload.get("created_on")),
("updated", payload.get("updated_on")),
]
print(" ".join(f"{name}={value}" for name, value in fields if value is not None))
def contact_display(payload: Dict[str, Any]) -> Optional[str]:
contact_id = payload.get("contact_id")
pieces = []
if contact_id is not None:
pieces.append(f"#{contact_id}")
if payload.get("contact_name"):
pieces.append(str(payload["contact_name"]))
if payload.get("contact_email"):
pieces.append(str(payload["contact_email"]))
if payload.get("contact_company"):
pieces.append(str(payload["contact_company"]))
return " | ".join(pieces) if pieces else None
def snippet(text: str, max_chars: int = 240) -> str:
compact = " ".join(text.split())
if len(compact) <= max_chars:
return compact
return compact[: max_chars - 3].rstrip() + "..."
+80
View File
@@ -0,0 +1,80 @@
from __future__ import annotations
import json
import sys
from typing import Any, Dict, Optional
from .models import SearchQuery, search_response
class SemanticMCP:
def __init__(self, search_service: Any, backfill_service: Optional[Any], store: Optional[Any] = None, refresh_service: Optional[Any] = None) -> None:
self.search_service = search_service
self.backfill_service = backfill_service
self.store = store
self.refresh_service = refresh_service
def tools(self) -> Dict[str, Dict[str, str]]:
return {
"semantic_search": {"description": "Search the semantic index and return cited snippets."},
"semantic_get_document": {"description": "Fetch one indexed document by stable id."},
"semantic_list_projects": {"description": "List indexed project identifiers and document counts."},
"semantic_backfill_redmine_sample": {"description": "Rebuild the Redmine sample collection."},
"semantic_refresh_redmine": {"description": "Refresh recent Redmine issues without re-embedding unchanged documents."},
}
def call_tool(self, name: str, arguments: Dict[str, Any]) -> Dict[str, Any]:
if name == "semantic_search":
query = SearchQuery(
text=arguments.get("query") or arguments.get("text") or "",
source=arguments.get("source"),
project_id=arguments.get("project_id"),
project_identifier=arguments.get("project_identifier"),
doc_type=arguments.get("doc_type"),
issue_id=arguments.get("issue_id"),
contact_id=arguments.get("contact_id"),
contact_email=arguments.get("contact_email"),
date_from=arguments.get("date_from"),
date_to=arguments.get("date_to"),
limit=int(arguments.get("limit", 10)),
include_snippets=bool(arguments.get("include_snippets", True)),
)
results = self.search_service.search(query)
return search_response(query, results)
if name == "semantic_get_document":
return self.search_service.get_document(arguments["id"]) or {"error": "not_found", "id": arguments["id"]}
if name == "semantic_list_projects":
if self.store is None:
return {"error": "project_listing_unavailable"}
return {"projects": self.store.list_projects(source=arguments.get("source", "redmine"))}
if name == "semantic_backfill_redmine_sample":
if self.backfill_service is None:
return {"error": "backfill_unavailable"}
return self.backfill_service.backfill_redmine_sample(limit=int(arguments.get("limit", 500)))
if name == "semantic_refresh_redmine":
if self.refresh_service is None:
return {"error": "refresh_unavailable"}
project_limits = arguments.get("project_limits")
if not project_limits:
project = arguments.get("project_identifier")
if not project:
return {"error": "project_required"}
project_limits = {project: int(arguments.get("limit", 500))}
return self.refresh_service.refresh_redmine_project_limits(
{str(project): int(limit) for project, limit in project_limits.items()},
dry_run=bool(arguments.get("dry_run", False)),
force_rebuild=bool(arguments.get("force_rebuild", False)),
overlap_minutes=int(arguments.get("overlap_minutes", 15)),
)
raise ValueError(f"unknown tool: {name}")
def serve_stdio(mcp: SemanticMCP) -> None:
for line in sys.stdin:
request = json.loads(line)
try:
result = mcp.call_tool(request["name"], request.get("arguments") or {})
response = {"id": request.get("id"), "result": result}
except Exception as exc:
response = {"id": request.get("id"), "error": str(exc)}
print(json.dumps(response), flush=True)
+100
View File
@@ -0,0 +1,100 @@
from __future__ import annotations
from dataclasses import dataclass, field
from typing import Any, Dict, Optional
Payload = Dict[str, Any]
@dataclass(frozen=True)
class IndexDocument:
id: str
text: str
payload: Payload = field(default_factory=dict)
def __post_init__(self) -> None:
if not self.id.strip():
raise ValueError("document id is required")
if not self.text.strip():
raise ValueError("document text is required")
@dataclass(frozen=True)
class SearchQuery:
text: str
source: Optional[str] = None
project_id: Optional[int] = None
project_identifier: Optional[str] = None
doc_type: Optional[str] = None
issue_id: Optional[int] = None
contact_id: Optional[int] = None
contact_email: Optional[str] = None
date_from: Optional[str] = None
date_to: Optional[str] = None
limit: int = 10
include_snippets: bool = True
def __post_init__(self) -> None:
if not self.text.strip():
raise ValueError("search text is required")
if self.limit < 1 or self.limit > 100:
raise ValueError("limit must be between 1 and 100")
@dataclass(frozen=True)
class SearchResult:
id: str
score: float
text: str
payload: Payload
@property
def snippet(self) -> str:
return self.text[:500]
@property
def citation(self) -> Payload:
return {
"id": self.id,
"source": self.payload.get("source"),
"doc_type": self.payload.get("doc_type"),
"issue_id": self.payload.get("issue_id"),
"project_identifier": self.payload.get("project_identifier"),
"contact_id": self.payload.get("contact_id"),
"contact_name": self.payload.get("contact_name"),
"contact_email": self.payload.get("contact_email"),
"url": self.payload.get("redmine_url"),
"record_id": self.payload.get("source_record_id"),
}
def to_dict(self, include_snippet: bool = True) -> Payload:
data: Payload = {
"id": self.id,
"score": self.score,
"payload": self.payload,
"citation": self.citation,
}
if include_snippet:
data["snippet"] = self.snippet
return data
def search_response(query: SearchQuery, results: list[SearchResult]) -> Payload:
filters = {
"source": query.source,
"project_id": query.project_id,
"project_identifier": query.project_identifier,
"doc_type": query.doc_type,
"issue_id": query.issue_id,
"contact_id": query.contact_id,
"contact_email": query.contact_email,
"date_from": query.date_from,
"date_to": query.date_to,
"limit": query.limit,
}
return {
"query": query.text,
"filters": {key: value for key, value in filters.items() if value is not None},
"results": [result.to_dict(include_snippet=query.include_snippets) for result in results],
}
+219
View File
@@ -0,0 +1,219 @@
from __future__ import annotations
import uuid
from typing import Any, Dict, List, Optional, Sequence
from collections import Counter
from .models import IndexDocument, SearchQuery, SearchResult
def point_id_for_document(document_id: str) -> str:
return str(uuid.uuid5(uuid.NAMESPACE_URL, document_id))
def build_filter(query: SearchQuery) -> Dict[str, List[Dict[str, Any]]]:
must: List[Dict[str, Any]] = []
equality_fields = {
"source": query.source,
"project_id": query.project_id,
"project_identifier": query.project_identifier,
"doc_type": query.doc_type,
"issue_id": query.issue_id,
"contact_id": query.contact_id,
"contact_email": query.contact_email,
}
for key, value in equality_fields.items():
if value is not None:
must.append({"key": key, "match": {"value": value}})
if query.date_from or query.date_to:
range_filter: Dict[str, str] = {}
if query.date_from:
range_filter["gte"] = query.date_from
if query.date_to:
range_filter["lte"] = query.date_to
must.append({"key": "created_on", "range": range_filter})
return {"must": must}
class QdrantStore:
def __init__(self, url: str, api_key: Optional[str], collection: str, vector_size: int = 1536, upsert_batch_size: int = 64) -> None:
try:
from qdrant_client import QdrantClient
from qdrant_client.http import models as qmodels
except ImportError as exc:
raise RuntimeError("Install qdrant-client to use live Qdrant storage") from exc
self.client = QdrantClient(url=url, api_key=api_key)
self.collection = collection
self.vector_size = vector_size
self.upsert_batch_size = upsert_batch_size
self.qmodels = qmodels
def ensure_collection(self) -> None:
collections = self.client.get_collections().collections
if any(collection.name == self.collection for collection in collections):
return
self.client.create_collection(
collection_name=self.collection,
vectors_config=self.qmodels.VectorParams(size=self.vector_size, distance=self.qmodels.Distance.COSINE),
)
def upsert(self, documents: Sequence[IndexDocument], vectors: Sequence[Sequence[float]]) -> None:
if len(documents) != len(vectors):
raise ValueError("documents and vectors length mismatch")
self.ensure_collection()
points = [
self.qmodels.PointStruct(
id=point_id_for_document(document.id),
vector=list(vector),
payload={**document.payload, "document_id": document.id, "text": document.text},
)
for document, vector in zip(documents, vectors)
]
for start in range(0, len(points), self.upsert_batch_size):
batch = points[start : start + self.upsert_batch_size]
if batch:
self.client.upsert(collection_name=self.collection, points=batch)
def delete_by_source(self, source: str, project_identifier: Optional[str] = None) -> None:
self.ensure_collection()
query = SearchQuery(text="*", source=source, project_identifier=project_identifier)
self.client.delete(
collection_name=self.collection,
points_selector=self.qmodels.FilterSelector(
filter=self._to_qdrant_filter(build_filter(query))
),
)
def delete_documents(self, document_ids: Sequence[str]) -> None:
self.ensure_collection()
if not document_ids:
return
self.client.delete(
collection_name=self.collection,
points_selector=self.qmodels.PointIdsList(
points=[point_id_for_document(document_id) for document_id in document_ids]
),
)
def rebuild_source(
self,
source: str,
documents: Sequence[IndexDocument],
vectors: Sequence[Sequence[float]],
project_identifier: Optional[str] = None,
) -> None:
self.delete_by_source(source, project_identifier=project_identifier)
self.upsert(documents, vectors)
def search(self, vector: Sequence[float], query: SearchQuery, limit: int) -> List[SearchResult]:
self.ensure_collection()
qfilter = self._to_qdrant_filter(build_filter(query))
if hasattr(self.client, "query_points"):
response = self.client.query_points(
collection_name=self.collection,
query=list(vector),
query_filter=qfilter,
limit=limit,
with_payload=True,
)
results = response.points
else:
results = self.client.search(
collection_name=self.collection,
query_vector=list(vector),
query_filter=qfilter,
limit=limit,
with_payload=True,
)
return [self._point_to_result(point) for point in results]
def get_document(self, document_id: str) -> Optional[Dict[str, Any]]:
self.ensure_collection()
points = self.client.retrieve(collection_name=self.collection, ids=[point_id_for_document(document_id)], with_payload=True)
if not points:
return None
payload = dict(points[0].payload or {})
text = payload.pop("text", "")
payload.pop("document_id", None)
return {"id": document_id, "text": text, "payload": payload}
def count_documents(
self,
source: Optional[str] = None,
project_identifier: Optional[str] = None,
doc_type: Optional[str] = None,
) -> int:
self.ensure_collection()
query = SearchQuery(text="*", source=source, project_identifier=project_identifier, doc_type=doc_type)
result = self.client.count(
collection_name=self.collection,
count_filter=self._to_qdrant_filter(build_filter(query)),
exact=True,
)
return int(result.count)
def list_documents(
self,
limit: int = 10,
source: Optional[str] = None,
project_identifier: Optional[str] = None,
doc_type: Optional[str] = None,
issue_id: Optional[int] = None,
) -> List[Dict[str, Any]]:
self.ensure_collection()
query = SearchQuery(text="*", source=source, project_identifier=project_identifier, doc_type=doc_type, issue_id=issue_id)
qfilter = self._to_qdrant_filter(build_filter(query))
records = []
offset = None
while len(records) < limit:
batch_limit = limit - len(records)
batch, offset = self.client.scroll(
collection_name=self.collection,
scroll_filter=qfilter,
limit=batch_limit,
with_payload=True,
with_vectors=False,
offset=offset,
)
records.extend(batch[:batch_limit])
if not offset or not batch:
break
return [self._record_to_document(record) for record in records]
def list_projects(self, source: Optional[str] = None, limit: int = 5000) -> List[Dict[str, Any]]:
documents = self.list_documents(limit=limit, source=source)
counts = Counter(
str((document.get("payload") or {}).get("project_identifier"))
for document in documents
if (document.get("payload") or {}).get("project_identifier")
)
return [
{"project_identifier": project, "document_count": count}
for project, count in sorted(counts.items())
]
def _to_qdrant_filter(self, raw_filter: Dict[str, List[Dict[str, Any]]]) -> Any:
conditions = []
for condition in raw_filter.get("must", []):
if "match" in condition:
conditions.append(
self.qmodels.FieldCondition(
key=condition["key"],
match=self.qmodels.MatchValue(value=condition["match"]["value"]),
)
)
elif "range" in condition:
conditions.append(self.qmodels.FieldCondition(key=condition["key"], range=self.qmodels.DatetimeRange(**condition["range"])))
return self.qmodels.Filter(must=conditions) if conditions else None
def _point_to_result(self, point: Any) -> SearchResult:
payload = dict(point.payload or {})
text = payload.pop("text", "")
document_id = payload.pop("document_id", str(point.id))
return SearchResult(id=document_id, score=float(point.score), text=text, payload=payload)
def _record_to_document(self, record: Any) -> Dict[str, Any]:
payload = dict(record.payload or {})
text = payload.pop("text", "")
document_id = payload.pop("document_id", str(record.id))
return {"id": document_id, "text": text, "payload": payload}
+243
View File
@@ -0,0 +1,243 @@
from __future__ import annotations
import hashlib
import json
import urllib.parse
import urllib.request
from typing import Any, Dict, Iterable, List, Optional
from .chunking import chunk_text
from .models import IndexDocument, Payload
Issue = Dict[str, Any]
class RedmineMapper:
def __init__(self, redmine_url: str, chunk_chars: int = 3500, project_identifier: Optional[str] = None) -> None:
self.redmine_url = redmine_url.rstrip("/")
self.chunk_chars = chunk_chars
self.project_identifier = project_identifier
def issue_to_documents(self, issue: Issue) -> List[IndexDocument]:
docs: List[IndexDocument] = []
docs.extend(self._issue_documents(issue))
docs.extend(self._journal_documents(issue))
docs.extend(self._message_documents(issue))
docs.extend(self._contact_documents(issue))
return docs
def _issue_documents(self, issue: Issue) -> List[IndexDocument]:
issue_id = int(issue["id"])
subject = issue.get("subject") or ""
description = issue.get("description") or ""
contact = self._issue_contact(issue)
contact_text = self._contact_text(contact)
text = f"Issue #{issue_id}: {subject}\n\n{description}\n\n{contact_text}".strip()
return self._documents_for_record(
base_id=f"redmine:issue:{issue_id}",
text=text,
issue=issue,
doc_type="issue",
source_record_id=f"issue:{issue_id}",
record=issue,
)
def _journal_documents(self, issue: Issue) -> List[IndexDocument]:
docs: List[IndexDocument] = []
issue_id = int(issue["id"])
for journal in issue.get("journals") or []:
notes = journal.get("notes") or ""
if not notes.strip():
continue
docs.extend(
self._documents_for_record(
base_id=f"redmine:issue:{issue_id}:journal:{journal['id']}",
text=notes,
issue=issue,
doc_type="journal",
source_record_id=f"journal:{journal['id']}",
record=journal,
extra={
"journal_id": journal.get("id"),
"visibility": "private" if journal.get("private_notes") else "public",
"created_on": journal.get("created_on") or issue.get("updated_on"),
},
)
)
return docs
def _message_documents(self, issue: Issue) -> List[IndexDocument]:
docs: List[IndexDocument] = []
issue_id = int(issue["id"])
for message in issue.get("messages") or issue.get("journal_messages") or []:
body = message.get("body") or message.get("content") or message.get("message") or ""
if not body.strip():
continue
docs.extend(
self._documents_for_record(
base_id=f"redmine:issue:{issue_id}:message:{message['id']}",
text=body,
issue=issue,
doc_type="message",
source_record_id=f"message:{message['id']}",
record=message,
extra={
"message_id": message.get("id"),
"direction": message.get("direction"),
"created_on": message.get("created_on") or issue.get("updated_on"),
},
)
)
return docs
def _contact_documents(self, issue: Issue) -> List[IndexDocument]:
contact = self._issue_contact(issue)
contact_id = contact.get("id")
if not contact_id:
return []
text = self._contact_text(contact)
if not text.strip():
return []
return self._documents_for_record(
base_id=f"redmine:contact:{contact_id}:issue:{issue['id']}",
text=text,
issue=issue,
doc_type="contact",
source_record_id=f"contact:{contact_id}",
record=contact,
)
def _documents_for_record(
self,
base_id: str,
text: str,
issue: Issue,
doc_type: str,
source_record_id: str,
record: Dict[str, Any],
extra: Optional[Payload] = None,
) -> List[IndexDocument]:
chunks = chunk_text(text, max_chars=self.chunk_chars)
payload = self._base_payload(issue, doc_type, source_record_id, record)
if extra:
payload.update({key: value for key, value in extra.items() if value is not None})
return [
IndexDocument(id=f"{base_id}:chunk:{index}", text=chunk, payload={**payload, "chunk_index": index})
for index, chunk in enumerate(chunks)
]
def _base_payload(self, issue: Issue, doc_type: str, source_record_id: str, record: Dict[str, Any]) -> Payload:
project = issue.get("project") or {}
helpdesk_ticket = issue.get("helpdesk_ticket") or {}
contact = self._issue_contact(issue)
issue_id = int(issue["id"])
redmine_url = issue.get("url") or f"{self.redmine_url}/issues/{issue_id}"
created_on = record.get("created_on") or issue.get("created_on")
updated_on = record.get("updated_on") or issue.get("updated_on")
return {
"source": "redmine",
"doc_type": doc_type,
"issue_id": issue_id,
"project_id": project.get("id"),
"project_identifier": project.get("identifier") or self.project_identifier,
"project_name": project.get("name"),
"has_helpdesk_ticket": bool(helpdesk_ticket.get("id")),
"helpdesk_ticket_id": helpdesk_ticket.get("id"),
"contact_id": contact.get("id"),
"contact_email": contact.get("email"),
"contact_name": contact.get("name"),
"contact_company": contact.get("company"),
"created_on": created_on,
"updated_on": updated_on,
"visibility": "public",
"redmine_url": redmine_url,
"source_record_id": source_record_id,
"source_hash": stable_hash(record),
}
def _issue_contact(self, issue: Issue) -> Payload:
contact = issue.get("contact") or issue.get("customer") or {}
helpdesk_ticket = issue.get("helpdesk_ticket") or {}
helpdesk_contact = helpdesk_ticket.get("contact") or {}
merged = {**helpdesk_contact, **contact}
if not merged.get("id"):
merged["id"] = helpdesk_ticket.get("contact_id")
if not merged.get("email"):
merged["email"] = helpdesk_ticket.get("contact_email") or helpdesk_ticket.get("from_address")
if not merged.get("name"):
merged["name"] = helpdesk_ticket.get("contact_name")
if not merged.get("company"):
merged["company"] = helpdesk_ticket.get("contact_company")
return {key: value for key, value in merged.items() if value not in (None, "")}
def _contact_text(self, contact: Payload) -> str:
text_parts = [
contact.get("name"),
contact.get("email"),
contact.get("phone"),
contact.get("company"),
]
return "\n".join(str(part) for part in text_parts if part)
class RedmineApiSource:
def __init__(self, redmine_url: str, api_key: str, project_identifier: Optional[str] = None) -> None:
self.redmine_url = redmine_url.rstrip("/")
self.api_key = api_key
self.project_identifier = project_identifier
def recent_helpdesk_issues(self, limit: int) -> Iterable[Issue]:
for issue in self.recent_issue_summaries(limit):
yield self.issue_detail(int(issue["id"]), fallback=issue)
def recent_issue_summaries(self, limit: int) -> Iterable[Issue]:
yielded = 0
offset = 0
seen_issue_ids = set()
page_size = 100
while yielded < limit:
current_limit = min(page_size, limit - yielded)
params = {
"limit": str(current_limit),
"offset": str(offset),
"sort": "updated_on:desc,id:desc",
"include": "journals",
"status_id": "*",
}
if self.project_identifier:
params["project_id"] = self.project_identifier
params["subproject_id"] = "!*"
path = f"{self.redmine_url}/issues.json?{urllib.parse.urlencode(params)}"
payload = self._get_json(path)
issues = payload.get("issues", [])
if not issues:
break
for issue in issues:
issue_id = issue["id"]
if issue_id in seen_issue_ids:
continue
seen_issue_ids.add(issue_id)
issue.setdefault("url", f"{self.redmine_url}/issues/{issue_id}")
yield issue
yielded += 1
if yielded >= limit:
break
offset += len(issues)
def issue_detail(self, issue_id: int, fallback: Optional[Issue] = None) -> Issue:
detail_params = urllib.parse.urlencode({"include": "journals,helpdesk"})
detail = self._get_json(f"{self.redmine_url}/issues/{issue_id}.json?{detail_params}")
merged = {**(fallback or {}), **detail.get("issue", {})}
merged.setdefault("url", f"{self.redmine_url}/issues/{issue_id}")
return merged
def _get_json(self, url: str) -> Dict[str, Any]:
request = urllib.request.Request(url, headers={"X-Redmine-API-Key": self.api_key, "Accept": "application/json"})
with urllib.request.urlopen(request, timeout=30) as response:
return json.loads(response.read().decode("utf-8"))
def stable_hash(record: Dict[str, Any]) -> str:
canonical = json.dumps(record, sort_keys=True, separators=(",", ":"), default=str)
return hashlib.sha256(canonical.encode("utf-8")).hexdigest()
+225
View File
@@ -0,0 +1,225 @@
from __future__ import annotations
import json
from datetime import datetime, timedelta, timezone
from pathlib import Path
from typing import Any, Dict, Iterable, List, Optional, Protocol, Sequence
from .ingest import deduplicate_documents
from .models import IndexDocument
from .redmine import RedmineMapper
class RedmineRefreshSource(Protocol):
project_identifier: str | None
def recent_helpdesk_issues(self, limit: int) -> Iterable[Dict[str, Any]]:
...
class RefreshEmbedder(Protocol):
def embed_documents(self, docs: Sequence[IndexDocument]) -> List[List[float]]:
...
class RefreshStore(Protocol):
def list_documents(
self,
limit: int = 10,
source: Optional[str] = None,
project_identifier: Optional[str] = None,
doc_type: Optional[str] = None,
issue_id: Optional[int] = None,
) -> List[Dict[str, Any]]:
...
def upsert(self, docs: Sequence[IndexDocument], vectors: Sequence[Sequence[float]]) -> None:
...
def delete_documents(self, document_ids: Sequence[str]) -> None:
...
class FileRefreshState:
def __init__(self, path: Path) -> None:
self.path = path
def load(self) -> Dict[str, Any]:
if not self.path.exists():
return {}
return json.loads(self.path.read_text(encoding="utf-8"))
def mark_success(self, project_identifier: str, timestamp: Optional[str] = None) -> None:
payload = self.load()
payload.setdefault("projects", {})
payload["projects"][project_identifier] = {
"last_successful_refresh_at": timestamp or datetime.now(timezone.utc).isoformat()
}
self.path.parent.mkdir(parents=True, exist_ok=True)
self.path.write_text(json.dumps(payload, indent=2, sort_keys=True) + "\n", encoding="utf-8")
class RedmineRefreshService:
def __init__(
self,
source: RedmineRefreshSource,
embedder: RefreshEmbedder,
store: RefreshStore,
mapper: Optional[RedmineMapper] = None,
state: Optional[FileRefreshState] = None,
) -> None:
self.source = source
self.embedder = embedder
self.store = store
self.mapper = mapper or RedmineMapper(redmine_url="")
self.state = state
def refresh_redmine_project_limits(
self,
project_limits: Dict[str, int],
dry_run: bool = False,
force_rebuild: bool = False,
overlap_minutes: int = 15,
) -> Dict[str, Any]:
previous_source_project = getattr(self.source, "project_identifier", None)
previous_mapper_project = getattr(self.mapper, "project_identifier", None)
project_results: List[Dict[str, Any]] = []
totals = {
"issues": 0,
"scanned_issues": 0,
"detail_fetched_issues": 0,
"skipped_issues": 0,
"documents": 0,
"unchanged_documents": 0,
"changed_documents": 0,
"new_documents": 0,
"stale_documents": 0,
"force_rebuilt_documents": 0,
"would_embed_documents": 0,
"embedded_documents": 0,
}
try:
for project, limit in project_limits.items():
if hasattr(self.source, "project_identifier"):
self.source.project_identifier = project
if hasattr(self.mapper, "project_identifier"):
self.mapper.project_identifier = project
project_result = self._refresh_project(project, limit, dry_run, force_rebuild, overlap_minutes)
project_results.append(project_result)
for key in totals:
totals[key] += int(project_result.get(key, 0))
if not dry_run and self.state is not None:
self.state.mark_success(project)
finally:
if hasattr(self.source, "project_identifier"):
self.source.project_identifier = previous_source_project
if hasattr(self.mapper, "project_identifier"):
self.mapper.project_identifier = previous_mapper_project
return {
"source": "redmine",
"projects": len(project_limits),
"dry_run": dry_run,
"force_rebuild": force_rebuild,
"overlap_minutes": overlap_minutes,
**totals,
"project_results": project_results,
}
def _refresh_project(self, project: str, limit: int, dry_run: bool, force_rebuild: bool, overlap_minutes: int) -> Dict[str, Any]:
summaries = list(self._recent_issue_summaries(limit))
result: Dict[str, Any] = {
"project_identifier": project,
"issues": len(summaries),
"scanned_issues": len(summaries),
"detail_fetched_issues": 0,
"skipped_issues": 0,
"documents": 0,
"unchanged_documents": 0,
"changed_documents": 0,
"new_documents": 0,
"stale_documents": 0,
"force_rebuilt_documents": 0,
"would_embed_documents": 0,
"embedded_documents": 0,
}
cutoff = self._cutoff_for_project(project, overlap_minutes)
docs_to_embed: List[IndexDocument] = []
stale_ids: List[str] = []
for summary in summaries:
if cutoff is not None and not force_rebuild and not self._issue_is_in_refresh_window(summary, cutoff):
result["skipped_issues"] += 1
continue
issue = self._issue_detail(summary)
result["detail_fetched_issues"] += 1
candidates = deduplicate_documents(self.mapper.issue_to_documents(issue))
result["documents"] += len(candidates)
existing = self.store.list_documents(
limit=5000,
source="redmine",
project_identifier=project,
issue_id=int(issue["id"]),
)
existing_by_id = {document["id"]: document for document in existing}
candidate_by_id = {document.id: document for document in candidates}
for stale_id in sorted(set(existing_by_id) - set(candidate_by_id)):
stale_ids.append(stale_id)
result["stale_documents"] += 1
for document in candidates:
existing_document = existing_by_id.get(document.id)
if existing_document is None:
result["new_documents"] += 1
docs_to_embed.append(document)
continue
existing_hash = (existing_document.get("payload") or {}).get("source_hash")
document_hash = document.payload.get("source_hash")
if force_rebuild:
result["force_rebuilt_documents"] += 1
docs_to_embed.append(document)
elif existing_hash != document_hash:
result["changed_documents"] += 1
docs_to_embed.append(document)
else:
result["unchanged_documents"] += 1
result["would_embed_documents"] = len(docs_to_embed)
if dry_run:
return result
if stale_ids:
self.store.delete_documents(stale_ids)
if docs_to_embed:
vectors = self.embedder.embed_documents(docs_to_embed)
self.store.upsert(docs_to_embed, vectors)
result["embedded_documents"] = len(docs_to_embed)
return result
def _recent_issue_summaries(self, limit: int) -> Iterable[Dict[str, Any]]:
if hasattr(self.source, "recent_issue_summaries"):
return self.source.recent_issue_summaries(limit) # type: ignore[attr-defined]
return self.source.recent_helpdesk_issues(limit)
def _issue_detail(self, summary: Dict[str, Any]) -> Dict[str, Any]:
if hasattr(self.source, "issue_detail"):
return self.source.issue_detail(int(summary["id"])) # type: ignore[attr-defined]
return summary
def _cutoff_for_project(self, project: str, overlap_minutes: int) -> Optional[datetime]:
if self.state is None:
return None
timestamp = ((self.state.load().get("projects") or {}).get(project) or {}).get("last_successful_refresh_at")
if not timestamp:
return None
parsed = parse_redmine_datetime(timestamp)
return parsed - timedelta(minutes=overlap_minutes)
def _issue_is_in_refresh_window(self, issue: Dict[str, Any], cutoff: datetime) -> bool:
updated_on = issue.get("updated_on")
if not updated_on:
return True
return parse_redmine_datetime(str(updated_on)) >= cutoff
def parse_redmine_datetime(raw: str) -> datetime:
normalized = raw.replace("Z", "+00:00")
parsed = datetime.fromisoformat(normalized)
if parsed.tzinfo is None:
return parsed.replace(tzinfo=timezone.utc)
return parsed.astimezone(timezone.utc)
+107
View File
@@ -0,0 +1,107 @@
#!/usr/bin/env bash
set -euo pipefail
usage() {
cat >&2 <<'EOF'
Usage:
semantic_index/refresh.sh [--apply] [--dry-run]
Examples:
semantic_index/refresh.sh
semantic_index/refresh.sh --apply
Environment:
SEMANTIC_INDEX_PROJECT_LIMITS comma-separated project=limit pairs
SEMANTIC_INDEX_LOG_DIR default: .cache/semantic_index/logs
SEMANTIC_INDEX_STATE_PATH default: .cache/semantic_index/refresh_state.json
SEMANTIC_INDEX_OVERLAP_MINUTES default: 15
PYTHON default: <install-root>/.venv/bin/python
This wrapper never passes --force-rebuild. Run force rebuilds manually.
EOF
}
script_dir=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
install_root=$(cd "$script_dir/.." && pwd)
load_env_defaults() {
local file=$1
local key value
[[ -r "$file" ]] || return 0
while IFS= read -r line || [[ -n "$line" ]]; do
line=${line#"${line%%[![:space:]]*}"}
line=${line%"${line##*[![:space:]]}"}
[[ -z "$line" || "$line" == \#* || "$line" != *=* ]] && continue
key=${line%%=*}
value=${line#*=}
key=${key%"${key##*[![:space:]]}"}
value=${value#"${value%%[![:space:]]*}"}
value=${value%"${value##*[![:space:]]}"}
value=${value%\"}
value=${value#\"}
value=${value%\'}
value=${value#\'}
if [[ -z "${!key+x}" ]]; then
export "$key=$value"
fi
done < "$file"
}
load_env_defaults /etc/semantic-index.env
mode=dry-run
while [[ $# -gt 0 ]]; do
case "$1" in
--apply)
mode=apply
shift
;;
--dry-run)
mode=dry-run
shift
;;
-h|--help)
usage
exit 0
;;
*)
usage
exit 2
;;
esac
done
project_limits=${SEMANTIC_INDEX_PROJECT_LIMITS:-customer-service=500,hiring=200,todo-jason=200,sales-inbox=100,business-development=100,dock-scheduling=100,prep-standardization=100}
log_dir=${SEMANTIC_INDEX_LOG_DIR:-.cache/semantic_index/logs}
state_path=${SEMANTIC_INDEX_STATE_PATH:-.cache/semantic_index/refresh_state.json}
overlap_minutes=${SEMANTIC_INDEX_OVERLAP_MINUTES:-15}
python_bin=${PYTHON:-$install_root/.venv/bin/python}
mkdir -p "$log_dir" "$(dirname "$state_path")"
timestamp=$(date -u +"%Y%m%dT%H%M%SZ")
log_file="$log_dir/redmine-refresh-$timestamp.log"
args=(
-m semantic_index
--refresh-redmine-projects
--project-limits "$project_limits"
--state-path "$state_path"
--overlap-minutes "$overlap_minutes"
)
if [[ "$mode" == "dry-run" ]]; then
args+=(--dry-run)
fi
{
printf 'started_at=%s\n' "$(date -u +"%Y-%m-%dT%H:%M:%SZ")"
printf 'mode=%s\n' "$mode"
printf 'project_limits=%s\n' "$project_limits"
printf 'state_path=%s\n' "$state_path"
printf 'overlap_minutes=%s\n' "$overlap_minutes"
cd "$install_root"
"$python_bin" "${args[@]}"
printf '\nfinished_at=%s\n' "$(date -u +"%Y-%m-%dT%H:%M:%SZ")"
} 2>&1 | tee "$log_file"
printf 'log_file=%s\n' "$log_file"
+61
View File
@@ -0,0 +1,61 @@
from __future__ import annotations
import re
from typing import Any, Dict, List, Optional, Protocol
from .models import SearchQuery, SearchResult
class QueryEmbedder(Protocol):
def embed_query(self, text: str) -> List[float]:
...
class SearchStore(Protocol):
def search(self, vector: List[float], query: SearchQuery, limit: int) -> List[SearchResult]:
...
def get_document(self, document_id: str) -> Optional[Dict[str, Any]]:
...
class HybridSearchService:
def __init__(self, embedder: QueryEmbedder, store: SearchStore) -> None:
self.embedder = embedder
self.store = store
def search(self, query: SearchQuery) -> List[SearchResult]:
vector = self.embedder.embed_query(query.text)
candidates = self.store.search(vector, query, limit=query.limit)
rescored = [
SearchResult(
id=result.id,
score=result.score + keyword_boost(query.text, result),
text=result.text,
payload=result.payload,
)
for result in candidates
]
return sorted(rescored, key=lambda result: result.score, reverse=True)[: query.limit]
def get_document(self, document_id: str) -> Optional[Dict[str, Any]]:
return self.store.get_document(document_id)
def keyword_boost(query_text: str, result: SearchResult) -> float:
haystack = " ".join([result.text, " ".join(str(value) for value in result.payload.values() if value is not None)]).lower()
boost = 0.0
for phrase in re.findall(r'"([^"]+)"', query_text):
if phrase.lower() in haystack:
boost += 0.35
for email in re.findall(r"[\w.+-]+@[\w.-]+\.[A-Za-z]{2,}", query_text):
if email.lower() in haystack:
boost += 0.3
for token in re.findall(r"\b(?:#?\d{2,}|[A-Z]{2,}[-_]\d{2,}|[A-Z0-9]{4,}-[A-Z0-9-]{2,})\b", query_text):
normalized = token.lower().lstrip("#")
if token.lower() in haystack or normalized in haystack:
boost += 0.25
for word in re.findall(r"\b[A-Za-z][\w.-]{2,}\b", query_text):
if word.lower() in haystack:
boost += 0.03
return boost
+71
View File
@@ -0,0 +1,71 @@
#!/usr/bin/env bash
set -euo pipefail
usage() {
cat >&2 <<'EOF'
Usage:
semantic_index/search.sh "query text" [project_identifier] [limit]
Examples:
semantic_index/search.sh "goods return" customer-service 3
semantic_index/search.sh "candidate follow up" hiring 5 | jq '.results[] | {id, score, citation}'
Environment:
SEMANTIC_INDEX_URL default: http://127.0.0.1:8787
SEMANTIC_INDEX_API_KEY optional; falls back to semantic_index/.env or .env
EOF
}
if [[ $# -lt 1 ]]; then
usage
exit 2
fi
query=$1
project=${2:-}
limit=${3:-10}
base_url=${SEMANTIC_INDEX_URL:-http://127.0.0.1:8787}
script_dir=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
install_root=$(cd "$script_dir/.." && pwd)
read_env_value() {
local key=$1
local file
for file in /etc/semantic-index.env "$install_root/semantic_index/.env" "$install_root/.env" semantic_index/.env .env; do
if [[ -f "$file" ]]; then
awk -F= -v key="$key" '
$1 == key {
value = substr($0, index($0, "=") + 1)
gsub(/^[ \t"'\''"]+|[ \t"'\''"]+$/, "", value)
print value
exit
}
' "$file"
return
fi
done
}
json_escape() {
sed \
-e 's/\\/\\\\/g' \
-e 's/"/\\"/g' \
-e ':a;N;$!ba;s/\n/\\n/g'
}
escaped_query=$(printf '%s' "$query" | json_escape)
payload="{\"query\":\"$escaped_query\",\"limit\":$limit"
if [[ -n "$project" ]]; then
escaped_project=$(printf '%s' "$project" | json_escape)
payload="$payload,\"project_identifier\":\"$escaped_project\""
fi
payload="$payload}"
api_key=${SEMANTIC_INDEX_API_KEY:-$(read_env_value SEMANTIC_INDEX_API_KEY)}
args=(-sS -H "Content-Type: application/json" -d "$payload")
if [[ -n "${api_key:-}" ]]; then
args+=(-H "Authorization: Bearer $api_key")
fi
curl "${args[@]}" "$base_url/search"
+31
View File
@@ -0,0 +1,31 @@
• Added semantic_index/search.sh. It outputs raw JSON from the running HTTP service.
Usage:
semantic_index/search.sh "goods return" customer-service 3
semantic_index/search.sh "candidate follow up" hiring 5 | jq '.results[] | {id, score, citation}'
It defaults to http://127.0.0.1:8787, reads SEMANTIC_INDEX_API_KEY from the environment or semantic_i
ndex/.env, and accepts:
semantic_index/search.sh "query text" [project_identifier] [limit]
Verified:
- Script works against the local service.
- Semantic tests still pass: 49 tests, OK.
- Python compile check passed.
Okay yeah, that's nice and fast
• Good. That script is the right shape for quick manual probing: fast because it avoids Python startup,
plain JSON so it pipes cleanly, and thin enough that it wont hide what the API is returning.
A couple useful examples:
semantic_index/search.sh "goods return" customer-service 3 | jq '.results[].citation'
semantic_index/search.sh "candidate follow up" hiring 5 | jq -r '.results[] | "\(.score)
\(.citation.url) \(.snippet)"'
semantic_index/search.sh "callum@safetagtracking.com" customer-service 3 | jq '.results[0]'
+115
View File
@@ -0,0 +1,115 @@
import unittest
from pathlib import Path
from semantic_index.app import create_app
from semantic_index.config import Settings
from semantic_index.models import SearchResult
class FakeSearchService:
def search(self, query):
return [
SearchResult(
id="redmine:issue:1:chunk:0",
score=0.8,
text="Snippet text",
payload={
"source": "redmine",
"project_identifier": "customer-service",
"doc_type": "issue",
"issue_id": 1,
"redmine_url": "http://redmine/issues/1",
"source_record_id": "issue:1",
},
)
]
def get_document(self, document_id):
return {"id": document_id, "text": "Full text", "payload": {}}
class FakeStore:
def list_projects(self, source=None, limit=1000):
return [{"project_identifier": "customer-service", "document_count": 10}]
class FakeRefreshService:
def __init__(self):
self.calls = []
def refresh_redmine_project_limits(self, project_limits, dry_run=False, force_rebuild=False, overlap_minutes=15):
self.calls.append((project_limits, dry_run, force_rebuild, overlap_minutes))
return {"source": "redmine", "projects": len(project_limits), "dry_run": dry_run}
def fake_services():
refresh = FakeRefreshService()
return {
"settings": Settings(
openai_api_key="",
qdrant_url="http://qdrant",
qdrant_api_key=None,
qdrant_collection="semantic",
redmine_url="http://redmine",
redmine_api_key="",
redmine_project_identifier=None,
sample_limit=50,
bind_host="127.0.0.1",
bind_port=8787,
service_api_key=None,
refresh_state_path=Path(".cache/semantic_index/refresh_state.json"),
),
"search": FakeSearchService(),
"store": FakeStore(),
"refresh": refresh,
}
class SemanticIndexAppTest(unittest.TestCase):
def test_health_does_not_build_live_services(self):
def broken_builder():
raise AssertionError("health should not build live clients")
app = create_app(service_builder=broken_builder)
routes = {route.path: route.endpoint for route in app.routes}
self.assertEqual({"status": "ok"}, routes["/health"]())
def test_search_endpoint_returns_normalized_agent_response(self):
app = create_app(service_builder=fake_services)
routes = {route.path: route.endpoint for route in app.routes}
response = routes["/search"]({"query": "printer", "project_identifier": "customer-service", "limit": 3})
self.assertEqual("printer", response["query"])
self.assertEqual("customer-service", response["filters"]["project_identifier"])
self.assertEqual("customer-service", response["results"][0]["citation"]["project_identifier"])
def test_projects_endpoint_lists_indexed_projects(self):
app = create_app(service_builder=fake_services)
routes = {route.path: route.endpoint for route in app.routes}
response = routes["/projects"]()
self.assertEqual("customer-service", response["projects"][0]["project_identifier"])
def test_refresh_endpoint_passes_project_limits_and_cost_flags(self):
services = fake_services()
app = create_app(service_builder=lambda: services)
routes = {route.path: route.endpoint for route in app.routes}
response = routes["/sources/redmine/refresh"](
{
"project_limits": {"customer-service": 5},
"dry_run": True,
"force_rebuild": False,
"overlap_minutes": 30,
}
)
self.assertTrue(response["dry_run"])
self.assertEqual(({"customer-service": 5}, True, False, 30), services["refresh"].calls[0])
if __name__ == "__main__":
unittest.main()
+182
View File
@@ -0,0 +1,182 @@
import unittest
from semantic_index.ingest import BackfillService
from semantic_index.mcp import SemanticMCP
from semantic_index.models import SearchQuery, SearchResult
from semantic_index.redmine import RedmineMapper
class FakeRedmineSource:
project_identifier = None
def recent_helpdesk_issues(self, limit):
return [
{
"id": 1,
"subject": "First",
"description": "First body",
"project": {"identifier": self.project_identifier},
},
{
"id": 2,
"subject": "Second",
"description": "Second body",
"project": {"identifier": self.project_identifier},
},
][:limit]
class DuplicateDocumentRedmineSource:
project_identifier = "customer-service"
def recent_helpdesk_issues(self, limit):
return [
{"id": 1, "subject": "First", "description": "First body", "project": {"identifier": "customer-service"}},
{"id": 1, "subject": "First duplicate", "description": "Duplicate body", "project": {"identifier": "customer-service"}},
][:limit]
class FakeEmbedder:
def embed_documents(self, docs):
return [[float(i), 0.0, 0.0] for i, _ in enumerate(docs, start=1)]
def embed_query(self, text):
return [0.1, 0.0, 0.0]
class FakeStore:
def __init__(self):
self.deleted = []
self.upserts = []
def rebuild_source(self, source, docs, vectors, project_identifier=None):
self.deleted.append((source, project_identifier))
self.upserts.append((docs, vectors))
def list_projects(self, source=None, limit=1000):
return [
{"project_identifier": "customer-service", "document_count": 1684},
{"project_identifier": "hiring", "document_count": 409},
]
class FakeRefreshService:
def __init__(self):
self.calls = []
def refresh_redmine_project_limits(self, project_limits, dry_run=False, force_rebuild=False, overlap_minutes=15):
self.calls.append((project_limits, dry_run, force_rebuild, overlap_minutes))
return {"source": "redmine", "projects": len(project_limits), "dry_run": dry_run}
class FakeSearchService:
def __init__(self):
self.queries = []
def search(self, query):
self.queries.append(query)
return [SearchResult(id="doc1", score=0.5, text="Snippet", payload={"redmine_url": "http://redmine/issues/1"})]
def get_document(self, document_id):
return {"id": document_id, "text": "Snippet"}
class BackfillAndMCPTest(unittest.TestCase):
def test_sample_backfill_rebuilds_redmine_source(self):
service = BackfillService(source=FakeRedmineSource(), embedder=FakeEmbedder(), store=FakeStore())
result = service.backfill_redmine_sample(limit=2)
self.assertEqual({"source": "redmine", "issues": 2, "documents": 2}, result)
self.assertEqual([("redmine", None)], service.store.deleted)
docs, vectors = service.store.upserts[0]
self.assertEqual(["redmine:issue:1:chunk:0", "redmine:issue:2:chunk:0"], [doc.id for doc in docs])
self.assertEqual(2, len(vectors))
def test_sample_backfill_rebuilds_only_the_configured_project_scope(self):
store = FakeStore()
service = BackfillService(
source=FakeRedmineSource(),
embedder=FakeEmbedder(),
store=store,
mapper=RedmineMapper(redmine_url="", project_identifier="customer-service"),
)
service.backfill_redmine_sample(limit=1)
self.assertEqual([("redmine", "customer-service")], store.deleted)
def test_multi_project_backfill_rebuilds_each_project_scope(self):
store = FakeStore()
service = BackfillService(source=FakeRedmineSource(), embedder=FakeEmbedder(), store=store)
result = service.backfill_redmine_projects(["customer-service", "hiring"], per_project_limit=1)
self.assertEqual(
{
"source": "redmine",
"projects": 2,
"issues": 2,
"documents": 2,
"project_results": [
{"project_identifier": "customer-service", "issues": 1, "documents": 1},
{"project_identifier": "hiring", "issues": 1, "documents": 1},
],
},
result,
)
self.assertEqual([("redmine", "customer-service"), ("redmine", "hiring")], store.deleted)
self.assertEqual("customer-service", store.upserts[0][0][0].payload["project_identifier"])
self.assertEqual("hiring", store.upserts[1][0][0].payload["project_identifier"])
def test_multi_project_backfill_accepts_per_project_limits(self):
store = FakeStore()
service = BackfillService(source=FakeRedmineSource(), embedder=FakeEmbedder(), store=store)
result = service.backfill_redmine_project_limits({"customer-service": 2, "hiring": 1})
self.assertEqual(3, result["issues"])
self.assertEqual(
[
{"project_identifier": "customer-service", "issues": 2, "documents": 2},
{"project_identifier": "hiring", "issues": 1, "documents": 1},
],
result["project_results"],
)
def test_backfill_deduplicates_documents_by_stable_id_before_embedding(self):
store = FakeStore()
service = BackfillService(source=DuplicateDocumentRedmineSource(), embedder=FakeEmbedder(), store=store)
result = service.backfill_redmine_sample(limit=2)
self.assertEqual({"source": "redmine", "issues": 2, "documents": 1}, result)
docs, vectors = store.upserts[0]
self.assertEqual(["redmine:issue:1:chunk:0"], [doc.id for doc in docs])
self.assertEqual(1, len(vectors))
def test_mcp_tools_return_json_ready_results(self):
search = FakeSearchService()
refresh = FakeRefreshService()
mcp = SemanticMCP(search_service=search, backfill_service=None, store=FakeStore(), refresh_service=refresh)
response = mcp.call_tool("semantic_search", {"query": "printer", "source": "redmine", "project_identifier": "hiring", "limit": 3})
document = mcp.call_tool("semantic_get_document", {"id": "doc1"})
projects = mcp.call_tool("semantic_list_projects", {"source": "redmine"})
refresh_response = mcp.call_tool("semantic_refresh_redmine", {"project_identifier": "customer-service", "limit": 5, "dry_run": True})
self.assertEqual("printer", response["query"])
self.assertEqual("hiring", response["filters"]["project_identifier"])
self.assertEqual("doc1", response["results"][0]["id"])
self.assertEqual("http://redmine/issues/1", response["results"][0]["citation"]["url"])
self.assertIsInstance(search.queries[0], SearchQuery)
self.assertEqual("redmine", search.queries[0].source)
self.assertEqual("hiring", search.queries[0].project_identifier)
self.assertEqual({"id": "doc1", "text": "Snippet"}, document)
self.assertEqual("customer-service", projects["projects"][0]["project_identifier"])
self.assertTrue(refresh_response["dry_run"])
self.assertEqual(({"customer-service": 5}, True, False, 15), refresh.calls[0])
if __name__ == "__main__":
unittest.main()
+37
View File
@@ -0,0 +1,37 @@
import subprocess
import sys
from pathlib import Path
from tempfile import TemporaryDirectory
import unittest
from semantic_index.config import load_settings
class SemanticIndexCliTest(unittest.TestCase):
def test_help_does_not_require_http_runtime_dependencies(self):
result = subprocess.run(
[sys.executable, "-m", "semantic_index", "--help"],
check=False,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
text=True,
)
self.assertEqual("", result.stderr)
self.assertEqual(0, result.returncode)
self.assertIn("--mcp-stdio", result.stdout)
def test_settings_load_from_package_env_when_root_env_missing(self):
with TemporaryDirectory() as tmp:
env_path = Path(tmp) / "semantic_index" / ".env"
env_path.parent.mkdir()
env_path.write_text("QDRANT_URL=http://qdrant.example:6333\nREDMINE_SAMPLE_LIMIT=7\n", encoding="utf-8")
settings = load_settings(Path(tmp) / ".env")
self.assertEqual("http://qdrant.example:6333", settings.qdrant_url)
self.assertEqual(7, settings.sample_limit)
if __name__ == "__main__":
unittest.main()
+87
View File
@@ -0,0 +1,87 @@
import json
import unittest
from unittest.mock import patch
from semantic_index.client import SemanticIndexClient
from semantic_index.models import SearchResult
class FakeSearchService:
def __init__(self):
self.queries = []
def search(self, query):
self.queries.append(query)
return [
SearchResult(
id="redmine:issue:1:chunk:0",
score=0.7,
text="Candidate follow up",
payload={
"source": "redmine",
"project_identifier": "hiring",
"doc_type": "issue",
"issue_id": 1,
"redmine_url": "http://redmine/issues/1",
"source_record_id": "issue:1",
},
)
]
def get_document(self, document_id):
return {"id": document_id, "text": "Full text", "payload": {"project_identifier": "hiring"}}
class SemanticIndexClientTest(unittest.TestCase):
def test_in_process_client_returns_normalized_search_response(self):
search = FakeSearchService()
client = SemanticIndexClient(search_service=search)
response = client.search("candidate follow up", project_identifier="hiring", limit=3)
self.assertEqual("candidate follow up", response["query"])
self.assertEqual({"project_identifier": "hiring", "limit": 3}, response["filters"])
self.assertEqual("redmine:issue:1:chunk:0", response["results"][0]["id"])
self.assertEqual("hiring", response["results"][0]["citation"]["project_identifier"])
self.assertEqual("hiring", search.queries[0].project_identifier)
def test_in_process_client_get_document(self):
client = SemanticIndexClient(search_service=FakeSearchService())
document = client.get_document("redmine:issue:1:chunk:0")
self.assertEqual("Full text", document["text"])
def test_http_client_sends_auth_header_and_parses_search_response(self):
body = json.dumps({"query": "printer", "filters": {}, "results": []}).encode()
class FakeResponse:
def __enter__(self):
return self
def __exit__(self, exc_type, exc, tb):
return False
def read(self):
return body
captured = {}
def fake_urlopen(request, timeout):
captured["url"] = request.full_url
captured["authorization"] = request.headers.get("Authorization")
captured["body"] = json.loads(request.data.decode())
return FakeResponse()
with patch("urllib.request.urlopen", fake_urlopen):
client = SemanticIndexClient(base_url="http://semantic.local", api_key="secret")
response = client.search("printer", project_identifier="customer-service")
self.assertEqual("http://semantic.local/search", captured["url"])
self.assertEqual("Bearer secret", captured["authorization"])
self.assertEqual("customer-service", captured["body"]["project_identifier"])
self.assertEqual("printer", response["query"])
if __name__ == "__main__":
unittest.main()
+138
View File
@@ -0,0 +1,138 @@
import unittest
from semantic_index.models import IndexDocument
from semantic_index.redmine import RedmineMapper
class RedmineMapperTest(unittest.TestCase):
def test_issue_chunks_have_stable_ids_and_metadata(self):
issue = {
"id": 42,
"subject": "Widget order ORD-12345 cannot ship",
"description": "Customer reports that widget order ORD-12345 is blocked.",
"project": {"id": 7, "identifier": "fud-helpdesk"},
"contact": {"id": 9, "email": "ada@example.com", "name": "Ada Lovelace"},
"created_on": "2026-04-01T10:00:00Z",
"updated_on": "2026-04-02T10:00:00Z",
"url": "http://redmine.local/issues/42",
}
first = RedmineMapper(redmine_url="http://redmine.local").issue_to_documents(issue)
second = RedmineMapper(redmine_url="http://redmine.local").issue_to_documents(issue)
self.assertEqual([doc.id for doc in first], [doc.id for doc in second])
self.assertEqual("redmine:issue:42:chunk:0", first[0].id)
self.assertEqual("issue", first[0].payload["doc_type"])
self.assertEqual(42, first[0].payload["issue_id"])
self.assertEqual("fud-helpdesk", first[0].payload["project_identifier"])
self.assertIsNone(first[0].payload["project_name"])
self.assertFalse(first[0].payload["has_helpdesk_ticket"])
self.assertEqual("ada@example.com", first[0].payload["contact_email"])
self.assertEqual("Ada Lovelace", first[0].payload["contact_name"])
self.assertEqual("http://redmine.local/issues/42", first[0].payload["redmine_url"])
self.assertIn("source_hash", first[0].payload)
def test_helpdesk_ticket_contact_is_mapped_to_all_issue_chunks(self):
issue = {
"id": 39779,
"subject": "Goods return",
"description": "Please arrange to return these goods.",
"project": {"id": 1, "identifier": "customer-service"},
"helpdesk_ticket": {
"id": 35159,
"contact_id": 1890,
"from_address": "callum@safetagtracking.com",
"contact": {
"id": 1890,
"name": "Callum Mackeonis",
"company": "SafeTag Tracking",
"email": "callum@safetagtracking.com",
},
},
"journals": [
{"id": 71570, "notes": "Hello, yes we can arrange this today.", "created_on": "2026-04-14T14:29:49Z"}
],
}
docs = RedmineMapper(redmine_url="http://redmine.local").issue_to_documents(issue)
issue_doc = next(doc for doc in docs if doc.payload["doc_type"] == "issue")
journal_doc = next(doc for doc in docs if doc.payload["doc_type"] == "journal")
contact_doc = next(doc for doc in docs if doc.payload["doc_type"] == "contact")
for doc in (issue_doc, journal_doc, contact_doc):
self.assertEqual(35159, doc.payload["helpdesk_ticket_id"])
self.assertTrue(doc.payload["has_helpdesk_ticket"])
self.assertEqual(1890, doc.payload["contact_id"])
self.assertEqual("Callum Mackeonis", doc.payload["contact_name"])
self.assertEqual("SafeTag Tracking", doc.payload["contact_company"])
self.assertEqual("callum@safetagtracking.com", doc.payload["contact_email"])
self.assertIn("Callum Mackeonis", issue_doc.text)
self.assertIn("callum@safetagtracking.com", contact_doc.text)
def test_configured_project_identifier_is_used_when_issue_payload_omits_identifier(self):
issue = {
"id": 42,
"subject": "Widget order",
"description": "Body",
"project": {"id": 1, "name": "Customer Service"},
}
docs = RedmineMapper(
redmine_url="http://redmine.local",
project_identifier="customer-service",
).issue_to_documents(issue)
self.assertEqual("customer-service", docs[0].payload["project_identifier"])
self.assertEqual("Customer Service", docs[0].payload["project_name"])
def test_internal_non_helpdesk_issue_keeps_project_metadata_without_contact(self):
issue = {
"id": 55,
"subject": "Internal hiring task",
"description": "Follow up with candidate.",
"project": {"id": 68, "identifier": "hiring", "name": "Hiring"},
}
docs = RedmineMapper(redmine_url="http://redmine.local").issue_to_documents(issue)
self.assertEqual(1, len(docs))
self.assertEqual("hiring", docs[0].payload["project_identifier"])
self.assertEqual("Hiring", docs[0].payload["project_name"])
self.assertFalse(docs[0].payload["has_helpdesk_ticket"])
self.assertIsNone(docs[0].payload["contact_id"])
def test_issue_journals_messages_and_contact_are_mapped(self):
issue = {
"id": 42,
"subject": "Widget order",
"description": "Ticket envelope",
"project": {"id": 7, "identifier": "fud-helpdesk"},
"contact": {"id": 9, "email": "ada@example.com", "name": "Ada Lovelace"},
"journals": [
{"id": 5, "notes": "Private escalation note", "private_notes": True, "created_on": "2026-04-03T10:00:00Z"}
],
"messages": [
{"id": 6, "body": "Customer reply body", "direction": "incoming", "created_on": "2026-04-03T11:00:00Z"}
],
}
docs = RedmineMapper(redmine_url="http://redmine.local").issue_to_documents(issue)
ids = {doc.id for doc in docs}
types = {doc.payload["doc_type"] for doc in docs}
self.assertIn("redmine:issue:42:journal:5:chunk:0", ids)
self.assertIn("redmine:issue:42:message:6:chunk:0", ids)
self.assertIn("redmine:contact:9:issue:42:chunk:0", ids)
self.assertEqual({"issue", "journal", "message", "contact"}, types)
journal = next(doc for doc in docs if doc.payload["doc_type"] == "journal")
message = next(doc for doc in docs if doc.payload["doc_type"] == "message")
self.assertEqual("private", journal.payload["visibility"])
self.assertEqual("incoming", message.payload["direction"])
def test_empty_documents_are_rejected(self):
with self.assertRaises(ValueError):
IndexDocument(id="x", text=" ", payload={})
if __name__ == "__main__":
unittest.main()
+46
View File
@@ -0,0 +1,46 @@
import unittest
from semantic_index.embeddings import OpenAIEmbedder
from semantic_index.models import IndexDocument
class FakeOpenAIClient:
def __init__(self):
self.calls = []
def create_embeddings(self, model, inputs, dimensions=None):
self.calls.append({"model": model, "inputs": list(inputs), "dimensions": dimensions})
return [[float(i)] * 3 for i, _ in enumerate(inputs, start=1)]
class OpenAIEmbedderTest(unittest.TestCase):
def test_batches_embedding_requests(self):
client = FakeOpenAIClient()
embedder = OpenAIEmbedder(client=client, batch_size=2, dimensions=1536)
docs = [
IndexDocument(id="a", text="alpha", payload={}),
IndexDocument(id="b", text="bravo", payload={}),
IndexDocument(id="c", text="charlie", payload={}),
]
vectors = embedder.embed_documents(docs)
self.assertEqual([[1.0, 1.0, 1.0], [2.0, 2.0, 2.0], [1.0, 1.0, 1.0]], vectors)
self.assertEqual(2, len(client.calls))
self.assertEqual(["alpha", "bravo"], client.calls[0]["inputs"])
self.assertEqual("text-embedding-3-small", client.calls[0]["model"])
self.assertEqual(1536, client.calls[0]["dimensions"])
def test_rejects_empty_or_oversized_chunks_before_api_call(self):
client = FakeOpenAIClient()
embedder = OpenAIEmbedder(client=client, max_chars=5)
with self.assertRaises(ValueError):
embedder.embed_texts(["ok", " "])
with self.assertRaises(ValueError):
embedder.embed_texts(["toolong"])
self.assertEqual([], client.calls)
if __name__ == "__main__":
unittest.main()
+394
View File
@@ -0,0 +1,394 @@
import io
import json
import unittest
from contextlib import redirect_stdout
from pathlib import Path
from semantic_index.__main__ import main
from semantic_index.config import Settings
from semantic_index.models import SearchResult
class FakeSearchService:
def __init__(self):
self.queries = []
def search(self, query):
self.queries.append(query)
if "missing@example.test" in query.text:
return []
return [
SearchResult(
id="redmine:contact:1890:issue:39779:chunk:0" if "callum" in query.text else "redmine:issue:39779:chunk:0",
score=0.58,
text="Callum Mackeonis callum@safetagtracking.com SafeTag Tracking",
payload={
"source": "redmine",
"doc_type": "contact" if "callum" in query.text else "issue",
"issue_id": 39779,
"project_identifier": "customer-service",
"contact_id": 1890,
"contact_name": "Callum Mackeonis",
"contact_email": "callum@safetagtracking.com",
"contact_company": "SafeTag Tracking",
"redmine_url": "http://redmine/issues/39779",
},
)
]
def get_document(self, document_id):
return {
"id": document_id,
"text": "Full indexed text",
"payload": {
"source": "redmine",
"doc_type": "journal",
"issue_id": 39778,
"project_identifier": "customer-service",
"contact_id": 1890,
"contact_name": "Callum Mackeonis",
"contact_email": "callum@safetagtracking.com",
"redmine_url": "http://redmine/issues/39778",
},
}
class FakeStore:
def __init__(self):
self.list_limits = []
def count_documents(self, source=None, project_identifier=None, doc_type=None):
return 12
def list_documents(self, limit=10, source=None, project_identifier=None, doc_type=None):
self.list_limits.append(limit)
return [
{
"id": "redmine:issue:39779:chunk:0",
"text": "Issue #39779: Goods return\nPlease return our goods.",
"payload": {
"source": "redmine",
"doc_type": "issue",
"issue_id": 39779,
"project_identifier": "customer-service",
"project_name": "Customer Service",
"has_helpdesk_ticket": True,
"contact_id": 1890,
"contact_name": "Callum Mackeonis",
"contact_email": "callum@safetagtracking.com",
"contact_company": "SafeTag Tracking",
"source_hash": "issue-hash",
"redmine_url": "http://redmine/issues/39779",
},
},
{
"id": "redmine:issue:39779:journal:71570:chunk:0",
"text": "Hello, we can arrange this today.",
"payload": {
"source": "redmine",
"doc_type": "journal",
"issue_id": 39779,
"project_identifier": "customer-service",
"project_name": "Customer Service",
"has_helpdesk_ticket": True,
"contact_id": 1890,
"contact_name": "Callum Mackeonis",
"contact_email": "callum@safetagtracking.com",
"contact_company": "SafeTag Tracking",
"source_hash": "journal-hash",
"redmine_url": "http://redmine/issues/39779",
},
},
{
"id": "redmine:contact:1890:issue:39779:chunk:0",
"text": "Callum Mackeonis callum@safetagtracking.com SafeTag Tracking",
"payload": {
"source": "redmine",
"doc_type": "contact",
"issue_id": 39779,
"project_identifier": "customer-service",
"project_name": "Customer Service",
"has_helpdesk_ticket": True,
"contact_id": 1890,
"contact_name": "Callum Mackeonis",
"contact_email": "callum@safetagtracking.com",
"contact_company": "SafeTag Tracking",
"source_hash": "contact-hash",
"redmine_url": "http://redmine/issues/39779",
},
},
{
"id": "redmine:issue:39800:chunk:0",
"text": "Ordinary issue with no helpdesk contact.",
"payload": {
"source": "redmine",
"doc_type": "issue",
"issue_id": 39800,
"project_identifier": "hiring",
"project_name": "Hiring",
"has_helpdesk_ticket": False,
"source_hash": "ordinary-hash",
"redmine_url": "http://redmine/issues/39800",
},
},
]
class FakeRedmineSource:
def recent_helpdesk_issues(self, limit):
return [
{
"id": 39779,
"subject": "Goods return",
"description": "Please return our goods.",
"project": {"id": 1, "identifier": "customer-service"},
"helpdesk_ticket": {
"id": 35159,
"contact_id": 1890,
"contact": {
"id": 1890,
"name": "Callum Mackeonis",
"email": "callum@safetagtracking.com",
"company": "SafeTag Tracking",
},
},
}
][:limit]
def fake_services(store=None, search=None):
settings = Settings(
openai_api_key="",
qdrant_url="http://qdrant",
qdrant_api_key=None,
qdrant_collection="semantic",
redmine_url="http://redmine",
redmine_api_key="",
redmine_project_identifier="customer-service",
sample_limit=50,
bind_host="127.0.0.1",
bind_port=8787,
service_api_key=None,
refresh_state_path=Path(".cache/semantic_index/refresh_state.json"),
)
return {
"settings": settings,
"search": search or FakeSearchService(),
"store": store or FakeStore(),
"redmine_source": FakeRedmineSource(),
"backfill": FakeBackfillService(),
}
class FakeBackfillService:
def __init__(self):
self.calls = []
def backfill_redmine_sample(self, limit):
self.calls.append(("sample", limit))
return {"source": "redmine", "issues": limit, "documents": limit}
def backfill_redmine_projects(self, projects, per_project_limit):
self.calls.append(("projects", projects, per_project_limit))
return {
"source": "redmine",
"projects": len(projects),
"issues": len(projects) * per_project_limit,
"documents": len(projects) * per_project_limit,
"project_results": [
{"project_identifier": project, "issues": per_project_limit, "documents": per_project_limit}
for project in projects
],
}
def backfill_redmine_project_limits(self, project_limits):
self.calls.append(("project_limits", project_limits))
return {
"source": "redmine",
"projects": len(project_limits),
"issues": sum(project_limits.values()),
"documents": sum(project_limits.values()),
"project_results": [
{"project_identifier": project, "issues": limit, "documents": limit}
for project, limit in project_limits.items()
],
}
class InspectCliTest(unittest.TestCase):
def run_cli(self, args):
out = io.StringIO()
with redirect_stdout(out):
main(args, service_builder=fake_services)
return out.getvalue()
def test_no_args_prints_help_without_building_services(self):
def broken_services():
raise AssertionError("help should not build live services")
out = io.StringIO()
with redirect_stdout(out):
main([], service_builder=broken_services)
self.assertIn("inspect", out.getvalue())
def test_count_lists_matching_document_count(self):
output = self.run_cli(["inspect", "count", "--source", "redmine", "--project", "customer-service"])
self.assertIn("12", output)
def test_list_shows_snippet_and_metadata_by_default(self):
output = self.run_cli(["inspect", "list", "--limit", "5", "--source", "redmine", "--project", "customer-service"])
self.assertIn("redmine:issue:39779:chunk:0", output)
self.assertIn("issue #39779", output.lower())
self.assertIn("customer-service", output)
self.assertIn("contact=#1890", output)
self.assertIn("Callum Mackeonis", output)
self.assertIn("callum@safetagtracking.com", output)
self.assertNotIn("Full indexed text", output)
def test_search_runs_query_and_prints_citation(self):
output = self.run_cli(["inspect", "search", "order status", "--limit", "3", "--project", "customer-service"])
self.assertIn("score=0.5800", output)
self.assertIn("http://redmine/issues/39779", output)
def test_show_prints_full_document_text(self):
output = self.run_cli(["inspect", "show", "redmine:issue:39778:chunk:0"])
self.assertIn("Full indexed text", output)
self.assertIn("doc_type=journal", output)
def test_preview_redmine_maps_documents_without_writing(self):
output = self.run_cli(["inspect", "preview-redmine", "--limit", "1", "--project", "customer-service"])
self.assertIn("redmine:issue:39779:chunk:0", output)
self.assertIn("project=customer-service", output)
self.assertIn("Please return our goods", output)
def test_preview_redmine_uses_minimal_service_builder(self):
services = []
def minimal_builder(settings):
services.append(settings.redmine_project_identifier)
return {"settings": settings, "redmine_source": FakeRedmineSource()}
out = io.StringIO()
with redirect_stdout(out):
main(
["inspect", "preview-redmine", "--limit", "1", "--project", "customer-service"],
service_builder=lambda: (_ for _ in ()).throw(AssertionError("full services should not be built")),
preview_service_builder=minimal_builder,
settings_loader=lambda: fake_services()["settings"],
)
self.assertEqual(["customer-service"], services)
self.assertIn("redmine:issue:39779:chunk:0", out.getvalue())
def test_audit_prints_doc_type_counts_contact_coverage_and_attachment_check(self):
output = self.run_cli(["inspect", "audit", "--limit", "10", "--source", "redmine", "--project", "customer-service"])
self.assertIn("documents=4", output)
self.assertIn("doc_type issue=2", output)
self.assertIn("doc_type journal=1", output)
self.assertIn("doc_type contact=1", output)
self.assertIn("contact_metadata 3/4", output)
self.assertIn("helpdesk_contact_metadata 3/3", output)
self.assertIn("project customer-service=3", output)
self.assertIn("project hiring=1", output)
self.assertIn("attachments=0", output)
self.assertNotIn("missing_contact redmine:issue:39800:chunk:0", output)
def test_audit_json_returns_machine_readable_summary(self):
output = self.run_cli(["inspect", "audit", "--limit", "10", "--project", "customer-service", "--json"])
payload = json.loads(output)
self.assertEqual(4, payload["total_documents"])
self.assertEqual(2, payload["doc_type_counts"]["issue"])
self.assertEqual(3, payload["project_counts"]["customer-service"])
self.assertEqual(1, payload["project_counts"]["hiring"])
self.assertEqual([], payload["missing_helpdesk_contact_metadata"])
def test_compare_redmine_reports_missing_stale_and_contact_mismatches(self):
output = self.run_cli(["inspect", "compare-redmine", "--limit", "1", "--project", "customer-service"])
self.assertIn("preview_documents=2", output)
self.assertIn("indexed_documents=4", output)
self.assertIn("stale", output)
self.assertIn("redmine:issue:39779:chunk:0", output)
def test_compare_redmine_fetches_a_large_index_window_to_avoid_false_missing_results(self):
store = FakeStore()
out = io.StringIO()
with redirect_stdout(out):
main(["inspect", "compare-redmine", "--limit", "3", "--project", "customer-service"], service_builder=lambda: fake_services(store=store))
self.assertEqual(5000, store.list_limits[0])
def test_smoke_search_prints_pass_fail_for_known_queries(self):
output = self.run_cli(["inspect", "smoke-search", "--project", "customer-service", "--email", "callum@safetagtracking.com", "--issue-id", "39779"])
self.assertIn("PASS email callum@safetagtracking.com", output)
self.assertIn("PASS issue 39779", output)
self.assertIn("redmine:contact:1890:issue:39779:chunk:0", output)
def test_smoke_search_uses_issue_id_filter_for_issue_checks(self):
search = FakeSearchService()
out = io.StringIO()
with redirect_stdout(out):
main(["inspect", "smoke-search", "--project", "customer-service", "--issue-id", "39779"], service_builder=lambda: fake_services(search=search))
issue_queries = [query for query in search.queries if query.text == "39779"]
self.assertEqual(39779, issue_queries[0].issue_id)
def test_smoke_search_json_returns_check_results(self):
output = self.run_cli(["inspect", "smoke-search", "--project", "customer-service", "--email", "missing@example.test", "--json"])
payload = json.loads(output)
self.assertFalse(payload["checks"][0]["passed"])
self.assertEqual("email", payload["checks"][0]["kind"])
def test_backfill_redmine_projects_cli_parses_comma_separated_projects(self):
backfill = FakeBackfillService()
services = fake_services()
services["backfill"] = backfill
out = io.StringIO()
with redirect_stdout(out):
main(
[
"--backfill-redmine-projects",
"--projects",
"customer-service,hiring",
"--per-project-limit",
"25",
],
service_builder=lambda: services,
)
self.assertEqual(("projects", ["customer-service", "hiring"], 25), backfill.calls[0])
self.assertIn("'projects': 2", out.getvalue())
def test_backfill_redmine_projects_cli_parses_project_specific_limits(self):
backfill = FakeBackfillService()
services = fake_services()
services["backfill"] = backfill
out = io.StringIO()
with redirect_stdout(out):
main(
[
"--backfill-redmine-projects",
"--project-limits",
"customer-service=500,hiring=200",
],
service_builder=lambda: services,
)
self.assertEqual(("project_limits", {"customer-service": 500, "hiring": 200}), backfill.calls[0])
self.assertIn("'issues': 700", out.getvalue())
if __name__ == "__main__":
unittest.main()
+58
View File
@@ -0,0 +1,58 @@
import subprocess
import tempfile
import unittest
from pathlib import Path
ROOT = Path(__file__).resolve().parents[2]
INSTALLER = ROOT / "deploy" / "semantic-index" / "install.sh"
class SemanticIndexInstallerTest(unittest.TestCase):
def run_installer(self, *args, env=None):
return subprocess.run(
[str(INSTALLER), *args],
cwd=ROOT,
text=True,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
check=False,
env=env,
)
def test_default_mode_is_dry_run(self):
result = self.run_installer()
self.assertEqual(0, result.returncode, result.stderr)
self.assertIn("mode=dry-run", result.stdout)
self.assertIn("would run: sudo mkdir -p /opt/semantic-index", result.stdout)
self.assertIn("would run: sudo rsync", result.stdout)
self.assertNotIn("Semantic Index installed, but deployment is not complete.", result.stdout)
def test_apply_prints_manual_next_step_warning(self):
with tempfile.TemporaryDirectory() as tmp:
tmp_path = Path(tmp)
env = {
"PATH": "/usr/bin:/bin",
"SEMANTIC_INDEX_INSTALL_DIR": str(tmp_path / "opt" / "semantic-index"),
"SEMANTIC_INDEX_ENV_FILE": str(tmp_path / "etc" / "semantic-index.env"),
"SEMANTIC_INDEX_STATE_DIR": str(tmp_path / "var" / "lib" / "semantic-index"),
"SEMANTIC_INDEX_LOG_DIR": str(tmp_path / "var" / "log" / "semantic-index"),
"SEMANTIC_INDEX_SYSTEMD_DIR": str(tmp_path / "etc" / "systemd" / "system"),
}
result = self.run_installer("--apply", "--no-system", "--skip-deps", env=env)
self.assertEqual(0, result.returncode, result.stderr)
self.assertIn("Semantic Index installed, but deployment is not complete.", result.stdout)
self.assertIn("The refresh timer was NOT enabled automatically.", result.stdout)
self.assertIn("Do not use --force-rebuild", result.stdout)
def test_invalid_argument_fails_with_usage(self):
result = self.run_installer("--force-rebuild")
self.assertEqual(2, result.returncode)
self.assertIn("Usage:", result.stderr)
if __name__ == "__main__":
unittest.main()
+187
View File
@@ -0,0 +1,187 @@
import unittest
from semantic_index.models import IndexDocument
from semantic_index.qdrant_store import QdrantStore
class FakeMatchValue:
def __init__(self, value):
self.value = value
class FakeFieldCondition:
def __init__(self, key, match=None, range=None):
self.key = key
self.match = match
self.range = range
class FakeFilter:
def __init__(self, must):
self.must = must
class FakeFilterSelector:
def __init__(self, filter):
self.filter = filter
class FakePointIdsList:
def __init__(self, points):
self.points = points
class FakeQModels:
MatchValue = FakeMatchValue
FieldCondition = FakeFieldCondition
Filter = FakeFilter
FilterSelector = FakeFilterSelector
PointIdsList = FakePointIdsList
class PointStruct:
def __init__(self, id, vector, payload):
self.id = id
self.vector = vector
self.payload = payload
class FakeCountResult:
count = 7
class FakeRecord:
def __init__(self):
self.id = "point-id"
self.payload = {
"document_id": "redmine:issue:1:chunk:0",
"text": "Indexed text",
"source": "redmine",
"project_identifier": "customer-service",
}
class FakeClient:
def __init__(self):
self.count_filter = None
self.scroll_filter = None
self.delete_filter = None
self.delete_selector = None
self.upsert_batches = []
def get_collections(self):
collection = type("Collection", (), {"name": "semantic"})()
return type("Collections", (), {"collections": [collection]})()
def count(self, collection_name, count_filter, exact):
self.count_filter = count_filter
return FakeCountResult()
def scroll(self, collection_name, scroll_filter, limit, with_payload, with_vectors, offset=None):
self.scroll_filter = scroll_filter
return [FakeRecord()], None
def delete(self, collection_name, points_selector):
self.delete_selector = points_selector
self.delete_filter = getattr(points_selector, "filter", None)
def upsert(self, collection_name, points):
self.upsert_batches.append(points)
class QdrantStoreReadTest(unittest.TestCase):
def make_store(self):
store = object.__new__(QdrantStore)
store.client = FakeClient()
store.collection = "semantic"
store.vector_size = 1536
store.qmodels = FakeQModels
store.upsert_batch_size = 2
return store
def test_count_documents_builds_metadata_filter(self):
store = self.make_store()
count = store.count_documents(source="redmine", project_identifier="customer-service", doc_type="issue")
self.assertEqual(7, count)
conditions = store.client.count_filter.must
self.assertEqual(["source", "project_identifier", "doc_type"], [condition.key for condition in conditions])
self.assertEqual("customer-service", conditions[1].match.value)
def test_list_documents_strips_internal_payload_fields(self):
store = self.make_store()
documents = store.list_documents(limit=5, source="redmine", project_identifier="customer-service")
self.assertEqual("redmine:issue:1:chunk:0", documents[0]["id"])
self.assertEqual("Indexed text", documents[0]["text"])
self.assertNotIn("document_id", documents[0]["payload"])
self.assertNotIn("text", documents[0]["payload"])
def test_delete_by_source_can_be_limited_to_project_scope(self):
store = self.make_store()
store.delete_by_source("redmine", project_identifier="customer-service")
conditions = store.client.delete_filter.must
self.assertEqual(["source", "project_identifier"], [condition.key for condition in conditions])
self.assertEqual("redmine", conditions[0].match.value)
self.assertEqual("customer-service", conditions[1].match.value)
def test_list_documents_can_be_limited_to_issue_scope(self):
store = self.make_store()
store.list_documents(limit=5, source="redmine", project_identifier="customer-service", issue_id=39779)
conditions = store.client.scroll_filter.must
self.assertEqual(["source", "project_identifier", "issue_id"], [condition.key for condition in conditions])
self.assertEqual(39779, conditions[2].match.value)
def test_delete_documents_deletes_stable_document_point_ids(self):
store = self.make_store()
store.delete_documents(["redmine:issue:39779:chunk:0"])
self.assertEqual(1, len(store.client.delete_selector.points))
self.assertNotEqual("redmine:issue:39779:chunk:0", store.client.delete_selector.points[0])
def test_upsert_sends_points_in_batches(self):
store = self.make_store()
documents = [
IndexDocument(id=f"redmine:issue:{issue_id}:chunk:0", text=f"Issue {issue_id}", payload={"source": "redmine"})
for issue_id in range(5)
]
vectors = [[0.1, 0.2, 0.3] for _ in documents]
store.upsert(documents, vectors)
self.assertEqual([2, 2, 1], [len(batch) for batch in store.client.upsert_batches])
self.assertEqual("Issue 0", store.client.upsert_batches[0][0].payload["text"])
def test_list_documents_paginates_qdrant_scroll_until_requested_limit(self):
class PagedClient(FakeClient):
def __init__(self):
super().__init__()
self.offsets = []
def scroll(self, collection_name, scroll_filter, limit, with_payload, with_vectors, offset=None):
self.offsets.append(offset)
first = FakeRecord()
first.payload = {**first.payload, "document_id": f"doc:{len(self.offsets)}a"}
second = FakeRecord()
second.payload = {**second.payload, "document_id": f"doc:{len(self.offsets)}b"}
if offset is None:
return [first, second], "next"
return [first, second], None
store = self.make_store()
store.client = PagedClient()
documents = store.list_documents(limit=3, source="redmine")
self.assertEqual(["doc:1a", "doc:1b", "doc:2a"], [document["id"] for document in documents])
self.assertEqual([None, "next"], store.client.offsets)
if __name__ == "__main__":
unittest.main()
+102
View File
@@ -0,0 +1,102 @@
import unittest
from semantic_index.redmine import RedmineApiSource
class RecordingRedmineSource(RedmineApiSource):
def __init__(self):
super().__init__(redmine_url="http://redmine.local", api_key="secret", project_identifier="customer-service")
self.urls = []
def _get_json(self, url):
self.urls.append(url)
if url.startswith("http://redmine.local/issues.json"):
return {"issues": [{"id": 39779}]}
return {"issue": {"id": 39779, "subject": "Goods return"}}
class PagedRedmineSource(RedmineApiSource):
def __init__(self):
super().__init__(redmine_url="http://redmine.local", api_key="secret", project_identifier="customer-service")
self.urls = []
def _get_json(self, url):
self.urls.append(url)
if url.startswith("http://redmine.local/issues.json"):
query = url.split("?", 1)[1]
params = dict(part.split("=", 1) for part in query.split("&"))
offset = int(params.get("offset", "0"))
limit = int(params.get("limit", "0"))
return {"issues": [{"id": issue_id} for issue_id in range(offset + 1, offset + limit + 1)]}
issue_id = int(url.split("/issues/", 1)[1].split(".", 1)[0])
return {"issue": {"id": issue_id, "subject": f"Issue {issue_id}"}}
class DuplicatePagedRedmineSource(RedmineApiSource):
def __init__(self):
super().__init__(redmine_url="http://redmine.local", api_key="secret", project_identifier="customer-service")
def _get_json(self, url):
if url.startswith("http://redmine.local/issues.json"):
query = url.split("?", 1)[1]
params = dict(part.split("=", 1) for part in query.split("&"))
offset = int(params.get("offset", "0"))
if offset == 0:
return {"issues": [{"id": 1}, {"id": 2}]}
if offset == 2:
return {"issues": [{"id": 2}, {"id": 3}]}
return {"issues": []}
issue_id = int(url.split("/issues/", 1)[1].split(".", 1)[0])
return {"issue": {"id": issue_id, "subject": f"Issue {issue_id}"}}
class RedmineApiSourceTest(unittest.TestCase):
def test_recent_issue_summaries_do_not_fetch_issue_details(self):
source = RecordingRedmineSource()
summaries = list(source.recent_issue_summaries(limit=1))
self.assertEqual(39779, summaries[0]["id"])
self.assertEqual(1, len(source.urls))
self.assertTrue(source.urls[0].startswith("http://redmine.local/issues.json"))
def test_issue_detail_fetches_journals_and_helpdesk(self):
source = RecordingRedmineSource()
detail = source.issue_detail(39779)
self.assertEqual(39779, detail["id"])
self.assertIn("include=journals%2Chelpdesk", source.urls[0])
def test_recent_helpdesk_issues_requests_helpdesk_include_with_journals(self):
source = RecordingRedmineSource()
issues = list(source.recent_helpdesk_issues(limit=1))
self.assertEqual(39779, issues[0]["id"])
self.assertIn("include=journals%2Chelpdesk", source.urls[1])
self.assertIn("subproject_id=%21%2A", source.urls[0])
def test_recent_helpdesk_issues_paginates_past_redmine_page_limit(self):
source = PagedRedmineSource()
issues = list(source.recent_helpdesk_issues(limit=250))
self.assertEqual(250, len(issues))
list_urls = [url for url in source.urls if url.startswith("http://redmine.local/issues.json")]
self.assertEqual(3, len(list_urls))
self.assertIn("limit=100", list_urls[0])
self.assertIn("offset=0", list_urls[0])
self.assertIn("offset=100", list_urls[1])
self.assertIn("offset=200", list_urls[2])
def test_recent_helpdesk_issues_skips_duplicate_issue_ids_across_pages(self):
source = DuplicatePagedRedmineSource()
issues = list(source.recent_helpdesk_issues(limit=3))
self.assertEqual([1, 2, 3], [issue["id"] for issue in issues])
if __name__ == "__main__":
unittest.main()
+277
View File
@@ -0,0 +1,277 @@
import io
import json
import tempfile
import unittest
from contextlib import redirect_stdout
from pathlib import Path
from semantic_index.__main__ import main
from semantic_index.models import IndexDocument
from semantic_index.refresh import FileRefreshState, RedmineRefreshService
def issue(updated_on="2026-04-25T12:00:00Z"):
return {
"id": 39779,
"subject": "Goods return",
"description": "Please return our goods.",
"updated_on": updated_on,
"project": {"id": 1, "identifier": "customer-service", "name": "Customer Service"},
}
class FakeRedmineSource:
project_identifier = None
def __init__(self, issues=None):
self.issues = issues or [issue()]
self.calls = []
def recent_helpdesk_issues(self, limit):
self.calls.append((self.project_identifier, limit))
return self.issues[:limit]
class SummaryDetailRedmineSource(FakeRedmineSource):
def __init__(self, summaries, details):
super().__init__([])
self.summaries = summaries
self.details = details
self.summary_calls = []
self.detail_calls = []
def recent_issue_summaries(self, limit):
self.summary_calls.append((self.project_identifier, limit))
return self.summaries[:limit]
def issue_detail(self, issue_id):
self.detail_calls.append(issue_id)
return self.details[issue_id]
class RecordingEmbedder:
def __init__(self):
self.calls = []
def embed_documents(self, docs):
self.calls.append(list(docs))
return [[0.1, 0.2, 0.3] for _ in docs]
class RefreshStore:
def __init__(self, existing=None):
self.existing = existing or {}
self.upserts = []
self.deleted_ids = []
def list_documents(self, limit=10, source=None, project_identifier=None, doc_type=None, issue_id=None):
return list(self.existing.values())[:limit]
def upsert(self, docs, vectors):
self.upserts.append((list(docs), list(vectors)))
def delete_documents(self, document_ids):
self.deleted_ids.extend(document_ids)
class RedmineRefreshServiceTest(unittest.TestCase):
def test_refresh_skips_embeddings_when_source_hash_matches_existing_document(self):
source = FakeRedmineSource()
embedder = RecordingEmbedder()
service = RedmineRefreshService(source=source, embedder=embedder, store=RefreshStore())
candidate = service.mapper.issue_to_documents(issue())[0]
service.store.existing[candidate.id] = {
"id": candidate.id,
"text": candidate.text,
"payload": dict(candidate.payload),
}
result = service.refresh_redmine_project_limits({"customer-service": 1})
self.assertEqual(1, result["unchanged_documents"])
self.assertEqual(0, result["embedded_documents"])
self.assertEqual([], embedder.calls)
self.assertEqual([], service.store.upserts)
def test_refresh_embeds_only_changed_and_new_documents(self):
source = FakeRedmineSource()
embedder = RecordingEmbedder()
service = RedmineRefreshService(source=source, embedder=embedder, store=RefreshStore())
candidate = service.mapper.issue_to_documents(issue())[0]
service.store.existing[candidate.id] = {
"id": candidate.id,
"text": "Old text",
"payload": {**candidate.payload, "source_hash": "old-hash"},
}
result = service.refresh_redmine_project_limits({"customer-service": 1})
self.assertEqual(1, result["changed_documents"])
self.assertEqual(1, result["embedded_documents"])
self.assertEqual([[candidate]], embedder.calls)
self.assertEqual([candidate.id], [doc.id for doc in service.store.upserts[0][0]])
def test_refresh_deletes_stale_issue_documents_without_embedding(self):
source = FakeRedmineSource()
embedder = RecordingEmbedder()
service = RedmineRefreshService(source=source, embedder=embedder, store=RefreshStore())
candidate = service.mapper.issue_to_documents(issue())[0]
service.store.existing[candidate.id] = {"id": candidate.id, "text": candidate.text, "payload": dict(candidate.payload)}
service.store.existing["redmine:issue:39779:journal:1:chunk:0"] = {
"id": "redmine:issue:39779:journal:1:chunk:0",
"text": "Deleted note",
"payload": {"source_hash": "gone", "issue_id": 39779},
}
result = service.refresh_redmine_project_limits({"customer-service": 1})
self.assertEqual(1, result["stale_documents"])
self.assertEqual(["redmine:issue:39779:journal:1:chunk:0"], service.store.deleted_ids)
self.assertEqual([], embedder.calls)
def test_dry_run_reports_planned_embeddings_without_embedding_or_mutating(self):
source = FakeRedmineSource()
embedder = RecordingEmbedder()
service = RedmineRefreshService(source=source, embedder=embedder, store=RefreshStore())
result = service.refresh_redmine_project_limits({"customer-service": 1}, dry_run=True)
self.assertEqual(1, result["new_documents"])
self.assertEqual(1, result["would_embed_documents"])
self.assertEqual(0, result["embedded_documents"])
self.assertEqual([], embedder.calls)
self.assertEqual([], service.store.upserts)
self.assertEqual([], service.store.deleted_ids)
def test_force_rebuild_embeds_unchanged_documents(self):
source = FakeRedmineSource()
embedder = RecordingEmbedder()
service = RedmineRefreshService(source=source, embedder=embedder, store=RefreshStore())
candidate = service.mapper.issue_to_documents(issue())[0]
service.store.existing[candidate.id] = {"id": candidate.id, "text": candidate.text, "payload": dict(candidate.payload)}
result = service.refresh_redmine_project_limits({"customer-service": 1}, force_rebuild=True)
self.assertEqual(1, result["force_rebuilt_documents"])
self.assertEqual(1, result["embedded_documents"])
self.assertEqual([[candidate]], embedder.calls)
def test_force_rebuild_ignores_refresh_state_window_for_fetched_candidates(self):
source = FakeRedmineSource([issue(updated_on="2026-04-25T10:00:00Z")])
embedder = RecordingEmbedder()
with tempfile.TemporaryDirectory() as tmp:
state = FileRefreshState(Path(tmp) / "refresh.json")
state.mark_success("customer-service", "2026-04-25T12:00:00Z")
service = RedmineRefreshService(source=source, embedder=embedder, store=RefreshStore(), state=state)
result = service.refresh_redmine_project_limits({"customer-service": 1}, force_rebuild=True, overlap_minutes=15)
self.assertEqual(0, result["skipped_issues"])
self.assertEqual(1, result["embedded_documents"])
def test_file_refresh_state_updates_only_when_called(self):
with tempfile.TemporaryDirectory() as tmp:
state = FileRefreshState(Path(tmp) / "refresh.json")
self.assertEqual({}, state.load())
state.mark_success("customer-service", "2026-04-25T12:00:00Z")
self.assertEqual(
{"projects": {"customer-service": {"last_successful_refresh_at": "2026-04-25T12:00:00Z"}}},
json.loads((Path(tmp) / "refresh.json").read_text(encoding="utf-8")),
)
def test_refresh_state_skips_issues_older_than_overlap_window(self):
source = FakeRedmineSource([issue(updated_on="2026-04-25T10:00:00Z")])
embedder = RecordingEmbedder()
with tempfile.TemporaryDirectory() as tmp:
state = FileRefreshState(Path(tmp) / "refresh.json")
state.mark_success("customer-service", "2026-04-25T12:00:00Z")
service = RedmineRefreshService(source=source, embedder=embedder, store=RefreshStore(), state=state)
result = service.refresh_redmine_project_limits({"customer-service": 1}, dry_run=True, overlap_minutes=15)
self.assertEqual(1, result["issues"])
self.assertEqual(1, result["skipped_issues"])
self.assertEqual(0, result["documents"])
self.assertEqual([], embedder.calls)
def test_refresh_skips_old_summaries_without_fetching_issue_detail(self):
old_summary = {"id": 39779, "updated_on": "2026-04-25T10:00:00Z"}
new_summary = {"id": 39780, "updated_on": "2026-04-25T11:50:00Z"}
source = SummaryDetailRedmineSource(
summaries=[old_summary, new_summary],
details={39780: {**issue("2026-04-25T11:50:00Z"), "id": 39780}},
)
embedder = RecordingEmbedder()
with tempfile.TemporaryDirectory() as tmp:
state = FileRefreshState(Path(tmp) / "refresh.json")
state.mark_success("customer-service", "2026-04-25T12:00:00Z")
service = RedmineRefreshService(source=source, embedder=embedder, store=RefreshStore(), state=state)
result = service.refresh_redmine_project_limits({"customer-service": 2}, dry_run=True, overlap_minutes=15)
self.assertEqual(2, result["scanned_issues"])
self.assertEqual(1, result["skipped_issues"])
self.assertEqual(1, result["detail_fetched_issues"])
self.assertEqual([39780], source.detail_calls)
class RefreshCliTest(unittest.TestCase):
def test_refresh_redmine_projects_cli_parses_project_limits_and_dry_run(self):
class FakeRefresh:
def __init__(self):
self.calls = []
def refresh_redmine_project_limits(self, project_limits, dry_run=False, force_rebuild=False, overlap_minutes=15):
self.calls.append((project_limits, dry_run, force_rebuild, overlap_minutes))
return {"source": "redmine", "projects": len(project_limits), "issues": sum(project_limits.values())}
refresh = FakeRefresh()
services = {"refresh": refresh}
out = io.StringIO()
with redirect_stdout(out):
main(
[
"--refresh-redmine-projects",
"--project-limits",
"customer-service=5,hiring=2",
"--dry-run",
"--overlap-minutes",
"30",
],
service_builder=lambda: services,
)
self.assertEqual(({"customer-service": 5, "hiring": 2}, True, False, 30), refresh.calls[0])
self.assertIn("'projects': 2", out.getvalue())
def test_refresh_redmine_projects_cli_can_override_state_path(self):
class FakeRefresh:
def __init__(self):
self.state = None
def refresh_redmine_project_limits(self, project_limits, dry_run=False, force_rebuild=False, overlap_minutes=15):
return {"state_path": str(self.state.path)}
refresh = FakeRefresh()
out = io.StringIO()
with redirect_stdout(out):
main(
[
"--refresh-redmine-projects",
"--project-limits",
"customer-service=1",
"--state-path",
"/tmp/semantic-refresh-state.json",
],
service_builder=lambda: {"refresh": refresh},
)
self.assertIn("/tmp/semantic-refresh-state.json", out.getvalue())
if __name__ == "__main__":
unittest.main()
+85
View File
@@ -0,0 +1,85 @@
import unittest
from semantic_index.models import IndexDocument, SearchQuery, SearchResult
from semantic_index.qdrant_store import build_filter, point_id_for_document
from semantic_index.search import HybridSearchService, keyword_boost
class FakeEmbedder:
def embed_query(self, text):
return [0.1, 0.2, 0.3]
class FakeStore:
def __init__(self):
self.query = None
def search(self, vector, query, limit):
self.query = query
return [
SearchResult(
id="weak",
score=0.7,
text="general support text",
payload={"redmine_url": "http://redmine/issues/1"},
),
SearchResult(
id="strong",
score=0.6,
text="Customer ada@example.com asked about ORD-12345",
payload={"redmine_url": "http://redmine/issues/2"},
),
][:limit]
class SearchTest(unittest.TestCase):
def test_qdrant_point_id_is_deterministic_uuid_for_stable_document_id(self):
first = point_id_for_document("redmine:issue:42:journal:5:chunk:0")
second = point_id_for_document("redmine:issue:42:journal:5:chunk:0")
self.assertEqual(first, second)
self.assertRegex(first, r"^[0-9a-f-]{36}$")
def test_filter_maps_supported_metadata(self):
query = SearchQuery(
text="printer",
source="redmine",
project_identifier="fud-helpdesk",
doc_type="message",
issue_id=42,
contact_email="ada@example.com",
date_from="2026-04-01T00:00:00Z",
date_to="2026-04-30T23:59:59Z",
)
qfilter = build_filter(query)
self.assertEqual(
[
{"key": "source", "match": {"value": "redmine"}},
{"key": "project_identifier", "match": {"value": "fud-helpdesk"}},
{"key": "doc_type", "match": {"value": "message"}},
{"key": "issue_id", "match": {"value": 42}},
{"key": "contact_email", "match": {"value": "ada@example.com"}},
{"key": "created_on", "range": {"gte": "2026-04-01T00:00:00Z", "lte": "2026-04-30T23:59:59Z"}},
],
qfilter["must"],
)
def test_keyword_boost_prioritizes_exact_email_and_order_matches(self):
weak = SearchResult(id="weak", score=0.7, text="general support text", payload={})
strong = SearchResult(id="strong", score=0.6, text="Customer ada@example.com asked about ORD-12345", payload={})
self.assertGreater(
keyword_boost('ada@example.com "ORD-12345"', strong),
keyword_boost('ada@example.com "ORD-12345"', weak),
)
service = HybridSearchService(embedder=FakeEmbedder(), store=FakeStore())
results = service.search(SearchQuery(text='ada@example.com "ORD-12345"', limit=2))
self.assertEqual("strong", results[0].id)
self.assertEqual("http://redmine/issues/2", results[0].citation["url"])
if __name__ == "__main__":
unittest.main()
@@ -0,0 +1,41 @@
import os
import subprocess
import tempfile
import unittest
from pathlib import Path
ROOT = Path(__file__).resolve().parents[2]
REFRESH = ROOT / "semantic_index" / "refresh.sh"
class SemanticIndexShellWrapperTest(unittest.TestCase):
def test_refresh_wrapper_is_self_locating_when_called_from_another_directory(self):
with tempfile.TemporaryDirectory() as tmp:
tmp_path = Path(tmp)
env = {
**os.environ,
"PYTHON": "/bin/echo",
"SEMANTIC_INDEX_PROJECT_LIMITS": "customer-service=5",
"SEMANTIC_INDEX_LOG_DIR": str(tmp_path / "logs"),
"SEMANTIC_INDEX_STATE_PATH": str(tmp_path / "state" / "refresh_state.json"),
}
result = subprocess.run(
[str(REFRESH)],
cwd=tmp,
env=env,
text=True,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
check=False,
)
self.assertEqual(0, result.returncode, result.stderr)
self.assertIn("-m semantic_index --refresh-redmine-projects", result.stdout)
self.assertIn("--project-limits customer-service=5", result.stdout)
self.assertIn("log_file=", result.stdout)
if __name__ == "__main__":
unittest.main()