open-notebook/api/routers/embedding_rebuild.py

191 lines
6.9 KiB
Python

from fastapi import APIRouter, HTTPException
from loguru import logger
from surreal_commands import get_command_status
from api.command_service import CommandService
from api.models import (
RebuildProgress,
RebuildRequest,
RebuildResponse,
RebuildStats,
RebuildStatusResponse,
)
from open_notebook.database.repository import repo_query
router = APIRouter()
@router.post("/rebuild", response_model=RebuildResponse)
async def start_rebuild(request: RebuildRequest):
"""
Start a background job to rebuild embeddings.
- **mode**: "existing" (re-embed items with embeddings) or "all" (embed everything)
- **include_sources**: Include sources in rebuild (default: true)
- **include_notes**: Include notes in rebuild (default: true)
- **include_insights**: Include insights in rebuild (default: true)
Returns command ID to track progress and estimated item count.
"""
try:
logger.info(f"Starting rebuild request: mode={request.mode}")
# Import commands to ensure they're registered
import commands.embedding_commands # noqa: F401
# Estimate total items (quick count query)
# This is a rough estimate before the command runs
total_estimate = 0
if request.include_sources:
if request.mode == "existing":
# Count sources with embeddings
result = await repo_query(
"""
SELECT VALUE count(array::distinct(
SELECT VALUE source.id
FROM source_embedding
WHERE embedding != none AND array::len(embedding) > 0
)) as count FROM {}
"""
)
else:
# Count all sources with content
result = await repo_query(
"SELECT VALUE count() as count FROM source WHERE full_text != none GROUP ALL"
)
if result and isinstance(result[0], dict):
total_estimate += result[0].get("count", 0)
elif result:
total_estimate += result[0] if isinstance(result[0], int) else 0
if request.include_notes:
if request.mode == "existing":
result = await repo_query(
"SELECT VALUE count() as count FROM note WHERE embedding != none AND array::len(embedding) > 0 GROUP ALL"
)
else:
result = await repo_query(
"SELECT VALUE count() as count FROM note WHERE content != none GROUP ALL"
)
if result and isinstance(result[0], dict):
total_estimate += result[0].get("count", 0)
elif result:
total_estimate += result[0] if isinstance(result[0], int) else 0
if request.include_insights:
if request.mode == "existing":
result = await repo_query(
"SELECT VALUE count() as count FROM source_insight WHERE embedding != none AND array::len(embedding) > 0 GROUP ALL"
)
else:
result = await repo_query(
"SELECT VALUE count() as count FROM source_insight GROUP ALL"
)
if result and isinstance(result[0], dict):
total_estimate += result[0].get("count", 0)
elif result:
total_estimate += result[0] if isinstance(result[0], int) else 0
logger.info(f"Estimated {total_estimate} items to process")
# Submit command
command_id = await CommandService.submit_command_job(
"open_notebook",
"rebuild_embeddings",
{
"mode": request.mode,
"include_sources": request.include_sources,
"include_notes": request.include_notes,
"include_insights": request.include_insights,
},
)
logger.info(f"Submitted rebuild command: {command_id}")
return RebuildResponse(
command_id=command_id,
total_items=total_estimate,
message=f"Rebuild operation started. Estimated {total_estimate} items to process.",
)
except Exception as e:
logger.error(f"Failed to start rebuild: {e}")
logger.exception(e)
raise HTTPException(
status_code=500, detail=f"Failed to start rebuild operation: {str(e)}"
)
@router.get("/rebuild/{command_id}/status", response_model=RebuildStatusResponse)
async def get_rebuild_status(command_id: str):
"""
Get the status of a rebuild operation.
Returns:
- **status**: queued, running, completed, failed
- **progress**: processed count, total count, percentage
- **stats**: breakdown by type (sources, notes, insights, failed)
- **timestamps**: started_at, completed_at
"""
try:
# Get command status from surreal_commands
status = await get_command_status(command_id)
if not status:
raise HTTPException(status_code=404, detail="Rebuild command not found")
# Build response based on status
response = RebuildStatusResponse(
command_id=command_id,
status=status.status,
)
# Extract metadata from command result
if status.result and isinstance(status.result, dict):
result = status.result
# Build progress info
if "total_items" in result and "processed_items" in result:
total = result["total_items"]
processed = result["processed_items"]
response.progress = RebuildProgress(
processed=processed,
total=total,
percentage=round((processed / total * 100) if total > 0 else 0, 2),
)
# Build stats
response.stats = RebuildStats(
sources=result.get("sources_processed", 0),
notes=result.get("notes_processed", 0),
insights=result.get("insights_processed", 0),
failed=result.get("failed_items", 0),
)
# Add timestamps
if hasattr(status, "created") and status.created:
response.started_at = str(status.created)
if hasattr(status, "updated") and status.updated:
response.completed_at = str(status.updated)
# Add error message if failed
if status.status == "failed" and status.result and isinstance(status.result, dict):
response.error_message = status.result.get(
"error_message", "Unknown error"
)
return response
except HTTPException:
raise
except Exception as e:
logger.error(f"Failed to get rebuild status: {e}")
logger.exception(e)
raise HTTPException(
status_code=500, detail=f"Failed to get rebuild status: {str(e)}"
)