refac: metadata handling in vectordb

This commit is contained in:
Timothy Jaeryang Baek 2025-07-31 17:45:06 +04:00
parent aa83ebae58
commit 6a17ba5b7a
5 changed files with 31 additions and 24 deletions

View File

@ -11,6 +11,8 @@ from open_webui.retrieval.vector.main import (
SearchResult,
GetResult,
)
from open_webui.retrieval.vector.utils import stringify_metadata
from open_webui.config import (
CHROMA_DATA_PATH,
CHROMA_HTTP_HOST,
@ -144,7 +146,7 @@ class ChromaClient(VectorDBBase):
ids = [item["id"] for item in items]
documents = [item["text"] for item in items]
embeddings = [item["vector"] for item in items]
metadatas = [item["metadata"] for item in items]
metadatas = [stringify_metadata(item["metadata"]) for item in items]
for batch in create_batches(
api=self.client,
@ -164,7 +166,7 @@ class ChromaClient(VectorDBBase):
ids = [item["id"] for item in items]
documents = [item["text"] for item in items]
embeddings = [item["vector"] for item in items]
metadatas = [item["metadata"] for item in items]
metadatas = [stringify_metadata(item["metadata"]) for item in items]
collection.upsert(
ids=ids, documents=documents, embeddings=embeddings, metadatas=metadatas

View File

@ -3,6 +3,8 @@ from pymilvus import FieldSchema, DataType
import json
import logging
from typing import Optional
from open_webui.retrieval.vector.utils import stringify_metadata
from open_webui.retrieval.vector.main import (
VectorDBBase,
VectorItem,
@ -311,7 +313,7 @@ class MilvusClient(VectorDBBase):
"id": item["id"],
"vector": item["vector"],
"data": {"text": item["text"]},
"metadata": item["metadata"],
"metadata": stringify_metadata(item["metadata"]),
}
for item in items
],
@ -347,7 +349,7 @@ class MilvusClient(VectorDBBase):
"id": item["id"],
"vector": item["vector"],
"data": {"text": item["text"]},
"metadata": item["metadata"],
"metadata": stringify_metadata(item["metadata"]),
}
for item in items
],

View File

@ -26,6 +26,8 @@ from pgvector.sqlalchemy import Vector
from sqlalchemy.ext.mutable import MutableDict
from sqlalchemy.exc import NoSuchTableError
from open_webui.retrieval.vector.utils import stringify_metadata
from open_webui.retrieval.vector.main import (
VectorDBBase,
VectorItem,
@ -235,7 +237,7 @@ class PgvectorClient(VectorDBBase):
vector=vector,
collection_name=collection_name,
text=item["text"],
vmetadata=item["metadata"],
vmetadata=stringify_metadata(item["metadata"]),
)
new_items.append(new_chunk)
self.session.bulk_save_objects(new_items)
@ -292,7 +294,7 @@ class PgvectorClient(VectorDBBase):
if existing:
existing.vector = vector
existing.text = item["text"]
existing.vmetadata = item["metadata"]
existing.vmetadata = stringify_metadata(item["metadata"])
existing.collection_name = (
collection_name # Update collection_name if necessary
)
@ -302,7 +304,7 @@ class PgvectorClient(VectorDBBase):
vector=vector,
collection_name=collection_name,
text=item["text"],
vmetadata=item["metadata"],
vmetadata=stringify_metadata(item["metadata"]),
)
self.session.add(new_chunk)
self.session.commit()

View File

@ -0,0 +1,14 @@
from datetime import datetime
def stringify_metadata(
metadata: dict[str, any],
) -> dict[str, any]:
for key, value in metadata.items():
if (
isinstance(value, datetime)
or isinstance(value, list)
or isinstance(value, dict)
):
metadata[key] = str(value)
return metadata

View File

@ -1229,27 +1229,14 @@ def save_docs_to_vector_db(
{
**doc.metadata,
**(metadata if metadata else {}),
"embedding_config": json.dumps(
{
"engine": request.app.state.config.RAG_EMBEDDING_ENGINE,
"model": request.app.state.config.RAG_EMBEDDING_MODEL,
}
),
"embedding_config": {
"engine": request.app.state.config.RAG_EMBEDDING_ENGINE,
"model": request.app.state.config.RAG_EMBEDDING_MODEL,
},
}
for doc in docs
]
# ChromaDB does not like datetime formats
# for meta-data so convert them to string.
for metadata in metadatas:
for key, value in metadata.items():
if (
isinstance(value, datetime)
or isinstance(value, list)
or isinstance(value, dict)
):
metadata[key] = str(value)
try:
if VECTOR_DB_CLIENT.has_collection(collection_name=collection_name):
log.info(f"collection {collection_name} already exists")