refac: metadata handling in vectordb

This commit is contained in:
Timothy Jaeryang Baek 2025-07-31 17:45:06 +04:00
parent aa83ebae58
commit 6a17ba5b7a
5 changed files with 31 additions and 24 deletions

View File

@ -11,6 +11,8 @@ from open_webui.retrieval.vector.main import (
SearchResult, SearchResult,
GetResult, GetResult,
) )
from open_webui.retrieval.vector.utils import stringify_metadata
from open_webui.config import ( from open_webui.config import (
CHROMA_DATA_PATH, CHROMA_DATA_PATH,
CHROMA_HTTP_HOST, CHROMA_HTTP_HOST,
@ -144,7 +146,7 @@ class ChromaClient(VectorDBBase):
ids = [item["id"] for item in items] ids = [item["id"] for item in items]
documents = [item["text"] for item in items] documents = [item["text"] for item in items]
embeddings = [item["vector"] for item in items] embeddings = [item["vector"] for item in items]
metadatas = [item["metadata"] for item in items] metadatas = [stringify_metadata(item["metadata"]) for item in items]
for batch in create_batches( for batch in create_batches(
api=self.client, api=self.client,
@ -164,7 +166,7 @@ class ChromaClient(VectorDBBase):
ids = [item["id"] for item in items] ids = [item["id"] for item in items]
documents = [item["text"] for item in items] documents = [item["text"] for item in items]
embeddings = [item["vector"] for item in items] embeddings = [item["vector"] for item in items]
metadatas = [item["metadata"] for item in items] metadatas = [stringify_metadata(item["metadata"]) for item in items]
collection.upsert( collection.upsert(
ids=ids, documents=documents, embeddings=embeddings, metadatas=metadatas ids=ids, documents=documents, embeddings=embeddings, metadatas=metadatas

View File

@ -3,6 +3,8 @@ from pymilvus import FieldSchema, DataType
import json import json
import logging import logging
from typing import Optional from typing import Optional
from open_webui.retrieval.vector.utils import stringify_metadata
from open_webui.retrieval.vector.main import ( from open_webui.retrieval.vector.main import (
VectorDBBase, VectorDBBase,
VectorItem, VectorItem,
@ -311,7 +313,7 @@ class MilvusClient(VectorDBBase):
"id": item["id"], "id": item["id"],
"vector": item["vector"], "vector": item["vector"],
"data": {"text": item["text"]}, "data": {"text": item["text"]},
"metadata": item["metadata"], "metadata": stringify_metadata(item["metadata"]),
} }
for item in items for item in items
], ],
@ -347,7 +349,7 @@ class MilvusClient(VectorDBBase):
"id": item["id"], "id": item["id"],
"vector": item["vector"], "vector": item["vector"],
"data": {"text": item["text"]}, "data": {"text": item["text"]},
"metadata": item["metadata"], "metadata": stringify_metadata(item["metadata"]),
} }
for item in items for item in items
], ],

View File

@ -26,6 +26,8 @@ from pgvector.sqlalchemy import Vector
from sqlalchemy.ext.mutable import MutableDict from sqlalchemy.ext.mutable import MutableDict
from sqlalchemy.exc import NoSuchTableError from sqlalchemy.exc import NoSuchTableError
from open_webui.retrieval.vector.utils import stringify_metadata
from open_webui.retrieval.vector.main import ( from open_webui.retrieval.vector.main import (
VectorDBBase, VectorDBBase,
VectorItem, VectorItem,
@ -235,7 +237,7 @@ class PgvectorClient(VectorDBBase):
vector=vector, vector=vector,
collection_name=collection_name, collection_name=collection_name,
text=item["text"], text=item["text"],
vmetadata=item["metadata"], vmetadata=stringify_metadata(item["metadata"]),
) )
new_items.append(new_chunk) new_items.append(new_chunk)
self.session.bulk_save_objects(new_items) self.session.bulk_save_objects(new_items)
@ -292,7 +294,7 @@ class PgvectorClient(VectorDBBase):
if existing: if existing:
existing.vector = vector existing.vector = vector
existing.text = item["text"] existing.text = item["text"]
existing.vmetadata = item["metadata"] existing.vmetadata = stringify_metadata(item["metadata"])
existing.collection_name = ( existing.collection_name = (
collection_name # Update collection_name if necessary collection_name # Update collection_name if necessary
) )
@ -302,7 +304,7 @@ class PgvectorClient(VectorDBBase):
vector=vector, vector=vector,
collection_name=collection_name, collection_name=collection_name,
text=item["text"], text=item["text"],
vmetadata=item["metadata"], vmetadata=stringify_metadata(item["metadata"]),
) )
self.session.add(new_chunk) self.session.add(new_chunk)
self.session.commit() self.session.commit()

View File

@ -0,0 +1,14 @@
from datetime import datetime
def stringify_metadata(
metadata: dict[str, any],
) -> dict[str, any]:
for key, value in metadata.items():
if (
isinstance(value, datetime)
or isinstance(value, list)
or isinstance(value, dict)
):
metadata[key] = str(value)
return metadata

View File

@ -1229,27 +1229,14 @@ def save_docs_to_vector_db(
{ {
**doc.metadata, **doc.metadata,
**(metadata if metadata else {}), **(metadata if metadata else {}),
"embedding_config": json.dumps( "embedding_config": {
{ "engine": request.app.state.config.RAG_EMBEDDING_ENGINE,
"engine": request.app.state.config.RAG_EMBEDDING_ENGINE, "model": request.app.state.config.RAG_EMBEDDING_MODEL,
"model": request.app.state.config.RAG_EMBEDDING_MODEL, },
}
),
} }
for doc in docs for doc in docs
] ]
# ChromaDB does not like datetime formats
# for meta-data so convert them to string.
for metadata in metadatas:
for key, value in metadata.items():
if (
isinstance(value, datetime)
or isinstance(value, list)
or isinstance(value, dict)
):
metadata[key] = str(value)
try: try:
if VECTOR_DB_CLIENT.has_collection(collection_name=collection_name): if VECTOR_DB_CLIENT.has_collection(collection_name=collection_name):
log.info(f"collection {collection_name} already exists") log.info(f"collection {collection_name} already exists")