refac: metadata handling in vectordb
This commit is contained in:
parent
aa83ebae58
commit
6a17ba5b7a
|
@ -11,6 +11,8 @@ from open_webui.retrieval.vector.main import (
|
|||
SearchResult,
|
||||
GetResult,
|
||||
)
|
||||
from open_webui.retrieval.vector.utils import stringify_metadata
|
||||
|
||||
from open_webui.config import (
|
||||
CHROMA_DATA_PATH,
|
||||
CHROMA_HTTP_HOST,
|
||||
|
@ -144,7 +146,7 @@ class ChromaClient(VectorDBBase):
|
|||
ids = [item["id"] for item in items]
|
||||
documents = [item["text"] for item in items]
|
||||
embeddings = [item["vector"] for item in items]
|
||||
metadatas = [item["metadata"] for item in items]
|
||||
metadatas = [stringify_metadata(item["metadata"]) for item in items]
|
||||
|
||||
for batch in create_batches(
|
||||
api=self.client,
|
||||
|
@ -164,7 +166,7 @@ class ChromaClient(VectorDBBase):
|
|||
ids = [item["id"] for item in items]
|
||||
documents = [item["text"] for item in items]
|
||||
embeddings = [item["vector"] for item in items]
|
||||
metadatas = [item["metadata"] for item in items]
|
||||
metadatas = [stringify_metadata(item["metadata"]) for item in items]
|
||||
|
||||
collection.upsert(
|
||||
ids=ids, documents=documents, embeddings=embeddings, metadatas=metadatas
|
||||
|
|
|
@ -3,6 +3,8 @@ from pymilvus import FieldSchema, DataType
|
|||
import json
|
||||
import logging
|
||||
from typing import Optional
|
||||
|
||||
from open_webui.retrieval.vector.utils import stringify_metadata
|
||||
from open_webui.retrieval.vector.main import (
|
||||
VectorDBBase,
|
||||
VectorItem,
|
||||
|
@ -311,7 +313,7 @@ class MilvusClient(VectorDBBase):
|
|||
"id": item["id"],
|
||||
"vector": item["vector"],
|
||||
"data": {"text": item["text"]},
|
||||
"metadata": item["metadata"],
|
||||
"metadata": stringify_metadata(item["metadata"]),
|
||||
}
|
||||
for item in items
|
||||
],
|
||||
|
@ -347,7 +349,7 @@ class MilvusClient(VectorDBBase):
|
|||
"id": item["id"],
|
||||
"vector": item["vector"],
|
||||
"data": {"text": item["text"]},
|
||||
"metadata": item["metadata"],
|
||||
"metadata": stringify_metadata(item["metadata"]),
|
||||
}
|
||||
for item in items
|
||||
],
|
||||
|
|
|
@ -26,6 +26,8 @@ from pgvector.sqlalchemy import Vector
|
|||
from sqlalchemy.ext.mutable import MutableDict
|
||||
from sqlalchemy.exc import NoSuchTableError
|
||||
|
||||
|
||||
from open_webui.retrieval.vector.utils import stringify_metadata
|
||||
from open_webui.retrieval.vector.main import (
|
||||
VectorDBBase,
|
||||
VectorItem,
|
||||
|
@ -235,7 +237,7 @@ class PgvectorClient(VectorDBBase):
|
|||
vector=vector,
|
||||
collection_name=collection_name,
|
||||
text=item["text"],
|
||||
vmetadata=item["metadata"],
|
||||
vmetadata=stringify_metadata(item["metadata"]),
|
||||
)
|
||||
new_items.append(new_chunk)
|
||||
self.session.bulk_save_objects(new_items)
|
||||
|
@ -292,7 +294,7 @@ class PgvectorClient(VectorDBBase):
|
|||
if existing:
|
||||
existing.vector = vector
|
||||
existing.text = item["text"]
|
||||
existing.vmetadata = item["metadata"]
|
||||
existing.vmetadata = stringify_metadata(item["metadata"])
|
||||
existing.collection_name = (
|
||||
collection_name # Update collection_name if necessary
|
||||
)
|
||||
|
@ -302,7 +304,7 @@ class PgvectorClient(VectorDBBase):
|
|||
vector=vector,
|
||||
collection_name=collection_name,
|
||||
text=item["text"],
|
||||
vmetadata=item["metadata"],
|
||||
vmetadata=stringify_metadata(item["metadata"]),
|
||||
)
|
||||
self.session.add(new_chunk)
|
||||
self.session.commit()
|
||||
|
|
|
@ -0,0 +1,14 @@
|
|||
from datetime import datetime
|
||||
|
||||
|
||||
def stringify_metadata(
|
||||
metadata: dict[str, any],
|
||||
) -> dict[str, any]:
|
||||
for key, value in metadata.items():
|
||||
if (
|
||||
isinstance(value, datetime)
|
||||
or isinstance(value, list)
|
||||
or isinstance(value, dict)
|
||||
):
|
||||
metadata[key] = str(value)
|
||||
return metadata
|
|
@ -1229,27 +1229,14 @@ def save_docs_to_vector_db(
|
|||
{
|
||||
**doc.metadata,
|
||||
**(metadata if metadata else {}),
|
||||
"embedding_config": json.dumps(
|
||||
{
|
||||
"engine": request.app.state.config.RAG_EMBEDDING_ENGINE,
|
||||
"model": request.app.state.config.RAG_EMBEDDING_MODEL,
|
||||
}
|
||||
),
|
||||
"embedding_config": {
|
||||
"engine": request.app.state.config.RAG_EMBEDDING_ENGINE,
|
||||
"model": request.app.state.config.RAG_EMBEDDING_MODEL,
|
||||
},
|
||||
}
|
||||
for doc in docs
|
||||
]
|
||||
|
||||
# ChromaDB does not like datetime formats
|
||||
# for meta-data so convert them to string.
|
||||
for metadata in metadatas:
|
||||
for key, value in metadata.items():
|
||||
if (
|
||||
isinstance(value, datetime)
|
||||
or isinstance(value, list)
|
||||
or isinstance(value, dict)
|
||||
):
|
||||
metadata[key] = str(value)
|
||||
|
||||
try:
|
||||
if VECTOR_DB_CLIENT.has_collection(collection_name=collection_name):
|
||||
log.info(f"collection {collection_name} already exists")
|
||||
|
|
Loading…
Reference in New Issue