refac: metadata handling in vectordb
This commit is contained in:
parent
aa83ebae58
commit
6a17ba5b7a
|
@ -11,6 +11,8 @@ from open_webui.retrieval.vector.main import (
|
||||||
SearchResult,
|
SearchResult,
|
||||||
GetResult,
|
GetResult,
|
||||||
)
|
)
|
||||||
|
from open_webui.retrieval.vector.utils import stringify_metadata
|
||||||
|
|
||||||
from open_webui.config import (
|
from open_webui.config import (
|
||||||
CHROMA_DATA_PATH,
|
CHROMA_DATA_PATH,
|
||||||
CHROMA_HTTP_HOST,
|
CHROMA_HTTP_HOST,
|
||||||
|
@ -144,7 +146,7 @@ class ChromaClient(VectorDBBase):
|
||||||
ids = [item["id"] for item in items]
|
ids = [item["id"] for item in items]
|
||||||
documents = [item["text"] for item in items]
|
documents = [item["text"] for item in items]
|
||||||
embeddings = [item["vector"] for item in items]
|
embeddings = [item["vector"] for item in items]
|
||||||
metadatas = [item["metadata"] for item in items]
|
metadatas = [stringify_metadata(item["metadata"]) for item in items]
|
||||||
|
|
||||||
for batch in create_batches(
|
for batch in create_batches(
|
||||||
api=self.client,
|
api=self.client,
|
||||||
|
@ -164,7 +166,7 @@ class ChromaClient(VectorDBBase):
|
||||||
ids = [item["id"] for item in items]
|
ids = [item["id"] for item in items]
|
||||||
documents = [item["text"] for item in items]
|
documents = [item["text"] for item in items]
|
||||||
embeddings = [item["vector"] for item in items]
|
embeddings = [item["vector"] for item in items]
|
||||||
metadatas = [item["metadata"] for item in items]
|
metadatas = [stringify_metadata(item["metadata"]) for item in items]
|
||||||
|
|
||||||
collection.upsert(
|
collection.upsert(
|
||||||
ids=ids, documents=documents, embeddings=embeddings, metadatas=metadatas
|
ids=ids, documents=documents, embeddings=embeddings, metadatas=metadatas
|
||||||
|
|
|
@ -3,6 +3,8 @@ from pymilvus import FieldSchema, DataType
|
||||||
import json
|
import json
|
||||||
import logging
|
import logging
|
||||||
from typing import Optional
|
from typing import Optional
|
||||||
|
|
||||||
|
from open_webui.retrieval.vector.utils import stringify_metadata
|
||||||
from open_webui.retrieval.vector.main import (
|
from open_webui.retrieval.vector.main import (
|
||||||
VectorDBBase,
|
VectorDBBase,
|
||||||
VectorItem,
|
VectorItem,
|
||||||
|
@ -311,7 +313,7 @@ class MilvusClient(VectorDBBase):
|
||||||
"id": item["id"],
|
"id": item["id"],
|
||||||
"vector": item["vector"],
|
"vector": item["vector"],
|
||||||
"data": {"text": item["text"]},
|
"data": {"text": item["text"]},
|
||||||
"metadata": item["metadata"],
|
"metadata": stringify_metadata(item["metadata"]),
|
||||||
}
|
}
|
||||||
for item in items
|
for item in items
|
||||||
],
|
],
|
||||||
|
@ -347,7 +349,7 @@ class MilvusClient(VectorDBBase):
|
||||||
"id": item["id"],
|
"id": item["id"],
|
||||||
"vector": item["vector"],
|
"vector": item["vector"],
|
||||||
"data": {"text": item["text"]},
|
"data": {"text": item["text"]},
|
||||||
"metadata": item["metadata"],
|
"metadata": stringify_metadata(item["metadata"]),
|
||||||
}
|
}
|
||||||
for item in items
|
for item in items
|
||||||
],
|
],
|
||||||
|
|
|
@ -26,6 +26,8 @@ from pgvector.sqlalchemy import Vector
|
||||||
from sqlalchemy.ext.mutable import MutableDict
|
from sqlalchemy.ext.mutable import MutableDict
|
||||||
from sqlalchemy.exc import NoSuchTableError
|
from sqlalchemy.exc import NoSuchTableError
|
||||||
|
|
||||||
|
|
||||||
|
from open_webui.retrieval.vector.utils import stringify_metadata
|
||||||
from open_webui.retrieval.vector.main import (
|
from open_webui.retrieval.vector.main import (
|
||||||
VectorDBBase,
|
VectorDBBase,
|
||||||
VectorItem,
|
VectorItem,
|
||||||
|
@ -235,7 +237,7 @@ class PgvectorClient(VectorDBBase):
|
||||||
vector=vector,
|
vector=vector,
|
||||||
collection_name=collection_name,
|
collection_name=collection_name,
|
||||||
text=item["text"],
|
text=item["text"],
|
||||||
vmetadata=item["metadata"],
|
vmetadata=stringify_metadata(item["metadata"]),
|
||||||
)
|
)
|
||||||
new_items.append(new_chunk)
|
new_items.append(new_chunk)
|
||||||
self.session.bulk_save_objects(new_items)
|
self.session.bulk_save_objects(new_items)
|
||||||
|
@ -292,7 +294,7 @@ class PgvectorClient(VectorDBBase):
|
||||||
if existing:
|
if existing:
|
||||||
existing.vector = vector
|
existing.vector = vector
|
||||||
existing.text = item["text"]
|
existing.text = item["text"]
|
||||||
existing.vmetadata = item["metadata"]
|
existing.vmetadata = stringify_metadata(item["metadata"])
|
||||||
existing.collection_name = (
|
existing.collection_name = (
|
||||||
collection_name # Update collection_name if necessary
|
collection_name # Update collection_name if necessary
|
||||||
)
|
)
|
||||||
|
@ -302,7 +304,7 @@ class PgvectorClient(VectorDBBase):
|
||||||
vector=vector,
|
vector=vector,
|
||||||
collection_name=collection_name,
|
collection_name=collection_name,
|
||||||
text=item["text"],
|
text=item["text"],
|
||||||
vmetadata=item["metadata"],
|
vmetadata=stringify_metadata(item["metadata"]),
|
||||||
)
|
)
|
||||||
self.session.add(new_chunk)
|
self.session.add(new_chunk)
|
||||||
self.session.commit()
|
self.session.commit()
|
||||||
|
|
|
@ -0,0 +1,14 @@
|
||||||
|
from datetime import datetime
|
||||||
|
|
||||||
|
|
||||||
|
def stringify_metadata(
|
||||||
|
metadata: dict[str, any],
|
||||||
|
) -> dict[str, any]:
|
||||||
|
for key, value in metadata.items():
|
||||||
|
if (
|
||||||
|
isinstance(value, datetime)
|
||||||
|
or isinstance(value, list)
|
||||||
|
or isinstance(value, dict)
|
||||||
|
):
|
||||||
|
metadata[key] = str(value)
|
||||||
|
return metadata
|
|
@ -1229,27 +1229,14 @@ def save_docs_to_vector_db(
|
||||||
{
|
{
|
||||||
**doc.metadata,
|
**doc.metadata,
|
||||||
**(metadata if metadata else {}),
|
**(metadata if metadata else {}),
|
||||||
"embedding_config": json.dumps(
|
"embedding_config": {
|
||||||
{
|
"engine": request.app.state.config.RAG_EMBEDDING_ENGINE,
|
||||||
"engine": request.app.state.config.RAG_EMBEDDING_ENGINE,
|
"model": request.app.state.config.RAG_EMBEDDING_MODEL,
|
||||||
"model": request.app.state.config.RAG_EMBEDDING_MODEL,
|
},
|
||||||
}
|
|
||||||
),
|
|
||||||
}
|
}
|
||||||
for doc in docs
|
for doc in docs
|
||||||
]
|
]
|
||||||
|
|
||||||
# ChromaDB does not like datetime formats
|
|
||||||
# for meta-data so convert them to string.
|
|
||||||
for metadata in metadatas:
|
|
||||||
for key, value in metadata.items():
|
|
||||||
if (
|
|
||||||
isinstance(value, datetime)
|
|
||||||
or isinstance(value, list)
|
|
||||||
or isinstance(value, dict)
|
|
||||||
):
|
|
||||||
metadata[key] = str(value)
|
|
||||||
|
|
||||||
try:
|
try:
|
||||||
if VECTOR_DB_CLIENT.has_collection(collection_name=collection_name):
|
if VECTOR_DB_CLIENT.has_collection(collection_name=collection_name):
|
||||||
log.info(f"collection {collection_name} already exists")
|
log.info(f"collection {collection_name} already exists")
|
||||||
|
|
Loading…
Reference in New Issue