Merge pull request #15903 from Hisma/marker-api-update
feat: Add configurable API URL (for self-hosting) and additional_config parameter for Datalab Marker API
This commit is contained in:
commit
5db60ca34f
|
@ -2032,10 +2032,16 @@ DATALAB_MARKER_API_KEY = PersistentConfig(
|
|||
os.environ.get("DATALAB_MARKER_API_KEY", ""),
|
||||
)
|
||||
|
||||
DATALAB_MARKER_LANGS = PersistentConfig(
|
||||
"DATALAB_MARKER_LANGS",
|
||||
"rag.datalab_marker_langs",
|
||||
os.environ.get("DATALAB_MARKER_LANGS", ""),
|
||||
DATALAB_MARKER_API_BASE_URL = PersistentConfig(
|
||||
"DATALAB_MARKER_API_BASE_URL",
|
||||
"rag.datalab_marker_api_base_url",
|
||||
os.environ.get("DATALAB_MARKER_API_BASE_URL", ""),
|
||||
)
|
||||
|
||||
DATALAB_MARKER_ADDITIONAL_CONFIG = PersistentConfig(
|
||||
"DATALAB_MARKER_ADDITIONAL_CONFIG",
|
||||
"rag.datalab_marker_additional_config",
|
||||
os.environ.get("DATALAB_MARKER_ADDITIONAL_CONFIG", ""),
|
||||
)
|
||||
|
||||
DATALAB_MARKER_USE_LLM = PersistentConfig(
|
||||
|
@ -2075,6 +2081,12 @@ DATALAB_MARKER_DISABLE_IMAGE_EXTRACTION = PersistentConfig(
|
|||
== "true",
|
||||
)
|
||||
|
||||
DATALAB_MARKER_FORMAT_LINES = PersistentConfig(
|
||||
"DATALAB_MARKER_FORMAT_LINES",
|
||||
"rag.datalab_marker_format_lines",
|
||||
os.environ.get("DATALAB_MARKER_FORMAT_LINES", "false").lower() == "true",
|
||||
)
|
||||
|
||||
DATALAB_MARKER_OUTPUT_FORMAT = PersistentConfig(
|
||||
"DATALAB_MARKER_OUTPUT_FORMAT",
|
||||
"rag.datalab_marker_output_format",
|
||||
|
|
|
@ -226,12 +226,14 @@ from open_webui.config import (
|
|||
CHUNK_SIZE,
|
||||
CONTENT_EXTRACTION_ENGINE,
|
||||
DATALAB_MARKER_API_KEY,
|
||||
DATALAB_MARKER_LANGS,
|
||||
DATALAB_MARKER_API_BASE_URL,
|
||||
DATALAB_MARKER_ADDITIONAL_CONFIG,
|
||||
DATALAB_MARKER_SKIP_CACHE,
|
||||
DATALAB_MARKER_FORCE_OCR,
|
||||
DATALAB_MARKER_PAGINATE,
|
||||
DATALAB_MARKER_STRIP_EXISTING_OCR,
|
||||
DATALAB_MARKER_DISABLE_IMAGE_EXTRACTION,
|
||||
DATALAB_MARKER_FORMAT_LINES,
|
||||
DATALAB_MARKER_OUTPUT_FORMAT,
|
||||
DATALAB_MARKER_USE_LLM,
|
||||
EXTERNAL_DOCUMENT_LOADER_URL,
|
||||
|
@ -771,7 +773,8 @@ app.state.config.ENABLE_WEB_LOADER_SSL_VERIFICATION = ENABLE_WEB_LOADER_SSL_VERI
|
|||
|
||||
app.state.config.CONTENT_EXTRACTION_ENGINE = CONTENT_EXTRACTION_ENGINE
|
||||
app.state.config.DATALAB_MARKER_API_KEY = DATALAB_MARKER_API_KEY
|
||||
app.state.config.DATALAB_MARKER_LANGS = DATALAB_MARKER_LANGS
|
||||
app.state.config.DATALAB_MARKER_API_BASE_URL = DATALAB_MARKER_API_BASE_URL
|
||||
app.state.config.DATALAB_MARKER_ADDITIONAL_CONFIG = DATALAB_MARKER_ADDITIONAL_CONFIG
|
||||
app.state.config.DATALAB_MARKER_SKIP_CACHE = DATALAB_MARKER_SKIP_CACHE
|
||||
app.state.config.DATALAB_MARKER_FORCE_OCR = DATALAB_MARKER_FORCE_OCR
|
||||
app.state.config.DATALAB_MARKER_PAGINATE = DATALAB_MARKER_PAGINATE
|
||||
|
@ -779,6 +782,7 @@ app.state.config.DATALAB_MARKER_STRIP_EXISTING_OCR = DATALAB_MARKER_STRIP_EXISTI
|
|||
app.state.config.DATALAB_MARKER_DISABLE_IMAGE_EXTRACTION = (
|
||||
DATALAB_MARKER_DISABLE_IMAGE_EXTRACTION
|
||||
)
|
||||
app.state.config.DATALAB_MARKER_FORMAT_LINES = DATALAB_MARKER_FORMAT_LINES
|
||||
app.state.config.DATALAB_MARKER_USE_LLM = DATALAB_MARKER_USE_LLM
|
||||
app.state.config.DATALAB_MARKER_OUTPUT_FORMAT = DATALAB_MARKER_OUTPUT_FORMAT
|
||||
app.state.config.EXTERNAL_DOCUMENT_LOADER_URL = EXTERNAL_DOCUMENT_LOADER_URL
|
||||
|
|
|
@ -15,24 +15,28 @@ class DatalabMarkerLoader:
|
|||
self,
|
||||
file_path: str,
|
||||
api_key: str,
|
||||
langs: Optional[str] = None,
|
||||
api_base_url: str,
|
||||
additional_config: Optional[str] = None,
|
||||
use_llm: bool = False,
|
||||
skip_cache: bool = False,
|
||||
force_ocr: bool = False,
|
||||
paginate: bool = False,
|
||||
strip_existing_ocr: bool = False,
|
||||
disable_image_extraction: bool = False,
|
||||
format_lines: bool = False,
|
||||
output_format: str = None,
|
||||
):
|
||||
self.file_path = file_path
|
||||
self.api_key = api_key
|
||||
self.langs = langs
|
||||
self.api_base_url = api_base_url
|
||||
self.additional_config = additional_config
|
||||
self.use_llm = use_llm
|
||||
self.skip_cache = skip_cache
|
||||
self.force_ocr = force_ocr
|
||||
self.paginate = paginate
|
||||
self.strip_existing_ocr = strip_existing_ocr
|
||||
self.disable_image_extraction = disable_image_extraction
|
||||
self.format_lines = format_lines
|
||||
self.output_format = output_format
|
||||
|
||||
def _get_mime_type(self, filename: str) -> str:
|
||||
|
@ -60,7 +64,7 @@ class DatalabMarkerLoader:
|
|||
return mime_map.get(ext, "application/octet-stream")
|
||||
|
||||
def check_marker_request_status(self, request_id: str) -> dict:
|
||||
url = f"https://www.datalab.to/api/v1/marker/{request_id}"
|
||||
url = f"{self.api_base_url}/{request_id}"
|
||||
headers = {"X-Api-Key": self.api_key}
|
||||
try:
|
||||
response = requests.get(url, headers=headers)
|
||||
|
@ -81,22 +85,25 @@ class DatalabMarkerLoader:
|
|||
)
|
||||
|
||||
def load(self) -> List[Document]:
|
||||
url = "https://www.datalab.to/api/v1/marker"
|
||||
url = self.api_base_url
|
||||
filename = os.path.basename(self.file_path)
|
||||
mime_type = self._get_mime_type(filename)
|
||||
headers = {"X-Api-Key": self.api_key}
|
||||
|
||||
form_data = {
|
||||
"langs": self.langs,
|
||||
"use_llm": str(self.use_llm).lower(),
|
||||
"skip_cache": str(self.skip_cache).lower(),
|
||||
"force_ocr": str(self.force_ocr).lower(),
|
||||
"paginate": str(self.paginate).lower(),
|
||||
"strip_existing_ocr": str(self.strip_existing_ocr).lower(),
|
||||
"disable_image_extraction": str(self.disable_image_extraction).lower(),
|
||||
"format_lines": str(self.format_lines).lower(),
|
||||
"output_format": self.output_format,
|
||||
}
|
||||
|
||||
if self.additional_config and self.additional_config.strip():
|
||||
form_data["additional_config"] = self.additional_config
|
||||
|
||||
log.info(
|
||||
f"Datalab Marker POST request parameters: {{'filename': '{filename}', 'mime_type': '{mime_type}', **{form_data}}}"
|
||||
)
|
||||
|
@ -133,74 +140,92 @@ class DatalabMarkerLoader:
|
|||
|
||||
check_url = result.get("request_check_url")
|
||||
request_id = result.get("request_id")
|
||||
if not check_url:
|
||||
raise HTTPException(
|
||||
status.HTTP_502_BAD_GATEWAY, detail="No request_check_url returned."
|
||||
)
|
||||
|
||||
for _ in range(300): # Up to 10 minutes
|
||||
time.sleep(2)
|
||||
try:
|
||||
poll_response = requests.get(check_url, headers=headers)
|
||||
poll_response.raise_for_status()
|
||||
poll_result = poll_response.json()
|
||||
except (requests.HTTPError, ValueError) as e:
|
||||
raw_body = poll_response.text
|
||||
log.error(f"Polling error: {e}, response body: {raw_body}")
|
||||
raise HTTPException(
|
||||
status.HTTP_502_BAD_GATEWAY, detail=f"Polling failed: {e}"
|
||||
)
|
||||
|
||||
status_val = poll_result.get("status")
|
||||
success_val = poll_result.get("success")
|
||||
|
||||
if status_val == "complete":
|
||||
summary = {
|
||||
k: poll_result.get(k)
|
||||
for k in (
|
||||
"status",
|
||||
"output_format",
|
||||
"success",
|
||||
"error",
|
||||
"page_count",
|
||||
"total_cost",
|
||||
# Check if this is a direct response (self-hosted) or polling response (DataLab)
|
||||
if check_url:
|
||||
# DataLab polling pattern
|
||||
for _ in range(300): # Up to 10 minutes
|
||||
time.sleep(2)
|
||||
try:
|
||||
poll_response = requests.get(check_url, headers=headers)
|
||||
poll_response.raise_for_status()
|
||||
poll_result = poll_response.json()
|
||||
except (requests.HTTPError, ValueError) as e:
|
||||
raw_body = poll_response.text
|
||||
log.error(f"Polling error: {e}, response body: {raw_body}")
|
||||
raise HTTPException(
|
||||
status.HTTP_502_BAD_GATEWAY, detail=f"Polling failed: {e}"
|
||||
)
|
||||
}
|
||||
log.info(
|
||||
f"Marker processing completed successfully: {json.dumps(summary, indent=2)}"
|
||||
)
|
||||
break
|
||||
|
||||
if status_val == "failed" or success_val is False:
|
||||
log.error(
|
||||
f"Marker poll failed full response: {json.dumps(poll_result, indent=2)}"
|
||||
)
|
||||
error_msg = (
|
||||
poll_result.get("error")
|
||||
or "Marker returned failure without error message"
|
||||
status_val = poll_result.get("status")
|
||||
success_val = poll_result.get("success")
|
||||
|
||||
if status_val == "complete":
|
||||
summary = {
|
||||
k: poll_result.get(k)
|
||||
for k in (
|
||||
"status",
|
||||
"output_format",
|
||||
"success",
|
||||
"error",
|
||||
"page_count",
|
||||
"total_cost",
|
||||
)
|
||||
}
|
||||
log.info(
|
||||
f"Marker processing completed successfully: {json.dumps(summary, indent=2)}"
|
||||
)
|
||||
break
|
||||
|
||||
if status_val == "failed" or success_val is False:
|
||||
log.error(
|
||||
f"Marker poll failed full response: {json.dumps(poll_result, indent=2)}"
|
||||
)
|
||||
error_msg = (
|
||||
poll_result.get("error")
|
||||
or "Marker returned failure without error message"
|
||||
)
|
||||
raise HTTPException(
|
||||
status.HTTP_400_BAD_REQUEST,
|
||||
detail=f"Marker processing failed: {error_msg}",
|
||||
)
|
||||
else:
|
||||
raise HTTPException(
|
||||
status.HTTP_504_GATEWAY_TIMEOUT,
|
||||
detail="Marker processing timed out",
|
||||
)
|
||||
|
||||
if not poll_result.get("success", False):
|
||||
error_msg = poll_result.get("error") or "Unknown processing error"
|
||||
raise HTTPException(
|
||||
status.HTTP_400_BAD_REQUEST,
|
||||
detail=f"Marker processing failed: {error_msg}",
|
||||
detail=f"Final processing failed: {error_msg}",
|
||||
)
|
||||
|
||||
# DataLab format - content in format-specific fields
|
||||
content_key = self.output_format.lower()
|
||||
raw_content = poll_result.get(content_key)
|
||||
final_result = poll_result
|
||||
else:
|
||||
raise HTTPException(
|
||||
status.HTTP_504_GATEWAY_TIMEOUT, detail="Marker processing timed out"
|
||||
)
|
||||
# Self-hosted direct response - content in "output" field
|
||||
if "output" in result:
|
||||
log.info("Self-hosted Marker returned direct response without polling")
|
||||
raw_content = result.get("output")
|
||||
final_result = result
|
||||
else:
|
||||
available_fields = (
|
||||
list(result.keys())
|
||||
if isinstance(result, dict)
|
||||
else "non-dict response"
|
||||
)
|
||||
raise HTTPException(
|
||||
status.HTTP_502_BAD_GATEWAY,
|
||||
detail=f"Custom Marker endpoint returned success but no 'output' field found. Available fields: {available_fields}. Expected either 'request_check_url' for polling or 'output' field for direct response.",
|
||||
)
|
||||
|
||||
if not poll_result.get("success", False):
|
||||
error_msg = poll_result.get("error") or "Unknown processing error"
|
||||
raise HTTPException(
|
||||
status.HTTP_400_BAD_REQUEST,
|
||||
detail=f"Final processing failed: {error_msg}",
|
||||
)
|
||||
|
||||
content_key = self.output_format.lower()
|
||||
raw_content = poll_result.get(content_key)
|
||||
|
||||
if content_key == "json":
|
||||
if self.output_format.lower() == "json":
|
||||
full_text = json.dumps(raw_content, indent=2)
|
||||
elif content_key in {"markdown", "html"}:
|
||||
elif self.output_format.lower() in {"markdown", "html"}:
|
||||
full_text = str(raw_content).strip()
|
||||
else:
|
||||
raise HTTPException(
|
||||
|
@ -211,14 +236,14 @@ class DatalabMarkerLoader:
|
|||
if not full_text:
|
||||
raise HTTPException(
|
||||
status.HTTP_400_BAD_REQUEST,
|
||||
detail="Datalab Marker returned empty content",
|
||||
detail="Marker returned empty content",
|
||||
)
|
||||
|
||||
marker_output_dir = os.path.join("/app/backend/data/uploads", "marker_output")
|
||||
os.makedirs(marker_output_dir, exist_ok=True)
|
||||
|
||||
file_ext_map = {"markdown": "md", "json": "json", "html": "html"}
|
||||
file_ext = file_ext_map.get(content_key, "txt")
|
||||
file_ext = file_ext_map.get(self.output_format.lower(), "txt")
|
||||
output_filename = f"{os.path.splitext(filename)[0]}.{file_ext}"
|
||||
output_path = os.path.join(marker_output_dir, output_filename)
|
||||
|
||||
|
@ -231,13 +256,13 @@ class DatalabMarkerLoader:
|
|||
|
||||
metadata = {
|
||||
"source": filename,
|
||||
"output_format": poll_result.get("output_format", self.output_format),
|
||||
"page_count": poll_result.get("page_count", 0),
|
||||
"output_format": final_result.get("output_format", self.output_format),
|
||||
"page_count": final_result.get("page_count", 0),
|
||||
"processed_with_llm": self.use_llm,
|
||||
"request_id": request_id or "",
|
||||
}
|
||||
|
||||
images = poll_result.get("images", {})
|
||||
images = final_result.get("images", {})
|
||||
if images:
|
||||
metadata["image_count"] = len(images)
|
||||
metadata["images"] = json.dumps(list(images.keys()))
|
||||
|
|
|
@ -281,10 +281,15 @@ class Loader:
|
|||
"tiff",
|
||||
]
|
||||
):
|
||||
api_base_url = self.kwargs.get("DATALAB_MARKER_API_BASE_URL", "")
|
||||
if not api_base_url or api_base_url.strip() == "":
|
||||
api_base_url = "https://www.datalab.to/api/v1/marker"
|
||||
|
||||
loader = DatalabMarkerLoader(
|
||||
file_path=file_path,
|
||||
api_key=self.kwargs["DATALAB_MARKER_API_KEY"],
|
||||
langs=self.kwargs.get("DATALAB_MARKER_LANGS"),
|
||||
api_base_url=api_base_url,
|
||||
additional_config=self.kwargs.get("DATALAB_MARKER_ADDITIONAL_CONFIG"),
|
||||
use_llm=self.kwargs.get("DATALAB_MARKER_USE_LLM", False),
|
||||
skip_cache=self.kwargs.get("DATALAB_MARKER_SKIP_CACHE", False),
|
||||
force_ocr=self.kwargs.get("DATALAB_MARKER_FORCE_OCR", False),
|
||||
|
@ -295,6 +300,7 @@ class Loader:
|
|||
disable_image_extraction=self.kwargs.get(
|
||||
"DATALAB_MARKER_DISABLE_IMAGE_EXTRACTION", False
|
||||
),
|
||||
format_lines=self.kwargs.get("DATALAB_MARKER_FORMAT_LINES", False),
|
||||
output_format=self.kwargs.get(
|
||||
"DATALAB_MARKER_OUTPUT_FORMAT", "markdown"
|
||||
),
|
||||
|
|
|
@ -401,12 +401,14 @@ async def get_rag_config(request: Request, user=Depends(get_admin_user)):
|
|||
"CONTENT_EXTRACTION_ENGINE": request.app.state.config.CONTENT_EXTRACTION_ENGINE,
|
||||
"PDF_EXTRACT_IMAGES": request.app.state.config.PDF_EXTRACT_IMAGES,
|
||||
"DATALAB_MARKER_API_KEY": request.app.state.config.DATALAB_MARKER_API_KEY,
|
||||
"DATALAB_MARKER_LANGS": request.app.state.config.DATALAB_MARKER_LANGS,
|
||||
"DATALAB_MARKER_API_BASE_URL": request.app.state.config.DATALAB_MARKER_API_BASE_URL,
|
||||
"DATALAB_MARKER_ADDITIONAL_CONFIG": request.app.state.config.DATALAB_MARKER_ADDITIONAL_CONFIG,
|
||||
"DATALAB_MARKER_SKIP_CACHE": request.app.state.config.DATALAB_MARKER_SKIP_CACHE,
|
||||
"DATALAB_MARKER_FORCE_OCR": request.app.state.config.DATALAB_MARKER_FORCE_OCR,
|
||||
"DATALAB_MARKER_PAGINATE": request.app.state.config.DATALAB_MARKER_PAGINATE,
|
||||
"DATALAB_MARKER_STRIP_EXISTING_OCR": request.app.state.config.DATALAB_MARKER_STRIP_EXISTING_OCR,
|
||||
"DATALAB_MARKER_DISABLE_IMAGE_EXTRACTION": request.app.state.config.DATALAB_MARKER_DISABLE_IMAGE_EXTRACTION,
|
||||
"DATALAB_MARKER_FORMAT_LINES": request.app.state.config.DATALAB_MARKER_FORMAT_LINES,
|
||||
"DATALAB_MARKER_USE_LLM": request.app.state.config.DATALAB_MARKER_USE_LLM,
|
||||
"DATALAB_MARKER_OUTPUT_FORMAT": request.app.state.config.DATALAB_MARKER_OUTPUT_FORMAT,
|
||||
"EXTERNAL_DOCUMENT_LOADER_URL": request.app.state.config.EXTERNAL_DOCUMENT_LOADER_URL,
|
||||
|
@ -566,12 +568,14 @@ class ConfigForm(BaseModel):
|
|||
CONTENT_EXTRACTION_ENGINE: Optional[str] = None
|
||||
PDF_EXTRACT_IMAGES: Optional[bool] = None
|
||||
DATALAB_MARKER_API_KEY: Optional[str] = None
|
||||
DATALAB_MARKER_LANGS: Optional[str] = None
|
||||
DATALAB_MARKER_API_BASE_URL: Optional[str] = None
|
||||
DATALAB_MARKER_ADDITIONAL_CONFIG: Optional[str] = None
|
||||
DATALAB_MARKER_SKIP_CACHE: Optional[bool] = None
|
||||
DATALAB_MARKER_FORCE_OCR: Optional[bool] = None
|
||||
DATALAB_MARKER_PAGINATE: Optional[bool] = None
|
||||
DATALAB_MARKER_STRIP_EXISTING_OCR: Optional[bool] = None
|
||||
DATALAB_MARKER_DISABLE_IMAGE_EXTRACTION: Optional[bool] = None
|
||||
DATALAB_MARKER_FORMAT_LINES: Optional[bool] = None
|
||||
DATALAB_MARKER_USE_LLM: Optional[bool] = None
|
||||
DATALAB_MARKER_OUTPUT_FORMAT: Optional[str] = None
|
||||
EXTERNAL_DOCUMENT_LOADER_URL: Optional[str] = None
|
||||
|
@ -683,10 +687,15 @@ async def update_rag_config(
|
|||
if form_data.DATALAB_MARKER_API_KEY is not None
|
||||
else request.app.state.config.DATALAB_MARKER_API_KEY
|
||||
)
|
||||
request.app.state.config.DATALAB_MARKER_LANGS = (
|
||||
form_data.DATALAB_MARKER_LANGS
|
||||
if form_data.DATALAB_MARKER_LANGS is not None
|
||||
else request.app.state.config.DATALAB_MARKER_LANGS
|
||||
request.app.state.config.DATALAB_MARKER_API_BASE_URL = (
|
||||
form_data.DATALAB_MARKER_API_BASE_URL
|
||||
if form_data.DATALAB_MARKER_API_BASE_URL is not None
|
||||
else request.app.state.config.DATALAB_MARKER_API_BASE_URL
|
||||
)
|
||||
request.app.state.config.DATALAB_MARKER_ADDITIONAL_CONFIG = (
|
||||
form_data.DATALAB_MARKER_ADDITIONAL_CONFIG
|
||||
if form_data.DATALAB_MARKER_ADDITIONAL_CONFIG is not None
|
||||
else request.app.state.config.DATALAB_MARKER_ADDITIONAL_CONFIG
|
||||
)
|
||||
request.app.state.config.DATALAB_MARKER_SKIP_CACHE = (
|
||||
form_data.DATALAB_MARKER_SKIP_CACHE
|
||||
|
@ -713,6 +722,11 @@ async def update_rag_config(
|
|||
if form_data.DATALAB_MARKER_DISABLE_IMAGE_EXTRACTION is not None
|
||||
else request.app.state.config.DATALAB_MARKER_DISABLE_IMAGE_EXTRACTION
|
||||
)
|
||||
request.app.state.config.DATALAB_MARKER_FORMAT_LINES = (
|
||||
form_data.DATALAB_MARKER_FORMAT_LINES
|
||||
if form_data.DATALAB_MARKER_FORMAT_LINES is not None
|
||||
else request.app.state.config.DATALAB_MARKER_FORMAT_LINES
|
||||
)
|
||||
request.app.state.config.DATALAB_MARKER_OUTPUT_FORMAT = (
|
||||
form_data.DATALAB_MARKER_OUTPUT_FORMAT
|
||||
if form_data.DATALAB_MARKER_OUTPUT_FORMAT is not None
|
||||
|
@ -1006,7 +1020,8 @@ async def update_rag_config(
|
|||
"CONTENT_EXTRACTION_ENGINE": request.app.state.config.CONTENT_EXTRACTION_ENGINE,
|
||||
"PDF_EXTRACT_IMAGES": request.app.state.config.PDF_EXTRACT_IMAGES,
|
||||
"DATALAB_MARKER_API_KEY": request.app.state.config.DATALAB_MARKER_API_KEY,
|
||||
"DATALAB_MARKER_LANGS": request.app.state.config.DATALAB_MARKER_LANGS,
|
||||
"DATALAB_MARKER_API_BASE_URL": request.app.state.config.DATALAB_MARKER_API_BASE_URL,
|
||||
"DATALAB_MARKER_ADDITIONAL_CONFIG": request.app.state.config.DATALAB_MARKER_ADDITIONAL_CONFIG,
|
||||
"DATALAB_MARKER_SKIP_CACHE": request.app.state.config.DATALAB_MARKER_SKIP_CACHE,
|
||||
"DATALAB_MARKER_FORCE_OCR": request.app.state.config.DATALAB_MARKER_FORCE_OCR,
|
||||
"DATALAB_MARKER_PAGINATE": request.app.state.config.DATALAB_MARKER_PAGINATE,
|
||||
|
@ -1393,12 +1408,14 @@ def process_file(
|
|||
loader = Loader(
|
||||
engine=request.app.state.config.CONTENT_EXTRACTION_ENGINE,
|
||||
DATALAB_MARKER_API_KEY=request.app.state.config.DATALAB_MARKER_API_KEY,
|
||||
DATALAB_MARKER_LANGS=request.app.state.config.DATALAB_MARKER_LANGS,
|
||||
DATALAB_MARKER_API_BASE_URL=request.app.state.config.DATALAB_MARKER_API_BASE_URL,
|
||||
DATALAB_MARKER_ADDITIONAL_CONFIG=request.app.state.config.DATALAB_MARKER_ADDITIONAL_CONFIG,
|
||||
DATALAB_MARKER_SKIP_CACHE=request.app.state.config.DATALAB_MARKER_SKIP_CACHE,
|
||||
DATALAB_MARKER_FORCE_OCR=request.app.state.config.DATALAB_MARKER_FORCE_OCR,
|
||||
DATALAB_MARKER_PAGINATE=request.app.state.config.DATALAB_MARKER_PAGINATE,
|
||||
DATALAB_MARKER_STRIP_EXISTING_OCR=request.app.state.config.DATALAB_MARKER_STRIP_EXISTING_OCR,
|
||||
DATALAB_MARKER_DISABLE_IMAGE_EXTRACTION=request.app.state.config.DATALAB_MARKER_DISABLE_IMAGE_EXTRACTION,
|
||||
DATALAB_MARKER_FORMAT_LINES=request.app.state.config.DATALAB_MARKER_FORMAT_LINES,
|
||||
DATALAB_MARKER_USE_LLM=request.app.state.config.DATALAB_MARKER_USE_LLM,
|
||||
DATALAB_MARKER_OUTPUT_FORMAT=request.app.state.config.DATALAB_MARKER_OUTPUT_FORMAT,
|
||||
EXTERNAL_DOCUMENT_LOADER_URL=request.app.state.config.EXTERNAL_DOCUMENT_LOADER_URL,
|
||||
|
|
|
@ -170,6 +170,19 @@
|
|||
return;
|
||||
}
|
||||
|
||||
if (
|
||||
RAGConfig.CONTENT_EXTRACTION_ENGINE === 'datalab_marker' &&
|
||||
RAGConfig.DATALAB_MARKER_ADDITIONAL_CONFIG &&
|
||||
RAGConfig.DATALAB_MARKER_ADDITIONAL_CONFIG.trim() !== ''
|
||||
) {
|
||||
try {
|
||||
JSON.parse(RAGConfig.DATALAB_MARKER_ADDITIONAL_CONFIG);
|
||||
} catch (e) {
|
||||
toast.error($i18n.t('Invalid JSON format in Additional Config'));
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
if (
|
||||
RAGConfig.CONTENT_EXTRACTION_ENGINE === 'document_intelligence' &&
|
||||
(RAGConfig.DOCUMENT_INTELLIGENCE_ENDPOINT === '' ||
|
||||
|
@ -195,10 +208,6 @@
|
|||
ALLOWED_FILE_EXTENSIONS: RAGConfig.ALLOWED_FILE_EXTENSIONS.split(',')
|
||||
.map((ext) => ext.trim())
|
||||
.filter((ext) => ext !== ''),
|
||||
DATALAB_MARKER_LANGS: RAGConfig.DATALAB_MARKER_LANGS.split(',')
|
||||
.map((code) => code.trim())
|
||||
.filter((code) => code !== '')
|
||||
.join(', '),
|
||||
DOCLING_PICTURE_DESCRIPTION_LOCAL: JSON.parse(
|
||||
RAGConfig.DOCLING_PICTURE_DESCRIPTION_LOCAL || '{}'
|
||||
),
|
||||
|
@ -243,6 +252,11 @@
|
|||
2
|
||||
);
|
||||
|
||||
// Set default API Base URL if empty
|
||||
if (!config.DATALAB_MARKER_API_BASE_URL) {
|
||||
config.DATALAB_MARKER_API_BASE_URL = 'https://www.datalab.to/api/v1/marker';
|
||||
}
|
||||
|
||||
RAGConfig = config;
|
||||
});
|
||||
</script>
|
||||
|
@ -336,6 +350,21 @@
|
|||
</div>
|
||||
</div>
|
||||
{:else if RAGConfig.CONTENT_EXTRACTION_ENGINE === 'datalab_marker'}
|
||||
<div class="my-0.5 flex gap-2 pr-2">
|
||||
<Tooltip
|
||||
content={$i18n.t(
|
||||
'API Base URL for Datalab Marker service. Defaults to: https://www.datalab.to/api/v1/marker'
|
||||
)}
|
||||
placement="top-start"
|
||||
className="w-full"
|
||||
>
|
||||
<input
|
||||
class="flex-1 w-full text-sm bg-transparent outline-hidden"
|
||||
placeholder={$i18n.t('Enter Datalab Marker API Base URL')}
|
||||
bind:value={RAGConfig.DATALAB_MARKER_API_BASE_URL}
|
||||
/>
|
||||
</Tooltip>
|
||||
</div>
|
||||
<div class="my-0.5 flex gap-2 pr-2">
|
||||
<SensitiveInput
|
||||
placeholder={$i18n.t('Enter Datalab Marker API Key')}
|
||||
|
@ -344,24 +373,33 @@
|
|||
/>
|
||||
</div>
|
||||
|
||||
<div class="flex justify-between w-full mt-2">
|
||||
<div class="text-xs font-medium">
|
||||
{$i18n.t('Languages')}
|
||||
<div class="flex flex-col gap-2 mt-2">
|
||||
<div class=" flex flex-col w-full justify-between">
|
||||
<div class=" mb-1 text-xs font-medium">
|
||||
{$i18n.t('Additional Config')}
|
||||
</div>
|
||||
<div class="flex w-full items-center relative">
|
||||
<Tooltip
|
||||
content={$i18n.t(
|
||||
'Additional configuration options for marker. This should be a JSON string with key-value pairs. For example, \'{"key": "value"}\'. Supported keys include: disable_links, keep_pageheader_in_output, keep_pagefooter_in_output, filter_blank_pages, drop_repeated_text, layout_coverage_threshold, merge_threshold, height_tolerance, gap_threshold, image_threshold, min_line_length, level_count, default_level'
|
||||
)}
|
||||
placement="top-start"
|
||||
className="w-full"
|
||||
>
|
||||
<Textarea
|
||||
bind:value={RAGConfig.DATALAB_MARKER_ADDITIONAL_CONFIG}
|
||||
placeholder={$i18n.t('Enter JSON config (e.g., {"disable_links": true})')}
|
||||
/>
|
||||
</Tooltip>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<input
|
||||
class="text-sm bg-transparent outline-hidden"
|
||||
type="text"
|
||||
bind:value={RAGConfig.DATALAB_MARKER_LANGS}
|
||||
placeholder={$i18n.t('e.g.) en,fr,de')}
|
||||
/>
|
||||
</div>
|
||||
|
||||
<div class="flex justify-between w-full mt-2">
|
||||
<div class="self-center text-xs font-medium">
|
||||
<Tooltip
|
||||
content={$i18n.t(
|
||||
'Significantly improves accuracy by using an LLM to enhance tables, forms, inline math, and layout detection. Will increase latency. Defaults to True.'
|
||||
'Significantly improves accuracy by using an LLM to enhance tables, forms, inline math, and layout detection. Will increase latency. Defaults to False.'
|
||||
)}
|
||||
placement="top-start"
|
||||
>
|
||||
|
@ -445,6 +483,21 @@
|
|||
<Switch bind:state={RAGConfig.DATALAB_MARKER_DISABLE_IMAGE_EXTRACTION} />
|
||||
</div>
|
||||
</div>
|
||||
<div class="flex justify-between w-full mt-2">
|
||||
<div class="self-center text-xs font-medium">
|
||||
<Tooltip
|
||||
content={$i18n.t(
|
||||
'Format the lines in the output. Defaults to False. If set to True, the lines will be formatted to detect inline math and styles.'
|
||||
)}
|
||||
placement="top-start"
|
||||
>
|
||||
{$i18n.t('Format Lines')}
|
||||
</Tooltip>
|
||||
</div>
|
||||
<div class="flex items-center">
|
||||
<Switch bind:state={RAGConfig.DATALAB_MARKER_FORMAT_LINES} />
|
||||
</div>
|
||||
</div>
|
||||
<div class="flex justify-between w-full mt-2">
|
||||
<div class="self-center text-xs font-medium">
|
||||
<Tooltip
|
||||
|
|
Loading…
Reference in New Issue