Merge pull request #15903 from Hisma/marker-api-update

feat: Add configurable API URL (for self-hosting) and additional_config parameter for Datalab Marker API
This commit is contained in:
Tim Jaeryang Baek 2025-08-04 15:21:03 +04:00 committed by GitHub
commit 5db60ca34f
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
6 changed files with 215 additions and 98 deletions

View File

@ -2032,10 +2032,16 @@ DATALAB_MARKER_API_KEY = PersistentConfig(
os.environ.get("DATALAB_MARKER_API_KEY", ""),
)
DATALAB_MARKER_LANGS = PersistentConfig(
"DATALAB_MARKER_LANGS",
"rag.datalab_marker_langs",
os.environ.get("DATALAB_MARKER_LANGS", ""),
DATALAB_MARKER_API_BASE_URL = PersistentConfig(
"DATALAB_MARKER_API_BASE_URL",
"rag.datalab_marker_api_base_url",
os.environ.get("DATALAB_MARKER_API_BASE_URL", ""),
)
DATALAB_MARKER_ADDITIONAL_CONFIG = PersistentConfig(
"DATALAB_MARKER_ADDITIONAL_CONFIG",
"rag.datalab_marker_additional_config",
os.environ.get("DATALAB_MARKER_ADDITIONAL_CONFIG", ""),
)
DATALAB_MARKER_USE_LLM = PersistentConfig(
@ -2075,6 +2081,12 @@ DATALAB_MARKER_DISABLE_IMAGE_EXTRACTION = PersistentConfig(
== "true",
)
DATALAB_MARKER_FORMAT_LINES = PersistentConfig(
"DATALAB_MARKER_FORMAT_LINES",
"rag.datalab_marker_format_lines",
os.environ.get("DATALAB_MARKER_FORMAT_LINES", "false").lower() == "true",
)
DATALAB_MARKER_OUTPUT_FORMAT = PersistentConfig(
"DATALAB_MARKER_OUTPUT_FORMAT",
"rag.datalab_marker_output_format",

View File

@ -226,12 +226,14 @@ from open_webui.config import (
CHUNK_SIZE,
CONTENT_EXTRACTION_ENGINE,
DATALAB_MARKER_API_KEY,
DATALAB_MARKER_LANGS,
DATALAB_MARKER_API_BASE_URL,
DATALAB_MARKER_ADDITIONAL_CONFIG,
DATALAB_MARKER_SKIP_CACHE,
DATALAB_MARKER_FORCE_OCR,
DATALAB_MARKER_PAGINATE,
DATALAB_MARKER_STRIP_EXISTING_OCR,
DATALAB_MARKER_DISABLE_IMAGE_EXTRACTION,
DATALAB_MARKER_FORMAT_LINES,
DATALAB_MARKER_OUTPUT_FORMAT,
DATALAB_MARKER_USE_LLM,
EXTERNAL_DOCUMENT_LOADER_URL,
@ -771,7 +773,8 @@ app.state.config.ENABLE_WEB_LOADER_SSL_VERIFICATION = ENABLE_WEB_LOADER_SSL_VERI
app.state.config.CONTENT_EXTRACTION_ENGINE = CONTENT_EXTRACTION_ENGINE
app.state.config.DATALAB_MARKER_API_KEY = DATALAB_MARKER_API_KEY
app.state.config.DATALAB_MARKER_LANGS = DATALAB_MARKER_LANGS
app.state.config.DATALAB_MARKER_API_BASE_URL = DATALAB_MARKER_API_BASE_URL
app.state.config.DATALAB_MARKER_ADDITIONAL_CONFIG = DATALAB_MARKER_ADDITIONAL_CONFIG
app.state.config.DATALAB_MARKER_SKIP_CACHE = DATALAB_MARKER_SKIP_CACHE
app.state.config.DATALAB_MARKER_FORCE_OCR = DATALAB_MARKER_FORCE_OCR
app.state.config.DATALAB_MARKER_PAGINATE = DATALAB_MARKER_PAGINATE
@ -779,6 +782,7 @@ app.state.config.DATALAB_MARKER_STRIP_EXISTING_OCR = DATALAB_MARKER_STRIP_EXISTI
app.state.config.DATALAB_MARKER_DISABLE_IMAGE_EXTRACTION = (
DATALAB_MARKER_DISABLE_IMAGE_EXTRACTION
)
app.state.config.DATALAB_MARKER_FORMAT_LINES = DATALAB_MARKER_FORMAT_LINES
app.state.config.DATALAB_MARKER_USE_LLM = DATALAB_MARKER_USE_LLM
app.state.config.DATALAB_MARKER_OUTPUT_FORMAT = DATALAB_MARKER_OUTPUT_FORMAT
app.state.config.EXTERNAL_DOCUMENT_LOADER_URL = EXTERNAL_DOCUMENT_LOADER_URL

View File

@ -15,24 +15,28 @@ class DatalabMarkerLoader:
self,
file_path: str,
api_key: str,
langs: Optional[str] = None,
api_base_url: str,
additional_config: Optional[str] = None,
use_llm: bool = False,
skip_cache: bool = False,
force_ocr: bool = False,
paginate: bool = False,
strip_existing_ocr: bool = False,
disable_image_extraction: bool = False,
format_lines: bool = False,
output_format: str = None,
):
self.file_path = file_path
self.api_key = api_key
self.langs = langs
self.api_base_url = api_base_url
self.additional_config = additional_config
self.use_llm = use_llm
self.skip_cache = skip_cache
self.force_ocr = force_ocr
self.paginate = paginate
self.strip_existing_ocr = strip_existing_ocr
self.disable_image_extraction = disable_image_extraction
self.format_lines = format_lines
self.output_format = output_format
def _get_mime_type(self, filename: str) -> str:
@ -60,7 +64,7 @@ class DatalabMarkerLoader:
return mime_map.get(ext, "application/octet-stream")
def check_marker_request_status(self, request_id: str) -> dict:
url = f"https://www.datalab.to/api/v1/marker/{request_id}"
url = f"{self.api_base_url}/{request_id}"
headers = {"X-Api-Key": self.api_key}
try:
response = requests.get(url, headers=headers)
@ -81,22 +85,25 @@ class DatalabMarkerLoader:
)
def load(self) -> List[Document]:
url = "https://www.datalab.to/api/v1/marker"
url = self.api_base_url
filename = os.path.basename(self.file_path)
mime_type = self._get_mime_type(filename)
headers = {"X-Api-Key": self.api_key}
form_data = {
"langs": self.langs,
"use_llm": str(self.use_llm).lower(),
"skip_cache": str(self.skip_cache).lower(),
"force_ocr": str(self.force_ocr).lower(),
"paginate": str(self.paginate).lower(),
"strip_existing_ocr": str(self.strip_existing_ocr).lower(),
"disable_image_extraction": str(self.disable_image_extraction).lower(),
"format_lines": str(self.format_lines).lower(),
"output_format": self.output_format,
}
if self.additional_config and self.additional_config.strip():
form_data["additional_config"] = self.additional_config
log.info(
f"Datalab Marker POST request parameters: {{'filename': '{filename}', 'mime_type': '{mime_type}', **{form_data}}}"
)
@ -133,74 +140,92 @@ class DatalabMarkerLoader:
check_url = result.get("request_check_url")
request_id = result.get("request_id")
if not check_url:
raise HTTPException(
status.HTTP_502_BAD_GATEWAY, detail="No request_check_url returned."
)
for _ in range(300): # Up to 10 minutes
time.sleep(2)
try:
poll_response = requests.get(check_url, headers=headers)
poll_response.raise_for_status()
poll_result = poll_response.json()
except (requests.HTTPError, ValueError) as e:
raw_body = poll_response.text
log.error(f"Polling error: {e}, response body: {raw_body}")
raise HTTPException(
status.HTTP_502_BAD_GATEWAY, detail=f"Polling failed: {e}"
)
status_val = poll_result.get("status")
success_val = poll_result.get("success")
if status_val == "complete":
summary = {
k: poll_result.get(k)
for k in (
"status",
"output_format",
"success",
"error",
"page_count",
"total_cost",
# Check if this is a direct response (self-hosted) or polling response (DataLab)
if check_url:
# DataLab polling pattern
for _ in range(300): # Up to 10 minutes
time.sleep(2)
try:
poll_response = requests.get(check_url, headers=headers)
poll_response.raise_for_status()
poll_result = poll_response.json()
except (requests.HTTPError, ValueError) as e:
raw_body = poll_response.text
log.error(f"Polling error: {e}, response body: {raw_body}")
raise HTTPException(
status.HTTP_502_BAD_GATEWAY, detail=f"Polling failed: {e}"
)
}
log.info(
f"Marker processing completed successfully: {json.dumps(summary, indent=2)}"
)
break
if status_val == "failed" or success_val is False:
log.error(
f"Marker poll failed full response: {json.dumps(poll_result, indent=2)}"
)
error_msg = (
poll_result.get("error")
or "Marker returned failure without error message"
status_val = poll_result.get("status")
success_val = poll_result.get("success")
if status_val == "complete":
summary = {
k: poll_result.get(k)
for k in (
"status",
"output_format",
"success",
"error",
"page_count",
"total_cost",
)
}
log.info(
f"Marker processing completed successfully: {json.dumps(summary, indent=2)}"
)
break
if status_val == "failed" or success_val is False:
log.error(
f"Marker poll failed full response: {json.dumps(poll_result, indent=2)}"
)
error_msg = (
poll_result.get("error")
or "Marker returned failure without error message"
)
raise HTTPException(
status.HTTP_400_BAD_REQUEST,
detail=f"Marker processing failed: {error_msg}",
)
else:
raise HTTPException(
status.HTTP_504_GATEWAY_TIMEOUT,
detail="Marker processing timed out",
)
if not poll_result.get("success", False):
error_msg = poll_result.get("error") or "Unknown processing error"
raise HTTPException(
status.HTTP_400_BAD_REQUEST,
detail=f"Marker processing failed: {error_msg}",
detail=f"Final processing failed: {error_msg}",
)
# DataLab format - content in format-specific fields
content_key = self.output_format.lower()
raw_content = poll_result.get(content_key)
final_result = poll_result
else:
raise HTTPException(
status.HTTP_504_GATEWAY_TIMEOUT, detail="Marker processing timed out"
)
# Self-hosted direct response - content in "output" field
if "output" in result:
log.info("Self-hosted Marker returned direct response without polling")
raw_content = result.get("output")
final_result = result
else:
available_fields = (
list(result.keys())
if isinstance(result, dict)
else "non-dict response"
)
raise HTTPException(
status.HTTP_502_BAD_GATEWAY,
detail=f"Custom Marker endpoint returned success but no 'output' field found. Available fields: {available_fields}. Expected either 'request_check_url' for polling or 'output' field for direct response.",
)
if not poll_result.get("success", False):
error_msg = poll_result.get("error") or "Unknown processing error"
raise HTTPException(
status.HTTP_400_BAD_REQUEST,
detail=f"Final processing failed: {error_msg}",
)
content_key = self.output_format.lower()
raw_content = poll_result.get(content_key)
if content_key == "json":
if self.output_format.lower() == "json":
full_text = json.dumps(raw_content, indent=2)
elif content_key in {"markdown", "html"}:
elif self.output_format.lower() in {"markdown", "html"}:
full_text = str(raw_content).strip()
else:
raise HTTPException(
@ -211,14 +236,14 @@ class DatalabMarkerLoader:
if not full_text:
raise HTTPException(
status.HTTP_400_BAD_REQUEST,
detail="Datalab Marker returned empty content",
detail="Marker returned empty content",
)
marker_output_dir = os.path.join("/app/backend/data/uploads", "marker_output")
os.makedirs(marker_output_dir, exist_ok=True)
file_ext_map = {"markdown": "md", "json": "json", "html": "html"}
file_ext = file_ext_map.get(content_key, "txt")
file_ext = file_ext_map.get(self.output_format.lower(), "txt")
output_filename = f"{os.path.splitext(filename)[0]}.{file_ext}"
output_path = os.path.join(marker_output_dir, output_filename)
@ -231,13 +256,13 @@ class DatalabMarkerLoader:
metadata = {
"source": filename,
"output_format": poll_result.get("output_format", self.output_format),
"page_count": poll_result.get("page_count", 0),
"output_format": final_result.get("output_format", self.output_format),
"page_count": final_result.get("page_count", 0),
"processed_with_llm": self.use_llm,
"request_id": request_id or "",
}
images = poll_result.get("images", {})
images = final_result.get("images", {})
if images:
metadata["image_count"] = len(images)
metadata["images"] = json.dumps(list(images.keys()))

View File

@ -281,10 +281,15 @@ class Loader:
"tiff",
]
):
api_base_url = self.kwargs.get("DATALAB_MARKER_API_BASE_URL", "")
if not api_base_url or api_base_url.strip() == "":
api_base_url = "https://www.datalab.to/api/v1/marker"
loader = DatalabMarkerLoader(
file_path=file_path,
api_key=self.kwargs["DATALAB_MARKER_API_KEY"],
langs=self.kwargs.get("DATALAB_MARKER_LANGS"),
api_base_url=api_base_url,
additional_config=self.kwargs.get("DATALAB_MARKER_ADDITIONAL_CONFIG"),
use_llm=self.kwargs.get("DATALAB_MARKER_USE_LLM", False),
skip_cache=self.kwargs.get("DATALAB_MARKER_SKIP_CACHE", False),
force_ocr=self.kwargs.get("DATALAB_MARKER_FORCE_OCR", False),
@ -295,6 +300,7 @@ class Loader:
disable_image_extraction=self.kwargs.get(
"DATALAB_MARKER_DISABLE_IMAGE_EXTRACTION", False
),
format_lines=self.kwargs.get("DATALAB_MARKER_FORMAT_LINES", False),
output_format=self.kwargs.get(
"DATALAB_MARKER_OUTPUT_FORMAT", "markdown"
),

View File

@ -401,12 +401,14 @@ async def get_rag_config(request: Request, user=Depends(get_admin_user)):
"CONTENT_EXTRACTION_ENGINE": request.app.state.config.CONTENT_EXTRACTION_ENGINE,
"PDF_EXTRACT_IMAGES": request.app.state.config.PDF_EXTRACT_IMAGES,
"DATALAB_MARKER_API_KEY": request.app.state.config.DATALAB_MARKER_API_KEY,
"DATALAB_MARKER_LANGS": request.app.state.config.DATALAB_MARKER_LANGS,
"DATALAB_MARKER_API_BASE_URL": request.app.state.config.DATALAB_MARKER_API_BASE_URL,
"DATALAB_MARKER_ADDITIONAL_CONFIG": request.app.state.config.DATALAB_MARKER_ADDITIONAL_CONFIG,
"DATALAB_MARKER_SKIP_CACHE": request.app.state.config.DATALAB_MARKER_SKIP_CACHE,
"DATALAB_MARKER_FORCE_OCR": request.app.state.config.DATALAB_MARKER_FORCE_OCR,
"DATALAB_MARKER_PAGINATE": request.app.state.config.DATALAB_MARKER_PAGINATE,
"DATALAB_MARKER_STRIP_EXISTING_OCR": request.app.state.config.DATALAB_MARKER_STRIP_EXISTING_OCR,
"DATALAB_MARKER_DISABLE_IMAGE_EXTRACTION": request.app.state.config.DATALAB_MARKER_DISABLE_IMAGE_EXTRACTION,
"DATALAB_MARKER_FORMAT_LINES": request.app.state.config.DATALAB_MARKER_FORMAT_LINES,
"DATALAB_MARKER_USE_LLM": request.app.state.config.DATALAB_MARKER_USE_LLM,
"DATALAB_MARKER_OUTPUT_FORMAT": request.app.state.config.DATALAB_MARKER_OUTPUT_FORMAT,
"EXTERNAL_DOCUMENT_LOADER_URL": request.app.state.config.EXTERNAL_DOCUMENT_LOADER_URL,
@ -566,12 +568,14 @@ class ConfigForm(BaseModel):
CONTENT_EXTRACTION_ENGINE: Optional[str] = None
PDF_EXTRACT_IMAGES: Optional[bool] = None
DATALAB_MARKER_API_KEY: Optional[str] = None
DATALAB_MARKER_LANGS: Optional[str] = None
DATALAB_MARKER_API_BASE_URL: Optional[str] = None
DATALAB_MARKER_ADDITIONAL_CONFIG: Optional[str] = None
DATALAB_MARKER_SKIP_CACHE: Optional[bool] = None
DATALAB_MARKER_FORCE_OCR: Optional[bool] = None
DATALAB_MARKER_PAGINATE: Optional[bool] = None
DATALAB_MARKER_STRIP_EXISTING_OCR: Optional[bool] = None
DATALAB_MARKER_DISABLE_IMAGE_EXTRACTION: Optional[bool] = None
DATALAB_MARKER_FORMAT_LINES: Optional[bool] = None
DATALAB_MARKER_USE_LLM: Optional[bool] = None
DATALAB_MARKER_OUTPUT_FORMAT: Optional[str] = None
EXTERNAL_DOCUMENT_LOADER_URL: Optional[str] = None
@ -683,10 +687,15 @@ async def update_rag_config(
if form_data.DATALAB_MARKER_API_KEY is not None
else request.app.state.config.DATALAB_MARKER_API_KEY
)
request.app.state.config.DATALAB_MARKER_LANGS = (
form_data.DATALAB_MARKER_LANGS
if form_data.DATALAB_MARKER_LANGS is not None
else request.app.state.config.DATALAB_MARKER_LANGS
request.app.state.config.DATALAB_MARKER_API_BASE_URL = (
form_data.DATALAB_MARKER_API_BASE_URL
if form_data.DATALAB_MARKER_API_BASE_URL is not None
else request.app.state.config.DATALAB_MARKER_API_BASE_URL
)
request.app.state.config.DATALAB_MARKER_ADDITIONAL_CONFIG = (
form_data.DATALAB_MARKER_ADDITIONAL_CONFIG
if form_data.DATALAB_MARKER_ADDITIONAL_CONFIG is not None
else request.app.state.config.DATALAB_MARKER_ADDITIONAL_CONFIG
)
request.app.state.config.DATALAB_MARKER_SKIP_CACHE = (
form_data.DATALAB_MARKER_SKIP_CACHE
@ -713,6 +722,11 @@ async def update_rag_config(
if form_data.DATALAB_MARKER_DISABLE_IMAGE_EXTRACTION is not None
else request.app.state.config.DATALAB_MARKER_DISABLE_IMAGE_EXTRACTION
)
request.app.state.config.DATALAB_MARKER_FORMAT_LINES = (
form_data.DATALAB_MARKER_FORMAT_LINES
if form_data.DATALAB_MARKER_FORMAT_LINES is not None
else request.app.state.config.DATALAB_MARKER_FORMAT_LINES
)
request.app.state.config.DATALAB_MARKER_OUTPUT_FORMAT = (
form_data.DATALAB_MARKER_OUTPUT_FORMAT
if form_data.DATALAB_MARKER_OUTPUT_FORMAT is not None
@ -1006,7 +1020,8 @@ async def update_rag_config(
"CONTENT_EXTRACTION_ENGINE": request.app.state.config.CONTENT_EXTRACTION_ENGINE,
"PDF_EXTRACT_IMAGES": request.app.state.config.PDF_EXTRACT_IMAGES,
"DATALAB_MARKER_API_KEY": request.app.state.config.DATALAB_MARKER_API_KEY,
"DATALAB_MARKER_LANGS": request.app.state.config.DATALAB_MARKER_LANGS,
"DATALAB_MARKER_API_BASE_URL": request.app.state.config.DATALAB_MARKER_API_BASE_URL,
"DATALAB_MARKER_ADDITIONAL_CONFIG": request.app.state.config.DATALAB_MARKER_ADDITIONAL_CONFIG,
"DATALAB_MARKER_SKIP_CACHE": request.app.state.config.DATALAB_MARKER_SKIP_CACHE,
"DATALAB_MARKER_FORCE_OCR": request.app.state.config.DATALAB_MARKER_FORCE_OCR,
"DATALAB_MARKER_PAGINATE": request.app.state.config.DATALAB_MARKER_PAGINATE,
@ -1393,12 +1408,14 @@ def process_file(
loader = Loader(
engine=request.app.state.config.CONTENT_EXTRACTION_ENGINE,
DATALAB_MARKER_API_KEY=request.app.state.config.DATALAB_MARKER_API_KEY,
DATALAB_MARKER_LANGS=request.app.state.config.DATALAB_MARKER_LANGS,
DATALAB_MARKER_API_BASE_URL=request.app.state.config.DATALAB_MARKER_API_BASE_URL,
DATALAB_MARKER_ADDITIONAL_CONFIG=request.app.state.config.DATALAB_MARKER_ADDITIONAL_CONFIG,
DATALAB_MARKER_SKIP_CACHE=request.app.state.config.DATALAB_MARKER_SKIP_CACHE,
DATALAB_MARKER_FORCE_OCR=request.app.state.config.DATALAB_MARKER_FORCE_OCR,
DATALAB_MARKER_PAGINATE=request.app.state.config.DATALAB_MARKER_PAGINATE,
DATALAB_MARKER_STRIP_EXISTING_OCR=request.app.state.config.DATALAB_MARKER_STRIP_EXISTING_OCR,
DATALAB_MARKER_DISABLE_IMAGE_EXTRACTION=request.app.state.config.DATALAB_MARKER_DISABLE_IMAGE_EXTRACTION,
DATALAB_MARKER_FORMAT_LINES=request.app.state.config.DATALAB_MARKER_FORMAT_LINES,
DATALAB_MARKER_USE_LLM=request.app.state.config.DATALAB_MARKER_USE_LLM,
DATALAB_MARKER_OUTPUT_FORMAT=request.app.state.config.DATALAB_MARKER_OUTPUT_FORMAT,
EXTERNAL_DOCUMENT_LOADER_URL=request.app.state.config.EXTERNAL_DOCUMENT_LOADER_URL,

View File

@ -170,6 +170,19 @@
return;
}
if (
RAGConfig.CONTENT_EXTRACTION_ENGINE === 'datalab_marker' &&
RAGConfig.DATALAB_MARKER_ADDITIONAL_CONFIG &&
RAGConfig.DATALAB_MARKER_ADDITIONAL_CONFIG.trim() !== ''
) {
try {
JSON.parse(RAGConfig.DATALAB_MARKER_ADDITIONAL_CONFIG);
} catch (e) {
toast.error($i18n.t('Invalid JSON format in Additional Config'));
return;
}
}
if (
RAGConfig.CONTENT_EXTRACTION_ENGINE === 'document_intelligence' &&
(RAGConfig.DOCUMENT_INTELLIGENCE_ENDPOINT === '' ||
@ -195,10 +208,6 @@
ALLOWED_FILE_EXTENSIONS: RAGConfig.ALLOWED_FILE_EXTENSIONS.split(',')
.map((ext) => ext.trim())
.filter((ext) => ext !== ''),
DATALAB_MARKER_LANGS: RAGConfig.DATALAB_MARKER_LANGS.split(',')
.map((code) => code.trim())
.filter((code) => code !== '')
.join(', '),
DOCLING_PICTURE_DESCRIPTION_LOCAL: JSON.parse(
RAGConfig.DOCLING_PICTURE_DESCRIPTION_LOCAL || '{}'
),
@ -243,6 +252,11 @@
2
);
// Set default API Base URL if empty
if (!config.DATALAB_MARKER_API_BASE_URL) {
config.DATALAB_MARKER_API_BASE_URL = 'https://www.datalab.to/api/v1/marker';
}
RAGConfig = config;
});
</script>
@ -336,6 +350,21 @@
</div>
</div>
{:else if RAGConfig.CONTENT_EXTRACTION_ENGINE === 'datalab_marker'}
<div class="my-0.5 flex gap-2 pr-2">
<Tooltip
content={$i18n.t(
'API Base URL for Datalab Marker service. Defaults to: https://www.datalab.to/api/v1/marker'
)}
placement="top-start"
className="w-full"
>
<input
class="flex-1 w-full text-sm bg-transparent outline-hidden"
placeholder={$i18n.t('Enter Datalab Marker API Base URL')}
bind:value={RAGConfig.DATALAB_MARKER_API_BASE_URL}
/>
</Tooltip>
</div>
<div class="my-0.5 flex gap-2 pr-2">
<SensitiveInput
placeholder={$i18n.t('Enter Datalab Marker API Key')}
@ -344,24 +373,33 @@
/>
</div>
<div class="flex justify-between w-full mt-2">
<div class="text-xs font-medium">
{$i18n.t('Languages')}
<div class="flex flex-col gap-2 mt-2">
<div class=" flex flex-col w-full justify-between">
<div class=" mb-1 text-xs font-medium">
{$i18n.t('Additional Config')}
</div>
<div class="flex w-full items-center relative">
<Tooltip
content={$i18n.t(
'Additional configuration options for marker. This should be a JSON string with key-value pairs. For example, \'{"key": "value"}\'. Supported keys include: disable_links, keep_pageheader_in_output, keep_pagefooter_in_output, filter_blank_pages, drop_repeated_text, layout_coverage_threshold, merge_threshold, height_tolerance, gap_threshold, image_threshold, min_line_length, level_count, default_level'
)}
placement="top-start"
className="w-full"
>
<Textarea
bind:value={RAGConfig.DATALAB_MARKER_ADDITIONAL_CONFIG}
placeholder={$i18n.t('Enter JSON config (e.g., {"disable_links": true})')}
/>
</Tooltip>
</div>
</div>
<input
class="text-sm bg-transparent outline-hidden"
type="text"
bind:value={RAGConfig.DATALAB_MARKER_LANGS}
placeholder={$i18n.t('e.g.) en,fr,de')}
/>
</div>
<div class="flex justify-between w-full mt-2">
<div class="self-center text-xs font-medium">
<Tooltip
content={$i18n.t(
'Significantly improves accuracy by using an LLM to enhance tables, forms, inline math, and layout detection. Will increase latency. Defaults to True.'
'Significantly improves accuracy by using an LLM to enhance tables, forms, inline math, and layout detection. Will increase latency. Defaults to False.'
)}
placement="top-start"
>
@ -445,6 +483,21 @@
<Switch bind:state={RAGConfig.DATALAB_MARKER_DISABLE_IMAGE_EXTRACTION} />
</div>
</div>
<div class="flex justify-between w-full mt-2">
<div class="self-center text-xs font-medium">
<Tooltip
content={$i18n.t(
'Format the lines in the output. Defaults to False. If set to True, the lines will be formatted to detect inline math and styles.'
)}
placement="top-start"
>
{$i18n.t('Format Lines')}
</Tooltip>
</div>
<div class="flex items-center">
<Switch bind:state={RAGConfig.DATALAB_MARKER_FORMAT_LINES} />
</div>
</div>
<div class="flex justify-between w-full mt-2">
<div class="self-center text-xs font-medium">
<Tooltip