2025-05-27 12:44:07 +08:00
import os
import time
import requests
import logging
import json
from typing import List , Optional
from langchain_core . documents import Document
from fastapi import HTTPException , status
log = logging . getLogger ( __name__ )
class DatalabMarkerLoader :
def __init__ (
self ,
file_path : str ,
api_key : str ,
2025-07-23 08:49:28 +08:00
api_base_url : str ,
additional_config : Optional [ str ] = None ,
2025-05-27 12:44:07 +08:00
use_llm : bool = False ,
skip_cache : bool = False ,
force_ocr : bool = False ,
paginate : bool = False ,
strip_existing_ocr : bool = False ,
disable_image_extraction : bool = False ,
2025-07-23 09:06:29 +08:00
format_lines : bool = False ,
2025-05-29 06:36:33 +08:00
output_format : str = None ,
2025-05-27 12:44:07 +08:00
) :
self . file_path = file_path
self . api_key = api_key
2025-07-23 08:49:28 +08:00
self . api_base_url = api_base_url
self . additional_config = additional_config
2025-05-27 12:44:07 +08:00
self . use_llm = use_llm
self . skip_cache = skip_cache
self . force_ocr = force_ocr
self . paginate = paginate
self . strip_existing_ocr = strip_existing_ocr
self . disable_image_extraction = disable_image_extraction
2025-07-23 09:06:29 +08:00
self . format_lines = format_lines
2025-05-27 12:44:07 +08:00
self . output_format = output_format
def _get_mime_type ( self , filename : str ) - > str :
ext = filename . rsplit ( " . " , 1 ) [ - 1 ] . lower ( )
mime_map = {
2025-05-29 06:36:33 +08:00
" pdf " : " application/pdf " ,
" xls " : " application/vnd.ms-excel " ,
" xlsx " : " application/vnd.openxmlformats-officedocument.spreadsheetml.sheet " ,
" ods " : " application/vnd.oasis.opendocument.spreadsheet " ,
" doc " : " application/msword " ,
" docx " : " application/vnd.openxmlformats-officedocument.wordprocessingml.document " ,
" odt " : " application/vnd.oasis.opendocument.text " ,
" ppt " : " application/vnd.ms-powerpoint " ,
" pptx " : " application/vnd.openxmlformats-officedocument.presentationml.presentation " ,
" odp " : " application/vnd.oasis.opendocument.presentation " ,
" html " : " text/html " ,
" epub " : " application/epub+zip " ,
" png " : " image/png " ,
" jpeg " : " image/jpeg " ,
" jpg " : " image/jpeg " ,
" webp " : " image/webp " ,
" gif " : " image/gif " ,
" tiff " : " image/tiff " ,
2025-05-27 12:44:07 +08:00
}
2025-05-29 06:36:33 +08:00
return mime_map . get ( ext , " application/octet-stream " )
2025-05-27 12:44:07 +08:00
def check_marker_request_status ( self , request_id : str ) - > dict :
2025-08-28 07:03:31 +08:00
url = f " { self . api_base_url } / { request_id } "
2025-05-27 12:44:07 +08:00
headers = { " X-Api-Key " : self . api_key }
try :
response = requests . get ( url , headers = headers )
response . raise_for_status ( )
result = response . json ( )
log . info ( f " Marker API status check for request { request_id } : { result } " )
return result
except requests . HTTPError as e :
log . error ( f " Error checking Marker request status: { e } " )
2025-05-29 06:36:33 +08:00
raise HTTPException (
status . HTTP_502_BAD_GATEWAY ,
detail = f " Failed to check Marker request: { e } " ,
)
2025-05-27 12:44:07 +08:00
except ValueError as e :
log . error ( f " Invalid JSON checking Marker request: { e } " )
2025-05-29 06:36:33 +08:00
raise HTTPException (
status . HTTP_502_BAD_GATEWAY , detail = f " Invalid JSON: { e } "
)
2025-05-27 12:44:07 +08:00
def load ( self ) - > List [ Document ] :
filename = os . path . basename ( self . file_path )
mime_type = self . _get_mime_type ( filename )
headers = { " X-Api-Key " : self . api_key }
form_data = {
" use_llm " : str ( self . use_llm ) . lower ( ) ,
" skip_cache " : str ( self . skip_cache ) . lower ( ) ,
" force_ocr " : str ( self . force_ocr ) . lower ( ) ,
" paginate " : str ( self . paginate ) . lower ( ) ,
" strip_existing_ocr " : str ( self . strip_existing_ocr ) . lower ( ) ,
" disable_image_extraction " : str ( self . disable_image_extraction ) . lower ( ) ,
2025-07-23 09:06:29 +08:00
" format_lines " : str ( self . format_lines ) . lower ( ) ,
2025-05-27 12:44:07 +08:00
" output_format " : self . output_format ,
}
2025-07-23 08:49:28 +08:00
if self . additional_config and self . additional_config . strip ( ) :
form_data [ " additional_config " ] = self . additional_config
2025-05-29 06:36:33 +08:00
log . info (
f " Datalab Marker POST request parameters: {{ ' filename ' : ' { filename } ' , ' mime_type ' : ' { mime_type } ' , ** { form_data } }} "
)
2025-05-27 12:44:07 +08:00
try :
with open ( self . file_path , " rb " ) as f :
files = { " file " : ( filename , f , mime_type ) }
2025-05-29 06:36:33 +08:00
response = requests . post (
2025-08-28 07:03:31 +08:00
f " { self . api_base_url } " ,
2025-08-04 19:23:43 +08:00
data = form_data ,
files = files ,
headers = headers ,
2025-05-29 06:36:33 +08:00
)
2025-05-27 12:44:07 +08:00
response . raise_for_status ( )
result = response . json ( )
except FileNotFoundError :
2025-05-29 06:36:33 +08:00
raise HTTPException (
status . HTTP_404_NOT_FOUND , detail = f " File not found: { self . file_path } "
)
2025-05-27 12:44:07 +08:00
except requests . HTTPError as e :
2025-05-29 06:36:33 +08:00
raise HTTPException (
status . HTTP_400_BAD_REQUEST ,
detail = f " Datalab Marker request failed: { e } " ,
)
2025-05-27 12:44:07 +08:00
except ValueError as e :
2025-05-29 06:36:33 +08:00
raise HTTPException (
status . HTTP_502_BAD_GATEWAY , detail = f " Invalid JSON response: { e } "
)
2025-05-27 12:44:07 +08:00
except Exception as e :
raise HTTPException ( status . HTTP_500_INTERNAL_SERVER_ERROR , detail = str ( e ) )
if not result . get ( " success " ) :
2025-05-29 06:36:33 +08:00
raise HTTPException (
status . HTTP_400_BAD_REQUEST ,
detail = f " Datalab Marker request failed: { result . get ( ' error ' , ' Unknown error ' ) } " ,
)
2025-05-27 12:44:07 +08:00
check_url = result . get ( " request_check_url " )
request_id = result . get ( " request_id " )
2025-07-23 10:07:14 +08:00
2025-07-23 08:49:28 +08:00
# Check if this is a direct response (self-hosted) or polling response (DataLab)
if check_url :
# DataLab polling pattern
for _ in range ( 300 ) : # Up to 10 minutes
time . sleep ( 2 )
try :
poll_response = requests . get ( check_url , headers = headers )
poll_response . raise_for_status ( )
poll_result = poll_response . json ( )
except ( requests . HTTPError , ValueError ) as e :
raw_body = poll_response . text
log . error ( f " Polling error: { e } , response body: { raw_body } " )
raise HTTPException (
status . HTTP_502_BAD_GATEWAY , detail = f " Polling failed: { e } "
)
2025-05-27 12:44:07 +08:00
2025-07-23 08:49:28 +08:00
status_val = poll_result . get ( " status " )
success_val = poll_result . get ( " success " )
2025-05-27 12:44:07 +08:00
2025-07-23 08:49:28 +08:00
if status_val == " complete " :
summary = {
k : poll_result . get ( k )
for k in (
" status " ,
" output_format " ,
" success " ,
" error " ,
" page_count " ,
" total_cost " ,
)
}
log . info (
f " Marker processing completed successfully: { json . dumps ( summary , indent = 2 ) } "
2025-05-29 06:36:33 +08:00
)
2025-07-23 08:49:28 +08:00
break
2025-05-27 12:44:07 +08:00
2025-07-23 08:49:28 +08:00
if status_val == " failed " or success_val is False :
log . error (
f " Marker poll failed full response: { json . dumps ( poll_result , indent = 2 ) } "
)
error_msg = (
poll_result . get ( " error " )
or " Marker returned failure without error message "
)
raise HTTPException (
status . HTTP_400_BAD_REQUEST ,
detail = f " Marker processing failed: { error_msg } " ,
)
else :
raise HTTPException (
2025-07-23 10:07:14 +08:00
status . HTTP_504_GATEWAY_TIMEOUT ,
detail = " Marker processing timed out " ,
2025-05-29 06:36:33 +08:00
)
2025-07-23 08:49:28 +08:00
if not poll_result . get ( " success " , False ) :
error_msg = poll_result . get ( " error " ) or " Unknown processing error "
2025-05-29 06:36:33 +08:00
raise HTTPException (
status . HTTP_400_BAD_REQUEST ,
2025-07-23 08:49:28 +08:00
detail = f " Final processing failed: { error_msg } " ,
2025-05-29 06:36:33 +08:00
)
2025-05-27 12:44:07 +08:00
2025-07-23 08:49:28 +08:00
# DataLab format - content in format-specific fields
content_key = self . output_format . lower ( )
raw_content = poll_result . get ( content_key )
final_result = poll_result
else :
# Self-hosted direct response - content in "output" field
if " output " in result :
log . info ( " Self-hosted Marker returned direct response without polling " )
raw_content = result . get ( " output " )
final_result = result
else :
2025-07-23 10:07:14 +08:00
available_fields = (
list ( result . keys ( ) )
if isinstance ( result , dict )
else " non-dict response "
)
2025-07-23 08:49:28 +08:00
raise HTTPException (
2025-07-23 10:07:14 +08:00
status . HTTP_502_BAD_GATEWAY ,
detail = f " Custom Marker endpoint returned success but no ' output ' field found. Available fields: { available_fields } . Expected either ' request_check_url ' for polling or ' output ' field for direct response. " ,
2025-07-23 08:49:28 +08:00
)
2025-05-27 13:12:03 +08:00
2025-07-23 08:49:28 +08:00
if self . output_format . lower ( ) == " json " :
2025-05-27 13:12:03 +08:00
full_text = json . dumps ( raw_content , indent = 2 )
2025-07-23 08:49:28 +08:00
elif self . output_format . lower ( ) in { " markdown " , " html " } :
2025-05-27 13:12:03 +08:00
full_text = str ( raw_content ) . strip ( )
else :
2025-05-29 06:36:33 +08:00
raise HTTPException (
status . HTTP_400_BAD_REQUEST ,
detail = f " Unsupported output format: { self . output_format } " ,
)
2025-05-27 13:12:03 +08:00
2025-05-27 12:44:07 +08:00
if not full_text :
2025-05-29 06:36:33 +08:00
raise HTTPException (
status . HTTP_400_BAD_REQUEST ,
2025-07-23 08:49:28 +08:00
detail = " Marker returned empty content " ,
2025-05-29 06:36:33 +08:00
)
2025-05-27 12:44:07 +08:00
marker_output_dir = os . path . join ( " /app/backend/data/uploads " , " marker_output " )
os . makedirs ( marker_output_dir , exist_ok = True )
file_ext_map = { " markdown " : " md " , " json " : " json " , " html " : " html " }
2025-07-23 08:49:28 +08:00
file_ext = file_ext_map . get ( self . output_format . lower ( ) , " txt " )
2025-05-27 12:44:07 +08:00
output_filename = f " { os . path . splitext ( filename ) [ 0 ] } . { file_ext } "
output_path = os . path . join ( marker_output_dir , output_filename )
try :
with open ( output_path , " w " , encoding = " utf-8 " ) as f :
f . write ( full_text )
log . info ( f " Saved Marker output to: { output_path } " )
except Exception as e :
log . warning ( f " Failed to write marker output to disk: { e } " )
metadata = {
" source " : filename ,
2025-07-23 08:49:28 +08:00
" output_format " : final_result . get ( " output_format " , self . output_format ) ,
" page_count " : final_result . get ( " page_count " , 0 ) ,
2025-05-27 12:44:07 +08:00
" processed_with_llm " : self . use_llm ,
" request_id " : request_id or " " ,
}
2025-07-23 08:49:28 +08:00
images = final_result . get ( " images " , { } )
2025-05-27 12:44:07 +08:00
if images :
metadata [ " image_count " ] = len ( images )
metadata [ " images " ] = json . dumps ( list ( images . keys ( ) ) )
for k , v in metadata . items ( ) :
if isinstance ( v , ( dict , list ) ) :
metadata [ k ] = json . dumps ( v )
elif v is None :
metadata [ k ] = " "
2025-05-27 13:12:03 +08:00
return [ Document ( page_content = full_text , metadata = metadata ) ]