docetl/pyproject.toml

135 lines
3.9 KiB
TOML

[project]
name = "docetl"
version = "0.2.5"
description = "ETL with LLM operations."
readme = "README.md"
requires-python = ">=3.10"
license = { text = "MIT" }
authors = [
{ name = "Shreya Shankar", email = "shreyashankar@berkeley.edu" }
]
# Core runtime dependencies (from Poetry)
dependencies = [
"tqdm>=4.66.4",
"rich>=13.7.1",
"frozendict>=2.4.4",
"diskcache>=5.6.3",
"typer>=0.16.0",
"asteval>=1.0.4",
"scikit-learn>=1.5.2",
"pyrate-limiter>=3.7.0",
"jsonschema>=4.23.0",
"rapidfuzz>=3.10.0",
"websockets>=13.1",
"lzstring>=1.0.4",
"pydantic>=2.9.2",
"boto3>=1.37.27",
"pandas>=2.3.0",
"python-multipart>=0.0.20",
"litellm>=1.75.4",
"rank-bm25>=0.2.2",
]
# Optional extras mapped from Poetry extras and optional deps
[project.optional-dependencies]
parsing = [
"python-docx>=1.1.2",
"openpyxl>=3.1.5",
"pydub>=0.25.1",
"python-pptx>=1.0.2",
"azure-ai-documentintelligence>=1.0.0b4",
"paddlepaddle>=2.6.2,<3.2",
"pymupdf>=1.24.10",
]
server = [
"fastapi>=0.115.4",
"uvicorn>=0.31.0",
"docling>=2.5.2",
"azure-ai-formrecognizer>=3.3.3",
"azure-ai-documentintelligence>=1.0.0b4",
"httpx>=0.27.2",
]
[project.scripts]
docetl = "docetl.cli:app"
# Preserve plugin entry points under a namespaced group
[project.entry-points."docetl.operation"]
map = "docetl.operations.map:MapOperation"
parallel_map = "docetl.operations.map:ParallelMapOperation"
filter = "docetl.operations.filter:FilterOperation"
unnest = "docetl.operations.unnest:UnnestOperation"
equijoin = "docetl.operations.equijoin:EquijoinOperation"
split = "docetl.operations.split:SplitOperation"
reduce = "docetl.operations.reduce:ReduceOperation"
resolve = "docetl.operations.resolve:ResolveOperation"
gather = "docetl.operations.gather:GatherOperation"
cluster = "docetl.operations.cluster:ClusterOperation"
sample = "docetl.operations.sample:SampleOperation"
topk = "docetl.operations.topk:TopKOperation"
link_resolve = "docetl.operations.link_resolve:LinkResolveOperation"
code_map = "docetl.operations.code_operations:CodeMapOperation"
code_reduce = "docetl.operations.code_operations:CodeReduceOperation"
code_filter = "docetl.operations.code_operations:CodeFilterOperation"
[project.entry-points."docetl.parser"]
llama_index_simple_directory_reader = "docetl.parsing_tools:llama_index_simple_directory_reader"
llama_index_wikipedia_reader = "docetl.parsing_tools:llama_index_wikipedia_reader"
whisper_speech_to_text = "docetl.parsing_tools:whisper_speech_to_text"
xlsx_to_string = "docetl.parsing_tools:xlsx_to_string"
txt_to_string = "docetl.parsing_tools:txt_to_string"
docx_to_string = "docetl.parsing_tools:docx_to_string"
pptx_to_string = "docetl.parsing_tools:pptx_to_string"
azure_di_read = "docetl.parsing_tools:azure_di_read"
paddleocr_pdf_to_string = "docetl.parsing_tools:paddleocr_pdf_to_string"
# Development dependency groups (from Poetry group.dev)
[dependency-groups]
dev = [
"pytest>=8.3.2",
"python-dotenv>=1.0.1",
"ruff>=0.6.1",
"mypy>=1.11.1",
"pre-commit>=3.8.0",
"mkdocs>=1.6.1",
"mkdocs-material>=9.5.34",
"mkdocstrings>=0.26.1",
"linkchecker>=10.5.0",
"pytkdocs>=0.16.2",
"mkdocstrings-python>=1.11.1",
"mkdocs-glightbox>=0.4.0",
"pytest-sugar>=1.0.0",
"pytest-xdist>=3.6.1",
"types-pyyaml>=6.0.12.20250516",
"types-requests>=2.32.4.20250611",
]
[build-system]
requires = ["hatchling"]
build-backend = "hatchling.build"
[tool.hatch.build]
packages = ["docetl"]
include = ["docetl/**", "server/**", "README.md", "LICENSE"]
exclude = ["website/**/*"]
[tool.pytest.ini_options]
testpaths = ["tests"]
addopts = "--basetemp=/tmp/pytest"
filterwarnings = [
"ignore::DeprecationWarning",
"ignore::UserWarning",
"ignore::RuntimeWarning",
]
[tool.mypy]
files = "docetl"
mypy_path = "docetl"
warn_return_any = true
warn_unused_configs = true
disallow_untyped_defs = true
exclude = ['docetl/tests*']
ignore_missing_imports = true
show_error_codes = true