diff --git a/pdf-table-extraction-docling-vs-llamaparse/README.md b/pdf-table-extraction-docling-vs-llamaparse/README.md new file mode 100644 index 0000000000..a308319dc6 --- /dev/null +++ b/pdf-table-extraction-docling-vs-llamaparse/README.md @@ -0,0 +1,56 @@ +# PDF Table Extraction: Docling vs LlamaParse + +This folder contains the code examples for the Real Python tutorial [PDF Table Extraction: Docling vs LlamaParse](https://realpython.com/pdf-table-extraction-docling-vs-llamaparse/). + +The scripts parse `sample_report.pdf`, a short financial report with tables, and compare two approaches: + +- **[Docling](https://github.com/docling-project/docling)** runs locally and exports structured document data, including tables as pandas DataFrames. +- **[LlamaParse](https://docs.cloud.llamaindex.ai/llamaparse/getting_started)** uses the Llama Cloud API for parsing and schema-driven extraction. + +## Files + +| File | Description | +|------|-------------| +| `sample_report.pdf` | Sample PDF used by all scripts | +| `docling_extraction.py` | Parse the PDF with Docling and print Markdown output | +| `docling_tables.py` | Inspect detected tables and print selected DataFrames | +| `docling_formats.py` | Export Docling results to Markdown, JSON, HTML, and DataFrames | +| `llamaparse_extraction.py` | Parse the PDF with LlamaParse and print Markdown output | +| `llamaparse_formats.py` | Export LlamaParse results to Markdown, plain text, and JSON | +| `requirements.txt` | Pinned dependencies for this folder | + +## Installation + +Create and activate a [virtual environment](https://realpython.com/python-virtual-environments-a-primer/), then install the dependencies: + +```shell +$ python3 -m venv venv/ +$ source venv/bin/activate +(venv) $ python -m pip install -r requirements.txt +``` + +Run the scripts from this folder so the relative path to `sample_report.pdf` resolves correctly. + +## Docling examples + +Docling runs on your machine and does not require an API key. + +```shell +(venv) $ python docling_extraction.py +(venv) $ python docling_tables.py +(venv) $ python docling_formats.py +``` + +`docling_formats.py` writes `output_docling.md`, `output_docling.json`, and `output_docling.html` in the current directory. + +## LlamaParse examples + +The LlamaParse scripts require a [Llama Cloud API key](https://cloud.llamaindex.ai/). Export it before running: + +```shell +(venv) $ export LLAMA_CLOUD_API_KEY="your-api-key" +(venv) $ python llamaparse_extraction.py +(venv) $ python llamaparse_formats.py +``` + +`llamaparse_formats.py` writes `output_llamaparse.md`, `output_llamaparse.text`, and `output_llamaparse.json` in the current directory. diff --git a/pdf-table-extraction-docling-vs-llamaparse/docling_extraction.py b/pdf-table-extraction-docling-vs-llamaparse/docling_extraction.py new file mode 100644 index 0000000000..b49baaa0bf --- /dev/null +++ b/pdf-table-extraction-docling-vs-llamaparse/docling_extraction.py @@ -0,0 +1,22 @@ +"""Parse a PDF with Docling and print Markdown output.""" + +from pathlib import Path + +from docling.document_converter import DocumentConverter + +PDF_PATH = Path("sample_report.pdf") + + +def main() -> None: + converter = DocumentConverter() + result = converter.convert(PDF_PATH) + + markdown = result.document.export_to_markdown() + print(markdown[:3000]) + print("\n---\n") + print(f"Pages parsed: {len(result.document.pages)}") + print(f"Tables found: {len(result.document.tables)}") + + +if __name__ == "__main__": + main() diff --git a/pdf-table-extraction-docling-vs-llamaparse/docling_formats.py b/pdf-table-extraction-docling-vs-llamaparse/docling_formats.py new file mode 100644 index 0000000000..8166d5b850 --- /dev/null +++ b/pdf-table-extraction-docling-vs-llamaparse/docling_formats.py @@ -0,0 +1,34 @@ +"""Export Docling parse results to Markdown, JSON, HTML, and pandas DataFrames.""" + +import json +from pathlib import Path + +from docling.document_converter import DocumentConverter + +PDF_PATH = Path("sample_report.pdf") + + +def main() -> None: + converter = DocumentConverter() + document = converter.convert(PDF_PATH).document + + markdown = document.export_to_markdown() + Path("output_docling.md").write_text(markdown, encoding="utf-8") + + payload = document.export_to_dict() + Path("output_docling.json").write_text( + json.dumps(payload, indent=2), + encoding="utf-8", + ) + + html = document.export_to_html() + Path("output_docling.html").write_text(html, encoding="utf-8") + + for index, table in enumerate(document.tables): + frame = table.export_to_dataframe(doc=document) + print(f"Table {index} shape: {frame.shape}") + print(frame.head(), end="\n\n") + + +if __name__ == "__main__": + main() diff --git a/pdf-table-extraction-docling-vs-llamaparse/docling_tables.py b/pdf-table-extraction-docling-vs-llamaparse/docling_tables.py new file mode 100644 index 0000000000..4e5e8ab4d9 --- /dev/null +++ b/pdf-table-extraction-docling-vs-llamaparse/docling_tables.py @@ -0,0 +1,30 @@ +"""Inspect and export tables from a Docling parse result.""" + +from pathlib import Path + +from docling.document_converter import DocumentConverter + +PDF_PATH = Path("sample_report.pdf") + + +def main() -> None: + document = DocumentConverter().convert(PDF_PATH).document + + print(f"Tables found: {len(document.tables)}\n") + + for index, table in enumerate(document.tables): + pages = sorted({prov.page_no for prov in table.prov}) + frame = table.export_to_dataframe(doc=document) + print(f"Table {index}: pages {pages}, shape {frame.shape}") + + index_table = document.tables[0].export_to_dataframe(doc=document) + print("\nFinancial statement index (table 0):") + print(index_table.to_string(index=False), end="\n\n") + + operations_table = document.tables[1].export_to_dataframe(doc=document) + print("Operations statement preview (table 1, first 4 rows):") + print(operations_table.head(4).to_string()) + + +if __name__ == "__main__": + main() diff --git a/pdf-table-extraction-docling-vs-llamaparse/llamaparse_extraction.py b/pdf-table-extraction-docling-vs-llamaparse/llamaparse_extraction.py new file mode 100644 index 0000000000..3be092a1f6 --- /dev/null +++ b/pdf-table-extraction-docling-vs-llamaparse/llamaparse_extraction.py @@ -0,0 +1,32 @@ +"""Parse a PDF with LlamaParse (llama-cloud SDK) and print Markdown output.""" + +import os +from pathlib import Path + +from llama_cloud import LlamaCloud + +PDF_PATH = Path("sample_report.pdf") + + +def main() -> None: + client = LlamaCloud(api_key=os.environ["LLAMA_CLOUD_API_KEY"]) + + uploaded = client.files.create(file=PDF_PATH, purpose="parse") + result = client.parsing.parse( + file_id=uploaded.id, + tier="agentic", + version="latest", + expand=["markdown"], + ) + + pages = "" + for page in result.markdown.pages: + pages += page.markdown + pages += "\n---\n" + + print(pages[:3000]) + print(f"Pages parsed: {len(result.markdown.pages)}") + + +if __name__ == "__main__": + main() diff --git a/pdf-table-extraction-docling-vs-llamaparse/llamaparse_formats.py b/pdf-table-extraction-docling-vs-llamaparse/llamaparse_formats.py new file mode 100644 index 0000000000..01d056b396 --- /dev/null +++ b/pdf-table-extraction-docling-vs-llamaparse/llamaparse_formats.py @@ -0,0 +1,71 @@ +"""Export LlamaParse results to Markdown, Text, and schema-driven JSON.""" + +import json +import os +from pathlib import Path + +from llama_cloud import LlamaCloud +from pydantic import BaseModel, Field + +PDF_PATH = Path("sample_report.pdf") + + +class RevenueRow(BaseModel): + quarter: str = Field( + description="Fiscal quarter label, e.g. Q1 2024", + ) + revenue_millions: float = Field( + description="Revenue in millions of USD", + ) + growth_percent: float | None = Field( + default=None, + description="Year-over-year growth percentage if stated", + ) + + +class RevenueTable(BaseModel): + rows: list[RevenueRow] = Field( + description="One row per quarter in the table" + ) + + +def main() -> None: + client = LlamaCloud(api_key=os.environ["LLAMA_CLOUD_API_KEY"]) + + uploaded = client.files.create(file=PDF_PATH, purpose="parse") + + parsed = client.parsing.parse( + file_id=uploaded.id, + tier="agentic", + version="latest", + expand=["markdown", "text"], + ) + + markdown_pages = "\n\n".join( + page.markdown for page in parsed.markdown.pages + ) + Path("output_llamaparse.md").write_text(markdown_pages, encoding="utf-8") + + if parsed.text and parsed.text.pages: + text_pages = "\n".join(page.text for page in parsed.text.pages) + Path("output_llamaparse.text").write_text(text_pages, encoding="utf-8") + + extract_file = client.files.create(file=PDF_PATH, purpose="extract") + job = client.extract.run( + file_input=extract_file.id, + configuration={ + "data_schema": RevenueTable.model_json_schema(), + "extraction_target": "per_doc", + "tier": "agentic", + }, + ) + + Path("output_llamaparse.json").write_text( + json.dumps(job.extract_result, indent=2), + encoding="utf-8", + ) + print(json.dumps(job.extract_result, indent=2)) + + +if __name__ == "__main__": + main() diff --git a/pdf-table-extraction-docling-vs-llamaparse/requirements.txt b/pdf-table-extraction-docling-vs-llamaparse/requirements.txt new file mode 100644 index 0000000000..d4f28fc93c --- /dev/null +++ b/pdf-table-extraction-docling-vs-llamaparse/requirements.txt @@ -0,0 +1,5 @@ +docling==2.102.2 +onnxruntime>=1.7.0,<2.0.0 +llama-cloud>=2.9.0 +pandas>=2.0.0 +pydantic>=2.0.0 diff --git a/pdf-table-extraction-docling-vs-llamaparse/sample_report.pdf b/pdf-table-extraction-docling-vs-llamaparse/sample_report.pdf new file mode 100644 index 0000000000..bce6248ad1 Binary files /dev/null and b/pdf-table-extraction-docling-vs-llamaparse/sample_report.pdf differ