Skip to main content

Python Examples

Complete Python examples for Extract integration.

Installation

pip install requests

Basic Invoice Processing

import requests
import json

BASE_URL = "http://localhost/api/v1"
API_KEY = "your_api_key"

headers = {"Authorization": f"Bearer {API_KEY}"}

def process_invoice(filepath: str, company_name: str = None) -> dict:
"""Process an invoice and return extracted data."""

context = {}
if company_name:
context["company_name"] = company_name

with open(filepath, "rb") as f:
response = requests.post(
f"{BASE_URL}/extract/process",
headers=headers,
files={"file": f},
data={
"template_id": "detailed_invoice",
"context": json.dumps(context) if context else None
}
)

response.raise_for_status()
return response.json()


# Usage
result = process_invoice("invoice.pdf", company_name="Acme Corp")

print(f"Invoice Number: {result['result']['parsed_data']['invoice_number']}")
print(f"Total: ${result['result']['parsed_data']['total']}")
print(f"Cost: ${result['usage']['cost']:.4f}")

Receipt Processing with Error Handling

import requests
from typing import Optional

class ExtractClient:
def __init__(self, base_url: str, api_key: str):
self.base_url = base_url.rstrip("/")
self.headers = {"Authorization": f"Bearer {api_key}"}

def process_receipt(self, filepath: str) -> dict:
"""Process a receipt and return extracted data."""

with open(filepath, "rb") as f:
response = requests.post(
f"{self.base_url}/extract/process",
headers=self.headers,
files={"file": f},
data={"template_id": "simple_receipt"}
)

if response.status_code == 400:
raise ValueError(f"Invalid file: {response.json()['detail']}")
elif response.status_code == 402:
raise PermissionError(f"Budget exceeded: {response.json()['detail']}")
elif response.status_code == 404:
raise FileNotFoundError(f"Template not found: {response.json()['detail']}")

response.raise_for_status()
return response.json()

def get_job(self, job_id: str) -> dict:
"""Get job details by ID."""
response = requests.get(
f"{self.base_url}/extract/jobs/{job_id}",
headers=self.headers
)
response.raise_for_status()
return response.json()


# Usage
client = ExtractClient("http://localhost/api/v1", "your_api_key")

try:
result = client.process_receipt("receipt.jpg")

data = result["result"]["parsed_data"]
print(f"Store: {data['store_name']}")
print(f"Date: {data['date']}")
print(f"Total: ${data['total']}")

if result["result"]["validation_warnings"]:
print("Warnings:")
for warning in result["result"]["validation_warnings"]:
print(f" - {warning}")

except ValueError as e:
print(f"File error: {e}")
except PermissionError as e:
print(f"Budget error: {e}")

Custom Template Creation

import requests
import json

def create_purchase_order_template(base_url: str, api_key: str) -> dict:
"""Create a custom template for purchase orders."""

template = {
"id": "purchase_order",
"system_prompt": """You are a document extraction specialist.
Extract data accurately from purchase orders.
Return valid JSON matching the requested structure.
Use null for fields that are not visible in the document.
Do not invent or hallucinate data.""",

"user_prompt": """Extract the following from this purchase order for {company_name}:

1. PO Number
2. Vendor information (name, address, contact)
3. Order date
4. Expected delivery date
5. Line items:
- Part/SKU number
- Description
- Quantity
- Unit price
- Line total
6. Shipping method and terms
7. Subtotal, tax, and total amount
8. Payment terms
9. Special instructions or notes""",

"context_schema": {
"company_name": {
"type": "string",
"description": "Name of the company placing the order"
}
},

"output_schema": {
"type": "object",
"required": ["po_number", "vendor", "line_items", "total_amount"],
"properties": {
"po_number": {"type": "string"},
"vendor": {
"type": "object",
"properties": {
"name": {"type": "string"},
"address": {"type": "string"},
"contact_email": {"type": "string"},
"contact_phone": {"type": "string"}
}
},
"order_date": {"type": "string", "format": "date"},
"delivery_date": {"type": "string", "format": "date"},
"line_items": {
"type": "array",
"items": {
"type": "object",
"required": ["description", "quantity", "unit_price"],
"properties": {
"part_number": {"type": "string"},
"description": {"type": "string"},
"quantity": {"type": "number", "minimum": 0},
"unit_price": {"type": "number", "minimum": 0},
"line_total": {"type": "number", "minimum": 0}
}
}
},
"shipping": {
"type": "object",
"properties": {
"method": {"type": "string"},
"terms": {"type": "string"}
}
},
"subtotal": {"type": "number"},
"tax": {"type": "number"},
"total_amount": {"type": "number"},
"payment_terms": {"type": "string"},
"notes": {"type": "string"}
}
}
}

response = requests.post(
f"{base_url}/extract/templates",
headers={"Authorization": f"Bearer {api_key}"},
json=template
)

response.raise_for_status()
return response.json()


# Usage
template = create_purchase_order_template(
"http://localhost/api/v1",
"your_api_key"
)
print(f"Created template: {template['id']}")

Batch Processing

import os
import json
from concurrent.futures import ThreadPoolExecutor, as_completed
from dataclasses import dataclass
from typing import List

@dataclass
class ProcessingResult:
filepath: str
success: bool
data: dict = None
error: str = None
cost: float = 0.0

def batch_process_invoices(
directory: str,
base_url: str,
api_key: str,
max_workers: int = 5
) -> List[ProcessingResult]:
"""Process all PDF invoices in a directory."""

headers = {"Authorization": f"Bearer {api_key}"}

def process_file(filepath: str) -> ProcessingResult:
try:
with open(filepath, "rb") as f:
response = requests.post(
f"{base_url}/extract/process",
headers=headers,
files={"file": f},
data={"template_id": "detailed_invoice"}
)

if response.status_code != 200:
return ProcessingResult(
filepath=filepath,
success=False,
error=response.json().get("detail", "Unknown error")
)

result = response.json()
return ProcessingResult(
filepath=filepath,
success=True,
data=result["result"]["parsed_data"],
cost=result["usage"]["cost"]
)

except Exception as e:
return ProcessingResult(
filepath=filepath,
success=False,
error=str(e)
)

# Find all PDFs
pdf_files = [
os.path.join(directory, f)
for f in os.listdir(directory)
if f.lower().endswith(".pdf")
]

# Process in parallel
results = []
with ThreadPoolExecutor(max_workers=max_workers) as executor:
futures = {executor.submit(process_file, f): f for f in pdf_files}

for future in as_completed(futures):
result = future.result()
results.append(result)

status = "OK" if result.success else f"FAILED: {result.error}"
print(f"{os.path.basename(result.filepath)}: {status}")

return results


# Usage
results = batch_process_invoices(
directory="invoices/",
base_url="http://localhost/api/v1",
api_key="your_api_key",
max_workers=5
)

# Summary
successful = [r for r in results if r.success]
failed = [r for r in results if not r.success]
total_cost = sum(r.cost for r in successful)

print(f"\nProcessed: {len(results)} files")
print(f"Successful: {len(successful)}")
print(f"Failed: {len(failed)}")
print(f"Total cost: ${total_cost:.4f}")

# Export results
output = {
"summary": {
"total": len(results),
"successful": len(successful),
"failed": len(failed),
"total_cost": total_cost
},
"results": [
{
"file": r.filepath,
"success": r.success,
"data": r.data,
"error": r.error
}
for r in results
]
}

with open("extraction_results.json", "w") as f:
json.dump(output, f, indent=2)

Template Management

import requests

class TemplateManager:
def __init__(self, base_url: str, api_key: str):
self.base_url = base_url.rstrip("/")
self.headers = {"Authorization": f"Bearer {api_key}"}

def list_templates(self) -> list:
"""List all available templates."""
response = requests.get(
f"{self.base_url}/extract/templates",
headers=self.headers
)
response.raise_for_status()
return response.json()["templates"]

def get_template(self, template_id: str) -> dict:
"""Get a specific template."""
response = requests.get(
f"{self.base_url}/extract/templates/{template_id}",
headers=self.headers
)
response.raise_for_status()
return response.json()

def create_template(self, template: dict) -> dict:
"""Create a new template."""
response = requests.post(
f"{self.base_url}/extract/templates",
headers=self.headers,
json=template
)
response.raise_for_status()
return response.json()

def update_template(self, template_id: str, updates: dict) -> dict:
"""Update an existing template."""
response = requests.put(
f"{self.base_url}/extract/templates/{template_id}",
headers=self.headers,
json=updates
)
response.raise_for_status()
return response.json()

def delete_template(self, template_id: str) -> None:
"""Delete a template."""
response = requests.delete(
f"{self.base_url}/extract/templates/{template_id}",
headers=self.headers
)
response.raise_for_status()

def reset_defaults(self) -> dict:
"""Reset default templates to original state."""
response = requests.post(
f"{self.base_url}/extract/templates/reset-defaults",
headers=self.headers
)
response.raise_for_status()
return response.json()


# Usage
manager = TemplateManager("http://localhost/api/v1", "your_api_key")

# List all templates
templates = manager.list_templates()
for t in templates:
print(f"{t['id']}: {'(default)' if t.get('is_default') else ''}")

# Get template details
invoice_template = manager.get_template("detailed_invoice")
print(f"System prompt: {invoice_template['system_prompt'][:100]}...")

# Update template
manager.update_template("detailed_invoice", {
"user_prompt": "Updated instructions for invoice extraction..."
})

# Reset to defaults
manager.reset_defaults()
print("Default templates restored")

Integration with Pandas

import pandas as pd
import requests
import json
from pathlib import Path

def invoices_to_dataframe(
directory: str,
base_url: str,
api_key: str
) -> pd.DataFrame:
"""Process invoices and return a DataFrame of line items."""

headers = {"Authorization": f"Bearer {api_key}"}
all_items = []

for filepath in Path(directory).glob("*.pdf"):
with open(filepath, "rb") as f:
response = requests.post(
f"{base_url}/extract/process",
headers=headers,
files={"file": f},
data={"template_id": "detailed_invoice"}
)

if response.status_code != 200:
print(f"Failed: {filepath.name}")
continue

result = response.json()
data = result["result"]["parsed_data"]

for item in data.get("line_items", []):
all_items.append({
"file": filepath.name,
"invoice_number": data.get("invoice_number"),
"invoice_date": data.get("invoice_date"),
"vendor": data.get("vendor", {}).get("name"),
"description": item.get("description"),
"quantity": item.get("quantity"),
"unit_price": item.get("unit_price"),
"amount": item.get("amount")
})

return pd.DataFrame(all_items)


# Usage
df = invoices_to_dataframe(
"invoices/",
"http://localhost/api/v1",
"your_api_key"
)

print(df.head())
print(f"\nTotal spend: ${df['amount'].sum():.2f}")
print(f"\nSpend by vendor:")
print(df.groupby("vendor")["amount"].sum().sort_values(ascending=False))