#!/usr/bin/env python3 # SPDX-License-Identifier: MIT """ Evaluate a Docling JSON export and suggest pipeline / option changes. Typical flow (agent or human): docling input.pdf --to json --output /tmp/ docling input.pdf --to md --output /tmp/ python3 scripts/docling-evaluate.py /tmp/input.json --markdown /tmp/input.md Exit codes: 0 = pass; 1 = fail or --fail-on-warn with status warn """ from __future__ import annotations import argparse import json import sys from collections import Counter from pathlib import Path from typing import Any def load_document(path: Path): data = json.loads(path.read_text(encoding="utf-8")) try: from docling_core.types.doc.document import DoclingDocument return DoclingDocument.model_validate(data), data except Exception: return None, data def page_numbers_from_doc(doc) -> set[int]: pages: set[int] = set() for item, _ in doc.iterate_items(): for prov in getattr(item, "prov", None) or []: p = getattr(prov, "page_no", None) if p is not None: pages.add(int(p)) return pages def collect_text_samples(doc, limit: int = 200) -> list[str]: texts: list[str] = [] for item, _ in doc.iterate_items(): t = getattr(item, "text", None) if t and str(t).strip(): texts.append(str(t).strip()) if len(texts) >= limit: break return texts def metrics_from_doc(doc) -> dict[str, Any]: n_tables = len(getattr(doc, "tables", []) or []) n_pictures = len(getattr(doc, "pictures", []) or []) n_headers = 0 n_text_items = 0 total_chars = 0 for item, _ in doc.iterate_items(): label = getattr(getattr(item, "label", None), "name", None) or "" if label == "SECTION_HEADER": n_headers += 1 t = getattr(item, "text", None) if t: n_text_items += 1 total_chars += len(str(t)) pages = page_numbers_from_doc(doc) n_pages = len(pages) if pages else 0 density = (total_chars / n_pages) if n_pages else total_chars samples = collect_text_samples(doc) rep = Counter(samples) top_rep = rep.most_common(1)[0] if rep else ("", 0) dup_ratio = ( sum(c for _, c in rep.items() if c > 2) / max(len(rep), 1) if rep else 0.0 ) md = "" try: md = doc.export_to_markdown() except Exception: pass replacement = md.count("\ufffd") + sum(str(t).count("\ufffd") for t in samples) return { "page_count": n_pages, "section_headers": n_headers, "text_items": n_text_items, "total_text_chars": total_chars, "chars_per_page": round(density, 2), "tables": n_tables, "pictures": n_pictures, "markdown_chars": len(md), "replacement_chars": replacement, "most_repeated_text_count": int(top_rep[1]) if top_rep else 0, "duplicate_heavy": dup_ratio > 0.15 and len(samples) > 10, } def heuristic_metrics(data: dict) -> dict[str, Any]: """Fallback when DoclingDocument cannot be validated (older export / drift).""" texts = data.get("texts") or [] tables = data.get("tables") or [] body = data.get("body") or {} children = body.get("children") if isinstance(body, dict) else None n_children = len(children) if isinstance(children, list) else 0 char_sum = 0 for t in texts: if isinstance(t, dict): char_sum += len(str(t.get("text") or "")) return { "page_count": 0, "section_headers": 0, "text_items": len(texts), "total_text_chars": char_sum, "chars_per_page": 0.0, "tables": len(tables), "pictures": len(data.get("pictures") or []), "markdown_chars": 0, "replacement_chars": 0, "most_repeated_text_count": 0, "duplicate_heavy": False, "heuristic_only": True, "body_children": n_children, } def evaluate( m: dict[str, Any], *, expect_tables: bool, min_chars_per_page: float, min_markdown_chars: int, ) -> tuple[str, list[str], list[str]]: issues: list[str] = [] actions: list[str] = [] if m.get("heuristic_only"): issues.append("Could not load full DoclingDocument; metrics are partial.") actions.append( "Ensure docling-core matches export; re-export with: docling --to json --output " ) cpp = m.get("chars_per_page") or 0 if m.get("page_count", 0) >= 2 and cpp < min_chars_per_page: issues.append( f"Low text density ({cpp} chars/page); likely scan, image-heavy PDF, or extraction gap." ) actions.append( "Retry: docling --ocr-engine tesserocr (or rapidocr, ocrmac)" ) actions.append("Retry: docling --pipeline vlm") if m.get("replacement_chars", 0) > 5: issues.append( "Unicode replacement characters detected; OCR may be garbling text." ) actions.append("Retry: docling --ocr-engine tesserocr (or rapidocr)") actions.append( "Retry: docling --pipeline vlm (use force_backend_text=True via Python API for hybrid)" ) if m.get("duplicate_heavy") or (m.get("most_repeated_text_count", 0) > 8): issues.append( "Repeated text blocks; possible layout/OCR loop or bad reading order." ) actions.append("Retry: docling --pipeline vlm") actions.append( "If using VLM: try force_backend_text=True via Python API for text-heavy pages" ) if expect_tables and m.get("tables", 0) == 0: issues.append("No tables detected but tables were expected.") actions.append( "Retry: docling (tables are enabled by default; remove --no-tables if set)" ) actions.append( "Retry: docling --pipeline vlm (better for merged-cell or visual tables)" ) mc = m.get("markdown_chars", 0) if mc > 0 and mc < min_markdown_chars and m.get("page_count", 0) >= 1: issues.append(f"Markdown export is very short ({mc} chars) for the page count.") actions.append( "Retry: docling --pipeline vlm (or try different --ocr-engine)" ) if m.get("text_items", 0) == 0 and m.get("page_count", 0) == 0: issues.append( "No text items and no page provenance; export may be empty or invalid." ) actions.append( "Verify source file opens correctly; retry with: docling --pipeline standard" ) seen = set() uniq_actions = [] for a in actions: if a not in seen: seen.add(a) uniq_actions.append(a) if not issues: return "pass", [], [] severe = m.get("text_items", 0) == 0 or ( m.get("page_count", 0) >= 1 and mc < 50 and mc > 0 ) status = "fail" if severe or m.get("replacement_chars", 0) > 20 else "warn" return status, issues, uniq_actions def parse_args(): p = argparse.ArgumentParser(description="Evaluate Docling JSON export quality") p.add_argument( "json_path", type=Path, help="Path to DoclingDocument JSON (export_to_dict)" ) p.add_argument( "--markdown", type=Path, default=None, help="Optional markdown file to cross-check length", ) p.add_argument("--expect-tables", action="store_true") p.add_argument("--min-chars-per-page", type=float, default=120.0) p.add_argument("--min-markdown-chars", type=int, default=200) p.add_argument("--fail-on-warn", action="store_true") p.add_argument( "--quiet", action="store_true", help="Only print JSON report to stdout" ) return p.parse_args() def main() -> None: args = parse_args() if not args.json_path.is_file(): print(json.dumps({"error": f"not found: {args.json_path}"}), file=sys.stderr) sys.exit(1) doc, raw = load_document(args.json_path) if doc is not None: m = metrics_from_doc(doc) else: m = heuristic_metrics(raw) if args.markdown and args.markdown.is_file(): md_len = len(args.markdown.read_text(encoding="utf-8")) m["markdown_file_chars"] = md_len if m.get("markdown_chars", 0) == 0: m["markdown_chars"] = md_len status, issues, actions = evaluate( m, expect_tables=args.expect_tables, min_chars_per_page=args.min_chars_per_page, min_markdown_chars=args.min_markdown_chars, ) report = { "status": status, "metrics": m, "issues": issues, "recommended_actions": actions, "next_steps_for_agent": [ "Re-run docling with flags from recommended_actions.", "Re-export JSON and run this script again until status is pass.", "Append a row to improvement-log.md (see SKILL.md).", ], } print(json.dumps(report, indent=2, ensure_ascii=False)) if not args.quiet: print(f"\nstatus={status}", file=sys.stderr) if issues: print("issues:", file=sys.stderr) for i in issues: print(f" - {i}", file=sys.stderr) if actions: print("recommended_actions:", file=sys.stderr) for a in actions: print(f" - {a}", file=sys.stderr) if status == "fail": sys.exit(1) if status == "warn" and args.fail_on_warn: sys.exit(1) sys.exit(0) if __name__ == "__main__": main()