Spaces:

cmboulanger
/

tei-annotator

Sleeping

App Files Files Community

cmboulanger commited on Mar 2

Commit

c460c34

1 Parent(s): 406ca65

Add human-readable output for comparison of gold and llm-annotation

Browse files

Files changed (4) hide show

.gitignore +3 -0
pyproject.toml +1 -0
scripts/evaluate_llm.py +135 -77
uv.lock +4 -0

.gitignore CHANGED Viewed

@@ -10,3 +10,6 @@ wheels/
 .venv
 .env*
 .DS_Store

 .venv
 .env*
 .DS_Store
+# Local files
+.local/

pyproject.toml CHANGED Viewed

@@ -23,6 +23,7 @@ markers = [
 dev = [
     "pytest>=8.0",
     "pytest-cov>=5.0",
 ]
 [build-system]

 dev = [
     "pytest>=8.0",
     "pytest-cov>=5.0",
+    "tqdm>=4.0",
 ]
 [build-system]

scripts/evaluate_llm.py CHANGED Viewed

@@ -25,6 +25,7 @@ from __future__ import annotations
 import argparse
 import json
 import os
 import sys
 import urllib.error
 import urllib.request
@@ -238,20 +239,31 @@ def run_evaluation(
     max_items: int | None,
     gliner_model: str | None = None,
     show_annotations: bool = False,
 ) -> bool:
     """
     Evaluate one provider: iterate over gold records with live progress,
     then print overall and per-element metrics.
     Returns True on success, False if a fatal exception occurred.
     """
     import warnings
     from lxml import etree
     from tei_annotator import preload_gliner_model
     from tei_annotator.evaluation import evaluate_element, aggregate, MatchMode
-    from tei_annotator.evaluation.extractor import extract_spans
     from tei_annotator.inference.endpoint import EndpointCapability, EndpointConfig
     _TEI_NS = "http://www.tei-c.org/ns/1.0"
     mode_map = {
@@ -278,83 +290,119 @@ def run_evaluation(
         all_bibls = all_bibls[:max_items]
     n_total = len(all_bibls)
-    sep = "─" * 64
-    print(f"\n{sep}")
-    print(f"  Provider  : {provider_name}")
-    print(f"  Gold file : {GOLD_FILE.relative_to(_REPO)}")
-    print(f"  Records   : {n_total}   match-mode: {match_mode_str}")
-    print(f"  GLiNER    : {gliner_model or 'disabled'}")
-    print(sep)
-    if gliner_model:
-        print(f"  Loading GLiNER model '{gliner_model}'...", flush=True)
-        preload_gliner_model(gliner_model)
-        print(f"  GLiNER model ready.")
-    per_record = []
-    failed = 0
-    for i, bibl in enumerate(all_bibls, 1):
-        plain_text = "".join(bibl.itertext())
-        snippet = plain_text[:60].replace("\n", " ")
-        print(f"  [{i:3d}/{n_total}] {snippet}...", end="\r\n", flush=True)
-        try:
-            # Suppress the pipeline's best-effort XML validation warning here;
-            # it surfaces again in the evaluator warning if parsing fails.
-            with warnings.catch_warnings():
-                warnings.filterwarnings(
-                    "ignore",
-                    message="Output XML validation failed",
-                )
-                result = evaluate_element(
-                    gold_element=bibl,
-                    schema=schema,
-                    endpoint=endpoint,
-                    gliner_model=gliner_model,
-                    match_mode=match_mode,
-                )
-            if show_annotations and result.annotation_xml is not None:
-                sep60 = "─" * 60
-                print(f"\n  {sep60}")
-                print(f"  Annotation:")
-                print(f"  {result.annotation_xml}")
-                print(f"  F1={result.micro_f1:.3f}  "
-                      f"missed={[s.element for s in result.unmatched_gold]}  "
-                      f"spurious={[s.element for s in result.unmatched_pred]}")
-                print(f"  {sep60}\n")
-            per_record.append(result)
-        except Exception as exc:
-            print(f"\n  [{i:3d}/{n_total}] ERROR — {exc}")
-            failed += 1
-    # Clear the progress line
-    print(" " * 70, end="\r")
-    if not per_record:
-        print("  ✗ All records failed — no results to report.")
-        return False
-    overall = aggregate(per_record)
-    n_ok = len(per_record)
-    print(f"\n  Completed: {n_ok}/{n_total} records"
-          + (f"  ({failed} failed)" if failed else "") + "\n")
-    print(overall.report(title=f"Overall — {provider_name}"))
-    # Show the five worst records (by F1) for diagnostics
-    worst = sorted(enumerate(per_record, 1), key=lambda x: x[1].micro_f1)[:5]
-    if worst and worst[0][1].micro_f1 < 1.0:
-        print(f"\n  Lowest-F1 records (top 5):")
-        for idx, r in worst:
-            gold_bibl = all_bibls[idx - 1]
-            snippet = "".join(gold_bibl.itertext())[:55].replace("\n", " ")
-            fn_tags = [s.element for s in r.unmatched_gold]
-            fp_tags = [s.element for s in r.unmatched_pred]
-            print(
-                f"    #{idx:3d}  F1={r.micro_f1:.3f}"
-                f"  missed={fn_tags}  spurious={fp_tags}"
-            )
-            print(f'         "{snippet}..."')
-    return True
 # ---------------------------------------------------------------------------
@@ -395,6 +443,15 @@ def _parse_args() -> argparse.Namespace:
         default=False,
         help="Print the annotated XML output for each record (useful for inspection runs).",
     )
     p.add_argument(
         "--provider",
         choices=["gemini", "kisski", "all"],
@@ -443,6 +500,7 @@ def main() -> int:
             max_items=args.max_items,
             gliner_model=args.gliner_model,
             show_annotations=args.show_annotations,
         )
         results.append(ok)

 import argparse
 import json
 import os
+import re
 import sys
 import urllib.error
 import urllib.request
     max_items: int | None,
     gliner_model: str | None = None,
     show_annotations: bool = False,
+    output_file: Path | None = None,
 ) -> bool:
     """
     Evaluate one provider: iterate over gold records with live progress,
     then print overall and per-element metrics.
+    When *output_file* is set all text output is written to that file and a
+    tqdm progress bar is shown in the terminal instead of per-record lines.
     Returns True on success, False if a fatal exception occurred.
     """
+    import contextlib
+    import io
     import warnings
     from lxml import etree
     from tei_annotator import preload_gliner_model
     from tei_annotator.evaluation import evaluate_element, aggregate, MatchMode
     from tei_annotator.inference.endpoint import EndpointCapability, EndpointConfig
+    try:
+        from tqdm import tqdm as _tqdm
+    except ImportError:
+        _tqdm = None
     _TEI_NS = "http://www.tei-c.org/ns/1.0"
     mode_map = {
         all_bibls = all_bibls[:max_items]
     n_total = len(all_bibls)
+    # --- output destination and progress display ----------------------------
+    # When --output-file: buffer all prints → file; show tqdm bar on stderr.
+    # Otherwise: print to stdout and show manual per-record progress lines.
+    _buf = io.StringIO() if output_file else None
+    _pbar = (
+        _tqdm(total=n_total, desc="Annotating", unit="rec", file=sys.stderr)
+        if output_file and _tqdm
+        else None
+    )
+    if output_file and not _tqdm:
+        print("WARNING: tqdm not installed — no progress bar. Run: pip install tqdm",
+              file=sys.stderr)
+    _ok = False
+    with contextlib.redirect_stdout(_buf) if _buf else contextlib.nullcontext():
+        sep = "─" * 64
+        print(f"\n{sep}")
+        print(f"  Provider  : {provider_name}")
+        print(f"  Gold file : {GOLD_FILE.relative_to(_REPO)}")
+        print(f"  Records   : {n_total}   match-mode: {match_mode_str}")
+        print(f"  GLiNER    : {gliner_model or 'disabled'}")
+        print(sep)
+        if gliner_model:
+            print(f"  Loading GLiNER model '{gliner_model}'...", flush=True)
+            preload_gliner_model(gliner_model)
+            print(f"  GLiNER model ready.")
+        per_record = []
+        failed = 0
+        for i, bibl in enumerate(all_bibls, 1):
+            plain_text = "".join(bibl.itertext())
+            snippet = plain_text[:60].replace("\n", " ")
+            if _pbar:
+                _pbar.set_description(snippet[:45])
+            else:
+                print(f"  [{i:3d}/{n_total}] {snippet}...", end="\r\n", flush=True)
+            try:
+                # Suppress the pipeline's best-effort XML validation warning here;
+                # it surfaces again in the evaluator warning if parsing fails.
+                with warnings.catch_warnings():
+                    warnings.filterwarnings(
+                        "ignore",
+                        message="Output XML validation failed",
+                    )
+                    result = evaluate_element(
+                        gold_element=bibl,
+                        schema=schema,
+                        endpoint=endpoint,
+                        gliner_model=gliner_model,
+                        match_mode=match_mode,
+                    )
+                if show_annotations and result.annotation_xml is not None:
+                    sep60 = "─" * 60
+                    gold_parts = [bibl.text or ""]
+                    for child in bibl:
+                        child_xml = etree.tostring(child, encoding="unicode", with_tail=True)
+                        gold_parts.append(re.sub(r'\s+xmlns(?::\w+)?="[^"]*"', "", child_xml))
+                    gold_xml = "".join(gold_parts)
+                    print(f"\n  {sep60}")
+                    print(f"  Gold:       {gold_xml}")
+                    print(f"  Annotation: {result.annotation_xml}")
+                    print(f"  F1={result.micro_f1:.3f}  "
+                          f"missed={[s.element for s in result.unmatched_gold]}  "
+                          f"spurious={[s.element for s in result.unmatched_pred]}")
+                    print(f"  {sep60}\n")
+                per_record.append(result)
+                if _pbar:
+                    _pbar.update(1)
+                    _pbar.set_postfix(F1=f"{result.micro_f1:.3f}")
+            except Exception as exc:
+                print(f"\n  [{i:3d}/{n_total}] ERROR — {exc}")
+                failed += 1
+                if _pbar:
+                    _pbar.update(1)
+        if _pbar:
+            _pbar.close()
+        else:
+            # Clear the progress line
+            print(" " * 70, end="\r")
+        if not per_record:
+            print("  ✗ All records failed — no results to report.")
+        else:
+            overall = aggregate(per_record)
+            n_ok = len(per_record)
+            print(f"\n  Completed: {n_ok}/{n_total} records"
+                  + (f"  ({failed} failed)" if failed else "") + "\n")
+            print(overall.report(title=f"Overall — {provider_name}"))
+            # Show the five worst records (by F1) for diagnostics
+            worst = sorted(enumerate(per_record, 1), key=lambda x: x[1].micro_f1)[:5]
+            if worst and worst[0][1].micro_f1 < 1.0:
+                print(f"\n  Lowest-F1 records (top 5):")
+                for idx, r in worst:
+                    gold_bibl = all_bibls[idx - 1]
+                    snippet = "".join(gold_bibl.itertext())[:55].replace("\n", " ")
+                    fn_tags = [s.element for s in r.unmatched_gold]
+                    fp_tags = [s.element for s in r.unmatched_pred]
+                    print(
+                        f"    #{idx:3d}  F1={r.micro_f1:.3f}"
+                        f"  missed={fn_tags}  spurious={fp_tags}"
+                    )
+                    print(f'         "{snippet}..."')
+            _ok = True
+    if _buf is not None:
+        output_file.write_text(_buf.getvalue(), encoding="utf-8")
+        print(f"\n  Output written to: {output_file}")
+    return _ok
 # ---------------------------------------------------------------------------
         default=False,
         help="Print the annotated XML output for each record (useful for inspection runs).",
     )
+    p.add_argument(
+        "--output-file",
+        default=None,
+        metavar="PATH",
+        help=(
+            "Write all evaluation output to this file. "
+            "A tqdm progress bar is shown in the terminal instead of per-record lines."
+        ),
+    )
     p.add_argument(
         "--provider",
         choices=["gemini", "kisski", "all"],
             max_items=args.max_items,
             gliner_model=args.gliner_model,
             show_annotations=args.show_annotations,
+            output_file=Path(args.output_file) if args.output_file else None,
         )
         results.append(ok)

uv.lock CHANGED Viewed

@@ -918,6 +918,7 @@ wheels = [
 name = "regex"
 version = "2026.2.28"
 source = { registry = "https://pypi.org/simple" }
 wheels = [
     { url = "https://files.pythonhosted.org/packages/07/42/9061b03cf0fc4b5fa2c3984cbbaed54324377e440a5c5a29d29a72518d62/regex-2026.2.28-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:fcf26c3c6d0da98fada8ae4ef0aa1c3405a431c0a77eb17306d38a89b02adcd7", size = 489574, upload-time = "2026-02-28T02:16:50.455Z" },
     { url = "https://files.pythonhosted.org/packages/77/83/0c8a5623a233015595e3da499c5a1c13720ac63c107897a6037bb97af248/regex-2026.2.28-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:02473c954af35dd2defeb07e44182f5705b30ea3f351a7cbffa9177beb14da5d", size = 291426, upload-time = "2026-02-28T02:16:52.52Z" },
@@ -998,6 +999,7 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/6b/ca/d2c03b0efde47e13db895b975b2be6a73ed90b8ba963677927283d43bf74/regex-2026.2.28-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:1c2c95e1a2b0f89d01e821ff4de1be4b5d73d1f4b0bf679fa27c1ad8d2327f1a", size = 800366, upload-time = "2026-02-28T02:19:34.248Z" },
     { url = "https://files.pythonhosted.org/packages/14/bd/ee13b20b763b8989f7c75d592bfd5de37dc1181814a2a2747fedcf97e3ba/regex-2026.2.28-cp314-cp314t-win32.whl", hash = "sha256:bbb882061f742eb5d46f2f1bd5304055be0a66b783576de3d7eef1bed4778a6e", size = 274936, upload-time = "2026-02-28T02:19:36.313Z" },
     { url = "https://files.pythonhosted.org/packages/cb/e7/d8020e39414c93af7f0d8688eabcecece44abfd5ce314b21dfda0eebd3d8/regex-2026.2.28-cp314-cp314t-win_amd64.whl", hash = "sha256:6591f281cb44dc13de9585b552cec6fc6cf47fb2fe7a48892295ee9bc4a612f9", size = 284779, upload-time = "2026-02-28T02:19:38.625Z" },
 ]
 [[package]]
@@ -1132,6 +1134,7 @@ gliner = [
 dev = [
     { name = "pytest" },
     { name = "pytest-cov" },
 ]
 [package.metadata]
@@ -1147,6 +1150,7 @@ provides-extras = ["gliner"]
 dev = [
     { name = "pytest", specifier = ">=8.0" },
     { name = "pytest-cov", specifier = ">=5.0" },
 ]
 [[package]]

 name = "regex"
 version = "2026.2.28"
 source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/8b/71/41455aa99a5a5ac1eaf311f5d8efd9ce6433c03ac1e0962de163350d0d97/regex-2026.2.28.tar.gz", hash = "sha256:a729e47d418ea11d03469f321aaf67cdee8954cde3ff2cf8403ab87951ad10f2", size = 415184, upload-time = "2026-02-28T02:19:42.792Z" }
 wheels = [
     { url = "https://files.pythonhosted.org/packages/07/42/9061b03cf0fc4b5fa2c3984cbbaed54324377e440a5c5a29d29a72518d62/regex-2026.2.28-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:fcf26c3c6d0da98fada8ae4ef0aa1c3405a431c0a77eb17306d38a89b02adcd7", size = 489574, upload-time = "2026-02-28T02:16:50.455Z" },
     { url = "https://files.pythonhosted.org/packages/77/83/0c8a5623a233015595e3da499c5a1c13720ac63c107897a6037bb97af248/regex-2026.2.28-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:02473c954af35dd2defeb07e44182f5705b30ea3f351a7cbffa9177beb14da5d", size = 291426, upload-time = "2026-02-28T02:16:52.52Z" },
     { url = "https://files.pythonhosted.org/packages/6b/ca/d2c03b0efde47e13db895b975b2be6a73ed90b8ba963677927283d43bf74/regex-2026.2.28-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:1c2c95e1a2b0f89d01e821ff4de1be4b5d73d1f4b0bf679fa27c1ad8d2327f1a", size = 800366, upload-time = "2026-02-28T02:19:34.248Z" },
     { url = "https://files.pythonhosted.org/packages/14/bd/ee13b20b763b8989f7c75d592bfd5de37dc1181814a2a2747fedcf97e3ba/regex-2026.2.28-cp314-cp314t-win32.whl", hash = "sha256:bbb882061f742eb5d46f2f1bd5304055be0a66b783576de3d7eef1bed4778a6e", size = 274936, upload-time = "2026-02-28T02:19:36.313Z" },
     { url = "https://files.pythonhosted.org/packages/cb/e7/d8020e39414c93af7f0d8688eabcecece44abfd5ce314b21dfda0eebd3d8/regex-2026.2.28-cp314-cp314t-win_amd64.whl", hash = "sha256:6591f281cb44dc13de9585b552cec6fc6cf47fb2fe7a48892295ee9bc4a612f9", size = 284779, upload-time = "2026-02-28T02:19:38.625Z" },
+    { url = "https://files.pythonhosted.org/packages/13/c0/ad225f4a405827486f1955283407cf758b6d2fb966712644c5f5aef33d1b/regex-2026.2.28-cp314-cp314t-win_arm64.whl", hash = "sha256:dee50f1be42222f89767b64b283283ef963189da0dda4a515aa54a5563c62dec", size = 275010, upload-time = "2026-02-28T02:19:40.65Z" },
 ]
 [[package]]
 dev = [
     { name = "pytest" },
     { name = "pytest-cov" },
+    { name = "tqdm" },
 ]
 [package.metadata]
 dev = [
     { name = "pytest", specifier = ">=8.0" },
     { name = "pytest-cov", specifier = ">=5.0" },
+    { name = "tqdm", specifier = ">=4.0" },
 ]
 [[package]]