dejanseo
/

latent-entity

Safetensors

gemma3_text

Model card Files Files and versions

xet

Community

dejanseo commited on Mar 24

Commit

7a907f5

verified ·

1 Parent(s): b76832a

Upload train.py

Browse files

Files changed (1) hide show

cross-entropy/train.py +346 -0

cross-entropy/train.py ADDED Viewed

	@@ -0,0 +1,346 @@

+import json
+import re
+import argparse
+import numpy as np
+from sklearn.model_selection import train_test_split
+parser = argparse.ArgumentParser()
+parser.add_argument("--bump", type=int, default=0, help="Extra epochs to train (resumes from last checkpoint)")
+args = parser.parse_args()
+from transformers import (
+    AutoTokenizer,
+    AutoModelForTokenClassification,
+    TrainingArguments,
+    Trainer,
+    DataCollatorForTokenClassification,
+)
+from datasets import Dataset
+import wandb
+MODEL_NAME = "microsoft/deberta-v3-large"
+TRAIN_FILE = "train.json"
+CACHE_FILE = "chunks.cache.json"
+MAX_LEN = 512
+STRIDE = 128
+LABEL2ID = {"O": 0, "B-SPAN": 1, "I-SPAN": 2}
+ID2LABEL = {v: k for k, v in LABEL2ID.items()}
+def parse_annotated(annotated):
+    """Parse 'title[SEP]text with [SPAN]...[/SPAN]' into title, plain_text, and char offsets."""
+    title, body = annotated.split("[SEP]", 1)
+    # Extract span offsets from body
+    spans = []
+    plain = ""
+    i = 0
+    while i < len(body):
+        if body[i:i+6] == "[SPAN]":
+            start = len(plain)
+            i += 6
+            while i < len(body) and body[i:i+7] != "[/SPAN]":
+                plain += body[i]
+                i += 1
+            end = len(plain)
+            spans.append((start, end))
+            if body[i:i+7] == "[/SPAN]":
+                i += 7
+        else:
+            plain += body[i]
+            i += 1
+    return title.strip(), plain, spans
+def chunk_with_title(title_ids, text_ids, text_labels, max_len, stride):
+    """Create overlapping chunks, each prefixed with title tokens."""
+    # Reserve space: [CLS] + title + [SEP] + ... + [SEP]
+    title_budget = len(title_ids) + 3  # CLS, SEP after title, SEP at end
+    text_budget = max_len - title_budget
+    if text_budget <= 0:
+        return []
+    chunks = []
+    start = 0
+    while start < len(text_ids):
+        end = min(start + text_budget, len(text_ids))
+        chunk_text_ids = text_ids[start:end]
+        chunk_labels = list(text_labels[start:end])
+        # Fix BIO boundary: if chunk starts mid-span, first span token must be B-SPAN
+        for j, lbl in enumerate(chunk_labels):
+            if lbl == LABEL2ID["I-SPAN"]:
+                chunk_labels[j] = LABEL2ID["B-SPAN"]
+                break
+            elif lbl != -100:
+                break
+        # Build full sequence: [CLS] title [SEP] text_chunk [SEP]
+        input_ids = [tokenizer.cls_token_id] + title_ids + [tokenizer.sep_token_id] + chunk_text_ids + [tokenizer.sep_token_id]
+        labels = [-100] + [-100] * len(title_ids) + [-100] + chunk_labels + [-100]
+        attention_mask = [1] * len(input_ids)
+        # Pad to max_len
+        pad_len = max_len - len(input_ids)
+        if pad_len > 0:
+            input_ids += [tokenizer.pad_token_id] * pad_len
+            labels += [-100] * pad_len
+            attention_mask += [0] * pad_len
+        chunks.append({
+            "input_ids": input_ids,
+            "attention_mask": attention_mask,
+            "labels": labels,
+        })
+        if end >= len(text_ids):
+            break
+        start += stride
+    return chunks
+print("Loading tokenizer...")
+tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
+import os
+if os.path.exists(CACHE_FILE):
+    print(f"Loading cached chunks from {CACHE_FILE}...")
+    with open(CACHE_FILE, "r", encoding="utf-8") as f:
+        all_chunks = json.load(f)
+    print(f"Loaded {len(all_chunks):,} chunks from cache")
+else:
+    print(f"Loading {TRAIN_FILE}...")
+    with open(TRAIN_FILE, "r", encoding="utf-8") as f:
+        raw_data = json.load(f)
+    print(f"Parsing and tokenizing {len(raw_data):,} articles...")
+    all_chunks = []
+    for i, item in enumerate(raw_data):
+        title, plain_text, span_offsets = parse_annotated(item["annotated"])
+        # Tokenize title (no special tokens)
+        title_enc = tokenizer(title, add_special_tokens=False)
+        title_ids = title_enc["input_ids"]
+        # Tokenize text with offset mapping
+        text_enc = tokenizer(plain_text, add_special_tokens=False, return_offsets_mapping=True)
+        text_ids = text_enc["input_ids"]
+        text_offsets_map = text_enc["offset_mapping"]
+        # Build token-level BIO labels for text tokens
+        text_labels = []
+        for tok_idx, (tok_start, tok_end) in enumerate(text_offsets_map):
+            if tok_start == 0 and tok_end == 0:
+                text_labels.append(-100)
+                continue
+            label = LABEL2ID["O"]
+            for span_start, span_end in span_offsets:
+                if tok_start >= span_start and tok_end <= span_end:
+                    if tok_start == span_start:
+                        label = LABEL2ID["B-SPAN"]
+                    else:
+                        label = LABEL2ID["I-SPAN"]
+                    break
+            text_labels.append(label)
+        # Chunk
+        chunks = chunk_with_title(title_ids, text_ids, text_labels, MAX_LEN, STRIDE)
+        all_chunks.extend(chunks)
+        if (i + 1) % 2000 == 0:
+            print(f"  [{i+1:,}/{len(raw_data):,}] chunks so far: {len(all_chunks):,}")
+    print(f"Total chunks: {len(all_chunks):,}")
+    print(f"Saving cache to {CACHE_FILE}...")
+    with open(CACHE_FILE, "w", encoding="utf-8") as f:
+        json.dump(all_chunks, f)
+    print("Cache saved.")
+# Verify label distribution
+all_labels_flat = []
+for c in all_chunks:
+    all_labels_flat.extend([l for l in c["labels"] if l >= 0])
+from collections import Counter
+dist = Counter(all_labels_flat)
+total_labeled = sum(dist.values())
+print(f"Label distribution:")
+for label_id, count in sorted(dist.items()):
+    print(f"  {ID2LABEL[label_id]}: {count:,} ({count/total_labeled*100:.2f}%)")
+# Split train/val
+print("Splitting 95/5 train/val...")
+train_chunks, val_chunks = train_test_split(all_chunks, test_size=0.05, random_state=42)
+print(f"Train: {len(train_chunks):,} | Val: {len(val_chunks):,}")
+train_ds = Dataset.from_list(train_chunks)
+val_ds = Dataset.from_list(val_chunks)
+# Model
+print("Loading model...")
+model = AutoModelForTokenClassification.from_pretrained(
+    MODEL_NAME,
+    num_labels=len(LABEL2ID),
+    id2label=ID2LABEL,
+    label2id=LABEL2ID,
+)
+model = model.float()  # DeBERTa-v3 stores weights in FP16 natively; cast to FP32 for stable optimizer updates
+data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer, padding=False)
+def extract_spans_from_bio(seq):
+    """Extract contiguous spans from a BIO label sequence. Returns list of (start, end) tuples."""
+    spans = []
+    start = None
+    for i, label in enumerate(seq):
+        if label == LABEL2ID["B-SPAN"]:
+            if start is not None:
+                spans.append((start, i))
+            start = i
+        elif label == LABEL2ID["I-SPAN"]:
+            if start is None:
+                start = i  # treat orphan I as B
+        else:
+            if start is not None:
+                spans.append((start, i))
+                start = None
+    if start is not None:
+        spans.append((start, len(seq)))
+    return spans
+def compute_metrics(eval_pred):
+    logits, labels = eval_pred
+    preds = np.argmax(logits, axis=-1)
+    # Token-level per-class metrics
+    mask = labels.flatten() >= 0
+    flat_labels = labels.flatten()[mask]
+    flat_preds = preds.flatten()[mask]
+    results = {}
+    for label_name, label_id in LABEL2ID.items():
+        tp = ((flat_preds == label_id) & (flat_labels == label_id)).sum()
+        fp = ((flat_preds == label_id) & (flat_labels != label_id)).sum()
+        fn = ((flat_preds != label_id) & (flat_labels == label_id)).sum()
+        precision = tp / (tp + fp) if (tp + fp) > 0 else 0
+        recall = tp / (tp + fn) if (tp + fn) > 0 else 0
+        f1 = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0
+        results[f"{label_name}_precision"] = float(precision)
+        results[f"{label_name}_recall"] = float(recall)
+        results[f"{label_name}_f1"] = float(f1)
+    # Entity-level span F1 (overlap-based)
+    total_tp = 0
+    total_pred = 0
+    total_true = 0
+    for i in range(len(labels)):
+        # Build valid label/pred sequences (skip -100)
+        valid_labels = []
+        valid_preds = []
+        for j in range(len(labels[i])):
+            if labels[i][j] >= 0:
+                valid_labels.append(labels[i][j])
+                valid_preds.append(preds[i][j])
+        pred_spans = extract_spans_from_bio(valid_preds)
+        true_spans = extract_spans_from_bio(valid_labels)
+        total_pred += len(pred_spans)
+        total_true += len(true_spans)
+        # Match: pred span overlaps >= 50% with a true span (and vice versa)
+        matched_true = set()
+        for ps, pe in pred_spans:
+            for idx, (ts, te) in enumerate(true_spans):
+                if idx in matched_true:
+                    continue
+                overlap = max(0, min(pe, te) - max(ps, ts))
+                pred_len = pe - ps
+                true_len = te - ts
+                if pred_len > 0 and true_len > 0:
+                    if overlap / pred_len >= 0.5 and overlap / true_len >= 0.5:
+                        total_tp += 1
+                        matched_true.add(idx)
+                        break
+    entity_precision = total_tp / total_pred if total_pred > 0 else 0
+    entity_recall = total_tp / total_true if total_true > 0 else 0
+    entity_f1 = 2 * entity_precision * entity_recall / (entity_precision + entity_recall) if (entity_precision + entity_recall) > 0 else 0
+    results["entity_precision"] = float(entity_precision)
+    results["entity_recall"] = float(entity_recall)
+    results["entity_f1"] = float(entity_f1)
+    # Console report
+    total = len(flat_preds)
+    print(f"\n{'='*60}")
+    print(f"  EVAL — Token-level ({total:,} tokens)")
+    print(f"  {'Class':<10} {'Prec':>8} {'Rec':>8} {'F1':>8} | {'Pred':>8} {'True':>8}")
+    print(f"  {'-'*54}")
+    for label_name, label_id in LABEL2ID.items():
+        p = results[f"{label_name}_precision"]
+        r = results[f"{label_name}_recall"]
+        f = results[f"{label_name}_f1"]
+        pred_count = (flat_preds == label_id).sum()
+        true_count = (flat_labels == label_id).sum()
+        print(f"  {label_name:<10} {p:>8.4f} {r:>8.4f} {f:>8.4f} | {pred_count:>8,} {true_count:>8,}")
+    print(f"  {'-'*54}")
+    print(f"  Entity-level: P={entity_precision:.4f} R={entity_recall:.4f} F1={entity_f1:.4f} ({total_tp}/{total_pred} pred, {total_true} true)")
+    print(f"{'='*60}\n")
+    return results
+resume = args.bump > 0
+total_epochs = 1 + args.bump
+wandb.init(project="span-extractor", name=f"deberta-v3-large-ce{f'-bump{args.bump}' if resume else ''}")
+training_args = TrainingArguments(
+    output_dir="./span_model_ce",
+    num_train_epochs=total_epochs,
+    per_device_train_batch_size=4,
+    per_device_eval_batch_size=8,
+    gradient_accumulation_steps=4,
+    learning_rate=2e-5,
+    weight_decay=0.01,
+    warmup_ratio=0.1,
+    bf16=True,
+    logging_steps=1,
+    eval_strategy="steps",
+    eval_steps=500,
+    save_strategy="steps",
+    save_steps=500,
+    save_total_limit=3,
+    load_best_model_at_end=True,
+    metric_for_best_model="entity_f1",
+    greater_is_better=True,
+    dataloader_num_workers=0,
+    report_to="wandb",
+    remove_unused_columns=False,
+)
+trainer = Trainer(
+    model=model,
+    args=training_args,
+    train_dataset=train_ds,
+    eval_dataset=val_ds,
+    data_collator=data_collator,
+    compute_metrics=compute_metrics,
+)
+print(f"Training... (epochs={total_epochs}, resume={resume})")
+trainer.train(resume_from_checkpoint=resume)
+print("Saving final model...")
+trainer.save_model("./span_model_ce/final")
+tokenizer.save_pretrained("./span_model_ce/final")
+wandb.finish()
+print("Done.")