Spaces:

dwablimol
/

token_detective

Sleeping

App Files Files Community

Martin Elstner commited on Dec 1, 2025

Commit

11f25fb

1 Parent(s): 9e5545b

Application added

Browse files

Files changed (4) hide show

.gitignore +1 -0
README.md +9 -1
app.py +184 -0
requirements.txt +5 -0

.gitignore ADDED Viewed

	@@ -0,0 +1 @@


1	+ .venv/

README.md CHANGED Viewed

@@ -11,4 +11,12 @@ license: mit
 short_description: Explore how different tokenisers handle rare symbols
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 short_description: Explore how different tokenisers handle rare symbols
 ---
+For local usage, clone the repository and run:
+```bash
+uv venv
+uv pip install -r requirements.txt
+uv run app.py
+```
+Then open your browser by clicking the link provided in the terminal (default: http://localhost:7860).

app.py ADDED Viewed

	@@ -0,0 +1,184 @@

+import gradio as gr
+from transformers import AutoTokenizer
+import collections
+# Map of display names to HF model IDs
+MODEL_MAP = {
+    "Nomic Embed v1.5": "nomic-ai/nomic-embed-text-v1.5",
+    "MixedBread XSmall v1": "mixedbread-ai/mxbai-embed-xsmall-v1",
+    "Google EmbeddingGemma 300m": "google/embeddinggemma-300m",
+    "all-MiniLM-L6-v2": "sentence-transformers/all-MiniLM-L6-v2",
+    "BGE-M3": "BAAI/bge-m3",
+    "BERT Base (Baseline WordPiece)": "bert-base-uncased",
+    "RoBERTa Base (Byte-Level BPE)": "roberta-base",
+    "E5 Mistral 7B (Llama Tokenizer)": "intfloat/e5-mistral-7b-instruct",
+}
+# Global cache for tokenizers
+tokenizer_cache = {}
+def get_tokenizer(model_name):
+    """Lazy load tokenizers."""
+    model_id = MODEL_MAP[model_name]
+    if model_id not in tokenizer_cache:
+        print(f"Loading tokenizer: {model_id}...")
+        try:
+            tokenizer_cache[model_id] = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
+        except Exception as e:
+            return None, f"Error loading tokenizer: {str(e)}"
+    return tokenizer_cache[model_id], None
+def format_byte_token(text):
+    """
+    Attempts to identify if a token is a RoBERTa/GPT-2 style byte mapping
+    (e.g., 'â' representing 0xE2) and converts it to <0xXX> for clarity.
+    """
+    # If the text is just one char and looks "weird" (extended unicode),
+    # it might be a byte mapping.
+    if len(text) == 1 and ord(text) > 256:
+        # This is a heuristic: RoBERTa maps bytes to specific unicode ranges.
+        # It's safer to just label it as a byte artifact if it matches our fragmentation logic.
+        return f"<{hex(ord(text))}>"
+    return text
+def analyze_tokenization(text, model_name=MODEL_MAP.keys().__iter__().__next__()):
+    tokenizer, error = get_tokenizer(model_name)
+    if error:
+        return [], error
+    try:
+        # Tokenize with offsets
+        encoding = tokenizer(text, add_special_tokens=False, return_offsets_mapping=True)
+    except Exception as e:
+        return [], f"Tokenization failed: {str(e)}"
+    tokens = tokenizer.convert_ids_to_tokens(encoding["input_ids"])
+    ids = encoding["input_ids"]
+    offsets = encoding["offset_mapping"]
+    # Map character indices to the list of tokens that cover them
+    char_coverage = collections.defaultdict(list)
+    for i, (start, end) in enumerate(offsets):
+        for char_idx in range(start, end):
+            char_coverage[char_idx].append(i)
+    output_spans = []
+    for i, (token, token_id) in enumerate(zip(tokens, ids)):
+        label = None
+        display_text = token
+        # --- Visual Cleanup for RoBERTa/GPT-2 ---
+        # Replace the special 'Ġ' (G with dot) which represents a space
+        display_text = display_text.replace('Ġ', ' ')
+        # Replace 'Ċ' (C with dot) which represents a newline
+        display_text = display_text.replace('Ċ', '\n')
+        # Replace 'ĉ' which represents a tab/control
+        display_text = display_text.replace('ĉ', '\t')
+        # Check 1: Explicit UNK (The "Hard Failure")
+        if token_id == tokenizer.unk_token_id:
+            label = "UNK (Data Loss)"
+        # Check 2: Byte Fallback / Fragmentation
+        start, end = offsets[i]
+        is_fragment = False
+        # If a single character in the input generated multiple tokens, it's a fragmentation/byte-split
+        if (end - start) == 1:
+            tokens_covering_this_char = char_coverage[start]
+            if len(tokens_covering_this_char) > 1:
+                is_fragment = True
+        # Check for Llama/Mistral style byte tokens (<0xE2>)
+        if token.startswith("<0x") and token.endswith(">"):
+            is_fragment = True
+        if is_fragment and label is None:
+            label = "Byte/Fragment"
+            # If it's a RoBERTa weird char (like â), try to show it as hex
+            # to make it look less like random noise
+            if len(display_text) == 1 and ord(display_text) > 127:
+                 # It's likely a mapped byte. We don't have the reverse map easily accessible,
+                 # but we can mark it clearly.
+                 display_text = f"<{display_text}>"
+        # Check 3: Subwords (Blue)
+        if label is None:
+            # WordPiece '##'
+            if token.startswith("##"):
+                label = "Subword"
+            # SentencePiece/RoBERTa often treats non-leading-space tokens as subwords
+            elif i > 0 and not token.startswith("Ġ") and not token.startswith(" "):
+                 # Heuristic: If previous token ended at the same spot this one starts
+                 prev_end = offsets[i-1][1]
+                 if start == prev_end:
+                     label = "Subword"
+        output_spans.append((display_text, label))
+    return output_spans, f"Total Tokens: {len(tokens)}"
+# Scientific text example
+scientific_text = "Acidity (pKa)2.97 (25 °C)[5] 13.82 (20 °C)[3] UV-vis (λmax)210 nm (χ)−72.23·10−6 cm3/mol"
+with gr.Blocks(title="Embedding Model Tokenizer Detective") as demo:
+    gr.Markdown(
+        """
+        # 🕵️‍♀️ Embedding Model Tokenizer Detective
+        Different embedding models handle unknown characters (OOV) differently.
+        * **Red (UNK):** The model **deleted** information. It saw a symbol it didn't know and replaced it with a generic placeholder.
+        * **Orange (Byte/Fragment):** The model **struggled** and split a single character (like a Greek letter or math symbol) into multiple raw bytes.
+        * **Blue:** Standard subword splitting.
+        """
+    )
+    with gr.Row():
+        with gr.Column():
+            input_text = gr.Textbox(
+                label="Input Text",
+                lines=5,
+                placeholder="Enter scientific or multilingual text here...",
+                value=scientific_text
+            )
+            model_selector = gr.Dropdown(
+                label="Select Embedding Model / Tokenizer",
+                choices=list(MODEL_MAP.keys()),
+                value="Nomic Embed v1.5"
+            )
+            analyze_btn = gr.Button("Diagnose Tokenization", variant="primary")
+        with gr.Column():
+            output_display = gr.HighlightedText(
+                label="Tokenized Analysis",
+                combine_adjacent=False,
+                show_legend=True,
+                color_map={"UNK (Data Loss)": "red", "Byte/Fragment": "orange", "Subword": "blue"}
+            )
+            stats_output = gr.Label(label="Statistics")
+    analyze_btn.click(
+        fn=analyze_tokenization,
+        inputs=[input_text, model_selector],
+        outputs=[output_display, stats_output]
+    )
+    gr.Examples(
+        examples=[
+            ["The quick brown fox jumps over the lazy dog."],
+            [scientific_text],
+            ["susceptibility (Ⅹ) = −72.23·10−6 cm3/mol"],
+            ["汉字漢字カタカナひらがな"],
+            ["⅕ of a pizza is 2 slices."],
+            ["😊 😂 🥺"],
+        ],
+        inputs=[input_text],
+        #outputs=[output_display, stats_output],
+        fn=analyze_tokenization,
+        run_on_click=True
+    )
+if __name__ == "__main__":
+    demo.launch()

requirements.txt ADDED Viewed

	@@ -0,0 +1,5 @@

+gradio
+transformers
+torch
+sentencepiece
+protobuf