Martin Elstner commited on
Commit
11f25fb
·
1 Parent(s): 9e5545b

Application added

Browse files
Files changed (4) hide show
  1. .gitignore +1 -0
  2. README.md +9 -1
  3. app.py +184 -0
  4. requirements.txt +5 -0
.gitignore ADDED
@@ -0,0 +1 @@
 
 
1
+ .venv/
README.md CHANGED
@@ -11,4 +11,12 @@ license: mit
11
  short_description: Explore how different tokenisers handle rare symbols
12
  ---
13
 
14
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
11
  short_description: Explore how different tokenisers handle rare symbols
12
  ---
13
 
14
+ For local usage, clone the repository and run:
15
+
16
+ ```bash
17
+ uv venv
18
+ uv pip install -r requirements.txt
19
+ uv run app.py
20
+ ```
21
+
22
+ Then open your browser by clicking the link provided in the terminal (default: http://localhost:7860).
app.py ADDED
@@ -0,0 +1,184 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from transformers import AutoTokenizer
3
+ import collections
4
+
5
+ # Map of display names to HF model IDs
6
+ MODEL_MAP = {
7
+ "Nomic Embed v1.5": "nomic-ai/nomic-embed-text-v1.5",
8
+ "MixedBread XSmall v1": "mixedbread-ai/mxbai-embed-xsmall-v1",
9
+ "Google EmbeddingGemma 300m": "google/embeddinggemma-300m",
10
+ "all-MiniLM-L6-v2": "sentence-transformers/all-MiniLM-L6-v2",
11
+ "BGE-M3": "BAAI/bge-m3",
12
+ "BERT Base (Baseline WordPiece)": "bert-base-uncased",
13
+ "RoBERTa Base (Byte-Level BPE)": "roberta-base",
14
+ "E5 Mistral 7B (Llama Tokenizer)": "intfloat/e5-mistral-7b-instruct",
15
+ }
16
+
17
+ # Global cache for tokenizers
18
+ tokenizer_cache = {}
19
+
20
+ def get_tokenizer(model_name):
21
+ """Lazy load tokenizers."""
22
+ model_id = MODEL_MAP[model_name]
23
+ if model_id not in tokenizer_cache:
24
+ print(f"Loading tokenizer: {model_id}...")
25
+ try:
26
+ tokenizer_cache[model_id] = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
27
+ except Exception as e:
28
+ return None, f"Error loading tokenizer: {str(e)}"
29
+ return tokenizer_cache[model_id], None
30
+
31
+ def format_byte_token(text):
32
+ """
33
+ Attempts to identify if a token is a RoBERTa/GPT-2 style byte mapping
34
+ (e.g., 'â' representing 0xE2) and converts it to <0xXX> for clarity.
35
+ """
36
+ # If the text is just one char and looks "weird" (extended unicode),
37
+ # it might be a byte mapping.
38
+ if len(text) == 1 and ord(text) > 256:
39
+ # This is a heuristic: RoBERTa maps bytes to specific unicode ranges.
40
+ # It's safer to just label it as a byte artifact if it matches our fragmentation logic.
41
+ return f"<{hex(ord(text))}>"
42
+ return text
43
+
44
+ def analyze_tokenization(text, model_name=MODEL_MAP.keys().__iter__().__next__()):
45
+ tokenizer, error = get_tokenizer(model_name)
46
+ if error:
47
+ return [], error
48
+
49
+ try:
50
+ # Tokenize with offsets
51
+ encoding = tokenizer(text, add_special_tokens=False, return_offsets_mapping=True)
52
+ except Exception as e:
53
+ return [], f"Tokenization failed: {str(e)}"
54
+
55
+ tokens = tokenizer.convert_ids_to_tokens(encoding["input_ids"])
56
+ ids = encoding["input_ids"]
57
+ offsets = encoding["offset_mapping"]
58
+
59
+ # Map character indices to the list of tokens that cover them
60
+ char_coverage = collections.defaultdict(list)
61
+ for i, (start, end) in enumerate(offsets):
62
+ for char_idx in range(start, end):
63
+ char_coverage[char_idx].append(i)
64
+
65
+ output_spans = []
66
+
67
+ for i, (token, token_id) in enumerate(zip(tokens, ids)):
68
+ label = None
69
+ display_text = token
70
+
71
+ # --- Visual Cleanup for RoBERTa/GPT-2 ---
72
+ # Replace the special 'Ġ' (G with dot) which represents a space
73
+ display_text = display_text.replace('Ġ', ' ')
74
+ # Replace 'Ċ' (C with dot) which represents a newline
75
+ display_text = display_text.replace('Ċ', '\n')
76
+ # Replace 'ĉ' which represents a tab/control
77
+ display_text = display_text.replace('ĉ', '\t')
78
+
79
+ # Check 1: Explicit UNK (The "Hard Failure")
80
+ if token_id == tokenizer.unk_token_id:
81
+ label = "UNK (Data Loss)"
82
+
83
+ # Check 2: Byte Fallback / Fragmentation
84
+ start, end = offsets[i]
85
+ is_fragment = False
86
+
87
+ # If a single character in the input generated multiple tokens, it's a fragmentation/byte-split
88
+ if (end - start) == 1:
89
+ tokens_covering_this_char = char_coverage[start]
90
+ if len(tokens_covering_this_char) > 1:
91
+ is_fragment = True
92
+
93
+ # Check for Llama/Mistral style byte tokens (<0xE2>)
94
+ if token.startswith("<0x") and token.endswith(">"):
95
+ is_fragment = True
96
+
97
+ if is_fragment and label is None:
98
+ label = "Byte/Fragment"
99
+ # If it's a RoBERTa weird char (like â), try to show it as hex
100
+ # to make it look less like random noise
101
+ if len(display_text) == 1 and ord(display_text) > 127:
102
+ # It's likely a mapped byte. We don't have the reverse map easily accessible,
103
+ # but we can mark it clearly.
104
+ display_text = f"<{display_text}>"
105
+
106
+ # Check 3: Subwords (Blue)
107
+ if label is None:
108
+ # WordPiece '##'
109
+ if token.startswith("##"):
110
+ label = "Subword"
111
+ # SentencePiece/RoBERTa often treats non-leading-space tokens as subwords
112
+ elif i > 0 and not token.startswith("Ġ") and not token.startswith(" "):
113
+ # Heuristic: If previous token ended at the same spot this one starts
114
+ prev_end = offsets[i-1][1]
115
+ if start == prev_end:
116
+ label = "Subword"
117
+
118
+ output_spans.append((display_text, label))
119
+
120
+ return output_spans, f"Total Tokens: {len(tokens)}"
121
+
122
+ # Scientific text example
123
+ scientific_text = "Acidity (pKa)2.97 (25 °C)[5] 13.82 (20 °C)[3] UV-vis (λmax)210 nm (χ)−72.23·10−6 cm3/mol"
124
+
125
+ with gr.Blocks(title="Embedding Model Tokenizer Detective") as demo:
126
+ gr.Markdown(
127
+ """
128
+ # 🕵️‍♀️ Embedding Model Tokenizer Detective
129
+
130
+ Different embedding models handle unknown characters (OOV) differently.
131
+
132
+ * **Red (UNK):** The model **deleted** information. It saw a symbol it didn't know and replaced it with a generic placeholder.
133
+ * **Orange (Byte/Fragment):** The model **struggled** and split a single character (like a Greek letter or math symbol) into multiple raw bytes.
134
+ * **Blue:** Standard subword splitting.
135
+ """
136
+ )
137
+
138
+ with gr.Row():
139
+ with gr.Column():
140
+ input_text = gr.Textbox(
141
+ label="Input Text",
142
+ lines=5,
143
+ placeholder="Enter scientific or multilingual text here...",
144
+ value=scientific_text
145
+ )
146
+ model_selector = gr.Dropdown(
147
+ label="Select Embedding Model / Tokenizer",
148
+ choices=list(MODEL_MAP.keys()),
149
+ value="Nomic Embed v1.5"
150
+ )
151
+ analyze_btn = gr.Button("Diagnose Tokenization", variant="primary")
152
+
153
+ with gr.Column():
154
+ output_display = gr.HighlightedText(
155
+ label="Tokenized Analysis",
156
+ combine_adjacent=False,
157
+ show_legend=True,
158
+ color_map={"UNK (Data Loss)": "red", "Byte/Fragment": "orange", "Subword": "blue"}
159
+ )
160
+ stats_output = gr.Label(label="Statistics")
161
+
162
+ analyze_btn.click(
163
+ fn=analyze_tokenization,
164
+ inputs=[input_text, model_selector],
165
+ outputs=[output_display, stats_output]
166
+ )
167
+
168
+ gr.Examples(
169
+ examples=[
170
+ ["The quick brown fox jumps over the lazy dog."],
171
+ [scientific_text],
172
+ ["susceptibility (Ⅹ) = −72.23·10−6 cm3/mol"],
173
+ ["汉字漢字カタカナひらがな"],
174
+ ["⅕ of a pizza is 2 slices."],
175
+ ["😊 😂 🥺"],
176
+ ],
177
+ inputs=[input_text],
178
+ #outputs=[output_display, stats_output],
179
+ fn=analyze_tokenization,
180
+ run_on_click=True
181
+ )
182
+
183
+ if __name__ == "__main__":
184
+ demo.launch()
requirements.txt ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ gradio
2
+ transformers
3
+ torch
4
+ sentencepiece
5
+ protobuf