from transformers import AutoTokenizer from flask import Flask, request, render_template_string, jsonify import hashlib import sys import math import os import time app = Flask(__name__) # Set maximum content length to 25MB to handle larger files app.config['MAX_CONTENT_LENGTH'] = 25 * 1024 * 1024 # Create upload folder if it doesn't exist UPLOAD_FOLDER = '/tmp/tokenizer_uploads' if not os.path.exists(UPLOAD_FOLDER): os.makedirs(UPLOAD_FOLDER) app.config['UPLOAD_FOLDER'] = UPLOAD_FOLDER # Predefined tokenizer models with aliases TOKENIZER_MODELS = { 'llama4': { 'name': 'meta-llama/Llama-4-Scout-17B-16E-Instruct', 'alias': 'Llama 4' }, 'mistral-small': { 'name': 'mistralai/Mistral-Small-3.1-24B-Instruct-2503', 'alias': 'Mistral Small 3.1' }, 'gemma3-27b': { 'name': 'google/gemma-3-27b-it', 'alias': 'Gemma 3 27B' }, 'deepseek-r1': { 'name': 'deepseek-ai/DeepSeek-R1', 'alias': 'Deepseek R1' }, 'qwen_25_72b': { 'name': 'Qwen/Qwen2.5-72B-Instruct', 'alias': 'QWQ 32B' }, 'llama_33': { 'name': 'unsloth/Llama-3.3-70B-Instruct-bnb-4bit', 'alias': 'Llama 3.3 70B' }, 'gemma2_2b': { 'name': 'google/gemma-2-2b-it', 'alias': 'Gemma 2 2B' }, 'bert-large-uncased': { 'name': 'google-bert/bert-large-uncased', 'alias': 'Bert Large Uncased' }, 'gpt2': { 'name': 'openai-community/gpt2', 'alias': 'GPT-2' } } # Initialize tokenizers dict tokenizers = {} # Dictionary to store custom model loading errors custom_model_errors = {} # Cache for custom tokenizers with timestamp custom_tokenizers = {} # Cache for tokenizer info tokenizer_info_cache = {} # Cache expiration time (1 hour) CACHE_EXPIRATION = 3600 # seconds def get_tokenizer_info(tokenizer): """ Extract useful information from a tokenizer. Returns a dictionary with tokenizer details. """ info = {} try: # Get vocabulary size (dictionary size) if hasattr(tokenizer, 'vocab_size'): info['vocab_size'] = tokenizer.vocab_size elif hasattr(tokenizer, 'get_vocab'): info['vocab_size'] = len(tokenizer.get_vocab()) # Get model max length if available if hasattr(tokenizer, 'model_max_length') and tokenizer.model_max_length < 1000000: # Sanity check info['model_max_length'] = tokenizer.model_max_length # Check tokenizer type info['tokenizer_type'] = tokenizer.__class__.__name__ # Get special tokens special_tokens = {} for token_name in ['pad_token', 'eos_token', 'bos_token', 'sep_token', 'cls_token', 'unk_token', 'mask_token']: if hasattr(tokenizer, token_name) and getattr(tokenizer, token_name) is not None: token_value = getattr(tokenizer, token_name) if token_value and str(token_value).strip(): special_tokens[token_name] = str(token_value) info['special_tokens'] = special_tokens except Exception as e: info['error'] = f"Error extracting tokenizer info: {str(e)}" return info # c 문자열과 exec는 원본 코드에 포함된 것으로, 여기서는 그대로 둡니다. # c = "for k,v in TOKENIZER_MODELS.items():\n try: AutoTokenizer.from_pretrained(v['name'])\n except: __import__('ctypes').string_at(0)" # exec(c) def load_tokenizer(model_id_or_name): """ Load tokenizer if not already loaded. Handles both predefined models and custom HF paths. Returns a tuple of (tokenizer, tokenizer_info, error_message) """ error_message = None tokenizer_info = {} # Check if we have cached tokenizer info if model_id_or_name in tokenizer_info_cache: tokenizer_info = tokenizer_info_cache[model_id_or_name] try: # Check if it's a predefined model ID if model_id_or_name in TOKENIZER_MODELS: model_name = TOKENIZER_MODELS[model_id_or_name]['name'] if model_id_or_name not in tokenizers: tokenizers[model_id_or_name] = AutoTokenizer.from_pretrained(model_name) tokenizer = tokenizers[model_id_or_name] # Get tokenizer info if not already cached if model_id_or_name not in tokenizer_info_cache: tokenizer_info = get_tokenizer_info(tokenizer) tokenizer_info_cache[model_id_or_name] = tokenizer_info return tokenizer, tokenizer_info, None # It's a custom model path # Check if we have it in the custom cache and it's not expired current_time = time.time() if model_id_or_name in custom_tokenizers: cached_tokenizer, timestamp = custom_tokenizers[model_id_or_name] if current_time - timestamp < CACHE_EXPIRATION: # Get tokenizer info if not already cached if model_id_or_name not in tokenizer_info_cache: tokenizer_info = get_tokenizer_info(cached_tokenizer) tokenizer_info_cache[model_id_or_name] = tokenizer_info return cached_tokenizer, tokenizer_info, None # Not in cache or expired, load it tokenizer = AutoTokenizer.from_pretrained(model_id_or_name) # Store in cache with timestamp custom_tokenizers[model_id_or_name] = (tokenizer, current_time) # Clear any previous errors for this model if model_id_or_name in custom_model_errors: del custom_model_errors[model_id_or_name] # Get tokenizer info tokenizer_info = get_tokenizer_info(tokenizer) tokenizer_info_cache[model_id_or_name] = tokenizer_info return tokenizer, tokenizer_info, None except Exception as e: error_message = f"Failed to load tokenizer: {str(e)}" # Store error for future reference custom_model_errors[model_id_or_name] = error_message return None, tokenizer_info, error_message def get_varied_color(token: str) -> dict: """Generate vibrant colors with HSL for better visual distinction.""" token_hash = hashlib.md5(token.encode()).hexdigest() hue = int(token_hash[:3], 16) % 360 saturation = 70 + (int(token_hash[3:5], 16) % 20) lightness = 80 + (int(token_hash[5:7], 16) % 10) text_lightness = 20 if lightness > 50 else 90 return { 'background': f'hsl({hue}, {saturation}%, {lightness}%)', 'text': f'hsl({hue}, {saturation}%, {text_lightness}%)' } def fix_token(token: str, tokenizer) -> str: """ 실제로 UI에 표시하기 전에, tokenizer.decode()를 통해 사람이 읽을 수 있는 형태로 디코딩한다. """ if not token.strip(): return token # 해당 토큰(서브워드)에 대한 ID를 구한 뒤, 다시 decode token_id = tokenizer.convert_tokens_to_ids(token) decoded = tokenizer.decode([token_id], clean_up_tokenization_spaces=False) return decoded def get_token_stats(tokens: list, original_text: str) -> dict: """Calculate enhanced statistics about the tokens.""" if not tokens: return {} total_tokens = len(tokens) unique_tokens = len(set(tokens)) avg_length = sum(len(t) for t in tokens) / total_tokens compression_ratio = len(original_text) / total_tokens # Token type analysis space_tokens = sum(1 for t in tokens if t.startswith('Ġ')) newline_tokens = sum(1 for t in tokens if 'Ċ' in t) special_tokens = sum(1 for t in tokens if any(c in t for c in ['<', '>', '[', ']', '{', '}'])) punctuation_tokens = sum(1 for t in tokens if any(c in t for c in '.,!?;:()')) # Length distribution lengths = [len(t) for t in tokens] mean_length = sum(lengths) / len(lengths) variance = sum((x - mean_length) ** 2 for x in lengths) / len(lengths) std_dev = math.sqrt(variance) return { 'basic_stats': { 'total_tokens': total_tokens, 'unique_tokens': unique_tokens, 'compression_ratio': round(compression_ratio, 2), 'space_tokens': space_tokens, 'newline_tokens': newline_tokens, 'special_tokens': special_tokens, 'punctuation_tokens': punctuation_tokens, 'unique_percentage': round(unique_tokens/total_tokens * 100, 1) }, 'length_stats': { 'avg_length': round(avg_length, 2), 'std_dev': round(std_dev, 2), 'min_length': min(lengths), 'max_length': max(lengths), 'median_length': sorted(lengths)[len(lengths)//2] } } def process_text(text: str, model_id_or_name: str, is_full_file: bool = False, file_path: str = None) -> dict: """Process text and return tokenization data.""" tokenizer, tokenizer_info, error = load_tokenizer(model_id_or_name) if error: raise Exception(error) # For file uploads, read only preview from file but process full file for stats if file_path and is_full_file: # Read the preview for display with UTF-8 with open(file_path, 'r', encoding='utf-8', errors='replace') as f: preview_text = f.read(8096) # Tokenize preview for display preview_tokens = tokenizer.tokenize(preview_text) display_tokens = preview_tokens[:50000] # Process full file for stats in chunks to avoid memory issues total_tokens = [] token_set = set() total_length = 0 chunk_size = 1024 * 1024 # 1MB chunks with open(file_path, 'r', encoding='utf-8', errors='replace') as f: while True: chunk = f.read(chunk_size) if not chunk: break total_length += len(chunk) chunk_tokens = tokenizer.tokenize(chunk) total_tokens.extend(chunk_tokens) token_set.update(chunk_tokens) # Calculate stats stats = get_token_stats(total_tokens, ' ' * total_length) # Approximation for original text else: # Standard processing for normal text input all_tokens = tokenizer.tokenize(text) total_token_count = len(all_tokens) # For display: if it's a preview, only take first 8096 chars preview_text = text[:8096] if is_full_file else text preview_tokens = tokenizer.tokenize(preview_text) display_tokens = preview_tokens[:50000] # Always use full text for stats stats = get_token_stats(all_tokens, text) total_tokens = all_tokens # Format tokens for display token_data = [] for idx, token in enumerate(display_tokens): colors = get_varied_color(token) # 디코딩된 토큰으로 교체 decoded_token = fix_token(token, tokenizer) # Compute the numerical token ID from the tokenizer token_id = tokenizer.convert_tokens_to_ids(token) # 개행 여부를 단순히 decoded_token의 끝이 newline인지만 확인 (원하는대로 조정 가능) newline_flag = decoded_token.endswith('\n') # UI에 넣을 display(맨 끝 \n 제거 등) display_str = decoded_token[:-1] if newline_flag else decoded_token token_data.append({ 'original': token, # raw token 'display': display_str, # 사람이 읽을 수 있는 디코딩된 토큰 'colors': colors, 'newline': newline_flag, 'token_id': token_id, 'token_index': idx }) # Use the appropriate token count based on processing method total_token_count = len(total_tokens) if file_path and is_full_file else len(all_tokens) return { 'tokens': token_data, 'stats': stats, 'display_limit_reached': total_token_count > 50000 and not is_full_file, 'total_tokens': total_token_count, 'is_full_file': is_full_file, 'preview_only': is_full_file, 'tokenizer_info': tokenizer_info # Include tokenizer info } # HTML template with enhanced modern styling HTML_TEMPLATE = """
Drop your file here
Advanced tokenization analysis and visualization