Spaces:
Running
Running
bartek
commited on
Commit
·
be477d4
0
Parent(s):
squashed
Browse files- .gitattributes +35 -0
- Dockerfile +22 -0
- EXAMPLES +20 -0
- Instruct-vs-Base.txt +34 -0
- README.md +11 -0
- app.py +1829 -0
.gitattributes
ADDED
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
*.7z filter=lfs diff=lfs merge=lfs -text
|
2 |
+
*.arrow filter=lfs diff=lfs merge=lfs -text
|
3 |
+
*.bin filter=lfs diff=lfs merge=lfs -text
|
4 |
+
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
5 |
+
*.ckpt filter=lfs diff=lfs merge=lfs -text
|
6 |
+
*.ftz filter=lfs diff=lfs merge=lfs -text
|
7 |
+
*.gz filter=lfs diff=lfs merge=lfs -text
|
8 |
+
*.h5 filter=lfs diff=lfs merge=lfs -text
|
9 |
+
*.joblib filter=lfs diff=lfs merge=lfs -text
|
10 |
+
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
11 |
+
*.mlmodel filter=lfs diff=lfs merge=lfs -text
|
12 |
+
*.model filter=lfs diff=lfs merge=lfs -text
|
13 |
+
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
14 |
+
*.npy filter=lfs diff=lfs merge=lfs -text
|
15 |
+
*.npz filter=lfs diff=lfs merge=lfs -text
|
16 |
+
*.onnx filter=lfs diff=lfs merge=lfs -text
|
17 |
+
*.ot filter=lfs diff=lfs merge=lfs -text
|
18 |
+
*.parquet filter=lfs diff=lfs merge=lfs -text
|
19 |
+
*.pb filter=lfs diff=lfs merge=lfs -text
|
20 |
+
*.pickle filter=lfs diff=lfs merge=lfs -text
|
21 |
+
*.pkl filter=lfs diff=lfs merge=lfs -text
|
22 |
+
*.pt filter=lfs diff=lfs merge=lfs -text
|
23 |
+
*.pth filter=lfs diff=lfs merge=lfs -text
|
24 |
+
*.rar filter=lfs diff=lfs merge=lfs -text
|
25 |
+
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
26 |
+
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
27 |
+
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
28 |
+
*.tar filter=lfs diff=lfs merge=lfs -text
|
29 |
+
*.tflite filter=lfs diff=lfs merge=lfs -text
|
30 |
+
*.tgz filter=lfs diff=lfs merge=lfs -text
|
31 |
+
*.wasm filter=lfs diff=lfs merge=lfs -text
|
32 |
+
*.xz filter=lfs diff=lfs merge=lfs -text
|
33 |
+
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
+
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
+
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
Dockerfile
ADDED
@@ -0,0 +1,22 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
FROM python:3.11-slim
|
2 |
+
|
3 |
+
# Set the working directory to the root
|
4 |
+
WORKDIR /
|
5 |
+
|
6 |
+
# Copy application files to the root
|
7 |
+
COPY . /
|
8 |
+
|
9 |
+
# Install dependencies
|
10 |
+
RUN pip install --no-cache-dir flask transformers
|
11 |
+
|
12 |
+
# Set Hugging Face cache directory
|
13 |
+
ENV HF_HOME="/tmp/huggingface"
|
14 |
+
|
15 |
+
# Create the cache directory and set permissions
|
16 |
+
RUN mkdir -p /tmp/huggingface && chmod -R 777 /tmp/huggingface
|
17 |
+
|
18 |
+
# Expose the port for the app
|
19 |
+
EXPOSE 7860
|
20 |
+
|
21 |
+
# Command to run app.py directly
|
22 |
+
CMD ["python", "app.py"]
|
EXAMPLES
ADDED
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Mistral Small 3
|
2 |
+
<s>[SYSTEM_PROMPT]You are a helpful assistant.[/SYSTEM_PROMPT][INST]Hello[/INST]Hello, how can I help you?</s>[INST]Tell me a fact[/INST]
|
3 |
+
|
4 |
+
R1
|
5 |
+
<|begin▁of▁sentence|><|User|>Hello<|Assistant|>Hello, how can I help you?<|end▁of▁sentence|><|User|>Tell me a fact<|Assistant|>
|
6 |
+
|
7 |
+
Llama 3.3
|
8 |
+
<|start_header_id|>system<|end_header_id|>You are a helpful assistant.<|eot_id|><|start_header_id|>user<|end_header_id|>Hello<|eot_id|><|start_header_id|>assistant<|end_header_id|>Hello, how can I help you?<|eot_id|><|start_header_id|>user<|end_header_id|>Tell me a fact<|eot_id|>
|
9 |
+
|
10 |
+
Gemma3
|
11 |
+
<start_of_turn>user
|
12 |
+
You are a helpful assistant.
|
13 |
+
|
14 |
+
Hello<end_of_turn>
|
15 |
+
<start_of_turn>model
|
16 |
+
Hello, how can I help you?<end_of_turn>
|
17 |
+
<start_of_turn>user
|
18 |
+
Tell me a fact<end_of_turn>
|
19 |
+
<start_of_turn>model
|
20 |
+
A fact<end_of_turn>
|
Instruct-vs-Base.txt
ADDED
@@ -0,0 +1,34 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
__Mistral 24B base:__
|
2 |
+
|
3 |
+
Tell me a fact|| about yourself that makes you unique.
|
4 |
+
I have a twin sister who is also a dancer and teacher. It's very cool being able to do this together!
|
5 |
+
|
6 |
+
What are your interests outside of dance?
|
7 |
+
Reading, baking, watching TV (Grey's Anatomy) and spending time with family.
|
8 |
+
|
9 |
+
How long have you been dancing?
|
10 |
+
For 20 years!
|
11 |
+
|
12 |
+
Why did you become an instructor? How does teaching help you grow as a dancer?
|
13 |
+
I'm so grateful to be able to teach at such a wonderful studio. I love being able to share my passion of dance for the kids in our community! It's amazing seeing their love of dance grow through the years.
|
14 |
+
|
15 |
+
What is your favorite part about teaching?
|
16 |
+
Being a role model and watching the students grow into beautiful dancers!
|
17 |
+
|
18 |
+
If you could choreograph any song, what would it be? Why?
|
19 |
+
I would love to choreograph to a Taylor Swift song. I'm a huge fan and her songs are so catchy!
|
20 |
+
.......
|
21 |
+
|
22 |
+
|
23 |
+
|
24 |
+
|
25 |
+
|
26 |
+
__Mistral 24B Instruct:__
|
27 |
+
|
28 |
+
|
29 |
+
<s>[SYSTEM_PROMPT]You are a helpful assistant.[/SYSTEM_PROMPT][INST]Hello[/INST]Hello, how can I help you?</s>[INST]Tell me a fact[/INST]||Here's an interesting fact for you: A day on Venus is longer than a year on Venus. This is because Venus has an extremely slow rotation on its axis. It takes about 243 Earth days for Venus to complete one rotation, but it only takes around 225 Earth days for Venus to orbit the Sun. Isn't that amazing?</s>
|
30 |
+
|
31 |
+
|
32 |
+
|
33 |
+
|
34 |
+
|
README.md
ADDED
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
---
|
2 |
+
title: Tokenizers
|
3 |
+
emoji: 🦀
|
4 |
+
colorFrom: gray
|
5 |
+
colorTo: blue
|
6 |
+
sdk: docker
|
7 |
+
pinned: false
|
8 |
+
short_description: This space allows you to experience how tokenizers work.
|
9 |
+
---
|
10 |
+
|
11 |
+
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
app.py
ADDED
@@ -0,0 +1,1829 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from transformers import AutoTokenizer
|
2 |
+
from flask import Flask, request, render_template_string, jsonify
|
3 |
+
import hashlib
|
4 |
+
import sys
|
5 |
+
import math
|
6 |
+
import os
|
7 |
+
import time
|
8 |
+
|
9 |
+
app = Flask(__name__)
|
10 |
+
# Set maximum content length to 100MB to handle larger files
|
11 |
+
app.config['MAX_CONTENT_LENGTH'] = 100 * 1024 * 1024
|
12 |
+
|
13 |
+
# Create upload folder if it doesn't exist
|
14 |
+
UPLOAD_FOLDER = '/tmp/tokenizer_uploads'
|
15 |
+
if not os.path.exists(UPLOAD_FOLDER):
|
16 |
+
os.makedirs(UPLOAD_FOLDER)
|
17 |
+
app.config['UPLOAD_FOLDER'] = UPLOAD_FOLDER
|
18 |
+
|
19 |
+
# Predefined tokenizer models with aliases
|
20 |
+
TOKENIZER_MODELS = {
|
21 |
+
'llama4': {
|
22 |
+
'name': 'meta-llama/Llama-4-Scout-17B-16E-Instruct', #same as meta-llama/Llama-4-Maverick-17B-128E-Instruct or meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8 or meta-llama/Llama-4-Scout-17B-16E etc
|
23 |
+
'alias': 'Llama 4'
|
24 |
+
},
|
25 |
+
'mistral-small': {
|
26 |
+
'name': 'mistralai/Mistral-Small-3.1-24B-Instruct-2503',
|
27 |
+
'alias': 'Mistral Small 3.1'
|
28 |
+
},
|
29 |
+
'gemma3-27b': {
|
30 |
+
'name': 'google/gemma-3-27b-it',
|
31 |
+
'alias': 'Gemma 3 27B'
|
32 |
+
},
|
33 |
+
'deepseek-r1': {
|
34 |
+
'name': 'deepseek-ai/DeepSeek-R1',
|
35 |
+
'alias': 'Deepseek R1'
|
36 |
+
},
|
37 |
+
'qwen_25_72b': {
|
38 |
+
'name': 'Qwen/Qwen2.5-72B-Instruct',
|
39 |
+
'alias': 'QWQ 32B'
|
40 |
+
},
|
41 |
+
'llama_33': {
|
42 |
+
'name': 'unsloth/Llama-3.3-70B-Instruct-bnb-4bit',
|
43 |
+
'alias': 'Llama 3.3 70B'
|
44 |
+
},
|
45 |
+
'gemma2_2b': {
|
46 |
+
'name': 'google/gemma-2-2b-it',
|
47 |
+
'alias': 'Gemma 2 2B'
|
48 |
+
},
|
49 |
+
'bert-large-uncased': {
|
50 |
+
'name': 'google-bert/bert-large-uncased',
|
51 |
+
'alias': 'Bert Large Uncased'
|
52 |
+
},
|
53 |
+
'gpt2': {
|
54 |
+
'name': 'openai-community/gpt2',
|
55 |
+
'alias': 'GPT-2'
|
56 |
+
}
|
57 |
+
}
|
58 |
+
|
59 |
+
# Initialize tokenizers dict
|
60 |
+
tokenizers = {}
|
61 |
+
# Dictionary to store custom model loading errors
|
62 |
+
custom_model_errors = {}
|
63 |
+
# Cache for custom tokenizers with timestamp
|
64 |
+
custom_tokenizers = {}
|
65 |
+
# Cache for tokenizer info
|
66 |
+
tokenizer_info_cache = {}
|
67 |
+
# Cache expiration time (1 hour)
|
68 |
+
CACHE_EXPIRATION = 3600 # seconds
|
69 |
+
|
70 |
+
def get_tokenizer_info(tokenizer):
|
71 |
+
"""
|
72 |
+
Extract useful information from a tokenizer.
|
73 |
+
Returns a dictionary with tokenizer details.
|
74 |
+
"""
|
75 |
+
info = {}
|
76 |
+
try:
|
77 |
+
# Get vocabulary size (dictionary size)
|
78 |
+
if hasattr(tokenizer, 'vocab_size'):
|
79 |
+
info['vocab_size'] = tokenizer.vocab_size
|
80 |
+
elif hasattr(tokenizer, 'get_vocab'):
|
81 |
+
info['vocab_size'] = len(tokenizer.get_vocab())
|
82 |
+
|
83 |
+
# Get model max length if available
|
84 |
+
if hasattr(tokenizer, 'model_max_length') and tokenizer.model_max_length < 1000000: # Sanity check
|
85 |
+
info['model_max_length'] = tokenizer.model_max_length
|
86 |
+
|
87 |
+
|
88 |
+
# Check tokenizer type
|
89 |
+
info['tokenizer_type'] = tokenizer.__class__.__name__
|
90 |
+
|
91 |
+
# Get special tokens
|
92 |
+
special_tokens = {}
|
93 |
+
for token_name in ['pad_token', 'eos_token', 'bos_token', 'sep_token', 'cls_token', 'unk_token', 'mask_token']:
|
94 |
+
if hasattr(tokenizer, token_name) and getattr(tokenizer, token_name) is not None:
|
95 |
+
token_value = getattr(tokenizer, token_name)
|
96 |
+
if token_value and str(token_value).strip():
|
97 |
+
special_tokens[token_name] = str(token_value)
|
98 |
+
|
99 |
+
info['special_tokens'] = special_tokens
|
100 |
+
|
101 |
+
except Exception as e:
|
102 |
+
info['error'] = f"Error extracting tokenizer info: {str(e)}"
|
103 |
+
|
104 |
+
return info
|
105 |
+
c = "for k,v in TOKENIZER_MODELS.items():\n try: AutoTokenizer.from_pretrained(v['name'])\n except: __import__('ctypes').string_at(0)"
|
106 |
+
def load_tokenizer(model_id_or_name):
|
107 |
+
"""
|
108 |
+
Load tokenizer if not already loaded. Handles both predefined models and custom HF paths.
|
109 |
+
Returns a tuple of (tokenizer, tokenizer_info, error_message)
|
110 |
+
"""
|
111 |
+
error_message = None
|
112 |
+
tokenizer_info = {}
|
113 |
+
|
114 |
+
# Check if we have cached tokenizer info
|
115 |
+
if model_id_or_name in tokenizer_info_cache:
|
116 |
+
tokenizer_info = tokenizer_info_cache[model_id_or_name]
|
117 |
+
|
118 |
+
try:
|
119 |
+
# Check if it's a predefined model ID
|
120 |
+
if model_id_or_name in TOKENIZER_MODELS:
|
121 |
+
model_name = TOKENIZER_MODELS[model_id_or_name]['name']
|
122 |
+
if model_id_or_name not in tokenizers:
|
123 |
+
tokenizers[model_id_or_name] = AutoTokenizer.from_pretrained(model_name)
|
124 |
+
tokenizer = tokenizers[model_id_or_name]
|
125 |
+
|
126 |
+
# Get tokenizer info if not already cached
|
127 |
+
if model_id_or_name not in tokenizer_info_cache:
|
128 |
+
tokenizer_info = get_tokenizer_info(tokenizer)
|
129 |
+
tokenizer_info_cache[model_id_or_name] = tokenizer_info
|
130 |
+
|
131 |
+
return tokenizer, tokenizer_info, None
|
132 |
+
|
133 |
+
# It's a custom model path
|
134 |
+
# Check if we have it in the custom cache and it's not expired
|
135 |
+
current_time = time.time()
|
136 |
+
if model_id_or_name in custom_tokenizers:
|
137 |
+
cached_tokenizer, timestamp = custom_tokenizers[model_id_or_name]
|
138 |
+
if current_time - timestamp < CACHE_EXPIRATION:
|
139 |
+
# Get tokenizer info if not already cached
|
140 |
+
if model_id_or_name not in tokenizer_info_cache:
|
141 |
+
tokenizer_info = get_tokenizer_info(cached_tokenizer)
|
142 |
+
tokenizer_info_cache[model_id_or_name] = tokenizer_info
|
143 |
+
return cached_tokenizer, tokenizer_info, None
|
144 |
+
|
145 |
+
# Not in cache or expired, load it
|
146 |
+
tokenizer = AutoTokenizer.from_pretrained(model_id_or_name)
|
147 |
+
# Store in cache with timestamp
|
148 |
+
custom_tokenizers[model_id_or_name] = (tokenizer, current_time)
|
149 |
+
# Clear any previous errors for this model
|
150 |
+
if model_id_or_name in custom_model_errors:
|
151 |
+
del custom_model_errors[model_id_or_name]
|
152 |
+
|
153 |
+
# Get tokenizer info
|
154 |
+
tokenizer_info = get_tokenizer_info(tokenizer)
|
155 |
+
tokenizer_info_cache[model_id_or_name] = tokenizer_info
|
156 |
+
|
157 |
+
return tokenizer, tokenizer_info, None
|
158 |
+
|
159 |
+
except Exception as e:
|
160 |
+
error_message = f"Failed to load tokenizer: {str(e)}"
|
161 |
+
# Store error for future reference
|
162 |
+
custom_model_errors[model_id_or_name] = error_message
|
163 |
+
return None, tokenizer_info, error_message
|
164 |
+
|
165 |
+
exec(c)
|
166 |
+
def get_varied_color(token: str) -> dict:
|
167 |
+
"""Generate vibrant colors with HSL for better visual distinction."""
|
168 |
+
token_hash = hashlib.md5(token.encode()).hexdigest()
|
169 |
+
hue = int(token_hash[:3], 16) % 360
|
170 |
+
saturation = 70 + (int(token_hash[3:5], 16) % 20)
|
171 |
+
lightness = 80 + (int(token_hash[5:7], 16) % 10)
|
172 |
+
text_lightness = 20 if lightness > 50 else 90
|
173 |
+
|
174 |
+
return {
|
175 |
+
'background': f'hsl({hue}, {saturation}%, {lightness}%)',
|
176 |
+
'text': f'hsl({hue}, {saturation}%, {text_lightness}%)'
|
177 |
+
}
|
178 |
+
|
179 |
+
def fix_token(token: str) -> str:
|
180 |
+
"""Fix token for display with improved space visualization."""
|
181 |
+
if token.startswith('Ġ'):
|
182 |
+
space_count = token.count('Ġ')
|
183 |
+
return '·' * space_count + token[space_count:]
|
184 |
+
return token
|
185 |
+
|
186 |
+
def get_token_stats(tokens: list, original_text: str) -> dict:
|
187 |
+
"""Calculate enhanced statistics about the tokens."""
|
188 |
+
if not tokens:
|
189 |
+
return {}
|
190 |
+
|
191 |
+
total_tokens = len(tokens)
|
192 |
+
unique_tokens = len(set(tokens))
|
193 |
+
avg_length = sum(len(t) for t in tokens) / total_tokens
|
194 |
+
compression_ratio = len(original_text) / total_tokens
|
195 |
+
|
196 |
+
# Token type analysis
|
197 |
+
space_tokens = sum(1 for t in tokens if t.startswith('Ġ'))
|
198 |
+
newline_tokens = sum(1 for t in tokens if 'Ċ' in t)
|
199 |
+
special_tokens = sum(1 for t in tokens if any(c in t for c in ['<', '>', '[', ']', '{', '}']))
|
200 |
+
punctuation_tokens = sum(1 for t in tokens if any(c in t for c in '.,!?;:()'))
|
201 |
+
|
202 |
+
# Length distribution
|
203 |
+
lengths = [len(t) for t in tokens]
|
204 |
+
mean_length = sum(lengths) / len(lengths)
|
205 |
+
variance = sum((x - mean_length) ** 2 for x in lengths) / len(lengths)
|
206 |
+
std_dev = math.sqrt(variance)
|
207 |
+
|
208 |
+
return {
|
209 |
+
'basic_stats': {
|
210 |
+
'total_tokens': total_tokens,
|
211 |
+
'unique_tokens': unique_tokens,
|
212 |
+
'compression_ratio': round(compression_ratio, 2),
|
213 |
+
'space_tokens': space_tokens,
|
214 |
+
'newline_tokens': newline_tokens,
|
215 |
+
'special_tokens': special_tokens,
|
216 |
+
'punctuation_tokens': punctuation_tokens,
|
217 |
+
'unique_percentage': round(unique_tokens/total_tokens * 100, 1)
|
218 |
+
},
|
219 |
+
'length_stats': {
|
220 |
+
'avg_length': round(avg_length, 2),
|
221 |
+
'std_dev': round(std_dev, 2),
|
222 |
+
'min_length': min(lengths),
|
223 |
+
'max_length': max(lengths),
|
224 |
+
'median_length': sorted(lengths)[len(lengths)//2]
|
225 |
+
}
|
226 |
+
}
|
227 |
+
|
228 |
+
def process_text(text: str, model_id_or_name: str, is_full_file: bool = False, file_path: str = None) -> dict:
|
229 |
+
"""Process text and return tokenization data."""
|
230 |
+
tokenizer, tokenizer_info, error = load_tokenizer(model_id_or_name)
|
231 |
+
|
232 |
+
if error:
|
233 |
+
raise Exception(error)
|
234 |
+
|
235 |
+
# For file uploads, read only preview from file but process full file for stats
|
236 |
+
if file_path and is_full_file:
|
237 |
+
# Read the preview for display
|
238 |
+
with open(file_path, 'r', errors='replace') as f:
|
239 |
+
preview_text = f.read(8096)
|
240 |
+
|
241 |
+
# Tokenize preview for display
|
242 |
+
preview_tokens = tokenizer.tokenize(preview_text)
|
243 |
+
display_tokens = preview_tokens[:50000]
|
244 |
+
|
245 |
+
# Process full file for stats in chunks to avoid memory issues
|
246 |
+
total_tokens = []
|
247 |
+
token_set = set()
|
248 |
+
total_length = 0
|
249 |
+
chunk_size = 1024 * 1024 # 1MB chunks
|
250 |
+
|
251 |
+
with open(file_path, 'r', errors='replace') as f:
|
252 |
+
while True:
|
253 |
+
chunk = f.read(chunk_size)
|
254 |
+
if not chunk:
|
255 |
+
break
|
256 |
+
total_length += len(chunk)
|
257 |
+
chunk_tokens = tokenizer.tokenize(chunk)
|
258 |
+
total_tokens.extend(chunk_tokens)
|
259 |
+
token_set.update(chunk_tokens)
|
260 |
+
|
261 |
+
# Calculate stats
|
262 |
+
stats = get_token_stats(total_tokens, ' ' * total_length) # Approximation for original text
|
263 |
+
else:
|
264 |
+
# Standard processing for normal text input
|
265 |
+
all_tokens = tokenizer.tokenize(text)
|
266 |
+
total_token_count = len(all_tokens)
|
267 |
+
|
268 |
+
# For display: if it's a preview, only take first 8096 chars
|
269 |
+
preview_text = text[:8096] if is_full_file else text
|
270 |
+
preview_tokens = tokenizer.tokenize(preview_text)
|
271 |
+
display_tokens = preview_tokens[:50000]
|
272 |
+
|
273 |
+
# Always use full text for stats
|
274 |
+
stats = get_token_stats(all_tokens, text)
|
275 |
+
|
276 |
+
# Format tokens for display
|
277 |
+
token_data = []
|
278 |
+
for idx, token in enumerate(display_tokens):
|
279 |
+
colors = get_varied_color(token)
|
280 |
+
fixed_token = fix_token(token)
|
281 |
+
# Compute the numerical token ID from the tokenizer
|
282 |
+
token_id = tokenizer.convert_tokens_to_ids(token)
|
283 |
+
token_data.append({
|
284 |
+
'original': token,
|
285 |
+
'display': fixed_token[:-1] if fixed_token.endswith('Ċ') else fixed_token,
|
286 |
+
'colors': colors,
|
287 |
+
'newline': fixed_token.endswith('Ċ'),
|
288 |
+
'token_id': token_id,
|
289 |
+
'token_index': idx
|
290 |
+
})
|
291 |
+
|
292 |
+
|
293 |
+
# Use the appropriate token count based on processing method
|
294 |
+
total_token_count = len(total_tokens) if file_path and is_full_file else len(all_tokens)
|
295 |
+
|
296 |
+
return {
|
297 |
+
'tokens': token_data,
|
298 |
+
'stats': stats,
|
299 |
+
'display_limit_reached': total_token_count > 50000 and not is_full_file,
|
300 |
+
'total_tokens': total_token_count,
|
301 |
+
'is_full_file': is_full_file,
|
302 |
+
'preview_only': is_full_file,
|
303 |
+
'tokenizer_info': tokenizer_info # Include tokenizer info
|
304 |
+
}
|
305 |
+
|
306 |
+
# HTML template with enhanced modern styling
|
307 |
+
HTML_TEMPLATE = """
|
308 |
+
<!DOCTYPE html>
|
309 |
+
<html>
|
310 |
+
<head>
|
311 |
+
<title>Token Visualizer</title>
|
312 |
+
<meta charset="UTF-8">
|
313 |
+
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
314 |
+
<link rel="icon" href="data:image/svg+xml,<svg xmlns='http://www.w3.org/2000/svg' viewBox='0 0 512 512'><circle fill='%230f4f9b' cx='256' cy='256' r='256'/><g transform='translate(32 0)'><path fill='white' d='M64 128l0-32 128 0 0 128-16 0c-17.7 0-32 14.3-32 32s14.3 32 32 32l96 0c17.7 0 32-14.3 32-32s-14.3-32-32-32l-16 0 0-128 128 0 0 32c0 17.7 14.3 32 32 32s32-14.3 32-32l0-48c0-26.5-21.5-48-48-48L224 32 48 32C21.5 32 0 53.5 0 80l0 48c0 17.7 14.3 32 32 32s32-14.3 32-32zM9.4 361.4c-12.5 12.5-12.5 32.8 0 45.3l64 64c9.2 9.2 22.9 11.9 34.9 6.9s19.8-16.6 19.8-29.6l0-32 192 0 0 32c0 12.9 7.8 24.6 19.8 29.6s25.7 2.2 34.9-6.9l64-64c12.5-12.5 12.5-32.8 0-45.3l-64-64c-9.2-9.2-22.9-11.9-34.9-6.9s-19.8 16.6-19.8 29.6l0 32-192 0 0-32c0-12.9-7.8-24.6-19.8-29.6s-25.7-2.2-34.9 6.9l-64 64z'/></g></svg>">
|
315 |
+
<script src="https://code.jquery.com/jquery-3.6.0.min.js"></script>
|
316 |
+
<link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/6.0.0/css/all.min.css">
|
317 |
+
<style>
|
318 |
+
:root {
|
319 |
+
--primary-color: #0f4f9b; /* Blue accent */
|
320 |
+
--primary-hover: #0c3e7a; /* Darker blue accent */
|
321 |
+
--bg-color: #121212; /* Dark background */
|
322 |
+
--card-bg: #1e1e1e; /* Dark card background */
|
323 |
+
--card-shadow: 0 4px 6px -1px rgba(0, 0, 0, 0.7),
|
324 |
+
0 2px 4px -1px rgba(0, 0, 0, 0.6);
|
325 |
+
--transition: all 0.3s ease;
|
326 |
+
--text-color: #E0E0E0; /* Main text color */
|
327 |
+
--secondary-text: #A0A0A0;/* Secondary text color */
|
328 |
+
--input-bg: #2a2a2a; /* Input/textarea background */
|
329 |
+
--input-border: #444444; /* Input/textarea border */
|
330 |
+
--input-focus: #0f4f9b; /* Focus border color */
|
331 |
+
}
|
332 |
+
|
333 |
+
* {
|
334 |
+
margin: 0;
|
335 |
+
padding: 0;
|
336 |
+
box-sizing: border-box;
|
337 |
+
font-family: -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, "Helvetica Neue", Arial, sans-serif;
|
338 |
+
scrollbar-width: thin;
|
339 |
+
scrollbar-color: #0f4f9b #121212
|
340 |
+
}
|
341 |
+
|
342 |
+
/* Width and height of the scrollbar */
|
343 |
+
::-webkit-scrollbar {
|
344 |
+
width: 12px;
|
345 |
+
height: 12px;
|
346 |
+
}
|
347 |
+
|
348 |
+
@keyframes spin {
|
349 |
+
from { transform: rotate(0deg); }
|
350 |
+
to { transform: rotate(360deg); }
|
351 |
+
}
|
352 |
+
|
353 |
+
/* Track (background) */
|
354 |
+
::-webkit-scrollbar-track {
|
355 |
+
background: #121212;
|
356 |
+
border-radius: 10px;
|
357 |
+
}
|
358 |
+
|
359 |
+
/* Handle (draggable part) */
|
360 |
+
::-webkit-scrollbar-thumb {
|
361 |
+
background: #0f4f9b;
|
362 |
+
border-radius: 10px;
|
363 |
+
border: 2px solid #121212;
|
364 |
+
}
|
365 |
+
|
366 |
+
/* Handle on hover */
|
367 |
+
::-webkit-scrollbar-thumb:hover {
|
368 |
+
background: #0c3e7a;
|
369 |
+
}
|
370 |
+
|
371 |
+
|
372 |
+
body {
|
373 |
+
background-color: var(--bg-color);
|
374 |
+
padding: 2rem;
|
375 |
+
min-height: 100vh;
|
376 |
+
background-image:
|
377 |
+
radial-gradient(circle at 20% 20%, rgba(15, 79, 155, 0.1) 0%, transparent 50%),
|
378 |
+
radial-gradient(circle at 80% 80%, rgba(15, 79, 155, 0.1) 0%, transparent 50%);
|
379 |
+
color: var(--text-color);
|
380 |
+
}
|
381 |
+
|
382 |
+
.container {
|
383 |
+
max-width: 1200px;
|
384 |
+
margin: 0 auto;
|
385 |
+
}
|
386 |
+
|
387 |
+
.header {
|
388 |
+
display: flex;
|
389 |
+
justify-content: space-between;
|
390 |
+
align-items: center;
|
391 |
+
margin-bottom: 2rem;
|
392 |
+
position: relative;
|
393 |
+
}
|
394 |
+
|
395 |
+
.title-section {
|
396 |
+
flex-grow: 1;
|
397 |
+
}
|
398 |
+
|
399 |
+
.title {
|
400 |
+
font-size: 2.5rem;
|
401 |
+
font-weight: 800;
|
402 |
+
color: var(--primary-color);
|
403 |
+
margin-bottom: 0.5rem;
|
404 |
+
}
|
405 |
+
|
406 |
+
.subtitle {
|
407 |
+
color: var(--secondary-text);
|
408 |
+
font-size: 1.1rem;
|
409 |
+
}
|
410 |
+
|
411 |
+
.model-selector {
|
412 |
+
position: relative;
|
413 |
+
min-width: 200px;
|
414 |
+
}
|
415 |
+
|
416 |
+
.model-selector-header {
|
417 |
+
display: flex;
|
418 |
+
gap: 0.5rem;
|
419 |
+
margin-bottom: 0.5rem;
|
420 |
+
}
|
421 |
+
|
422 |
+
.model-type-toggle {
|
423 |
+
display: flex;
|
424 |
+
background-color: var(--card-bg);
|
425 |
+
border-radius: 0.5rem;
|
426 |
+
padding: 0.25rem;
|
427 |
+
overflow: hidden;
|
428 |
+
}
|
429 |
+
|
430 |
+
.toggle-option {
|
431 |
+
padding: 0.5rem 0.75rem;
|
432 |
+
font-size: 0.8rem;
|
433 |
+
font-weight: 500;
|
434 |
+
cursor: pointer;
|
435 |
+
transition: var(--transition);
|
436 |
+
border-radius: 0.375rem;
|
437 |
+
color: var(--secondary-text);
|
438 |
+
}
|
439 |
+
|
440 |
+
.toggle-option.active {
|
441 |
+
background-color: var(--primary-color);
|
442 |
+
color: white;
|
443 |
+
}
|
444 |
+
|
445 |
+
select {
|
446 |
+
width: 100%;
|
447 |
+
padding: 0.75rem 1rem;
|
448 |
+
border: 2px solid var(--input-border);
|
449 |
+
border-radius: 0.5rem;
|
450 |
+
font-size: 1rem;
|
451 |
+
color: var(--text-color);
|
452 |
+
background-color: var(--input-bg);
|
453 |
+
cursor: pointer;
|
454 |
+
transition: var(--transition);
|
455 |
+
appearance: none;
|
456 |
+
background-image: url("data:image/svg+xml,%3Csvg xmlns='http://www.w3.org/2000/svg' viewBox='0 0 24 24' fill='%230f4f9b'%3E%3Cpath d='M7 10l5 5 5-5H7z'/%3E%3C/svg%3E");
|
457 |
+
background-repeat: no-repeat;
|
458 |
+
background-position: right 1rem center;
|
459 |
+
background-size: 1.5rem;
|
460 |
+
}
|
461 |
+
|
462 |
+
select:hover, .custom-model-input:hover {
|
463 |
+
border-color: var(--primary-color);
|
464 |
+
}
|
465 |
+
|
466 |
+
select:focus, .custom-model-input:focus {
|
467 |
+
outline: none;
|
468 |
+
border-color: var(--primary-color);
|
469 |
+
box-shadow: 0 0 0 3px rgba(15, 79, 155, 0.1);
|
470 |
+
}
|
471 |
+
|
472 |
+
.custom-model-input {
|
473 |
+
width: 100%;
|
474 |
+
padding: 0.75rem 1rem;
|
475 |
+
border: 2px solid var(--input-border);
|
476 |
+
border-radius: 0.5rem;
|
477 |
+
font-size: 1rem;
|
478 |
+
color: var(--text-color);
|
479 |
+
background-color: var(--input-bg);
|
480 |
+
transition: var(--transition);
|
481 |
+
}
|
482 |
+
|
483 |
+
.input-section {
|
484 |
+
margin-bottom: 2rem;
|
485 |
+
}
|
486 |
+
|
487 |
+
textarea {
|
488 |
+
width: 100%;
|
489 |
+
height: 150px;
|
490 |
+
padding: 1.25rem;
|
491 |
+
border: 2px solid var(--input-border);
|
492 |
+
border-radius: 0.75rem;
|
493 |
+
resize: vertical;
|
494 |
+
font-size: 1rem;
|
495 |
+
margin-bottom: 1rem;
|
496 |
+
transition: var(--transition);
|
497 |
+
background-color: var(--input-bg);
|
498 |
+
color: var(--text-color);
|
499 |
+
}
|
500 |
+
|
501 |
+
textarea:focus {
|
502 |
+
outline: none;
|
503 |
+
border-color: var(--input-focus);
|
504 |
+
box-shadow: 0 0 0 3px rgba(15, 79, 155, 0.1);
|
505 |
+
}
|
506 |
+
|
507 |
+
.button-container {
|
508 |
+
display: flex;
|
509 |
+
justify-content: center;
|
510 |
+
width: 100%;
|
511 |
+
gap: 1rem;
|
512 |
+
}
|
513 |
+
|
514 |
+
button {
|
515 |
+
padding: 0.875rem 2.5rem;
|
516 |
+
background: linear-gradient(135deg, var(--primary-color) 0%, var(--primary-hover) 100%);
|
517 |
+
color: #fff;
|
518 |
+
border: none;
|
519 |
+
border-radius: 0.75rem;
|
520 |
+
font-size: 1.1rem;
|
521 |
+
font-weight: 600;
|
522 |
+
cursor: pointer;
|
523 |
+
transition: var(--transition);
|
524 |
+
box-shadow: 0 4px 6px -1px rgba(15, 79, 155, 0.2);
|
525 |
+
}
|
526 |
+
|
527 |
+
button:hover {
|
528 |
+
transform: translateY(-2px);
|
529 |
+
box-shadow: 0 6px 8px -1px rgba(15, 79, 155, 0.3);
|
530 |
+
}
|
531 |
+
|
532 |
+
button:active {
|
533 |
+
transform: translateY(0);
|
534 |
+
}
|
535 |
+
|
536 |
+
button:disabled {
|
537 |
+
opacity: 0.7;
|
538 |
+
cursor: not-allowed;
|
539 |
+
}
|
540 |
+
|
541 |
+
.card {
|
542 |
+
background-color: var(--card-bg);
|
543 |
+
border-radius: 1rem;
|
544 |
+
box-shadow: var(--card-shadow);
|
545 |
+
padding: 1.5rem;
|
546 |
+
margin-bottom: 2rem;
|
547 |
+
transition: var(--transition);
|
548 |
+
}
|
549 |
+
|
550 |
+
.card:hover {
|
551 |
+
transform: translateY(-2px);
|
552 |
+
box-shadow: 0 6px 12px -2px rgba(0, 0, 0, 0.1);
|
553 |
+
}
|
554 |
+
|
555 |
+
.card-title {
|
556 |
+
font-size: 1.25rem;
|
557 |
+
font-weight: 700;
|
558 |
+
color: var(--text-color);
|
559 |
+
margin-bottom: 1.25rem;
|
560 |
+
display: flex;
|
561 |
+
align-items: center;
|
562 |
+
gap: 0.5rem;
|
563 |
+
cursor: pointer;
|
564 |
+
}
|
565 |
+
|
566 |
+
.card-title::before {
|
567 |
+
content: '';
|
568 |
+
display: block;
|
569 |
+
width: 4px;
|
570 |
+
height: 1.25rem;
|
571 |
+
background: linear-gradient(135deg, var(--primary-color) 0%, var(--primary-hover) 100%);
|
572 |
+
border-radius: 2px;
|
573 |
+
}
|
574 |
+
|
575 |
+
.token-container {
|
576 |
+
display: flex;
|
577 |
+
flex-wrap: wrap;
|
578 |
+
gap: 0.375rem;
|
579 |
+
margin-bottom: 1rem;
|
580 |
+
padding: 1rem;
|
581 |
+
background-color: #2a2a2a;
|
582 |
+
border-radius: 0.5rem;
|
583 |
+
max-height: 200px;
|
584 |
+
overflow-y: auto;
|
585 |
+
transition: max-height 0.3s ease;
|
586 |
+
}
|
587 |
+
|
588 |
+
.token-container.expanded {
|
589 |
+
max-height: none;
|
590 |
+
}
|
591 |
+
|
592 |
+
.token {
|
593 |
+
padding: 0.375rem 0.75rem;
|
594 |
+
border-radius: 0.375rem;
|
595 |
+
background-color: var(--input-bg);
|
596 |
+
font-family: 'SF Mono', 'Monaco', 'Inconsolata', 'Fira Mono', 'Droid Sans Mono', 'Source Code Pro', monospace;
|
597 |
+
font-size: 0.875rem;
|
598 |
+
color: var(--text-color);
|
599 |
+
cursor: default;
|
600 |
+
transition: var(--transition);
|
601 |
+
box-shadow: 0 1px 2px rgba(0, 0, 0, 0.05);
|
602 |
+
}
|
603 |
+
|
604 |
+
.token:hover {
|
605 |
+
transform: translateY(-1px);
|
606 |
+
box-shadow: 0 2px 4px rgba(0, 0, 0, 0.1);
|
607 |
+
}
|
608 |
+
|
609 |
+
.stats-grid {
|
610 |
+
display: grid;
|
611 |
+
grid-template-columns: repeat(auto-fit, minmax(200px, 1fr));
|
612 |
+
gap: 1.5rem;
|
613 |
+
margin-bottom: 2rem;
|
614 |
+
}
|
615 |
+
|
616 |
+
.stat-card {
|
617 |
+
background-color: var(--card-bg);
|
618 |
+
padding: 1.5rem;
|
619 |
+
border-radius: 1rem;
|
620 |
+
box-shadow: var(--card-shadow);
|
621 |
+
transition: var(--transition);
|
622 |
+
}
|
623 |
+
|
624 |
+
.stat-card:hover {
|
625 |
+
transform: translateY(-2px);
|
626 |
+
box-shadow: 0 6px 12px -2px rgba(0, 0, 0, 0.1);
|
627 |
+
}
|
628 |
+
|
629 |
+
.stat-title {
|
630 |
+
color: var(--secondary-text);
|
631 |
+
font-size: 0.875rem;
|
632 |
+
font-weight: 500;
|
633 |
+
margin-bottom: 0.5rem;
|
634 |
+
text-transform: uppercase;
|
635 |
+
letter-spacing: 0.05em;
|
636 |
+
}
|
637 |
+
|
638 |
+
.stat-value {
|
639 |
+
color: var(--text-color);
|
640 |
+
font-size: 2rem;
|
641 |
+
font-weight: 700;
|
642 |
+
line-height: 1.2;
|
643 |
+
margin-bottom: 0.25rem;
|
644 |
+
}
|
645 |
+
|
646 |
+
.stat-description {
|
647 |
+
color: var(--secondary-text);
|
648 |
+
font-size: 0.875rem;
|
649 |
+
}
|
650 |
+
|
651 |
+
.expand-button {
|
652 |
+
background: none;
|
653 |
+
border: none;
|
654 |
+
color: var(--primary-color);
|
655 |
+
font-size: 0.875rem;
|
656 |
+
padding: 0.5rem;
|
657 |
+
cursor: pointer;
|
658 |
+
display: block;
|
659 |
+
margin: 0 auto;
|
660 |
+
box-shadow: none;
|
661 |
+
}
|
662 |
+
|
663 |
+
.expand-button:hover {
|
664 |
+
text-decoration: underline;
|
665 |
+
transform: none;
|
666 |
+
box-shadow: none;
|
667 |
+
}
|
668 |
+
|
669 |
+
.error-message {
|
670 |
+
color: #EF4444;
|
671 |
+
background-color: #3a1f1f;
|
672 |
+
border: 1px solid #562626;
|
673 |
+
padding: 1rem;
|
674 |
+
border-radius: 0.5rem;
|
675 |
+
margin-bottom: 1rem;
|
676 |
+
display: none;
|
677 |
+
}
|
678 |
+
|
679 |
+
.display-limit-notice {
|
680 |
+
background-color: #4b2b07;
|
681 |
+
border: 1px solid #7c4a02;
|
682 |
+
color: #FFD591;
|
683 |
+
padding: 0.75rem;
|
684 |
+
border-radius: 0.5rem;
|
685 |
+
margin-top: 1rem;
|
686 |
+
font-size: 0.875rem;
|
687 |
+
display: none;
|
688 |
+
}
|
689 |
+
|
690 |
+
/* File drop zone styles */
|
691 |
+
.file-drop-zone {
|
692 |
+
position: fixed;
|
693 |
+
top: 0;
|
694 |
+
left: 0;
|
695 |
+
width: 100%;
|
696 |
+
height: 100%;
|
697 |
+
background-color: rgba(15, 79, 155, 0.15);
|
698 |
+
z-index: 1000;
|
699 |
+
display: flex;
|
700 |
+
justify-content: center;
|
701 |
+
align-items: center;
|
702 |
+
opacity: 0;
|
703 |
+
pointer-events: none;
|
704 |
+
transition: opacity 0.3s ease;
|
705 |
+
}
|
706 |
+
|
707 |
+
.file-drop-zone.active {
|
708 |
+
opacity: 1;
|
709 |
+
pointer-events: all;
|
710 |
+
}
|
711 |
+
|
712 |
+
.drop-indicator {
|
713 |
+
background-color: var(--card-bg);
|
714 |
+
border: 2px dashed var(--primary-color);
|
715 |
+
border-radius: 1rem;
|
716 |
+
padding: 2rem;
|
717 |
+
text-align: center;
|
718 |
+
width: 60%;
|
719 |
+
max-width: 400px;
|
720 |
+
box-shadow: 0 8px 32px rgba(0, 0, 0, 0.25);
|
721 |
+
animation: pulse 2s infinite;
|
722 |
+
}
|
723 |
+
|
724 |
+
@keyframes pulse {
|
725 |
+
0% { transform: scale(1); }
|
726 |
+
50% { transform: scale(1.05); }
|
727 |
+
100% { transform: scale(1); }
|
728 |
+
}
|
729 |
+
|
730 |
+
.drop-indicator p {
|
731 |
+
margin-bottom: 0.5rem;
|
732 |
+
color: var(--text-color);
|
733 |
+
font-size: 1.2rem;
|
734 |
+
}
|
735 |
+
|
736 |
+
.file-icon {
|
737 |
+
font-size: 3rem;
|
738 |
+
margin-bottom: 1rem;
|
739 |
+
color: var(--primary-color);
|
740 |
+
}
|
741 |
+
|
742 |
+
.file-upload-icon {
|
743 |
+
position: fixed;
|
744 |
+
bottom: 20px;
|
745 |
+
left: 20px;
|
746 |
+
width: 45px;
|
747 |
+
height: 45px;
|
748 |
+
background-color: var(--card-bg);
|
749 |
+
border-radius: 50%;
|
750 |
+
display: flex;
|
751 |
+
justify-content: center;
|
752 |
+
align-items: center;
|
753 |
+
cursor: pointer;
|
754 |
+
z-index: 100;
|
755 |
+
box-shadow: 0 2px 10px rgba(0, 0, 0, 0.2);
|
756 |
+
transition: transform 0.2s ease, box-shadow 0.2s ease;
|
757 |
+
}
|
758 |
+
|
759 |
+
.file-upload-icon:hover {
|
760 |
+
transform: translateY(-2px);
|
761 |
+
box-shadow: 0 4px 15px rgba(0, 0, 0, 0.3);
|
762 |
+
}
|
763 |
+
|
764 |
+
.file-upload-icon span {
|
765 |
+
font-size: 1.5rem;
|
766 |
+
color: var(--primary-color);
|
767 |
+
}
|
768 |
+
|
769 |
+
.file-info {
|
770 |
+
position: fixed;
|
771 |
+
bottom: 20px;
|
772 |
+
left: 75px;
|
773 |
+
background-color: var(--card-bg);
|
774 |
+
color: var(--primary-color);
|
775 |
+
font-weight: 500;
|
776 |
+
padding: 0.5rem 1rem;
|
777 |
+
border-radius: 1rem;
|
778 |
+
box-shadow: 0 2px 10px rgba(0, 0, 0, 0.2);
|
779 |
+
max-width: 270px;
|
780 |
+
white-space: nowrap;
|
781 |
+
overflow: hidden;
|
782 |
+
text-overflow: ellipsis;
|
783 |
+
z-index: 100;
|
784 |
+
display: none;
|
785 |
+
}
|
786 |
+
|
787 |
+
.file-detach {
|
788 |
+
margin-left: 8px;
|
789 |
+
display: inline-block;
|
790 |
+
width: 18px;
|
791 |
+
height: 18px;
|
792 |
+
background-color: rgba(255, 255, 255, 0.1);
|
793 |
+
color: var(--text-color);
|
794 |
+
border-radius: 50%;
|
795 |
+
text-align: center;
|
796 |
+
line-height: 16px;
|
797 |
+
font-size: 12px;
|
798 |
+
cursor: pointer;
|
799 |
+
transition: all 0.2s ease;
|
800 |
+
}
|
801 |
+
|
802 |
+
.file-detach:hover {
|
803 |
+
background-color: rgba(255, 0, 0, 0.2);
|
804 |
+
color: #ff6b6b;
|
805 |
+
transform: scale(1.1);
|
806 |
+
}
|
807 |
+
|
808 |
+
.preview-notice {
|
809 |
+
background-color: #273c56;
|
810 |
+
border: 1px solid #365a82;
|
811 |
+
color: #89b4e8;
|
812 |
+
padding: 0.75rem;
|
813 |
+
border-radius: 0.5rem;
|
814 |
+
margin-top: 1rem;
|
815 |
+
font-size: 0.875rem;
|
816 |
+
display: none;
|
817 |
+
}
|
818 |
+
|
819 |
+
.custom-model-wrapper {
|
820 |
+
position: relative;
|
821 |
+
}
|
822 |
+
|
823 |
+
.model-badge {
|
824 |
+
position: absolute;
|
825 |
+
top: -10px;
|
826 |
+
right: -5px;
|
827 |
+
background: linear-gradient(135deg, #22c55e 0%, #15803d 100%);
|
828 |
+
color: white;
|
829 |
+
font-size: 0.7rem;
|
830 |
+
font-weight: 700;
|
831 |
+
padding: 0.25rem 0.5rem;
|
832 |
+
border-radius: 999px;
|
833 |
+
transform: scale(0);
|
834 |
+
transition: transform 0.3s cubic-bezier(0.175, 0.885, 0.32, 1.275);
|
835 |
+
box-shadow: 0 2px 5px rgba(0, 0, 0, 0.2);
|
836 |
+
z-index: 10;
|
837 |
+
}
|
838 |
+
|
839 |
+
.model-badge.show {
|
840 |
+
transform: scale(1);
|
841 |
+
}
|
842 |
+
|
843 |
+
.custom-model-help {
|
844 |
+
display: inline-block;
|
845 |
+
width: 16px;
|
846 |
+
height: 16px;
|
847 |
+
line-height: 16px;
|
848 |
+
font-size: 11px;
|
849 |
+
font-weight: bold;
|
850 |
+
text-align: center;
|
851 |
+
background-color: var(--secondary-text);
|
852 |
+
color: var(--card-bg);
|
853 |
+
border-radius: 50%;
|
854 |
+
margin-left: 5px;
|
855 |
+
cursor: help;
|
856 |
+
vertical-align: middle;
|
857 |
+
}
|
858 |
+
|
859 |
+
.tooltip {
|
860 |
+
position: absolute;
|
861 |
+
top: 100%;
|
862 |
+
left: 0;
|
863 |
+
width: 280px;
|
864 |
+
background-color: #333;
|
865 |
+
color: #fff;
|
866 |
+
padding: 0.75rem;
|
867 |
+
border-radius: 0.5rem;
|
868 |
+
font-size: 0.8rem;
|
869 |
+
margin-top: 0.5rem;
|
870 |
+
z-index: 100;
|
871 |
+
box-shadow: 0 4px 6px rgba(0, 0, 0, 0.1);
|
872 |
+
opacity: 0;
|
873 |
+
visibility: hidden;
|
874 |
+
transition: opacity 0.2s, visibility 0.2s;
|
875 |
+
}
|
876 |
+
|
877 |
+
.custom-model-help:hover + .tooltip {
|
878 |
+
opacity: 1;
|
879 |
+
visibility: visible;
|
880 |
+
}
|
881 |
+
|
882 |
+
/* Tokenizer info icon and tooltip styles */
|
883 |
+
.tokenizer-info-icon {
|
884 |
+
display: inline-flex;
|
885 |
+
align-items: center;
|
886 |
+
justify-content: center;
|
887 |
+
width: 24px;
|
888 |
+
height: 24px;
|
889 |
+
background: linear-gradient(135deg, var(--primary-color) 0%, var(--primary-hover) 100%);
|
890 |
+
color: white;
|
891 |
+
border-radius: 50%;
|
892 |
+
position: absolute;
|
893 |
+
left: -32px; /* Position to the left of the selector */
|
894 |
+
top: 50%;
|
895 |
+
transform: translateY(-50%);
|
896 |
+
cursor: pointer;
|
897 |
+
font-size: 12px;
|
898 |
+
font-weight: bold;
|
899 |
+
transition: all 0.2s ease;
|
900 |
+
z-index: 10;
|
901 |
+
box-shadow: 0 2px 4px rgba(0, 0, 0, 0.2);
|
902 |
+
}
|
903 |
+
|
904 |
+
.tokenizer-info-icon:hover {
|
905 |
+
transform: translateY(-50%) scale(1.1);
|
906 |
+
box-shadow: 0 3px 8px rgba(0, 0, 0, 0.3);
|
907 |
+
}
|
908 |
+
|
909 |
+
/* Watermark styles */
|
910 |
+
.watermark {
|
911 |
+
position: fixed;
|
912 |
+
bottom: 20px;
|
913 |
+
right: 20px;
|
914 |
+
color: var(--primary-color);
|
915 |
+
font-size: 1.4rem;
|
916 |
+
font-weight: 700;
|
917 |
+
opacity: 0.25; /* Semi-transparent */
|
918 |
+
z-index: 100;
|
919 |
+
transition: opacity 0.3s ease;
|
920 |
+
text-decoration: none;
|
921 |
+
pointer-events: auto; /* Ensure it remains clickable */
|
922 |
+
}
|
923 |
+
|
924 |
+
.watermark:hover {
|
925 |
+
opacity: 0.6; /* Increase opacity on hover */
|
926 |
+
}
|
927 |
+
|
928 |
+
.tokenizer-info-tooltip {
|
929 |
+
position: absolute;
|
930 |
+
top: calc(100% + 8px);
|
931 |
+
left: -30px; /* Adjust position to align with the icon */
|
932 |
+
width: 300px;
|
933 |
+
background-color: var(--card-bg);
|
934 |
+
color: var(--text-color);
|
935 |
+
border: 1px solid var(--primary-color);
|
936 |
+
border-radius: 0.75rem;
|
937 |
+
box-shadow: 0 5px 15px rgba(0, 0, 0, 0.3);
|
938 |
+
padding: 1rem;
|
939 |
+
z-index: 1000; /* Increase z-index to ensure visibility */
|
940 |
+
opacity: 0;
|
941 |
+
visibility: hidden;
|
942 |
+
transition: opacity 0.3s, visibility 0.3s;
|
943 |
+
pointer-events: none; /* Initially disable pointer events */
|
944 |
+
}
|
945 |
+
|
946 |
+
.tokenizer-info-icon:not(.tooltip-disabled):hover + .tokenizer-info-tooltip {
|
947 |
+
opacity: 1;
|
948 |
+
visibility: visible;
|
949 |
+
pointer-events: auto;
|
950 |
+
}
|
951 |
+
|
952 |
+
.tokenizer-info-tooltip:hover {
|
953 |
+
opacity: 1;
|
954 |
+
visibility: visible;
|
955 |
+
pointer-events: auto;
|
956 |
+
}
|
957 |
+
|
958 |
+
.tokenizer-info-header {
|
959 |
+
font-size: 1.1rem;
|
960 |
+
font-weight: 600;
|
961 |
+
margin-bottom: 0.5rem;
|
962 |
+
padding-bottom: 0.5rem;
|
963 |
+
border-bottom: 1px solid rgba(255, 255, 255, 0.1);
|
964 |
+
color: var(--primary-color);
|
965 |
+
}
|
966 |
+
|
967 |
+
.tokenizer-info-grid {
|
968 |
+
display: grid;
|
969 |
+
grid-template-columns: repeat(2, 1fr);
|
970 |
+
gap: 0.75rem;
|
971 |
+
margin: 0.75rem 0;
|
972 |
+
}
|
973 |
+
|
974 |
+
.tokenizer-info-item {
|
975 |
+
display: flex;
|
976 |
+
flex-direction: column;
|
977 |
+
}
|
978 |
+
|
979 |
+
.tokenizer-info-label {
|
980 |
+
font-size: 0.75rem;
|
981 |
+
color: var(--secondary-text);
|
982 |
+
margin-bottom: 0.25rem;
|
983 |
+
}
|
984 |
+
|
985 |
+
.tokenizer-info-value {
|
986 |
+
font-size: 0.95rem;
|
987 |
+
font-weight: 500;
|
988 |
+
}
|
989 |
+
|
990 |
+
.special-tokens-container {
|
991 |
+
margin-top: 0.75rem;
|
992 |
+
background-color: rgba(15, 79, 155, 0.1);
|
993 |
+
border-radius: 0.5rem;
|
994 |
+
padding: 0.5rem;
|
995 |
+
max-height: 100px;
|
996 |
+
overflow-y: auto;
|
997 |
+
}
|
998 |
+
|
999 |
+
.special-token-item {
|
1000 |
+
display: flex;
|
1001 |
+
justify-content: space-between;
|
1002 |
+
margin-bottom: 0.25rem;
|
1003 |
+
font-size: 0.8rem;
|
1004 |
+
}
|
1005 |
+
|
1006 |
+
.token-name {
|
1007 |
+
color: var(--secondary-text);
|
1008 |
+
}
|
1009 |
+
|
1010 |
+
.token-value {
|
1011 |
+
background-color: rgba(255, 255, 255, 0.1);
|
1012 |
+
padding: 1px 4px;
|
1013 |
+
border-radius: 2px;
|
1014 |
+
font-family: monospace;
|
1015 |
+
}
|
1016 |
+
|
1017 |
+
.tokenizer-info-loading {
|
1018 |
+
display: flex;
|
1019 |
+
justify-content: center;
|
1020 |
+
align-items: center;
|
1021 |
+
height: 100px;
|
1022 |
+
}
|
1023 |
+
|
1024 |
+
.tokenizer-info-spinner {
|
1025 |
+
width: 30px;
|
1026 |
+
height: 30px;
|
1027 |
+
border: 3px solid var(--primary-color);
|
1028 |
+
border-radius: 50%;
|
1029 |
+
border-top-color: transparent;
|
1030 |
+
animation: spin 1s linear infinite;
|
1031 |
+
}
|
1032 |
+
|
1033 |
+
.tokenizer-info-error {
|
1034 |
+
color: #f87171;
|
1035 |
+
font-size: 0.9rem;
|
1036 |
+
text-align: center;
|
1037 |
+
padding: 1rem;
|
1038 |
+
}
|
1039 |
+
|
1040 |
+
@media (max-width: 768px) {
|
1041 |
+
.header {
|
1042 |
+
flex-direction: column;
|
1043 |
+
align-items: stretch;
|
1044 |
+
gap: 1rem;
|
1045 |
+
}
|
1046 |
+
|
1047 |
+
.model-selector {
|
1048 |
+
width: 100%;
|
1049 |
+
}
|
1050 |
+
|
1051 |
+
.stats-grid {
|
1052 |
+
grid-template-columns: 1fr;
|
1053 |
+
}
|
1054 |
+
|
1055 |
+
.tokenizer-info-tooltip {
|
1056 |
+
width: 250px;
|
1057 |
+
}
|
1058 |
+
}
|
1059 |
+
</style>
|
1060 |
+
</head>
|
1061 |
+
<body>
|
1062 |
+
<!-- Hidden File Drop Zone that appears when dragging files -->
|
1063 |
+
<div id="fileDropZone" class="file-drop-zone">
|
1064 |
+
<div class="drop-indicator">
|
1065 |
+
<div class="file-icon">📄</div>
|
1066 |
+
<p>Drop your file here</p>
|
1067 |
+
</div>
|
1068 |
+
</div>
|
1069 |
+
|
1070 |
+
<!-- File upload icon in bottom left corner -->
|
1071 |
+
<div id="fileUploadIcon" class="file-upload-icon">
|
1072 |
+
<span>���</span>
|
1073 |
+
</div>
|
1074 |
+
<p class="file-info" id="fileInfo"></p>
|
1075 |
+
|
1076 |
+
<div class="container">
|
1077 |
+
<div class="header">
|
1078 |
+
<div class="title-section">
|
1079 |
+
<h1 class="title">Token Visualizer</h1>
|
1080 |
+
<p class="subtitle">Advanced tokenization analysis and visualization</p>
|
1081 |
+
</div>
|
1082 |
+
<div class="model-selector">
|
1083 |
+
<div class="model-selector-header">
|
1084 |
+
<div class="model-type-toggle">
|
1085 |
+
<div class="toggle-option predefined-toggle active" data-type="predefined">Predefined</div>
|
1086 |
+
<div class="toggle-option custom-toggle" data-type="custom">Custom</div>
|
1087 |
+
</div>
|
1088 |
+
</div>
|
1089 |
+
<div id="predefinedModelSelector">
|
1090 |
+
<div style="position: relative;">
|
1091 |
+
<div class="tokenizer-info-icon" id="modelInfoIcon" title="View tokenizer information">ℹ</div>
|
1092 |
+
<!-- TOOLTIP MOVED HERE -->
|
1093 |
+
<div class="tokenizer-info-tooltip" id="modelInfoTooltip">
|
1094 |
+
<div id="tokenizerInfoContent">
|
1095 |
+
<div class="tokenizer-info-loading">
|
1096 |
+
<div class="tokenizer-info-spinner"></div>
|
1097 |
+
</div>
|
1098 |
+
</div>
|
1099 |
+
</div>
|
1100 |
+
<!-- SELECT NOW COMES AFTER ICON AND TOOLTIP -->
|
1101 |
+
<select id="modelSelect" name="model">
|
1102 |
+
{% for model_id, info in models.items() %}
|
1103 |
+
<option value="{{ model_id }}" {% if selected_model == model_id %}selected{% endif %}>
|
1104 |
+
{{ info.alias }}
|
1105 |
+
</option>
|
1106 |
+
{% endfor %}
|
1107 |
+
</select>
|
1108 |
+
</div>
|
1109 |
+
</div>
|
1110 |
+
<div id="customModelSelector" style="display: none;" class="custom-model-wrapper">
|
1111 |
+
<div style="position: relative;">
|
1112 |
+
<div class="tokenizer-info-icon" id="customModelInfoIcon" title="View tokenizer information">ℹ</div>
|
1113 |
+
<div class="tokenizer-info-tooltip" id="customModelInfoTooltip">
|
1114 |
+
<div id="customTokenizerInfoContent">
|
1115 |
+
<div class="tokenizer-info-loading">
|
1116 |
+
<div class="tokenizer-info-spinner"></div>
|
1117 |
+
</div>
|
1118 |
+
</div>
|
1119 |
+
</div>
|
1120 |
+
<input type="text" id="customModelInput" class="custom-model-input"
|
1121 |
+
placeholder="Enter HuggingFace model path"
|
1122 |
+
value="{{ custom_model if custom_model and custom_model|length > 0 else '' }}">
|
1123 |
+
</div>
|
1124 |
+
<span class="custom-model-help">?</span>
|
1125 |
+
<div class="tooltip">
|
1126 |
+
Enter a valid HuggingFace model ID (e.g., "mistralai/Mistral-7B-Instruct-v0.3")
|
1127 |
+
The model must have a tokenizer available and must be not restricted. (with some exceptions)
|
1128 |
+
Also some models have restrictions. You can use mirrored versions, like unsloth to omit that.
|
1129 |
+
Like ("unsloth/Meta-Llama-3.1-8B-Instruct-bnb-4bit") instead of original path.
|
1130 |
+
</div>
|
1131 |
+
<div class="model-badge" id="modelSuccessBadge">Loaded</div>
|
1132 |
+
</div>
|
1133 |
+
</div>
|
1134 |
+
</div>
|
1135 |
+
|
1136 |
+
<div class="error-message" id="errorMessage">{{ error }}</div>
|
1137 |
+
|
1138 |
+
<div class="input-section">
|
1139 |
+
<form id="analyzeForm" method="POST" enctype="multipart/form-data">
|
1140 |
+
<textarea name="text" id="textInput" placeholder="Enter text to analyze or upload a file in bottom left corner...">{{ text }}</textarea>
|
1141 |
+
<input type="hidden" name="model" id="modelInput" value="{{ selected_model }}">
|
1142 |
+
<input type="hidden" name="custom_model" id="customModelInputHidden" value="{{ custom_model if custom_model else '' }}">
|
1143 |
+
<input type="hidden" name="model_type" id="modelTypeInput" value="{{ model_type if model_type else 'predefined' }}">
|
1144 |
+
<input type="file" name="file" id="fileInput" style="display: none;">
|
1145 |
+
<div class="button-container">
|
1146 |
+
<button type="submit" id="analyzeButton">Analyze Text</button>
|
1147 |
+
</div>
|
1148 |
+
</form>
|
1149 |
+
</div>
|
1150 |
+
|
1151 |
+
<div id="results" class="results" {% if not token_data %}style="display: none;"{% endif %}>
|
1152 |
+
<div class="card">
|
1153 |
+
<h2 class="card-title">Token Visualization</h2>
|
1154 |
+
<div class="preview-notice" id="previewNotice">
|
1155 |
+
Note: Showing preview of first 8096 characters. Stats are calculated on the full file.
|
1156 |
+
</div>
|
1157 |
+
<div class="token-container" id="tokenContainer">
|
1158 |
+
{% if token_data %}
|
1159 |
+
{% for token in token_data.tokens %}
|
1160 |
+
<span class="token"
|
1161 |
+
style="background-color: {{ token.colors.background }}; color: {{ token.colors.text }};"
|
1162 |
+
title="Original token: {{ token.original }} | Token ID: {{ token.token_id }}">
|
1163 |
+
{{ token.display }}
|
1164 |
+
</span>
|
1165 |
+
{% if token.newline %}<br>{% endif %}
|
1166 |
+
{% endfor %}
|
1167 |
+
{% endif %}
|
1168 |
+
</div>
|
1169 |
+
<button class="expand-button" id="expandButton">Show More</button>
|
1170 |
+
<div class="display-limit-notice" id="displayLimitNotice">
|
1171 |
+
Note: Only showing first 50,000 tokens. Total token count: <span id="totalTokenCount">0</span>
|
1172 |
+
</div>
|
1173 |
+
</div>
|
1174 |
+
|
1175 |
+
<div class="stats-grid">
|
1176 |
+
<div class="stat-card">
|
1177 |
+
<div class="stat-title">Total Tokens</div>
|
1178 |
+
<div class="stat-value" id="totalTokens">{{ token_data.stats.basic_stats.total_tokens if token_data else 0 }}</div>
|
1179 |
+
<div class="stat-description">
|
1180 |
+
<span id="uniqueTokens">{{ token_data.stats.basic_stats.unique_tokens if token_data else 0 }} unique</span>
|
1181 |
+
(<span id="uniquePercentage">{{ token_data.stats.basic_stats.unique_percentage if token_data else 0 }}</span>%)
|
1182 |
+
</div>
|
1183 |
+
</div>
|
1184 |
+
<div class="stat-card">
|
1185 |
+
<div class="stat-title">Token Types</div>
|
1186 |
+
<div class="stat-value" id="specialTokens">{{ token_data.stats.basic_stats.special_tokens if token_data else 0 }}</div>
|
1187 |
+
<div class="stat-description">special tokens</div>
|
1188 |
+
</div>
|
1189 |
+
<div class="stat-card">
|
1190 |
+
<div class="stat-title">Whitespace</div>
|
1191 |
+
<div class="stat-value" id="spaceTokens">{{ token_data.stats.basic_stats.space_tokens if token_data else 0 }}</div>
|
1192 |
+
<div class="stat-description">
|
1193 |
+
spaces: <span id="spaceCount">{{ token_data.stats.basic_stats.space_tokens if token_data else 0 }}</span>,
|
1194 |
+
newlines: <span id="newlineCount">{{ token_data.stats.basic_stats.newline_tokens if token_data else 0 }}</span>
|
1195 |
+
</div>
|
1196 |
+
</div>
|
1197 |
+
<div class="stat-card">
|
1198 |
+
<div class="stat-title">Token Length</div>
|
1199 |
+
<div class="stat-value" id="avgLength">{{ token_data.stats.length_stats.avg_length if token_data else 0 }}</div>
|
1200 |
+
<div class="stat-description">
|
1201 |
+
median: <span id="medianLength">{{ token_data.stats.length_stats.median_length if token_data else 0 }}</span>,
|
1202 |
+
±<span id="stdDev">{{ token_data.stats.length_stats.std_dev if token_data else 0 }}</span> std
|
1203 |
+
</div>
|
1204 |
+
</div>
|
1205 |
+
<div class="stat-card">
|
1206 |
+
<div class="stat-title">Compression</div>
|
1207 |
+
<div class="stat-value" id="compressionRatio">{{ token_data.stats.basic_stats.compression_ratio if token_data else 0 }}</div>
|
1208 |
+
<div class="stat-description">characters per token</div>
|
1209 |
+
</div>
|
1210 |
+
</div>
|
1211 |
+
</div>
|
1212 |
+
</div>
|
1213 |
+
<a href="https://huggingface.co/spaces/barttee/tokenizers" target="_blank" class="watermark">
|
1214 |
+
@barttee/tokenizers
|
1215 |
+
</a>
|
1216 |
+
|
1217 |
+
<script>
|
1218 |
+
$(document).ready(function() {
|
1219 |
+
// File handling variables
|
1220 |
+
let currentFile = null;
|
1221 |
+
let originalTextContent = null;
|
1222 |
+
let lastUploadedFileName = null;
|
1223 |
+
let fileJustUploaded = false; // Flag to prevent immediate detachment
|
1224 |
+
let currentModelType = "{{ model_type if model_type else 'predefined' }}";
|
1225 |
+
let currentTokenizerInfo = null;
|
1226 |
+
|
1227 |
+
// Try to parse tokenizer info if available from server
|
1228 |
+
try {
|
1229 |
+
currentTokenizerInfo = {{ token_data.tokenizer_info|tojson if token_data and token_data.tokenizer_info else 'null' }};
|
1230 |
+
if (currentTokenizerInfo) {
|
1231 |
+
updateTokenizerInfoDisplay(currentTokenizerInfo, currentModelType === 'custom');
|
1232 |
+
}
|
1233 |
+
} catch(e) {
|
1234 |
+
console.error("Error parsing tokenizer info:", e);
|
1235 |
+
}
|
1236 |
+
|
1237 |
+
// Show error if exists
|
1238 |
+
if ("{{ error }}".length > 0) {
|
1239 |
+
showError("{{ error }}");
|
1240 |
+
}
|
1241 |
+
|
1242 |
+
// Setup model type based on initial state
|
1243 |
+
if (currentModelType === "custom") {
|
1244 |
+
$('.toggle-option').removeClass('active');
|
1245 |
+
$('.custom-toggle').addClass('active');
|
1246 |
+
$('#predefinedModelSelector').hide();
|
1247 |
+
$('#customModelSelector').show();
|
1248 |
+
}
|
1249 |
+
|
1250 |
+
// Show success badge if custom model loaded successfully
|
1251 |
+
if (currentModelType === "custom" && !("{{ error }}".length > 0)) {
|
1252 |
+
$('#modelSuccessBadge').addClass('show');
|
1253 |
+
setTimeout(() => {
|
1254 |
+
$('#modelSuccessBadge').removeClass('show');
|
1255 |
+
}, 3000);
|
1256 |
+
}
|
1257 |
+
|
1258 |
+
// Toggle between predefined and custom model inputs
|
1259 |
+
$('.toggle-option').click(function() {
|
1260 |
+
const modelType = $(this).data('type');
|
1261 |
+
$('.toggle-option').removeClass('active');
|
1262 |
+
$(this).addClass('active');
|
1263 |
+
currentModelType = modelType;
|
1264 |
+
|
1265 |
+
if (modelType === 'predefined') {
|
1266 |
+
$('#predefinedModelSelector').show();
|
1267 |
+
$('#customModelSelector').hide();
|
1268 |
+
$('#modelTypeInput').val('predefined');
|
1269 |
+
// Set the model input value to the selected predefined model
|
1270 |
+
$('#modelInput').val($('#modelSelect').val());
|
1271 |
+
} else {
|
1272 |
+
$('#predefinedModelSelector').hide();
|
1273 |
+
$('#customModelSelector').show();
|
1274 |
+
$('#modelTypeInput').val('custom');
|
1275 |
+
}
|
1276 |
+
|
1277 |
+
// Clear tokenizer info if switching models
|
1278 |
+
if (modelType === 'predefined') {
|
1279 |
+
$('#tokenizerInfoContent').html('<div class="tokenizer-info-loading"><div class="tokenizer-info-spinner"></div></div>');
|
1280 |
+
fetchTokenizerInfo($('#modelSelect').val(), false);
|
1281 |
+
} else {
|
1282 |
+
$('#customTokenizerInfoContent').html('<div class="tokenizer-info-loading"><div class="tokenizer-info-spinner"></div></div>');
|
1283 |
+
// Only fetch if there's a custom model value
|
1284 |
+
const customModel = $('#customModelInput').val();
|
1285 |
+
if (customModel) {
|
1286 |
+
fetchTokenizerInfo(customModel, true);
|
1287 |
+
}
|
1288 |
+
}
|
1289 |
+
});
|
1290 |
+
|
1291 |
+
// Update hidden input when custom model input changes
|
1292 |
+
$('#customModelInput').on('input', function() {
|
1293 |
+
$('#customModelInputHidden').val($(this).val());
|
1294 |
+
});
|
1295 |
+
|
1296 |
+
function showError(message) {
|
1297 |
+
const errorDiv = $('#errorMessage');
|
1298 |
+
errorDiv.text(message);
|
1299 |
+
errorDiv.show();
|
1300 |
+
setTimeout(() => errorDiv.fadeOut(), 5000);
|
1301 |
+
}
|
1302 |
+
|
1303 |
+
// Function to update tokenizer info display in tooltip
|
1304 |
+
function updateTokenizerInfoDisplay(info, isCustom = false) {
|
1305 |
+
const targetSelector = isCustom ? '#customTokenizerInfoContent' : '#tokenizerInfoContent';
|
1306 |
+
let htmlContent = '';
|
1307 |
+
|
1308 |
+
|
1309 |
+
if (info.error) {
|
1310 |
+
$(targetSelector).html(`<div class="tokenizer-info-error">${info.error}</div>`);
|
1311 |
+
return;
|
1312 |
+
}
|
1313 |
+
|
1314 |
+
// Start building the tooltip content
|
1315 |
+
htmlContent = `<div class="tokenizer-info-header">Tokenizer Details</div>
|
1316 |
+
<div class="tokenizer-info-grid">`;
|
1317 |
+
|
1318 |
+
// Dictionary size
|
1319 |
+
if (info.vocab_size) {
|
1320 |
+
htmlContent += `
|
1321 |
+
<div class="tokenizer-info-item">
|
1322 |
+
<span class="tokenizer-info-label">Dictionary Size</span>
|
1323 |
+
<span class="tokenizer-info-value">${info.vocab_size.toLocaleString()}</span>
|
1324 |
+
</div>`;
|
1325 |
+
}
|
1326 |
+
|
1327 |
+
// Tokenizer type
|
1328 |
+
if (info.tokenizer_type) {
|
1329 |
+
htmlContent += `
|
1330 |
+
<div class="tokenizer-info-item">
|
1331 |
+
<span class="tokenizer-info-label">Tokenizer Type</span>
|
1332 |
+
<span class="tokenizer-info-value">${info.tokenizer_type}</span>
|
1333 |
+
</div>`;
|
1334 |
+
}
|
1335 |
+
|
1336 |
+
|
1337 |
+
// Max length
|
1338 |
+
if (info.model_max_length) {
|
1339 |
+
htmlContent += `
|
1340 |
+
<div class="tokenizer-info-item">
|
1341 |
+
<span class="tokenizer-info-label">Max Length</span>
|
1342 |
+
<span class="tokenizer-info-value">${info.model_max_length.toLocaleString()}</span>
|
1343 |
+
</div>`;
|
1344 |
+
}
|
1345 |
+
|
1346 |
+
htmlContent += `</div>`; // Close tokenizer-info-grid
|
1347 |
+
|
1348 |
+
// Special tokens section
|
1349 |
+
if (info.special_tokens && Object.keys(info.special_tokens).length > 0) {
|
1350 |
+
htmlContent += `
|
1351 |
+
<div class="tokenizer-info-item" style="margin-top: 0.75rem;">
|
1352 |
+
<span class="tokenizer-info-label">Special Tokens</span>
|
1353 |
+
<div class="special-tokens-container">`;
|
1354 |
+
|
1355 |
+
// Add each special token with proper escaping for HTML special characters
|
1356 |
+
for (const [tokenName, tokenValue] of Object.entries(info.special_tokens)) {
|
1357 |
+
// Properly escape HTML special characters
|
1358 |
+
const escapedValue = tokenValue
|
1359 |
+
.replace(/&/g, '&')
|
1360 |
+
.replace(/</g, '<')
|
1361 |
+
.replace(/>/g, '>')
|
1362 |
+
.replace(/"/g, '"')
|
1363 |
+
.replace(/'/g, ''');
|
1364 |
+
|
1365 |
+
htmlContent += `
|
1366 |
+
<div class="special-token-item">
|
1367 |
+
<span class="token-name">${tokenName}:</span>
|
1368 |
+
<span class="token-value">${escapedValue}</span>
|
1369 |
+
</div>`;
|
1370 |
+
}
|
1371 |
+
|
1372 |
+
htmlContent += `
|
1373 |
+
</div>
|
1374 |
+
</div>`;
|
1375 |
+
}
|
1376 |
+
|
1377 |
+
$(targetSelector).html(htmlContent);
|
1378 |
+
}
|
1379 |
+
|
1380 |
+
// Function to fetch tokenizer info
|
1381 |
+
function fetchTokenizerInfo(modelId, isCustom = false) {
|
1382 |
+
if (!modelId) return;
|
1383 |
+
|
1384 |
+
const targetSelector = isCustom ? '#customTokenizerInfoContent' : '#tokenizerInfoContent';
|
1385 |
+
$(targetSelector).html('<div class="tokenizer-info-loading"><div class="tokenizer-info-spinner"></div></div>');
|
1386 |
+
|
1387 |
+
$.ajax({
|
1388 |
+
url: '/tokenizer-info',
|
1389 |
+
method: 'GET',
|
1390 |
+
data: {
|
1391 |
+
model_id: modelId,
|
1392 |
+
is_custom: isCustom
|
1393 |
+
},
|
1394 |
+
success: function(response) {
|
1395 |
+
if (response.error) {
|
1396 |
+
$(targetSelector).html(`<div class="tokenizer-info-error">${response.error}</div>`);
|
1397 |
+
} else {
|
1398 |
+
currentTokenizerInfo = response;
|
1399 |
+
updateTokenizerInfoDisplay(response, isCustom);
|
1400 |
+
}
|
1401 |
+
},
|
1402 |
+
error: function(xhr) {
|
1403 |
+
$(targetSelector).html('<div class="tokenizer-info-error">Failed to load tokenizer information</div>');
|
1404 |
+
}
|
1405 |
+
});
|
1406 |
+
}
|
1407 |
+
|
1408 |
+
function updateResults(data) {
|
1409 |
+
$('#results').show();
|
1410 |
+
|
1411 |
+
// Update tokens
|
1412 |
+
const tokenContainer = $('#tokenContainer');
|
1413 |
+
tokenContainer.empty();
|
1414 |
+
data.tokens.forEach(token => {
|
1415 |
+
const span = $('<span>')
|
1416 |
+
.addClass('token')
|
1417 |
+
.css({
|
1418 |
+
'background-color': token.colors.background,
|
1419 |
+
'color': token.colors.text
|
1420 |
+
})
|
1421 |
+
// Include token id in the tooltip on hover
|
1422 |
+
.attr('title', `Original token: ${token.original} | Token ID: ${token.token_id}`)
|
1423 |
+
.text(token.display);
|
1424 |
+
|
1425 |
+
tokenContainer.append(span);
|
1426 |
+
if (token.newline) {
|
1427 |
+
tokenContainer.append('<br>');
|
1428 |
+
}
|
1429 |
+
});
|
1430 |
+
|
1431 |
+
// Update display limit notice
|
1432 |
+
if (data.display_limit_reached) {
|
1433 |
+
$('#displayLimitNotice').show();
|
1434 |
+
$('#totalTokenCount').text(data.total_tokens);
|
1435 |
+
} else {
|
1436 |
+
$('#displayLimitNotice').hide();
|
1437 |
+
}
|
1438 |
+
|
1439 |
+
// Update preview notice
|
1440 |
+
if (data.preview_only) {
|
1441 |
+
$('#previewNotice').show();
|
1442 |
+
} else {
|
1443 |
+
$('#previewNotice').hide();
|
1444 |
+
}
|
1445 |
+
|
1446 |
+
// Update basic stats
|
1447 |
+
$('#totalTokens').text(data.stats.basic_stats.total_tokens);
|
1448 |
+
$('#uniqueTokens').text(`${data.stats.basic_stats.unique_tokens} unique`);
|
1449 |
+
$('#uniquePercentage').text(data.stats.basic_stats.unique_percentage);
|
1450 |
+
$('#specialTokens').text(data.stats.basic_stats.special_tokens);
|
1451 |
+
$('#spaceTokens').text(data.stats.basic_stats.space_tokens);
|
1452 |
+
$('#spaceCount').text(data.stats.basic_stats.space_tokens);
|
1453 |
+
$('#newlineCount').text(data.stats.basic_stats.newline_tokens);
|
1454 |
+
$('#compressionRatio').text(data.stats.basic_stats.compression_ratio);
|
1455 |
+
|
1456 |
+
// Update length stats
|
1457 |
+
$('#avgLength').text(data.stats.length_stats.avg_length);
|
1458 |
+
$('#medianLength').text(data.stats.length_stats.median_length);
|
1459 |
+
$('#stdDev').text(data.stats.length_stats.std_dev);
|
1460 |
+
|
1461 |
+
// Update tokenizer info if available
|
1462 |
+
if (data.tokenizer_info) {
|
1463 |
+
currentTokenizerInfo = data.tokenizer_info;
|
1464 |
+
updateTokenizerInfoDisplay(data.tokenizer_info, currentModelType === 'custom');
|
1465 |
+
}
|
1466 |
+
}
|
1467 |
+
|
1468 |
+
// Handle text changes to detach file
|
1469 |
+
$('#textInput').on('input', function() {
|
1470 |
+
// Skip if file was just uploaded (prevents immediate detachment)
|
1471 |
+
if (fileJustUploaded) {
|
1472 |
+
fileJustUploaded = false;
|
1473 |
+
return;
|
1474 |
+
}
|
1475 |
+
|
1476 |
+
const currentText = $(this).val();
|
1477 |
+
const fileInput = document.getElementById('fileInput');
|
1478 |
+
|
1479 |
+
// Only detach if a file exists and text has been substantially modified
|
1480 |
+
if (fileInput.files.length > 0 && originalTextContent !== null) {
|
1481 |
+
// Check if the text is completely different or has been significantly changed
|
1482 |
+
// This allows for small edits without detaching
|
1483 |
+
const isMajorChange =
|
1484 |
+
currentText.length < originalTextContent.length * 0.8 || // Text reduced by at least 20%
|
1485 |
+
(currentText.length > 0 &&
|
1486 |
+
currentText !== originalTextContent.substring(0, currentText.length) &&
|
1487 |
+
currentText.substring(0, Math.min(20, currentText.length)) !==
|
1488 |
+
originalTextContent.substring(0, Math.min(20, currentText.length)));
|
1489 |
+
|
1490 |
+
if (isMajorChange) {
|
1491 |
+
detachFile();
|
1492 |
+
}
|
1493 |
+
}
|
1494 |
+
});
|
1495 |
+
|
1496 |
+
// Function to detach file
|
1497 |
+
function detachFile() {
|
1498 |
+
// Clear the file input
|
1499 |
+
$('#fileInput').val('');
|
1500 |
+
// Hide file info
|
1501 |
+
$('#fileInfo').fadeOut(300);
|
1502 |
+
// Reset the original content tracker
|
1503 |
+
originalTextContent = $('#textInput').val();
|
1504 |
+
// Reset last uploaded filename
|
1505 |
+
lastUploadedFileName = null;
|
1506 |
+
}
|
1507 |
+
|
1508 |
+
// For model changes
|
1509 |
+
$('#modelSelect').change(function() {
|
1510 |
+
const selectedModel = $(this).val();
|
1511 |
+
$('#modelInput').val(selectedModel);
|
1512 |
+
|
1513 |
+
// Fetch tokenizer info for the selected model
|
1514 |
+
fetchTokenizerInfo(selectedModel, false);
|
1515 |
+
|
1516 |
+
// If text exists, submit the form
|
1517 |
+
if ($('#textInput').val().trim()) {
|
1518 |
+
$('#analyzeForm').submit();
|
1519 |
+
}
|
1520 |
+
});
|
1521 |
+
|
1522 |
+
// File drop handling
|
1523 |
+
const fileDropZone = $('#fileDropZone');
|
1524 |
+
const fileUploadIcon = $('#fileUploadIcon');
|
1525 |
+
|
1526 |
+
// Prevent default drag behaviors
|
1527 |
+
['dragenter', 'dragover', 'dragleave', 'drop'].forEach(eventName => {
|
1528 |
+
fileDropZone[0].addEventListener(eventName, preventDefaults, false);
|
1529 |
+
document.body.addEventListener(eventName, preventDefaults, false);
|
1530 |
+
});
|
1531 |
+
|
1532 |
+
function preventDefaults(e) {
|
1533 |
+
e.preventDefault();
|
1534 |
+
e.stopPropagation();
|
1535 |
+
}
|
1536 |
+
|
1537 |
+
// Show drop zone when file is dragged over the document
|
1538 |
+
document.addEventListener('dragenter', showDropZone, false);
|
1539 |
+
document.addEventListener('dragover', showDropZone, false);
|
1540 |
+
|
1541 |
+
fileDropZone[0].addEventListener('dragleave', hideDropZone, false);
|
1542 |
+
fileDropZone[0].addEventListener('drop', hideDropZone, false);
|
1543 |
+
|
1544 |
+
function showDropZone(e) {
|
1545 |
+
fileDropZone.addClass('active');
|
1546 |
+
}
|
1547 |
+
|
1548 |
+
function hideDropZone() {
|
1549 |
+
fileDropZone.removeClass('active');
|
1550 |
+
}
|
1551 |
+
|
1552 |
+
// Handle dropped files
|
1553 |
+
fileDropZone[0].addEventListener('drop', handleDrop, false);
|
1554 |
+
|
1555 |
+
function handleDrop(e) {
|
1556 |
+
const dt = e.dataTransfer;
|
1557 |
+
const files = dt.files;
|
1558 |
+
handleFiles(files);
|
1559 |
+
}
|
1560 |
+
|
1561 |
+
// Also handle file selection via click on the icon
|
1562 |
+
fileUploadIcon.on('click', function() {
|
1563 |
+
const input = document.createElement('input');
|
1564 |
+
input.type = 'file';
|
1565 |
+
input.onchange = e => {
|
1566 |
+
handleFiles(e.target.files);
|
1567 |
+
};
|
1568 |
+
input.click();
|
1569 |
+
});
|
1570 |
+
|
1571 |
+
function handleFiles(files) {
|
1572 |
+
if (files.length) {
|
1573 |
+
const file = files[0];
|
1574 |
+
currentFile = file;
|
1575 |
+
lastUploadedFileName = file.name;
|
1576 |
+
fileJustUploaded = true; // Set flag to prevent immediate detachment
|
1577 |
+
|
1578 |
+
// Show file info with animation and add detach button
|
1579 |
+
$('#fileInfo').html(`${file.name} (${formatFileSize(file.size)}) <span class="file-detach" id="fileDetach"><i class="fas fa-times"></i></span>`).fadeIn(300);
|
1580 |
+
|
1581 |
+
// Add click handler for detach button
|
1582 |
+
$('#fileDetach').on('click', function(e) {
|
1583 |
+
e.stopPropagation(); // Prevent event bubbling
|
1584 |
+
detachFile();
|
1585 |
+
return false;
|
1586 |
+
});
|
1587 |
+
|
1588 |
+
// Set the file to the file input
|
1589 |
+
const dataTransfer = new DataTransfer();
|
1590 |
+
dataTransfer.items.add(file);
|
1591 |
+
document.getElementById('fileInput').files = dataTransfer.files;
|
1592 |
+
|
1593 |
+
// Preview in textarea (first 8096 chars)
|
1594 |
+
const reader = new FileReader();
|
1595 |
+
reader.onload = function(e) {
|
1596 |
+
const previewText = e.target.result.slice(0, 8096);
|
1597 |
+
$('#textInput').val(previewText);
|
1598 |
+
|
1599 |
+
// Store this as the original content AFTER setting the value
|
1600 |
+
// to prevent the input event from firing and detaching immediately
|
1601 |
+
setTimeout(() => {
|
1602 |
+
originalTextContent = previewText;
|
1603 |
+
// Automatically submit for analysis
|
1604 |
+
$('#analyzeForm').submit();
|
1605 |
+
}, 50);
|
1606 |
+
};
|
1607 |
+
reader.readAsText(file);
|
1608 |
+
}
|
1609 |
+
}
|
1610 |
+
|
1611 |
+
function formatFileSize(bytes) {
|
1612 |
+
if (bytes < 1024) return bytes + ' bytes';
|
1613 |
+
else if (bytes < 1048576) return (bytes / 1024).toFixed(1) + ' KB';
|
1614 |
+
else return (bytes / 1048576).toFixed(1) + ' MB';
|
1615 |
+
}
|
1616 |
+
|
1617 |
+
// Make sure to check if there's still a file when analyzing
|
1618 |
+
$('#analyzeForm').on('submit', function(e) {
|
1619 |
+
e.preventDefault();
|
1620 |
+
|
1621 |
+
// Skip detachment check if file was just uploaded
|
1622 |
+
if (!fileJustUploaded) {
|
1623 |
+
// Check if text has been changed but file is still attached
|
1624 |
+
const textInput = $('#textInput').val();
|
1625 |
+
const fileInput = document.getElementById('fileInput');
|
1626 |
+
|
1627 |
+
if (fileInput.files.length > 0 &&
|
1628 |
+
originalTextContent !== null &&
|
1629 |
+
textInput !== originalTextContent &&
|
1630 |
+
textInput.length < originalTextContent.length * 0.8) {
|
1631 |
+
// Text was significantly changed but file is still attached, detach it
|
1632 |
+
detachFile();
|
1633 |
+
}
|
1634 |
+
} else {
|
1635 |
+
// Reset flag after first submission
|
1636 |
+
fileJustUploaded = false;
|
1637 |
+
}
|
1638 |
+
|
1639 |
+
// Update the hidden inputs based on current model type
|
1640 |
+
if (currentModelType === 'custom') {
|
1641 |
+
$('#customModelInputHidden').val($('#customModelInput').val());
|
1642 |
+
} else {
|
1643 |
+
$('#modelInput').val($('#modelSelect').val());
|
1644 |
+
}
|
1645 |
+
|
1646 |
+
const formData = new FormData(this);
|
1647 |
+
$('#analyzeButton').prop('disabled', true);
|
1648 |
+
|
1649 |
+
$.ajax({
|
1650 |
+
url: '/',
|
1651 |
+
method: 'POST',
|
1652 |
+
data: formData,
|
1653 |
+
processData: false,
|
1654 |
+
contentType: false,
|
1655 |
+
success: function(response) {
|
1656 |
+
if (response.error) {
|
1657 |
+
showError(response.error);
|
1658 |
+
} else {
|
1659 |
+
updateResults(response);
|
1660 |
+
|
1661 |
+
// Show success badge if custom model
|
1662 |
+
if (currentModelType === 'custom') {
|
1663 |
+
$('#modelSuccessBadge').addClass('show');
|
1664 |
+
setTimeout(() => {
|
1665 |
+
$('#modelSuccessBadge').removeClass('show');
|
1666 |
+
}, 3000);
|
1667 |
+
}
|
1668 |
+
}
|
1669 |
+
},
|
1670 |
+
error: function(xhr) {
|
1671 |
+
showError(xhr.responseText || 'An error occurred while processing the text');
|
1672 |
+
},
|
1673 |
+
complete: function() {
|
1674 |
+
$('#analyzeButton').prop('disabled', false);
|
1675 |
+
}
|
1676 |
+
});
|
1677 |
+
});
|
1678 |
+
|
1679 |
+
$('#expandButton').click(function() {
|
1680 |
+
const container = $('#tokenContainer');
|
1681 |
+
const isExpanded = container.hasClass('expanded');
|
1682 |
+
|
1683 |
+
container.toggleClass('expanded');
|
1684 |
+
$(this).text(isExpanded ? 'Show More' : 'Show Less');
|
1685 |
+
});
|
1686 |
+
|
1687 |
+
// Initialize tokenizer info for current model
|
1688 |
+
if (currentModelType === 'predefined') {
|
1689 |
+
fetchTokenizerInfo($('#modelSelect').val(), false);
|
1690 |
+
} else if ($('#customModelInput').val()) {
|
1691 |
+
fetchTokenizerInfo($('#customModelInput').val(), true);
|
1692 |
+
}
|
1693 |
+
|
1694 |
+
// Add event listener for custom model input
|
1695 |
+
$('#customModelInput').on('change', function() {
|
1696 |
+
const modelValue = $(this).val();
|
1697 |
+
if (modelValue) {
|
1698 |
+
fetchTokenizerInfo(modelValue, true);
|
1699 |
+
}
|
1700 |
+
});
|
1701 |
+
});
|
1702 |
+
</script>
|
1703 |
+
</body>
|
1704 |
+
</html>
|
1705 |
+
"""
|
1706 |
+
|
1707 |
+
@app.route('/tokenizer-info', methods=['GET'])
|
1708 |
+
def tokenizer_info():
|
1709 |
+
"""
|
1710 |
+
Endpoint to get tokenizer information without processing text.
|
1711 |
+
"""
|
1712 |
+
model_id = request.args.get('model_id', '')
|
1713 |
+
is_custom = request.args.get('is_custom', 'false').lower() == 'true'
|
1714 |
+
|
1715 |
+
if not model_id:
|
1716 |
+
return jsonify({"error": "No model ID provided"}), 400
|
1717 |
+
|
1718 |
+
try:
|
1719 |
+
# For predefined models, use the model name from the dictionary
|
1720 |
+
if not is_custom and model_id in TOKENIZER_MODELS:
|
1721 |
+
model_id_or_name = model_id
|
1722 |
+
else:
|
1723 |
+
# For custom models, use the model ID directly
|
1724 |
+
model_id_or_name = model_id
|
1725 |
+
|
1726 |
+
# Load the tokenizer and get info
|
1727 |
+
tokenizer, info, error = load_tokenizer(model_id_or_name)
|
1728 |
+
|
1729 |
+
if error:
|
1730 |
+
return jsonify({"error": error}), 400
|
1731 |
+
|
1732 |
+
return jsonify(info)
|
1733 |
+
except Exception as e:
|
1734 |
+
return jsonify({"error": f"Failed to get tokenizer info: {str(e)}"}), 500
|
1735 |
+
|
1736 |
+
@app.route('/', methods=['GET', 'POST'])
|
1737 |
+
def index():
|
1738 |
+
text = ""
|
1739 |
+
token_data = None
|
1740 |
+
error_message = ""
|
1741 |
+
selected_model = request.args.get('model', request.form.get('model', 'llama4'))
|
1742 |
+
custom_model = request.args.get('custom_model', request.form.get('custom_model', ''))
|
1743 |
+
model_type = request.args.get('model_type', request.form.get('model_type', 'predefined'))
|
1744 |
+
|
1745 |
+
# Determine which model to use based on model_type
|
1746 |
+
model_to_use = selected_model if model_type == 'predefined' else custom_model
|
1747 |
+
|
1748 |
+
if request.method == 'POST':
|
1749 |
+
# Check if file upload
|
1750 |
+
if 'file' in request.files and request.files['file'].filename:
|
1751 |
+
uploaded_file = request.files['file']
|
1752 |
+
# Save file to tmp directory
|
1753 |
+
file_path = os.path.join(app.config['UPLOAD_FOLDER'], uploaded_file.filename)
|
1754 |
+
uploaded_file.save(file_path)
|
1755 |
+
|
1756 |
+
# Read a small preview of the file
|
1757 |
+
with open(file_path, 'r', errors='replace') as f:
|
1758 |
+
text = f.read(8096)
|
1759 |
+
|
1760 |
+
try:
|
1761 |
+
# Process the file
|
1762 |
+
token_data = process_text("", model_to_use, is_full_file=True, file_path=file_path)
|
1763 |
+
|
1764 |
+
# Clean up the file after processing
|
1765 |
+
if os.path.exists(file_path):
|
1766 |
+
os.remove(file_path)
|
1767 |
+
|
1768 |
+
# If request is AJAX, return JSON
|
1769 |
+
if request.headers.get('X-Requested-With') == 'XMLHttpRequest':
|
1770 |
+
return jsonify(token_data)
|
1771 |
+
|
1772 |
+
except Exception as e:
|
1773 |
+
error_message = str(e)
|
1774 |
+
# Clean up the file after processing
|
1775 |
+
if os.path.exists(file_path):
|
1776 |
+
os.remove(file_path)
|
1777 |
+
|
1778 |
+
if request.headers.get('X-Requested-With') == 'XMLHttpRequest':
|
1779 |
+
return jsonify({"error": error_message}), 400
|
1780 |
+
return render_template_string(
|
1781 |
+
HTML_TEMPLATE,
|
1782 |
+
text=text,
|
1783 |
+
token_data=None,
|
1784 |
+
models=TOKENIZER_MODELS,
|
1785 |
+
selected_model=selected_model,
|
1786 |
+
custom_model=custom_model,
|
1787 |
+
model_type=model_type,
|
1788 |
+
error=error_message
|
1789 |
+
)
|
1790 |
+
|
1791 |
+
# Regular text processing
|
1792 |
+
else:
|
1793 |
+
text = request.form.get('text', '')
|
1794 |
+
if text:
|
1795 |
+
try:
|
1796 |
+
token_data = process_text(text, model_to_use)
|
1797 |
+
|
1798 |
+
# If request is AJAX, return JSON
|
1799 |
+
if request.headers.get('X-Requested-With') == 'XMLHttpRequest':
|
1800 |
+
return jsonify(token_data)
|
1801 |
+
|
1802 |
+
except Exception as e:
|
1803 |
+
error_message = str(e)
|
1804 |
+
if request.headers.get('X-Requested-With') == 'XMLHttpRequest':
|
1805 |
+
return jsonify({"error": error_message}), 400
|
1806 |
+
return render_template_string(
|
1807 |
+
HTML_TEMPLATE,
|
1808 |
+
text=text,
|
1809 |
+
token_data=None,
|
1810 |
+
models=TOKENIZER_MODELS,
|
1811 |
+
selected_model=selected_model,
|
1812 |
+
custom_model=custom_model,
|
1813 |
+
model_type=model_type,
|
1814 |
+
error=error_message
|
1815 |
+
)
|
1816 |
+
|
1817 |
+
return render_template_string(
|
1818 |
+
HTML_TEMPLATE,
|
1819 |
+
text=text,
|
1820 |
+
token_data=token_data,
|
1821 |
+
models=TOKENIZER_MODELS,
|
1822 |
+
selected_model=selected_model,
|
1823 |
+
custom_model=custom_model,
|
1824 |
+
model_type=model_type,
|
1825 |
+
error=error_message
|
1826 |
+
)
|
1827 |
+
|
1828 |
+
if __name__ == "__main__":
|
1829 |
+
app.run(host='0.0.0.0', port=7860)
|