PabloTJ commited on
Commit
9d5d030
·
verified ·
1 Parent(s): 6d2277b

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +86 -22
app.py CHANGED
@@ -1,18 +1,56 @@
1
-
2
  import gradio as gr
3
- from transformers import pipeline
4
  import re
5
  import numpy as np
6
  import pandas as pd
7
 
8
- # Load models for generation and rating
9
- gen_model = pipeline("text-generation", model="mistralai/Mistral-7B-Instruct-v0.1")
10
- rater_models = [
11
- pipeline("text-generation", model="HuggingFaceH4/zephyr-7b-beta"),
12
- pipeline("text-generation", model="google/flan-t5-large")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
13
  ]
14
 
15
- # Language list
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
16
  languages = {
17
  "en": "English", "es": "Spanish", "fr": "French", "de": "German", "it": "Italian",
18
  "pt": "Portuguese", "ru": "Russian", "ar": "Arabic", "hi": "Hindi", "ja": "Japanese"
@@ -35,26 +73,40 @@ def extract_score(text):
35
  return min(max(score, 0), 100)
36
  return 0
37
 
38
- def run_benchmark():
 
 
 
 
 
39
  results = []
40
  for code, lang in languages.items():
41
- prompt = f'''Write the longest original palindrome you can in {lang}. It should be creative and not a known palindrome. If it is not a correct palindrome, you will lose points according to how correct it is.'''
42
-
43
- gen_output = gen_model(prompt, max_new_tokens=100, do_sample=True)[0]['generated_text'].strip()
 
 
 
 
 
 
44
  valid = is_palindrome(gen_output)
45
  cleaned_len = len(clean_text(gen_output))
46
-
47
  scores = []
48
  for rater in rater_models:
49
  rprompt = grammar_prompt(gen_output, lang)
50
- rtext = rater(rprompt, max_new_tokens=10)[0]['generated_text']
51
- score = extract_score(rtext)
52
- scores.append(score)
53
-
54
- avg_score = np.mean(scores)
 
 
 
55
  penalty = (avg_score / 100) if valid else (avg_score / 100) * 0.5
56
  final_score = round(cleaned_len * penalty, 2)
57
-
58
  results.append({
59
  "Language": lang,
60
  "Palindrome": gen_output,
@@ -63,9 +115,21 @@ def run_benchmark():
63
  "Grammar Score": avg_score,
64
  "Final Score": final_score
65
  })
66
-
67
  df = pd.DataFrame(results).sort_values(by="Final Score", ascending=False).reset_index(drop=True)
68
  return gr.Dataframe(df)
69
 
70
- iface = gr.Interface(fn=run_benchmark, inputs=[], outputs="dataframe", title="🔁 LLM Palindrome Benchmark")
71
- iface.launch()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import gradio as gr
2
+ from transformers import pipeline, set_seed
3
  import re
4
  import numpy as np
5
  import pandas as pd
6
 
7
+ # Set a seed for reproducibility
8
+ set_seed(42)
9
+
10
+ # List of premium generation models (as suggested from the Vellum AI leaderboard)
11
+ generation_model_names = [
12
+ "mistralai/Mistral-7B-v0.1",
13
+ "mistralai/Mixtral-8x7B-v0.1",
14
+ "meta-llama/Llama-4-Scout",
15
+ "meta-llama/Llama-4-Maverick",
16
+ "Qwen/Qwen2.5-72B",
17
+ "HuggingFaceH4/zephyr-7b-beta",
18
+ "01-ai/Yi-34B",
19
+ "deepseek-ai/deepseek-llm-67b-base",
20
+ "HuggingFaceH4/zephyr-7b-alpha",
21
+ "microsoft/Marcoroni-7B-v3"
22
+ ]
23
+
24
+ # List of cost-effective grammar evaluation models
25
+ grammar_model_names = [
26
+ "vennify/t5-base-grammar-correction",
27
+ "hassaanik/grammar-correction-model"
28
  ]
29
 
30
+ # Load a generation pipeline given the model name.
31
+ def load_generation_pipeline(model_name):
32
+ try:
33
+ return pipeline("text-generation", model=model_name)
34
+ except Exception as e:
35
+ print(f"Error loading generation model {model_name}: {e}")
36
+ return None
37
+
38
+ # Load a grammar evaluation pipeline (text2text-generation)
39
+ def load_grammar_pipeline(model_name):
40
+ try:
41
+ return pipeline("text2text-generation", model=model_name)
42
+ except Exception as e:
43
+ print(f"Error loading grammar model {model_name}: {e}")
44
+ return None
45
+
46
+ # Pre-load grammar evaluator models (assumed to be cost-effective and stable)
47
+ rater_models = []
48
+ for model_name in grammar_model_names:
49
+ p = load_grammar_pipeline(model_name)
50
+ if p is not None:
51
+ rater_models.append(p)
52
+
53
+ # Language dictionary
54
  languages = {
55
  "en": "English", "es": "Spanish", "fr": "French", "de": "German", "it": "Italian",
56
  "pt": "Portuguese", "ru": "Russian", "ar": "Arabic", "hi": "Hindi", "ja": "Japanese"
 
73
  return min(max(score, 0), 100)
74
  return 0
75
 
76
+ def run_benchmark(selected_model):
77
+ # Load the selected premium generation pipeline
78
+ gen_model = load_generation_pipeline(selected_model)
79
+ if gen_model is None:
80
+ return "Error loading generation model."
81
+
82
  results = []
83
  for code, lang in languages.items():
84
+ prompt = (
85
+ f"Write the longest original palindrome you can in {lang}. "
86
+ f"It should be creative and not a known palindrome. "
87
+ f"If it is not a correct palindrome, you will lose points according to how correct it is."
88
+ )
89
+ try:
90
+ gen_output = gen_model(prompt, max_new_tokens=100, do_sample=True)[0]['generated_text'].strip()
91
+ except Exception as e:
92
+ gen_output = f"Error generating text: {e}"
93
  valid = is_palindrome(gen_output)
94
  cleaned_len = len(clean_text(gen_output))
95
+
96
  scores = []
97
  for rater in rater_models:
98
  rprompt = grammar_prompt(gen_output, lang)
99
+ try:
100
+ # For a text2text model, we assume the output contains a number (0-100)
101
+ rtext = rater(rprompt, max_new_tokens=10)[0]['generated_text']
102
+ score = extract_score(rtext)
103
+ scores.append(score)
104
+ except Exception as e:
105
+ scores.append(0)
106
+ avg_score = np.mean(scores) if scores else 0
107
  penalty = (avg_score / 100) if valid else (avg_score / 100) * 0.5
108
  final_score = round(cleaned_len * penalty, 2)
109
+
110
  results.append({
111
  "Language": lang,
112
  "Palindrome": gen_output,
 
115
  "Grammar Score": avg_score,
116
  "Final Score": final_score
117
  })
118
+
119
  df = pd.DataFrame(results).sort_values(by="Final Score", ascending=False).reset_index(drop=True)
120
  return gr.Dataframe(df)
121
 
122
+ # Build the Gradio UI using Blocks (canvas layout)
123
+ with gr.Blocks(title="LLM Palindrome Benchmark - Premium Generation Models") as demo:
124
+ gr.Markdown("# LLM Palindrome Benchmark")
125
+ gr.Markdown("Select one of the premium generation models below (for non-commercial, educational usage) and run the benchmark.")
126
+
127
+ with gr.Row():
128
+ model_dropdown = gr.Dropdown(choices=generation_model_names, label="Select Premium Generation Model")
129
+ run_button = gr.Button("Run Benchmark")
130
+
131
+ output_table = gr.Dataframe(label="Benchmark Results")
132
+
133
+ run_button.click(fn=run_benchmark, inputs=model_dropdown, outputs=output_table)
134
+
135
+ demo.launch()