Spaces:

hymarog1
/

LegalDoc

Sleeping

App Files Files Community

hymarog1 commited on Apr 30

Commit

f5f246c

verified ·

1 Parent(s): 0925ca5

Update app.py

Browse files

Files changed (1) hide show

app.py +226 -234

app.py CHANGED Viewed

@@ -48,7 +48,7 @@ if "last_prompt_hash" not in st.session_state:
     st.session_state.last_prompt_hash = None
-st.title("📄 Legal Document Summarizer (Alt Model w/o token doc Aug)")
 USER_AVATAR = "👤"
 BOT_AVATAR = "🤖"
@@ -187,53 +187,6 @@ def load_led():
 tokenizer_led, model_led = load_led()
-from transformers import pipeline
-@st.cache_resource
-def load_led_summarizer():
-    # Use “allenai/led-base-16384” (or “led-large-16384”)
-    return pipeline(
-        "summarization",
-        model="allenai/led-base-16384",
-        tokenizer="allenai/led-base-16384",
-        device=0 if torch.cuda.is_available() else -1
-    )
-led_summarizer = load_led_summarizer()
-from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
-@st.cache_resource
-def load_paraphraser():
-    tok   = AutoTokenizer.from_pretrained("google/flan-t5-small")
-    model = AutoModelForSeq2SeqLM.from_pretrained("google/flan-t5-small")
-    return pipeline(
-      "text2text-generation",
-      model=model,
-      tokenizer=tok,
-      device=0 if torch.cuda.is_available() else -1,
-      max_length=256,
-      num_beams=4,
-      do_sample=False
-    )
-paraphraser = load_paraphraser()
-def humanize(text):
-    out = paraphraser(f"paraphrase: {text}",
-                     max_length=256,
-                     num_beams=4,
-                     do_sample=False)[0]["generated_text"]
-    return out
-# then at the end of rag_query_response:
 def legalbert_extractive_summary(text, top_ratio=0.2):
     sentences = sent_tokenize(text)
     top_k = max(3, int(len(sentences) * top_ratio))
@@ -329,18 +282,6 @@ def hybrid_summary_hierarchical(text, top_ratio=0.8):
     return structured_summary
-from sentence_transformers import SentenceTransformer
-@st.cache_resource
-def load_embedder():
-    return SentenceTransformer("all-MiniLM-L6-v2")
-embedder = load_embedder()
-import numpy as np
 def chunk_text_custom(text, n=1000, overlap=200):
     chunks = []
     for i in range(0, len(text), n - overlap):
@@ -357,118 +298,151 @@ def get_embedding(text, model="BAAI/bge-en-icl"):
     resp = client.embeddings.create(model=model, input=text)
     return np.array(resp.data[0].embedding)
-def create_embeddings(text_chunks, model="BAAI/bge-en-icl"):
-    """
-    Batch the get_embedding call over your chunks.
-    Returns a list of numpy arrays.
-    """
-    return [get_embedding(chunk, model=model) for chunk in text_chunks]
-def generate_questions(text_chunk, num_questions=5):
-    """
-    Use LED to generate a small set of probing questions
-    about this chunk that the final answer should address.
-    """
-    prompt = (
-        "You are a question-generation expert. "
-        "From the text below, generate "
-        f"{num_questions} concise questions:\n\n{text_chunk}"
-    )
-    out = led_summarizer(
-        prompt,
-        max_length=128,
-        min_length=32,
-        num_beams=4,
-        do_sample=False
-    )[0]["summary_text"]
-    # assume each question on its own line
-    questions = [q.strip() for q in out.split("\n") if q.strip()]
-    return questions[:num_questions]
-def process_document(document_text):
     """
-    1) chunk the document
-    2) embed each chunk with your SentenceTransformer
-    returns chunks, embeddings
     """
-    chunks = chunk_text_custom(document_text, n=800, overlap=200)
-    embeddings = embedder.encode(chunks, convert_to_tensor=False)
-    return chunks, embeddings
-def semantic_search(query, chunks, chunk_embeddings, k=5):
-    """
-    Score each chunk by cosine similarity to the query embed
-    and return the top-k chunks (in descending order).
-    """
-    q_emb = embedder.encode([query], convert_to_tensor=False)[0]
-    scores = [
-        float(np.dot(q_emb, emb) / (np.linalg.norm(q_emb) * np.linalg.norm(emb)))
-        for emb in chunk_embeddings
-    ]
-    top_idxs = sorted(range(len(scores)), key=lambda i: scores[i], reverse=True)[:k]
-    return [chunks[i] for i in top_idxs]
-def prepare_context(questions, chunks, chunk_embeddings, k_per_question=2):
-    """
-    For each generated question, pick its top-k supporting chunks,
-    then dedupe & concatenate into one context string.
-    """
-    selected = []
-    for q in questions:
-        best = semantic_search(q, chunks, chunk_embeddings, k=k_per_question)
-        selected.extend(best)
-    # dedupe while preserving order
     seen = set()
-    context = []
-    for c in selected:
-        if c not in seen:
-            seen.add(c)
-            context.append(c)
-    return "\n\n".join(f"• {c}" for c in context)
-def rag_query_response(prompt, document_text):
     """
-    Document-Augmentation RAG:
-      1. generate probing sub-questions about the doc
-      2. process the doc (chunk + embed)
-      3. build minimal context via those questions
-      4. feed context + user prompt into LED
-      5. paraphrase (humanize)
-    """
-    # 1) Probing questions
-    questions = generate_questions(document_text, num_questions=5)
-    # 2) Chunk & embed the document
-    chunks, chunk_embs = process_document(document_text)
-    # 3) Assemble the distilled context
-    context = prepare_context(questions, chunks, chunk_embs, k_per_question=2)
-    # 4) Compose the LED input
-    led_input = (
-        "You are a knowledgeable legal assistant. "
-        "Answer the user’s question **using ONLY** the context below, "
-        "and speak in a friendly, conversational tone.\n\n"
-        f"Context:\n{context}\n\n"
-        f"Question: {prompt}\n\nAnswer:"
     )
-    raw = led_summarizer(
-        led_input,
-        max_length=512,
-        min_length=64,
-        do_sample=False
-    )[0]["summary_text"]
-    # 5) Humanize
-    return humanize(raw)
 #######################################################################################################################
@@ -548,13 +522,6 @@ def prepare_text_for_embedding(summary_dict):
     return "\n\n".join(combined_chunks)
-###################################################################################################################
-# Store cleaned text and FAISS index only when document is processed
-# Embedding for chunking
 ##############################################################################################################
 user_role = st.sidebar.selectbox(
@@ -583,8 +550,6 @@ def role_based_filter(section, summary, role):
 #########################################################################################################################
@@ -593,75 +558,110 @@ if uploaded_file:
     if file_hash != st.session_state.last_uploaded_hash or reprocess_btn:
         st.session_state.processed = False
-    # if is_new_file or reprocess_btn:
-    #     st.session_state.processed = False
     if not st.session_state.processed:
         start_time = time.time()
-        raw_text = extract_text(uploaded_file)
         summary_dict = hybrid_summary_hierarchical(raw_text)
-        # timeline_data = extract_timeline(clean_text(raw_text))
         embedding_text = prepare_text_for_embedding(summary_dict)
-        # Generate and display RAG-based summary
         st.session_state.document_context = embedding_text
-        role_specific_prompt = f"As a {user_role}, summarize the legal document focusing on the most relevant aspects such as facts, arguments, and judgments tailored for your role. Include key legal reasoning and timeline of events where necessary."
-        rag_summary = rag_query_response(role_specific_prompt, embedding_text)
-        st.session_state.messages.append({"role": "user", "content": f"📤 Uploaded **{uploaded_file.name}**"})
-        st.session_state.messages.append({"role": "assistant", "content": rag_summary})
         with st.chat_message("assistant", avatar=BOT_AVATAR):
             display_with_typing_effect(rag_summary)
         processing_time = round((time.time() - start_time) / 60, 2)
         st.info(f"⏱️ Response generated in **{processing_time} minutes**.")
-        st.session_state.generated_summary = rag_summary   #for Evalution
         st.session_state.last_uploaded_hash = file_hash
         st.session_state.processed = True
         st.session_state.last_prompt_hash = None
         save_chat_history(st.session_state.messages)
 if prompt:
     words = prompt.split()
-    word_count = len(words)
-    prompt_hash = hashlib.md5(prompt.encode("utf-8")).hexdigest()
-    # 1) LONG prompts – echo first, then summarize
     if word_count > 30 and prompt_hash != st.session_state.last_prompt_hash:
-        # mark new prompt
         st.session_state.last_prompt_hash = prompt_hash
-        # raw_text is just the prompt text
         raw_text = prompt
         st.session_state.messages.append({
             "role": "user",
-            "content": f"📥 **Pasted Document Text:**\n\n{limit_text(raw_text, word_limit=500)}"
         })
         with st.chat_message("user", avatar=USER_AVATAR):
-            st.markdown(limit_text(raw_text, word_limit=500))
         start_time = time.time()
-        summary_dict = hybrid_summary_hierarchical(raw_text)
-        emb_text     = prepare_text_for_embedding(summary_dict)
         st.session_state.document_context = emb_text
         st.session_state.processed        = True
-        role_prompt = (
-            f"As a {user_role}, summarize the document focusing on facts, "
-            "arguments, judgments, plus timeline of events."
-        )
-        initial_summary = rag_query_response(role_prompt, emb_text)
-        # 3️⃣ Append & display the assistant’s summary with typing effect
         st.session_state.messages.append({
             "role": "assistant",
             "content": initial_summary
@@ -672,29 +672,40 @@ if prompt:
         st.info(f"⏱️ Summary generated in {round((time.time()-start_time)/60,2)} minutes")
         save_chat_history(st.session_state.messages)
-    # 2) SHORT prompts: normal RAG against last context
     elif word_count <= 30 and st.session_state.processed:
-        role_query = f"As a {user_role}, {prompt}"
-        answer     = rag_query_response(role_query, st.session_state.document_context)
-        answer = rag_query_response(prompt, st.session_state.document_context)
-        st.session_state.messages.append({"role": "user",    "content": prompt})
-        st.session_state.messages.append({"role": "assistant","content": answer})
         with st.chat_message("assistant", avatar=BOT_AVATAR):
             display_with_typing_effect(answer)
         save_chat_history(st.session_state.messages)
-    # 3) Ingest prompt to start
     else:
         with st.chat_message("assistant", avatar=BOT_AVATAR):
             st.markdown("❗ Paste at least 30 words of your document to ingest it first.")
-################################Evaluation###########################
 # 📚 Imports
 import evaluate
 from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
-import streamlit as st
 # �� Load Evaluators Once
 @st.cache_resource
@@ -713,40 +724,21 @@ def evaluate_summary(generated_summary, ground_truth_summary):
     return rouge_result, bert_result
 def compute_bleu(prediction, ground_truth):
-    """Compute BLEU score for summaries."""
     reference = [ground_truth.strip().split()]
     candidate = prediction.strip().split()
     smoothie = SmoothingFunction().method4
     return sentence_bleu(reference, candidate, smoothing_function=smoothie)
-# 📥 Upload and Evaluate
-ground_truth_summary_file = st.file_uploader("📄 Upload Ground Truth Summary (.txt)", type=["txt"])
-if ground_truth_summary_file:
-    ground_truth_summary = ground_truth_summary_file.read().decode("utf-8").strip()
-    if "generated_summary" in st.session_state and st.session_state.generated_summary:
-        prediction = st.session_state.generated_summary
-        # Evaluate ROUGE and BERTScore
-        rouge_result, bert_result = evaluate_summary(prediction, ground_truth_summary)
-        # Display ROUGE and BERTScore
-        st.subheader("📊 Evaluation Results")
-        st.write("🔹 ROUGE Scores:")
-        st.json(rouge_result)
-        st.write("🔹 BERTScore:")
-        st.json(bert_result)
-        # Compute and Display BLEU Score
-        bleu = compute_bleu(prediction, ground_truth_summary)
-        st.subheader("🔵 BLEU Score")
-        st.write(f"BLEU Score: {bleu:.4f}")
-    else:
-        st.warning("⚠️ Please generate a summary first by uploading a document.")
-######################################################################################################################
 # Run this along with streamlit run app.py to evaluate the model's performance on a test set

     st.session_state.last_prompt_hash = None
+st.title("📄 Legal Document Summarizer (Document Augmentation RAG)")
 USER_AVATAR = "👤"
 BOT_AVATAR = "🤖"
 tokenizer_led, model_led = load_led()
 def legalbert_extractive_summary(text, top_ratio=0.2):
     sentences = sent_tokenize(text)
     top_k = max(3, int(len(sentences) * top_ratio))
     return structured_summary
 def chunk_text_custom(text, n=1000, overlap=200):
     chunks = []
     for i in range(0, len(text), n - overlap):
     resp = client.embeddings.create(model=model, input=text)
     return np.array(resp.data[0].embedding)
+def semantic_search(query, text_chunks, chunk_embeddings, k=5):
     """
+    Compute cosine similarity between the query embedding and each chunk embedding,
+    then pick the top-k chunks.
     """
+    q_emb = get_embedding(query)
+    # simple cosine:
+    def cosine(a, b): return float(np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b)))
+    scores = [cosine(q_emb, emb) for emb in chunk_embeddings]
+    top_idxs = sorted(range(len(scores)), key=lambda i: scores[i], reverse=True)[:k]
+    return [text_chunks[i] for i in top_idxs]
+def generate_response(system_prompt, user_message, model="meta-llama/Llama-3.2-3B-Instruct"):
+    return client.chat.completions.create(
+        model=model,
+        temperature=0,
+        messages=[{"role": "system", "content": system_prompt}, {"role": "user", "content": user_message}]
+    ).choices[0].message.content
+def generate_questions(text_chunk, num_questions=5,
+                       model="meta-llama/Llama-3.2-3B-Instruct"):
+    system_prompt = (
+      "You are an expert at generating relevant questions from text. "
+      "Create concise questions that can be answered using only the provided text."
+    )
+    user_prompt = f"""
+    Based on the following text, generate {num_questions} different questions
+    that can be answered using only this text:
+    {text_chunk}
+    Format your response as a numbered list of questions only.
+    """
+    resp = client.chat.completions.create(
+      model=model,
+      temperature=0.7,
+      messages=[
+        {"role":"system","content":system_prompt},
+        {"role":"user","content":user_prompt}
+      ]
+    )
+    raw = resp.choices[0].message.content.strip()
+    questions = []
+    for line in raw.split("\n"):
+        q = re.sub(r"^\d+\.\s*", "", line).strip()
+        if q.endswith("?"):
+            questions.append(q)
+    return questions
+# 2) EMBEDDINGS
+def create_embeddings(text, model="BAAI/bge-en-icl"):
+    resp = client.embeddings.create(model=model, input=text)
+    return resp.data[0].embedding
+def cosine_similarity(a,b):
+    return float(np.dot(a,b)/(np.linalg.norm(a)*np.linalg.norm(b)))
+# 3) VECTOR STORE
+class SimpleVectorStore:
+    def __init__(self):
+        self.items = []  # each item is dict {text, embedding, metadata}
+    def add_item(self, text, embedding, metadata):
+        self.items.append(dict(text=text, embedding=embedding, metadata=metadata))
+    def search(self, query, k=5):
+        q_emb = create_embeddings(query)
+        scores = [(i, cosine_similarity(q_emb, item["embedding"]))
+                  for i,item in enumerate(self.items)]
+        scores.sort(key=lambda x:x[1], reverse=True)
+        return [self.items[i] for i,_ in scores[:k]]
+# 4) DOCUMENT PROCESSOR
+def process_document(raw_text,
+                     chunk_size=1000,
+                     chunk_overlap=200,
+                     questions_per_chunk=5):
+    # chunk the text
+    chunks = []
+    for i in range(0, len(raw_text), chunk_size - chunk_overlap):
+        chunks.append(raw_text[i:i+chunk_size])
+    store = SimpleVectorStore()
+    for idx,chunk in enumerate(chunks):
+        # chunk embedding
+        emb = create_embeddings(chunk)
+        store.add_item(chunk, emb, {"type":"chunk","index":idx})
+        # generate Qs + their embeddings
+        qs = generate_questions(chunk, num_questions=questions_per_chunk)
+        for q in qs:
+            q_emb = create_embeddings(q)
+            store.add_item(q, q_emb, {
+              "type":"question",
+              "chunk_index":idx,
+              "original_chunk": chunk
+            })
+    return chunks, store
+# 5) CONTEXT BUILDER
+def prepare_context(results):
     seen = set()
+    ctx = []
+    # first direct chunks
+    for r in results:
+        m = r["metadata"]
+        if m["type"]=="chunk" and m["index"] not in seen:
+            seen.add(m["index"])
+            ctx.append(f"Chunk {m['index']}:\n{r['text']}")
+    # then referenced by questions
+    for r in results:
+        m = r["metadata"]
+        if m["type"]=="question":
+            ci = m["chunk_index"]
+            if ci not in seen:
+                seen.add(ci)
+                ctx.append(f"Chunk {ci} (via Q “{r['text']}”):\n{m['original_chunk']}")
+    return "\n\n".join(ctx)
+# 6) ANSWER GENERATOR (overrides your old generate_response)
+def generate_response_from_context(query, context,
+                                   model="meta-llama/Llama-3.2-3B-Instruct"):
+    sp = (
+      "You are an AI assistant that strictly answers based on the given context. "
+      "If the answer cannot be derived directly from the provided context, "
+      "respond with: 'I do not have enough information to answer that.'"
+    )
+    up = f"""
+    Context:
+    {context}
+    Question: {query}
+    Please answer the question based only on the context above.
     """
+    resp = client.chat.completions.create(
+      model=model,
+      temperature=0,
+      messages=[{"role":"system","content":sp},
+                {"role":"user","content":up}]
     )
+    return resp.choices[0].message.content
 #######################################################################################################################
     return "\n\n".join(combined_chunks)
 ##############################################################################################################
 user_role = st.sidebar.selectbox(
 #########################################################################################################################
     if file_hash != st.session_state.last_uploaded_hash or reprocess_btn:
         st.session_state.processed = False
     if not st.session_state.processed:
         start_time = time.time()
+        # 1) extract & summarize as before
+        raw_text     = extract_text(uploaded_file)
         summary_dict = hybrid_summary_hierarchical(raw_text)
         embedding_text = prepare_text_for_embedding(summary_dict)
+        # ─── NEW: document‐augmentation ingestion ───
+        chunks, store = process_document(raw_text,
+                                         chunk_size=1000,
+                                         chunk_overlap=200,
+                                         questions_per_chunk=5)
+        st.session_state.vector_store = store
+        # ────────────────────────────────────────────
+        # 2) generate your “role‐specific prompt” as before
         st.session_state.document_context = embedding_text
+        if user_role == "General":
+            role_specific_prompt = (
+            "Summarize the legal document focusing on the most relevant aspects "
+            "such as facts, arguments, and judgments. Include key legal reasoning "
+            "and a timeline of events where necessary."
+        )
+        else:
+            role_specific_prompt = (
+            f"As a {user_role}, summarize the legal document focusing on "
+            "the most relevant aspects such as facts, arguments, and judgments "
+            "tailored for your role. Include key legal reasoning and timeline of events."
+        )
+        # ─── REPLACE rag_query_response with doc‐augmentation RAG ───
+        results = store.search(role_specific_prompt, k=5)
+        context = prepare_context(results)
+        rag_summary = generate_response_from_context(role_specific_prompt, context)
+        #
+        st.session_state.messages.append({
+            "role": "user",
+            "content": f"📤 Uploaded **{uploaded_file.name}**"
+        })
+        st.session_state.messages.append({
+            "role": "assistant",
+            "content": rag_summary
+        })
         with st.chat_message("assistant", avatar=BOT_AVATAR):
             display_with_typing_effect(rag_summary)
         processing_time = round((time.time() - start_time) / 60, 2)
         st.info(f"⏱️ Response generated in **{processing_time} minutes**.")
+        st.session_state.generated_summary = rag_summary
         st.session_state.last_uploaded_hash = file_hash
         st.session_state.processed = True
         st.session_state.last_prompt_hash = None
         save_chat_history(st.session_state.messages)
 if prompt:
     words = prompt.split()
+    word_count   = len(words)
+    prompt_hash  = hashlib.md5(prompt.encode("utf-8")).hexdigest()
+    # 1) LONG prompts – echo & ingest like a “paste‐in” document
     if word_count > 30 and prompt_hash != st.session_state.last_prompt_hash:
         st.session_state.last_prompt_hash = prompt_hash
         raw_text = prompt
         st.session_state.messages.append({
             "role": "user",
+            "content": f"📥 **Pasted Document Text:**\n\n{limit_text(raw_text,500)}"
         })
         with st.chat_message("user", avatar=USER_AVATAR):
+            st.markdown(limit_text(raw_text,500))
         start_time = time.time()
+        # summarization + emb_text as before
+        summary_dict   = hybrid_summary_hierarchical(raw_text)
+        emb_text       = prepare_text_for_embedding(summary_dict)
         st.session_state.document_context = emb_text
         st.session_state.processed        = True
+        # ─── NEW: ingest via document‐augmentation ───
+        chunks, store = process_document(raw_text)
+        st.session_state.vector_store = store
+        if user_role == "General":
+            role_prompt = (
+                "Summarize the document focusing on facts, arguments, judgments, "
+                "and include a timeline of events."
+            )
+        else:
+            role_prompt = (
+                f"As a {user_role}, summarize the document focusing on facts, "
+                "arguments, judgments, plus timeline of events."
+            )
+        # ─── doc‐augmentation RAG here too ───
+        results = store.search(role_prompt, k=5)
+        context = prepare_context(results)
+        initial_summary = generate_response_from_context(role_prompt, context)
         st.session_state.messages.append({
             "role": "assistant",
             "content": initial_summary
         st.info(f"⏱️ Summary generated in {round((time.time()-start_time)/60,2)} minutes")
         save_chat_history(st.session_state.messages)
+    # 2) SHORT prompts – normal RAG against last ingested context
     elif word_count <= 30 and st.session_state.processed:
+        with st.chat_message("user", avatar=USER_AVATAR):
+            st.markdown(prompt)
+    # 2) save to history
+        st.session_state.messages.append({"role": "user", "content": prompt})
+        store = st.session_state.vector_store
+        # ─── instead of rag_query_response, do doc‐augmentation RAG ───
+        results = store.search(prompt, k=5)
+        context = prepare_context(results)
+        answer  = generate_response_from_context(prompt, context)
+        # st.session_state.messages.append({"role":"user",     "content":prompt})
+        st.session_state.messages.append({"role":"assistant","content":answer})
         with st.chat_message("assistant", avatar=BOT_AVATAR):
             display_with_typing_effect(answer)
         save_chat_history(st.session_state.messages)
+    # 3) not enough input
     else:
         with st.chat_message("assistant", avatar=BOT_AVATAR):
             st.markdown("❗ Paste at least 30 words of your document to ingest it first.")
+######################################################################################################################
 # 📚 Imports
 import evaluate
 from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
 # �� Load Evaluators Once
 @st.cache_resource
     return rouge_result, bert_result
 def compute_bleu(prediction, ground_truth):
     reference = [ground_truth.strip().split()]
     candidate = prediction.strip().split()
     smoothie = SmoothingFunction().method4
     return sentence_bleu(reference, candidate, smoothing_function=smoothie)
+def evaluate_metrics(prediction, ground_truth):
+    """Only compute ROUGE, BLEU, and BERTScore."""
+    rouge_result, bert_result = evaluate_summary(prediction, ground_truth)
+    bleu_score = compute_bleu(prediction, ground_truth)
+    return {
+        "ROUGE": rouge_result,
+        "BERTScore": bert_result,
+        "BLEU Score": bleu_score
+    }
 # Run this along with streamlit run app.py to evaluate the model's performance on a test set