Spaces:

hymarog1
/

LegalDoc

Sleeping

App Files Files Community

hymarog1 commited on Apr 30

Commit

0925ca5

verified ·

1 Parent(s): 812a461

Update app.py

Browse files

Files changed (1) hide show

app.py +204 -219

app.py CHANGED Viewed

@@ -48,7 +48,7 @@ if "last_prompt_hash" not in st.session_state:
     st.session_state.last_prompt_hash = None
-st.title("📄 Legal Document Summarizer (Document Augmentation RAG)")
 USER_AVATAR = "👤"
 BOT_AVATAR = "🤖"
@@ -187,6 +187,53 @@ def load_led():
 tokenizer_led, model_led = load_led()
 def legalbert_extractive_summary(text, top_ratio=0.2):
     sentences = sent_tokenize(text)
     top_k = max(3, int(len(sentences) * top_ratio))
@@ -282,6 +329,18 @@ def hybrid_summary_hierarchical(text, top_ratio=0.8):
     return structured_summary
 def chunk_text_custom(text, n=1000, overlap=200):
     chunks = []
     for i in range(0, len(text), n - overlap):
@@ -298,151 +357,118 @@ def get_embedding(text, model="BAAI/bge-en-icl"):
     resp = client.embeddings.create(model=model, input=text)
     return np.array(resp.data[0].embedding)
-def semantic_search(query, text_chunks, chunk_embeddings, k=5):
     """
-    Compute cosine similarity between the query embedding and each chunk embedding,
-    then pick the top-k chunks.
     """
-    q_emb = get_embedding(query)
-    # simple cosine:
-    def cosine(a, b): return float(np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b)))
-    scores = [cosine(q_emb, emb) for emb in chunk_embeddings]
-    top_idxs = sorted(range(len(scores)), key=lambda i: scores[i], reverse=True)[:k]
-    return [text_chunks[i] for i in top_idxs]
-def generate_response(system_prompt, user_message, model="meta-llama/Llama-3.2-3B-Instruct"):
-    return client.chat.completions.create(
-        model=model,
-        temperature=0,
-        messages=[{"role": "system", "content": system_prompt}, {"role": "user", "content": user_message}]
-    ).choices[0].message.content
-def generate_questions(text_chunk, num_questions=5,
-                       model="meta-llama/Llama-3.2-3B-Instruct"):
-    system_prompt = (
-      "You are an expert at generating relevant questions from text. "
-      "Create concise questions that can be answered using only the provided text."
     )
-    user_prompt = f"""
-    Based on the following text, generate {num_questions} different questions
-    that can be answered using only this text:
-    {text_chunk}
-    Format your response as a numbered list of questions only.
     """
-    resp = client.chat.completions.create(
-      model=model,
-      temperature=0.7,
-      messages=[
-        {"role":"system","content":system_prompt},
-        {"role":"user","content":user_prompt}
-      ]
-    )
-    raw = resp.choices[0].message.content.strip()
-    questions = []
-    for line in raw.split("\n"):
-        q = re.sub(r"^\d+\.\s*", "", line).strip()
-        if q.endswith("?"):
-            questions.append(q)
-    return questions
-# 2) EMBEDDINGS
-def create_embeddings(text, model="BAAI/bge-en-icl"):
-    resp = client.embeddings.create(model=model, input=text)
-    return resp.data[0].embedding
-def cosine_similarity(a,b):
-    return float(np.dot(a,b)/(np.linalg.norm(a)*np.linalg.norm(b)))
-# 3) VECTOR STORE
-class SimpleVectorStore:
-    def __init__(self):
-        self.items = []  # each item is dict {text, embedding, metadata}
-    def add_item(self, text, embedding, metadata):
-        self.items.append(dict(text=text, embedding=embedding, metadata=metadata))
-    def search(self, query, k=5):
-        q_emb = create_embeddings(query)
-        scores = [(i, cosine_similarity(q_emb, item["embedding"]))
-                  for i,item in enumerate(self.items)]
-        scores.sort(key=lambda x:x[1], reverse=True)
-        return [self.items[i] for i,_ in scores[:k]]
-# 4) DOCUMENT PROCESSOR
-def process_document(raw_text,
-                     chunk_size=1000,
-                     chunk_overlap=200,
-                     questions_per_chunk=5):
-    # chunk the text
-    chunks = []
-    for i in range(0, len(raw_text), chunk_size - chunk_overlap):
-        chunks.append(raw_text[i:i+chunk_size])
-    store = SimpleVectorStore()
-    for idx,chunk in enumerate(chunks):
-        # chunk embedding
-        emb = create_embeddings(chunk)
-        store.add_item(chunk, emb, {"type":"chunk","index":idx})
-        # generate Qs + their embeddings
-        qs = generate_questions(chunk, num_questions=questions_per_chunk)
-        for q in qs:
-            q_emb = create_embeddings(q)
-            store.add_item(q, q_emb, {
-              "type":"question",
-              "chunk_index":idx,
-              "original_chunk": chunk
-            })
-    return chunks, store
-# 5) CONTEXT BUILDER
-def prepare_context(results):
     seen = set()
-    ctx = []
-    # first direct chunks
-    for r in results:
-        m = r["metadata"]
-        if m["type"]=="chunk" and m["index"] not in seen:
-            seen.add(m["index"])
-            ctx.append(f"Chunk {m['index']}:\n{r['text']}")
-    # then referenced by questions
-    for r in results:
-        m = r["metadata"]
-        if m["type"]=="question":
-            ci = m["chunk_index"]
-            if ci not in seen:
-                seen.add(ci)
-                ctx.append(f"Chunk {ci} (via Q “{r['text']}”):\n{m['original_chunk']}")
-    return "\n\n".join(ctx)
-# 6) ANSWER GENERATOR (overrides your old generate_response)
-def generate_response_from_context(query, context,
-                                   model="meta-llama/Llama-3.2-3B-Instruct"):
-    sp = (
-      "You are an AI assistant that strictly answers based on the given context. "
-      "If the answer cannot be derived directly from the provided context, "
-      "respond with: 'I do not have enough information to answer that.'"
-    )
-    up = f"""
-    Context:
-    {context}
-    Question: {query}
-    Please answer the question based only on the context above.
     """
-    resp = client.chat.completions.create(
-      model=model,
-      temperature=0,
-      messages=[{"role":"system","content":sp},
-                {"role":"user","content":up}]
     )
-    return resp.choices[0].message.content
 #######################################################################################################################
@@ -522,6 +548,13 @@ def prepare_text_for_embedding(summary_dict):
     return "\n\n".join(combined_chunks)
 ##############################################################################################################
 user_role = st.sidebar.selectbox(
@@ -550,6 +583,8 @@ def role_based_filter(section, summary, role):
 #########################################################################################################################
@@ -558,110 +593,75 @@ if uploaded_file:
     if file_hash != st.session_state.last_uploaded_hash or reprocess_btn:
         st.session_state.processed = False
     if not st.session_state.processed:
         start_time = time.time()
-        # 1) extract & summarize as before
-        raw_text     = extract_text(uploaded_file)
         summary_dict = hybrid_summary_hierarchical(raw_text)
         embedding_text = prepare_text_for_embedding(summary_dict)
-        # ─── NEW: document‐augmentation ingestion ───
-        chunks, store = process_document(raw_text,
-                                         chunk_size=1000,
-                                         chunk_overlap=200,
-                                         questions_per_chunk=5)
-        st.session_state.vector_store = store
-        # ────────────────────────────────────────────
-        # 2) generate your “role‐specific prompt” as before
         st.session_state.document_context = embedding_text
-        if user_role == "General":
-            role_specific_prompt = (
-            "Summarize the legal document focusing on the most relevant aspects "
-            "such as facts, arguments, and judgments. Include key legal reasoning "
-            "and a timeline of events where necessary."
-        )
-        else:
-            role_specific_prompt = (
-            f"As a {user_role}, summarize the legal document focusing on "
-            "the most relevant aspects such as facts, arguments, and judgments "
-            "tailored for your role. Include key legal reasoning and timeline of events."
-        )
-        # ─── REPLACE rag_query_response with doc‐augmentation RAG ───
-        results = store.search(role_specific_prompt, k=5)
-        context = prepare_context(results)
-        rag_summary = generate_response_from_context(role_specific_prompt, context)
-        #
-        st.session_state.messages.append({
-            "role": "user",
-            "content": f"📤 Uploaded **{uploaded_file.name}**"
-        })
-        st.session_state.messages.append({
-            "role": "assistant",
-            "content": rag_summary
-        })
         with st.chat_message("assistant", avatar=BOT_AVATAR):
             display_with_typing_effect(rag_summary)
         processing_time = round((time.time() - start_time) / 60, 2)
         st.info(f"⏱️ Response generated in **{processing_time} minutes**.")
-        st.session_state.generated_summary = rag_summary
         st.session_state.last_uploaded_hash = file_hash
         st.session_state.processed = True
         st.session_state.last_prompt_hash = None
         save_chat_history(st.session_state.messages)
 if prompt:
     words = prompt.split()
-    word_count   = len(words)
-    prompt_hash  = hashlib.md5(prompt.encode("utf-8")).hexdigest()
-    # 1) LONG prompts – echo & ingest like a “paste‐in” document
     if word_count > 30 and prompt_hash != st.session_state.last_prompt_hash:
         st.session_state.last_prompt_hash = prompt_hash
         raw_text = prompt
         st.session_state.messages.append({
             "role": "user",
-            "content": f"📥 **Pasted Document Text:**\n\n{limit_text(raw_text,500)}"
         })
         with st.chat_message("user", avatar=USER_AVATAR):
-            st.markdown(limit_text(raw_text,500))
         start_time = time.time()
-        # summarization + emb_text as before
-        summary_dict   = hybrid_summary_hierarchical(raw_text)
-        emb_text       = prepare_text_for_embedding(summary_dict)
         st.session_state.document_context = emb_text
         st.session_state.processed        = True
-        # ─── NEW: ingest via document‐augmentation ───
-        chunks, store = process_document(raw_text)
-        st.session_state.vector_store = store
-        if user_role == "General":
-            role_prompt = (
-                "Summarize the document focusing on facts, arguments, judgments, "
-                "and include a timeline of events."
-            )
-        else:
-            role_prompt = (
-                f"As a {user_role}, summarize the document focusing on facts, "
-                "arguments, judgments, plus timeline of events."
-            )
-        # ─── doc‐augmentation RAG here too ───
-        results = store.search(role_prompt, k=5)
-        context = prepare_context(results)
-        initial_summary = generate_response_from_context(role_prompt, context)
         st.session_state.messages.append({
             "role": "assistant",
             "content": initial_summary
@@ -672,38 +672,25 @@ if prompt:
         st.info(f"⏱️ Summary generated in {round((time.time()-start_time)/60,2)} minutes")
         save_chat_history(st.session_state.messages)
-    # 2) SHORT prompts – normal RAG against last ingested context
     elif word_count <= 30 and st.session_state.processed:
-        with st.chat_message("user", avatar=USER_AVATAR):
-            st.markdown(prompt)
-    # 2) save to history
-        st.session_state.messages.append({"role": "user", "content": prompt})
-        store = st.session_state.vector_store
-        # ─── instead of rag_query_response, do doc‐augmentation RAG ───
-        results = store.search(prompt, k=5)
-        context = prepare_context(results)
-        answer  = generate_response_from_context(prompt, context)
-        # st.session_state.messages.append({"role":"user",     "content":prompt})
-        st.session_state.messages.append({"role":"assistant","content":answer})
         with st.chat_message("assistant", avatar=BOT_AVATAR):
             display_with_typing_effect(answer)
         save_chat_history(st.session_state.messages)
-    # 3) not enough input
     else:
         with st.chat_message("assistant", avatar=BOT_AVATAR):
             st.markdown("❗ Paste at least 30 words of your document to ingest it first.")
 ################################Evaluation###########################
-######################################################################################################################
 # 📚 Imports
 import evaluate
 from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
@@ -759,8 +746,6 @@ if ground_truth_summary_file:
     else:
         st.warning("⚠️ Please generate a summary first by uploading a document.")
 ######################################################################################################################

     st.session_state.last_prompt_hash = None
+st.title("📄 Legal Document Summarizer (Alt Model w/o token doc Aug)")
 USER_AVATAR = "👤"
 BOT_AVATAR = "🤖"
 tokenizer_led, model_led = load_led()
+from transformers import pipeline
+@st.cache_resource
+def load_led_summarizer():
+    # Use “allenai/led-base-16384” (or “led-large-16384”)
+    return pipeline(
+        "summarization",
+        model="allenai/led-base-16384",
+        tokenizer="allenai/led-base-16384",
+        device=0 if torch.cuda.is_available() else -1
+    )
+led_summarizer = load_led_summarizer()
+from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
+@st.cache_resource
+def load_paraphraser():
+    tok   = AutoTokenizer.from_pretrained("google/flan-t5-small")
+    model = AutoModelForSeq2SeqLM.from_pretrained("google/flan-t5-small")
+    return pipeline(
+      "text2text-generation",
+      model=model,
+      tokenizer=tok,
+      device=0 if torch.cuda.is_available() else -1,
+      max_length=256,
+      num_beams=4,
+      do_sample=False
+    )
+paraphraser = load_paraphraser()
+def humanize(text):
+    out = paraphraser(f"paraphrase: {text}",
+                     max_length=256,
+                     num_beams=4,
+                     do_sample=False)[0]["generated_text"]
+    return out
+# then at the end of rag_query_response:
 def legalbert_extractive_summary(text, top_ratio=0.2):
     sentences = sent_tokenize(text)
     top_k = max(3, int(len(sentences) * top_ratio))
     return structured_summary
+from sentence_transformers import SentenceTransformer
+@st.cache_resource
+def load_embedder():
+    return SentenceTransformer("all-MiniLM-L6-v2")
+embedder = load_embedder()
+import numpy as np
 def chunk_text_custom(text, n=1000, overlap=200):
     chunks = []
     for i in range(0, len(text), n - overlap):
     resp = client.embeddings.create(model=model, input=text)
     return np.array(resp.data[0].embedding)
+def create_embeddings(text_chunks, model="BAAI/bge-en-icl"):
     """
+    Batch the get_embedding call over your chunks.
+    Returns a list of numpy arrays.
     """
+    return [get_embedding(chunk, model=model) for chunk in text_chunks]
+def generate_questions(text_chunk, num_questions=5):
+    """
+    Use LED to generate a small set of probing questions
+    about this chunk that the final answer should address.
+    """
+    prompt = (
+        "You are a question-generation expert. "
+        "From the text below, generate "
+        f"{num_questions} concise questions:\n\n{text_chunk}"
     )
+    out = led_summarizer(
+        prompt,
+        max_length=128,
+        min_length=32,
+        num_beams=4,
+        do_sample=False
+    )[0]["summary_text"]
+    # assume each question on its own line
+    questions = [q.strip() for q in out.split("\n") if q.strip()]
+    return questions[:num_questions]
+def process_document(document_text):
+    """
+    1) chunk the document
+    2) embed each chunk with your SentenceTransformer
+    returns chunks, embeddings
+    """
+    chunks = chunk_text_custom(document_text, n=800, overlap=200)
+    embeddings = embedder.encode(chunks, convert_to_tensor=False)
+    return chunks, embeddings
+def semantic_search(query, chunks, chunk_embeddings, k=5):
     """
+    Score each chunk by cosine similarity to the query embed
+    and return the top-k chunks (in descending order).
+    """
+    q_emb = embedder.encode([query], convert_to_tensor=False)[0]
+    scores = [
+        float(np.dot(q_emb, emb) / (np.linalg.norm(q_emb) * np.linalg.norm(emb)))
+        for emb in chunk_embeddings
+    ]
+    top_idxs = sorted(range(len(scores)), key=lambda i: scores[i], reverse=True)[:k]
+    return [chunks[i] for i in top_idxs]
+def prepare_context(questions, chunks, chunk_embeddings, k_per_question=2):
+    """
+    For each generated question, pick its top-k supporting chunks,
+    then dedupe & concatenate into one context string.
+    """
+    selected = []
+    for q in questions:
+        best = semantic_search(q, chunks, chunk_embeddings, k=k_per_question)
+        selected.extend(best)
+    # dedupe while preserving order
     seen = set()
+    context = []
+    for c in selected:
+        if c not in seen:
+            seen.add(c)
+            context.append(c)
+    return "\n\n".join(f"• {c}" for c in context)
+def rag_query_response(prompt, document_text):
     """
+    Document-Augmentation RAG:
+      1. generate probing sub-questions about the doc
+      2. process the doc (chunk + embed)
+      3. build minimal context via those questions
+      4. feed context + user prompt into LED
+      5. paraphrase (humanize)
+    """
+    # 1) Probing questions
+    questions = generate_questions(document_text, num_questions=5)
+    # 2) Chunk & embed the document
+    chunks, chunk_embs = process_document(document_text)
+    # 3) Assemble the distilled context
+    context = prepare_context(questions, chunks, chunk_embs, k_per_question=2)
+    # 4) Compose the LED input
+    led_input = (
+        "You are a knowledgeable legal assistant. "
+        "Answer the user’s question **using ONLY** the context below, "
+        "and speak in a friendly, conversational tone.\n\n"
+        f"Context:\n{context}\n\n"
+        f"Question: {prompt}\n\nAnswer:"
     )
+    raw = led_summarizer(
+        led_input,
+        max_length=512,
+        min_length=64,
+        do_sample=False
+    )[0]["summary_text"]
+    # 5) Humanize
+    return humanize(raw)
 #######################################################################################################################
     return "\n\n".join(combined_chunks)
+###################################################################################################################
+# Store cleaned text and FAISS index only when document is processed
+# Embedding for chunking
 ##############################################################################################################
 user_role = st.sidebar.selectbox(
 #########################################################################################################################
     if file_hash != st.session_state.last_uploaded_hash or reprocess_btn:
         st.session_state.processed = False
+    # if is_new_file or reprocess_btn:
+    #     st.session_state.processed = False
     if not st.session_state.processed:
         start_time = time.time()
+        raw_text = extract_text(uploaded_file)
         summary_dict = hybrid_summary_hierarchical(raw_text)
+        # timeline_data = extract_timeline(clean_text(raw_text))
         embedding_text = prepare_text_for_embedding(summary_dict)
+        # Generate and display RAG-based summary
         st.session_state.document_context = embedding_text
+        role_specific_prompt = f"As a {user_role}, summarize the legal document focusing on the most relevant aspects such as facts, arguments, and judgments tailored for your role. Include key legal reasoning and timeline of events where necessary."
+        rag_summary = rag_query_response(role_specific_prompt, embedding_text)
+        st.session_state.messages.append({"role": "user", "content": f"📤 Uploaded **{uploaded_file.name}**"})
+        st.session_state.messages.append({"role": "assistant", "content": rag_summary})
         with st.chat_message("assistant", avatar=BOT_AVATAR):
             display_with_typing_effect(rag_summary)
         processing_time = round((time.time() - start_time) / 60, 2)
         st.info(f"⏱️ Response generated in **{processing_time} minutes**.")
+        st.session_state.generated_summary = rag_summary   #for Evalution
         st.session_state.last_uploaded_hash = file_hash
         st.session_state.processed = True
         st.session_state.last_prompt_hash = None
         save_chat_history(st.session_state.messages)
 if prompt:
     words = prompt.split()
+    word_count = len(words)
+    prompt_hash = hashlib.md5(prompt.encode("utf-8")).hexdigest()
+    # 1) LONG prompts – echo first, then summarize
     if word_count > 30 and prompt_hash != st.session_state.last_prompt_hash:
+        # mark new prompt
         st.session_state.last_prompt_hash = prompt_hash
+        # raw_text is just the prompt text
         raw_text = prompt
         st.session_state.messages.append({
             "role": "user",
+            "content": f"📥 **Pasted Document Text:**\n\n{limit_text(raw_text, word_limit=500)}"
         })
         with st.chat_message("user", avatar=USER_AVATAR):
+            st.markdown(limit_text(raw_text, word_limit=500))
         start_time = time.time()
+        summary_dict = hybrid_summary_hierarchical(raw_text)
+        emb_text     = prepare_text_for_embedding(summary_dict)
         st.session_state.document_context = emb_text
         st.session_state.processed        = True
+        role_prompt = (
+            f"As a {user_role}, summarize the document focusing on facts, "
+            "arguments, judgments, plus timeline of events."
+        )
+        initial_summary = rag_query_response(role_prompt, emb_text)
+        # 3️⃣ Append & display the assistant’s summary with typing effect
         st.session_state.messages.append({
             "role": "assistant",
             "content": initial_summary
         st.info(f"⏱️ Summary generated in {round((time.time()-start_time)/60,2)} minutes")
         save_chat_history(st.session_state.messages)
+    # 2) SHORT prompts: normal RAG against last context
     elif word_count <= 30 and st.session_state.processed:
+        role_query = f"As a {user_role}, {prompt}"
+        answer     = rag_query_response(role_query, st.session_state.document_context)
+        answer = rag_query_response(prompt, st.session_state.document_context)
+        st.session_state.messages.append({"role": "user",    "content": prompt})
+        st.session_state.messages.append({"role": "assistant","content": answer})
         with st.chat_message("assistant", avatar=BOT_AVATAR):
             display_with_typing_effect(answer)
         save_chat_history(st.session_state.messages)
+    # 3) Ingest prompt to start
     else:
         with st.chat_message("assistant", avatar=BOT_AVATAR):
             st.markdown("❗ Paste at least 30 words of your document to ingest it first.")
 ################################Evaluation###########################
 # 📚 Imports
 import evaluate
 from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
     else:
         st.warning("⚠️ Please generate a summary first by uploading a document.")
 ######################################################################################################################