saaketvarma commited on
Commit
74b73d4
·
1 Parent(s): 368155b

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +82 -41
app.py CHANGED
@@ -1,55 +1,96 @@
1
- # from dotenv import load_dotenv
2
  import streamlit as st
 
3
  from PyPDF2 import PdfReader
4
  from langchain.text_splitter import CharacterTextSplitter
5
- from langchain.embeddings.openai import OpenAIEmbeddings
6
  from langchain.vectorstores import FAISS
7
- from langchain.chains.question_answering import load_qa_chain
8
- from langchain.llms import OpenAI
9
- from langchain.callbacks import get_openai_callback
 
10
 
 
 
 
 
 
 
 
11
 
12
- def main():
13
- # load_dotenv()
14
- st.set_page_config(page_title="Ask your PDF")
15
- st.header("Ask your PDF 💬")
16
-
17
- # upload file
18
- pdf = st.file_uploader("Upload your PDF", type="pdf")
19
-
20
- # extract the text
21
- if pdf is not None:
22
- pdf_reader = PdfReader(pdf)
23
- text = ""
24
- for page in pdf_reader.pages:
25
- text += page.extract_text()
26
-
27
- # split into chunks
28
- text_splitter = CharacterTextSplitter(
29
  separator="\n",
30
  chunk_size=1000,
31
  chunk_overlap=200,
32
  length_function=len
33
- )
34
- chunks = text_splitter.split_text(text)
35
-
36
- # create embeddings
37
- embeddings = OpenAIEmbeddings()
38
- knowledge_base = FAISS.from_texts(chunks, embeddings)
39
-
40
- # show user input
41
- user_question = st.text_input("Ask a question about your PDF:")
42
- if user_question:
43
- docs = knowledge_base.similarity_search(user_question)
44
-
45
- llm = OpenAI()
46
- chain = load_qa_chain(llm, chain_type="stuff")
47
- with get_openai_callback() as cb:
48
- response = chain.run(input_documents=docs, question=user_question)
49
- print(cb)
50
-
51
- st.write(response)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
52
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
53
 
54
  if __name__ == '__main__':
55
  main()
 
 
1
  import streamlit as st
2
+ from dotenv import load_dotenv
3
  from PyPDF2 import PdfReader
4
  from langchain.text_splitter import CharacterTextSplitter
5
+ from langchain.embeddings import OpenAIEmbeddings, HuggingFaceInstructEmbeddings
6
  from langchain.vectorstores import FAISS
7
+ from langchain.memory import ConversationBufferMemory
8
+ from langchain.chains import ConversationalRetrievalChain
9
+ from htmlTemplates import css, bot_template, user_template
10
+ from langchain.llms import HuggingFaceHub
11
 
12
+ def get_pdf_text(pdf_docs):
13
+ text = ""
14
+ for pdf in pdf_docs:
15
+ pdf_reader = PdfReader(pdf)
16
+ for page in pdf_reader.pages:
17
+ text += page.extract_text()
18
+ return text
19
 
20
+
21
+ def get_text_chunks(text):
22
+ text_splitter = CharacterTextSplitter(
 
 
 
 
 
 
 
 
 
 
 
 
 
 
23
  separator="\n",
24
  chunk_size=1000,
25
  chunk_overlap=200,
26
  length_function=len
27
+ )
28
+ chunks = text_splitter.split_text(text)
29
+ return chunks
30
+
31
+
32
+ def get_vectorstore(text_chunks):
33
+ embeddings = HuggingFaceInstructEmbeddings(model_name="hkunlp/instructor-xl")
34
+ vectorstore = FAISS.from_texts(texts=text_chunks, embedding=embeddings)
35
+ return vectorstore
36
+
37
+
38
+ def get_conversation_chain(vectorstore):
39
+ llm = HuggingFaceHub(repo_id="google/flan-t5-xxl", model_kwargs={"temperature":0.5, "max_length":512})
40
+
41
+ memory = ConversationBufferMemory(
42
+ memory_key='chat_history', return_messages=True)
43
+ conversation_chain = ConversationalRetrievalChain.from_llm(
44
+ llm=llm,
45
+ retriever=vectorstore.as_retriever(),
46
+ memory=memory
47
+ )
48
+ return conversation_chain
49
+
50
+
51
+ def handle_userinput(user_question):
52
+ response = st.session_state.conversation({'question': user_question})
53
+ st.session_state.chat_history = response['chat_history']
54
+
55
+ for i, message in enumerate(st.session_state.chat_history):
56
+ if i % 2 == 0:
57
+ st.write(user_template.replace(
58
+ "{{MSG}}", message.content), unsafe_allow_html=True)
59
+ else:
60
+ st.write(bot_template.replace(
61
+ "{{MSG}}", message.content), unsafe_allow_html=True)
62
+
63
+
64
+ def main():
65
 
66
+ st.set_page_config(page_title="Get Questions Related To Your PDF Answered",
67
+ page_icon=":books:")
68
+ st.write(css, unsafe_allow_html=True)
69
+
70
+ if "conversation" not in st.session_state:
71
+ st.session_state.conversation = None
72
+ if "chat_history" not in st.session_state:
73
+ st.session_state.chat_history = None
74
+
75
+ st.header("Get Questions Related To Your PDF Answered :books:")
76
+ user_question = st.text_input("Ask a question about your documents:")
77
+ if user_question:
78
+ handle_userinput(user_question)
79
+
80
+ with st.sidebar:
81
+ st.subheader("Your PDFs")
82
+ pdf_docs = st.file_uploader(
83
+ "Upload your PDFs here and click on 'Process'", accept_multiple_files=True)
84
+ if st.button("Process"):
85
+ with st.spinner("Processing"):
86
+
87
+ raw_text = get_pdf_text(pdf_docs)
88
+
89
+ text_chunks = get_text_chunks(raw_text)
90
+ vectorstore = get_vectorstore(text_chunks)
91
+ st.session_state.conversation = get_conversation_chain(
92
+ vectorstore)
93
+
94
 
95
  if __name__ == '__main__':
96
  main()