Spaces:

Samarth991
/

CV-Agent

Running

App Files Files Community

Samarth991 commited on Apr 8

Commit

4708376

1 Parent(s): 7fef6fd

adding SMOL tool

Browse files

Files changed (5) hide show

app.py +12 -35
extract_tools.py +8 -36
llm_service.py → llm/llm_service.py +0 -0
hub_prompts.py → prompts/hub_prompts.py +0 -0
tool_utils/image_description.py +75 -0

app.py CHANGED Viewed

@@ -3,8 +3,8 @@ import streamlit as st
 from PIL import Image
 from pathlib import Path
 from QA_bot import tyre_synap_bot as bot
-from llm_service import get_llm
-from hub_prompts import PREFIX
 from extract_tools import get_all_tools
 from langchain.agents import AgentExecutor
@@ -15,15 +15,14 @@ from langchain.tools.render import render_text_description
 import logging
 import warnings
-warnings.filterwarnings("ignore")
 logging.basicConfig(filename="newfile.log",
                     format='%(asctime)s %(message)s',
                     filemode='w')
 logger = logging.getLogger()
 llm = None
-tools = None
 cv_agent = None
 @st.cache_resource
@@ -32,14 +31,14 @@ def call_llmservice_model(option,api_key):
     return model
 @st.cache_resource
-def setup_agent_prompt():
     prompt = hub.pull("hwchase17/react-json")
-    if len(tools) == 0 :
         logger.error ("No Tools added")
     else :
         prompt = prompt.partial(
-            tools= render_text_description(tools),
-            tool_names= ", ".join([t.name for t in tools]),
             additional_kwargs={
             'system_message':PREFIX,
             }
@@ -48,7 +47,9 @@ def setup_agent_prompt():
 @st.cache_resource
 def agent_initalize():
-    agent_prompt = setup_agent_prompt()
     lm_with_stop = llm.bind(stop=["\nObservation"])
     #### we can use create_react_agent https://github.com/langchain-ai/langchain/blob/master/libs/langchain/langchain/agents/react/agent.py
     agent = (
@@ -62,29 +63,9 @@ def agent_initalize():
     )
     # instantiate AgentExecutor
-    agent_executor = AgentExecutor(agent=agent, tools=tools, verbose=True,handle_parsing_errors=True)
     return agent_executor
-# def agent_initalize(tools,max_iterations=5):
-#     zero_shot_agent = initialize_agent(
-#         agent= AgentType.ZERO_SHOT_REACT_DESCRIPTION,
-#         tools = tools,
-#         llm = llm,
-#         verbose = True,
-#         max_iterations = max_iterations,
-#         memory = None,
-#         handle_parsing_errors=True,
-#         agent_kwargs={
-#         'system_message':PREFIX,
-#         # 'format_instructions':FORMAT_INSTRUCTIONS,
-#         # 'suffix':SUFFIX
-#         }
-#     )
-#     # sys_message = PREFIX
-#     # zero_shot_agent.agent.llm_chain.prompt.template = sys_message
-#     return zero_shot_agent
 def main():
     database_store = 'image_store'
     st.session_state.disabled = False
@@ -137,14 +118,10 @@ def main():
         global llm
         llm = call_llmservice_model(option=option,api_key=api_key)
         logger.info("\tLLM Service {} Active ... !".format(llm.get_name()))
-        ## extract tools
-        global tools
-        tools = get_all_tools()
-        logger.info("\tFound {} tools ".format(len(tools)))
         ## generate Agent
         global agent
         cv_agent = agent_initalize()
-        logger.info('\tAgent inintalized with {} tools '.format(len(tools)))
         with open(file_path, mode='wb') as w:
             w.write(uploaded_file.getvalue())

 from PIL import Image
 from pathlib import Path
 from QA_bot import tyre_synap_bot as bot
+from llm.llm_service import get_llm
+from prompts.hub_prompts import PREFIX
 from extract_tools import get_all_tools
 from langchain.agents import AgentExecutor
 import logging
 import warnings
+warnings.filterwarnings("ignore")
 logging.basicConfig(filename="newfile.log",
                     format='%(asctime)s %(message)s',
                     filemode='w')
 logger = logging.getLogger()
 llm = None
 cv_agent = None
 @st.cache_resource
     return model
 @st.cache_resource
+def setup_agent_prompt(_tools):
     prompt = hub.pull("hwchase17/react-json")
+    if len(_tools) == 0 :
         logger.error ("No Tools added")
     else :
         prompt = prompt.partial(
+            tools = render_text_description(_tools),
+            tool_names= ", ".join([t.name for t in _tools]),
             additional_kwargs={
             'system_message':PREFIX,
             }
 @st.cache_resource
 def agent_initalize():
+    agent_tools = get_all_tools()
+    logger.info("\tFound {} tools ".format(len(agent_tools)))
+    agent_prompt = setup_agent_prompt(_tools=agent_tools)
     lm_with_stop = llm.bind(stop=["\nObservation"])
     #### we can use create_react_agent https://github.com/langchain-ai/langchain/blob/master/libs/langchain/langchain/agents/react/agent.py
     agent = (
     )
     # instantiate AgentExecutor
+    agent_executor = AgentExecutor(agent=agent, tools=agent_tools, verbose=True,handle_parsing_errors=True)
     return agent_executor
 def main():
     database_store = 'image_store'
     st.session_state.disabled = False
         global llm
         llm = call_llmservice_model(option=option,api_key=api_key)
         logger.info("\tLLM Service {} Active ... !".format(llm.get_name()))
         ## generate Agent
         global agent
         cv_agent = agent_initalize()
         with open(file_path, mode='wb') as w:
             w.write(uploaded_file.getvalue())

extract_tools.py CHANGED Viewed

@@ -4,7 +4,7 @@ import requests
 from PIL import Image
 import logging
 import torch
-from llm_service import get_llm
 from langchain_core.tools import tool,Tool
 from langchain_community.tools import DuckDuckGoSearchResults
 from langchain_groq import ChatGroq
@@ -13,7 +13,7 @@ from typing import List
 from tool_utils.clip_segmentation import CLIPSEG
 from tool_utils.yolo_world import YoloWorld
 from tool_utils.image_qualitycheck import brightness_check,gaussian_noise_check,snr_check
 try:
     from transformers import BlipProcessor, BlipForConditionalGeneration
     from transformers import AutoImageProcessor, Mask2FormerForUniversalSegmentation
@@ -73,37 +73,10 @@ def panoptic_image_segemntation(image_path:str)->str:
 def image_description(img_path:str)->str:
     "Use this tool to describe the image " \
     "The tool helps you to identify weather in the image as well "
-    hf_model = "Salesforce/blip-image-captioning-base"
-    text = ""
-    if img_path.startswith('https'):
-        image = Image.open(requests.get(img_path, stream=True).raw).convert('RGB')
-    else:
-        image = Image.open(img_path).convert('RGB')
-    try:
-        processor = BlipProcessor.from_pretrained(hf_model)
-        caption_model = BlipForConditionalGeneration.from_pretrained(hf_model).to(device)
-    except:
-        logging.error("unable to load the Blip model ")
-    logging.info("Image Caption model loaded ! ")
-    # unconditional image captioning
-    inputs = processor(image, return_tensors ='pt').to(device)
-    output = caption_model.generate(**inputs, max_new_tokens=50)
-    caption = processor.decode(output[0], skip_special_tokens=True)
-    # # conditional image captioning
-    # obj_text = "Total number of objects in image "
-    # inputs_2 = processor(image, obj_text ,return_tensors ='pt').to(device)
-    # out_2 = caption_model.generate(**inputs_2,max_new_tokens=50)
-    # object_caption = processor.decode(out_2[0], skip_special_tokens=True)
-    ## clear the GPU cache
-    with torch.no_grad():
-        torch.cuda.empty_cache()
-    text = caption + " ."
-    return text
 @tool
 def clipsegmentation_mask(input_data:str)->str:
@@ -163,12 +136,11 @@ def get_image_quality(image_path:str)->str:
     brightness_text = brightness_check(image)
     blurry_text = gaussian_noise_check(image)
-    snr_text = snr_check(image)
-    final_text = "Image properties are :\n{}\n{}\n{}".format(blurry_text, brightness_text,snr_text)
     return final_text
 def get_all_tools():
      ## bind tools
         image_desc_tool = Tool(

 from PIL import Image
 import logging
 import torch
+from llm.llm_service import get_llm
 from langchain_core.tools import tool,Tool
 from langchain_community.tools import DuckDuckGoSearchResults
 from langchain_groq import ChatGroq
 from tool_utils.clip_segmentation import CLIPSEG
 from tool_utils.yolo_world import YoloWorld
 from tool_utils.image_qualitycheck import brightness_check,gaussian_noise_check,snr_check
+from tool_utils.image_description import SMOLVLM2
 try:
     from transformers import BlipProcessor, BlipForConditionalGeneration
     from transformers import AutoImageProcessor, Mask2FormerForUniversalSegmentation
 def image_description(img_path:str)->str:
     "Use this tool to describe the image " \
     "The tool helps you to identify weather in the image as well "
+    smol_vlm = SMOLVLM2(memory_efficient=True)
+    query="Describe the image. Higlight the details in 2-3 lines"
+    response = smol_vlm.run_inference_on_image(image_path=img_path,query=query)
+    return response
 @tool
 def clipsegmentation_mask(input_data:str)->str:
     brightness_text = brightness_check(image)
     blurry_text = gaussian_noise_check(image)
+    # snr_text = snr_check(image)
+    final_text = "Image properties are :\n{}\n{}".format(blurry_text, brightness_text)
     return final_text
 def get_all_tools():
      ## bind tools
         image_desc_tool = Tool(

llm_service.py → llm/llm_service.py RENAMED Viewed

File without changes

hub_prompts.py → prompts/hub_prompts.py RENAMED Viewed

File without changes

tool_utils/image_description.py ADDED Viewed

	@@ -0,0 +1,75 @@

+import os
+import cv2
+import torch
+import gc
+from transformers import AutoProcessor, AutoModelForImageTextToText
+import logging
+device = 'cuda' if torch.cuda.is_available() else 'cpu'
+torch.cuda.empty_cache()
+os.environ['PYTORCH_CUDA_ALLOC_CONF']= 'max_split_size_mb:1024'
+gc.collect()
+class SMOLVLM2:
+    def __init__(self,model_name = "HuggingFaceTB/SmolVLM2-500M-Video-Instruct" , memory_efficient=True):
+        self.half = True
+        self.processor = AutoProcessor.from_pretrained(model_name)
+        if self.support_flash_attension(device_id=0):
+            self.model = AutoModelForImageTextToText.from_pretrained(
+                model_name,
+                torch_dtype=torch.float16,
+                _attn_implementation="flash_attention_2"
+            ).to(device)
+        else:
+            self.model = AutoModelForImageTextToText.from_pretrained(
+                model_name,
+                torch_dtype=torch.float16,
+            ).to(device)
+        logging.info("Model loaded")
+        self.print_gpu_memory()
+    @staticmethod
+    def print_gpu_memory():
+        logging.info(f"Allocated memory: {torch.cuda.memory_allocated() / 1024**2:.2f} MB")
+        logging.info(f"Cached memory: {torch.cuda.memory_reserved() / 1024**2:.2f} MB")
+    ## check for flash attension
+    @staticmethod
+    def support_flash_attension(device_id):
+        """ Check if GPU supports FalshAttension"""
+        support = False
+        major, minor = torch.cuda.get_device_capability(device_id)
+        if major<8:
+            print("GPU does not support Flash Attension")
+        else:
+            support = True
+        return support
+    def run_inference_on_image(self,image_path,query):
+        messages = [
+            {
+                "role":"user",
+                "content":[
+                    {"type":"image","path":image_path},
+                    {"type":"text","text":query}
+                ]
+            }
+        ]
+        inputs = self.processor.apply_chat_template(
+            messages,
+            add_generation_prompt = True,
+            tokenize = True,
+            return_dict = True,
+            return_tensors = 'pt'
+        )
+        if self.half:
+            inputs.to(torch.half).to(device)
+        else:
+            inputs.to(device)
+        generated_ids = self.model.generate(**inputs,do_sample = False , max_new_tokens = 1024)
+        generated_texts = self.processor.batch_decode(generated_ids,skip_special_tokens=True)
+        del inputs
+        torch.cuda.empty_cache()
+        return generated_texts[0].split('\n')[-1]