Spaces:
Sleeping
Sleeping
import ast | |
from huggingface_hub import InferenceClient | |
import os | |
from typing import List | |
import requests | |
from bs4 import BeautifulSoup | |
class ServerlessInference: | |
''' | |
Interface to the HF serverless inference API | |
''' | |
def __init__(self, vector_store_text = None, vector_store_images = None): | |
self.model:str = "HuggingFaceH4/zephyr-7b-beta" | |
self.client = InferenceClient(api_key=os.getenv("HF_SERVELESS_API")) | |
self.vs_text = vector_store_text | |
self.vs_images = vector_store_images | |
def test(self, query:str) -> str: | |
'''Responds to generic query using llm''' | |
messages:list = [ | |
{ | |
"role": "user", | |
"content": query | |
} | |
] | |
completion = self.client.chat.completions.create( | |
model=self.model, | |
messages=messages, | |
max_tokens=500 | |
) | |
return completion.choices[0].message.content | |
def perform_rag(self, query:str): | |
# First perform text search | |
# Retrieval | |
retrieved_docs = self.vs_text.similarity_search(query=query, k=5) | |
retrieved_docs_text = [doc.page_content for doc in retrieved_docs] # We only need the text of the documents | |
context = "\nExtracted documents:\n" | |
context += "".join([f"Document {str(i)}:::\n" + doc for i, doc in enumerate(retrieved_docs_text)]) | |
# Augmented Generation | |
messages:list = [ | |
{ | |
"role": "system", | |
"content": """Using the information contained in the context, | |
give a comprehensive answer to the question. | |
Respond only to the question asked, response should be concise and relevant to the question. | |
If the answer cannot be deduced from the context, do not give an answer. Instead say `Theres lack of information in document source.`""", | |
}, | |
{ | |
"role": "user", | |
"content": """Context: | |
{context} | |
--- | |
Now here is the question you need to answer. | |
Question: {question}""".format(context=context, question=query), | |
}, | |
] | |
completion = self.client.chat.completions.create( | |
model=self.model, | |
messages=messages, | |
max_tokens=500 | |
) | |
response_text = completion.choices[0].message.content | |
# Image retrieval | |
retrieved_image = self.vs_images.similarity_search(query=query, k=5) | |
retrieved_docs_text = [doc.page_content for doc in retrieved_image] # We only need the text of the documents | |
context = "\nExtracted Images:\n" | |
context += "".join([f"Document {str(i)}:::\n" + doc for i, doc in enumerate(retrieved_docs_text)]) | |
messages:list = [ | |
{ | |
"role": "system", | |
"content": """Using the information contained in the context about the images stored in the database, | |
give a list of identifiers of the image that best represent the kind of information seeked by the document description of user. | |
Respond only to the question asked. Provide only number(s) of the source images relevant to the question. | |
If the image is relevant to the question then output format should be a list [1, 3, 0] | |
otherwise just reply empty list that is []""", | |
}, | |
{ | |
"role": "user", | |
"content": """Context: | |
Extracted Images: | |
Document 0::: | |
Rahuls playing football | |
Document 1::: | |
Rahul recieving award in Archery. | |
--- | |
Now here is the question you need to answer. | |
Document Description: Rahul is excellent player of archery and great poet.""" | |
}, | |
{ | |
"role": "assistant", | |
"content": "[1, ]" | |
}, | |
{ | |
"role": "user", | |
"content": """Context: | |
{context} | |
--- | |
Now here is the question you need to answer. | |
Document Description: {response_text}""".format(context=context, response_text=response_text), | |
}, | |
] | |
completion = self.client.chat.completions.create( | |
model=self.model, | |
messages=messages, | |
max_tokens=500 | |
) | |
try: | |
images_list_str: str = completion.choices[0].message.content | |
images_list:list = parse(images_list_str) | |
# Create link and caption pair | |
response_images = [] | |
for idx in images_list: | |
caption = retrieved_image[idx].page_content | |
url = get_wiki_file_to_image_url(retrieved_image[idx].metadata["url"]) | |
response_images.append( | |
(url, caption) | |
) | |
except Exception as e: | |
print("Error in parsing suggeted images, ",images_list) | |
response_images = [] | |
return response_text, response_images | |
def parse(value: str) -> List[int]: | |
""" | |
Extracts a list of numbers from the given string. | |
Parameters: | |
value (str): The input string containing the list of numbers. | |
Returns: | |
list: A list of numbers if found, otherwise an empty list. | |
""" | |
try: | |
# Find the substring that looks like a list | |
start = value.index('[') | |
end = value.index(']') | |
# Extract and parse it into a Python list | |
return ast.literal_eval(value[start:end+1]) | |
except (ValueError, SyntaxError): | |
# Return an empty list if parsing fails | |
return [] | |
def get_wiki_file_to_image_url(file_page_url:str): | |
# Headers to mimic a browser | |
headers = { | |
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36" | |
} | |
# Step 1: Get the file page HTML | |
response = requests.get(file_page_url, headers=headers) | |
if response.status_code == 200: | |
# Parse the HTML content | |
soup = BeautifulSoup(response.content, "html.parser") | |
# Step 2: Find the link to the image file | |
image_tag = soup.find("a", {"class": "internal"}) | |
if image_tag and "href" in image_tag.attrs: | |
direct_image_url = "https:" + image_tag["href"] | |
return direct_image_url | |
else: | |
return file_page_url |