File size: 6,245 Bytes
ec6359d
33080cc
 
2a28d9d
 
 
33080cc
 
3bfe553
 
 
2fe32bb
33080cc
 
2fe32bb
 
33080cc
 
3bfe553
f0c8373
33080cc
 
 
 
 
 
 
 
 
 
 
2fe32bb
 
 
 
060a333
 
 
 
 
 
 
f0c8373
060a333
 
 
 
 
 
 
 
f0c8373
060a333
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f0c8373
 
 
 
 
 
 
 
 
 
 
 
 
26c4757
f0c8373
 
 
 
 
f1fa604
f0c8373
 
 
f1fa604
 
 
 
 
 
 
 
 
 
 
 
 
 
 
26c4757
f1fa604
 
 
 
 
 
 
f0c8373
 
 
 
 
 
 
 
 
 
26c4757
f0c8373
 
 
 
 
 
 
 
 
3bfe553
 
 
 
 
 
 
 
 
 
 
 
 
 
f0c8373
2a28d9d
 
 
 
ec6359d
 
 
 
 
 
 
 
 
2a28d9d
ec6359d
 
 
 
 
 
 
2a28d9d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
import ast
from huggingface_hub import InferenceClient
import os 
from typing import List
import requests
from bs4 import BeautifulSoup

class ServerlessInference:
    '''
    Interface to the HF serverless inference API
    '''
    def __init__(self, vector_store_text = None, vector_store_images = None):
        self.model:str = "HuggingFaceH4/zephyr-7b-beta"
        self.client = InferenceClient(api_key=os.getenv("HF_SERVELESS_API"))
        self.vs_text = vector_store_text
        self.vs_images = vector_store_images

    def test(self, query:str) -> str:
        '''Responds to generic query using llm'''
        messages:list = [
            {
                "role": "user",
                "content": query
            }
        ]
        completion = self.client.chat.completions.create(
            model=self.model,
            messages=messages, 
            max_tokens=500
        )

        return completion.choices[0].message.content
    
    def perform_rag(self, query:str):
        # First perform text search 
        # Retrieval 
        retrieved_docs = self.vs_text.similarity_search(query=query, k=5)
        retrieved_docs_text = [doc.page_content for doc in retrieved_docs]  # We only need the text of the documents
        context = "\nExtracted documents:\n"
        context += "".join([f"Document {str(i)}:::\n" + doc for i, doc in enumerate(retrieved_docs_text)])

        # Augmented Generation
        messages:list  = [
            {
                "role": "system",
                "content": """Using the information contained in the context,

give a comprehensive answer to the question.

Respond only to the question asked, response should be concise and relevant to the question.

If the answer cannot be deduced from the context, do not give an answer. Instead say `Theres lack of information in document source.`""",

            },

            {
                "role": "user",
                "content": """Context:

{context}

---

Now here is the question you need to answer.

Question: {question}""".format(context=context, question=query),

            },
        ]

        completion = self.client.chat.completions.create(
            model=self.model,
            messages=messages, 
            max_tokens=500
        )

        response_text = completion.choices[0].message.content

        # Image retrieval 
        retrieved_image = self.vs_images.similarity_search(query=query, k=5)
        retrieved_docs_text = [doc.page_content for doc in retrieved_image]  # We only need the text of the documents
        context = "\nExtracted Images:\n"
        context += "".join([f"Document {str(i)}:::\n" + doc for i, doc in enumerate(retrieved_docs_text)])


        messages:list  = [
            {
                "role": "system",
                "content": """Using the information contained in the context about the images stored in the database,

give a list of identifiers of the image that best represent the kind of information seeked by the document description of user.

Respond only to the question asked. Provide only number(s) of the source images relevant to the question. 

If the image is relevant to the question then output format should be a list [1, 3, 0] 

otherwise just reply empty list that is []""",

            },

            {
                "role": "user",
                "content": """Context:
Extracted Images:

Document 0:::
Rahuls playing football

Document 1:::
Rahul recieving award in Archery.

---

Now here is the question you need to answer.

Document Description: Rahul is excellent player of archery and great poet."""

            },
            {
                "role": "assistant", 
                "content": "[1, ]"
            },

            {
                "role": "user",
                "content": """Context:

{context}

---

Now here is the question you need to answer.

Document Description:  {response_text}""".format(context=context, response_text=response_text),

            },
        ]

        completion = self.client.chat.completions.create(
            model=self.model,
            messages=messages, 
            max_tokens=500
        )
        try:
            images_list_str: str = completion.choices[0].message.content
            images_list:list = parse(images_list_str)
            # Create link and caption pair
            response_images = []
            for idx in images_list: 
                caption = retrieved_image[idx].page_content 
                url = get_wiki_file_to_image_url(retrieved_image[idx].metadata["url"])
                response_images.append(
                    (url, caption)
                )
        except Exception as e: 
            print("Error in parsing suggeted images, ",images_list)
            response_images  = []

        return response_text, response_images      
    

def parse(value: str) -> List[int]:
    """
    Extracts a list of numbers from the given string.

    Parameters:
        value (str): The input string containing the list of numbers.

    Returns:
        list: A list of numbers if found, otherwise an empty list.
    """
    try:
        # Find the substring that looks like a list
        start = value.index('[')
        end = value.index(']')
        # Extract and parse it into a Python list
        return ast.literal_eval(value[start:end+1])
    except (ValueError, SyntaxError):
        # Return an empty list if parsing fails
        return []


def get_wiki_file_to_image_url(file_page_url:str):
    # Headers to mimic a browser
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
    }

    # Step 1: Get the file page HTML
    response = requests.get(file_page_url, headers=headers)

    if response.status_code == 200:
        # Parse the HTML content
        soup = BeautifulSoup(response.content, "html.parser")

        # Step 2: Find the link to the image file
        image_tag = soup.find("a", {"class": "internal"})
        if image_tag and "href" in image_tag.attrs:
            direct_image_url = "https:" + image_tag["href"]
        
            return direct_image_url
    
    else:
        return file_page_url