OpenSearch-AI

Running on CPU Upgrade

App Files Files

prasadnu commited on Feb 12

Commit

2e2dda5

0 Parent(s):

RAG fix

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitattributes +35 -0
.gitignore +3 -0
.streamlit/config.toml +21 -0
RAG/bedrock_agent.py +146 -0
RAG/generate_csv_for_tables.py +167 -0
RAG/rag_DocumentLoader.py +395 -0
RAG/rag_DocumentSearcher.py +338 -0
README.md +13 -0
app.py +125 -0
figures/ukhousingstats/figure-1-1-resized.jpg +0 -0
figures/ukhousingstats/figure-1-1.jpg +0 -0
figures/ukhousingstats/figure-1-2-resized.jpg +0 -0
figures/ukhousingstats/figure-1-2.jpg +0 -0
figures/ukhousingstats/figure-2-3-resized.jpg +0 -0
figures/ukhousingstats/figure-2-3.jpg +0 -0
figures/ukhousingstats/figure-3-4-resized.jpg +0 -0
figures/ukhousingstats/figure-3-4.jpg +0 -0
figures/ukhousingstats/figure-3-5-resized.jpg +0 -0
figures/ukhousingstats/figure-3-5.jpg +0 -0
figures/ukhousingstats/figure-4-6-resized.jpg +0 -0
figures/ukhousingstats/figure-4-6.jpg +0 -0
figures/ukhousingstats/figure-4-7-resized.jpg +0 -0
figures/ukhousingstats/figure-4-7.jpg +0 -0
figures/ukhousingstats/figure-5-8-resized.jpg +0 -0
figures/ukhousingstats/figure-5-8.jpg +0 -0
figures/ukhousingstats/figure-6-10-resized.jpg +0 -0
figures/ukhousingstats/figure-6-10.jpg +0 -0
figures/ukhousingstats/figure-6-11-resized.jpg +0 -0
figures/ukhousingstats/figure-6-11.jpg +0 -0
figures/ukhousingstats/figure-6-12-resized.jpg +0 -0
figures/ukhousingstats/figure-6-12.jpg +0 -0
figures/ukhousingstats/figure-6-13-resized.jpg +0 -0
figures/ukhousingstats/figure-6-13.jpg +0 -0
figures/ukhousingstats/figure-6-14-resized.jpg +0 -0
figures/ukhousingstats/figure-6-14.jpg +0 -0
figures/ukhousingstats/figure-6-15-resized.jpg +0 -0
figures/ukhousingstats/figure-6-15.jpg +0 -0
figures/ukhousingstats/figure-6-16-resized.jpg +0 -0
figures/ukhousingstats/figure-6-16.jpg +0 -0
figures/ukhousingstats/figure-6-17-resized.jpg +0 -0
figures/ukhousingstats/figure-6-17.jpg +0 -0
figures/ukhousingstats/figure-6-18-resized.jpg +0 -0
figures/ukhousingstats/figure-6-18.jpg +0 -0
figures/ukhousingstats/figure-6-19-resized.jpg +0 -0
figures/ukhousingstats/figure-6-19.jpg +0 -0
figures/ukhousingstats/figure-6-20-resized.jpg +0 -0
figures/ukhousingstats/figure-6-20.jpg +0 -0
figures/ukhousingstats/figure-6-21-resized.jpg +0 -0
figures/ukhousingstats/figure-6-21.jpg +0 -0
figures/ukhousingstats/figure-6-22-resized.jpg +0 -0

.gitattributes ADDED Viewed

	@@ -0,0 +1,35 @@

+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ckpt filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.mlmodel filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.npy filter=lfs diff=lfs merge=lfs -text
+*.npz filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pickle filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+*.safetensors filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tar filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zst filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text

.gitignore ADDED Viewed

	@@ -0,0 +1,3 @@


1	+ **/__pycache__/
2	+ *.DS_Store
3	+

.streamlit/config.toml ADDED Viewed

	@@ -0,0 +1,21 @@

+[client]
+toolbarMode = "viewer"
+showSidebarNavigation = false
+showErrorDetails = true
+[browser]
+gatherUsageStats = false
+[theme]
+base="dark"
+font="sans serif"
+primaryColor="#e28743"
+backgroundColor ="#000000"
+[global]
+disableWidgetStateDuplicationWarning = true
+showWarningOnDirectExecution = false
+[server]
+enableXsrfProtection=false

RAG/bedrock_agent.py ADDED Viewed

	@@ -0,0 +1,146 @@

+import boto3
+import json
+import time
+import zipfile
+from io import BytesIO
+import uuid
+import pprint
+import logging
+print(boto3.__version__)
+from PIL import Image
+import os
+import base64
+import re
+import requests
+import utilities.re_ranker as re_ranker
+import utilities.invoke_models as invoke_models
+import streamlit as st
+import time as t
+import botocore.exceptions
+if "inputs_" not in st.session_state:
+    st.session_state.inputs_ = {}
+parent_dirname = "/".join((os.path.dirname(__file__)).split("/")[0:-1])
+region = 'us-east-1'
+print(region)
+account_id = '445083327804'
+# setting logger
+logging.basicConfig(format='[%(asctime)s] p%(process)s {%(filename)s:%(lineno)d} %(levelname)s - %(message)s', level=logging.INFO)
+logger = logging.getLogger(__name__)
+# getting boto3 clients for required AWS services
+#bedrock_agent_client = boto3.client('bedrock-agent',region_name=region)
+bedrock_agent_runtime_client = boto3.client(
+    'bedrock-agent-runtime',
+    aws_access_key_id=st.secrets['user_access_key'],
+    aws_secret_access_key=st.secrets['user_secret_key'], region_name = 'us-east-1'
+)
+enable_trace:bool = True
+end_session:bool = False
+def delete_memory():
+    response = bedrock_agent_runtime_client.delete_agent_memory(
+    agentAliasId='TSTALIASID',
+    agentId='B4Z7BTURC4'
+    )
+def query_(inputs):
+    ## create a random id for session initiator id
+    # invoke the agent API
+    agentResponse = bedrock_agent_runtime_client.invoke_agent(
+        inputText=inputs['shopping_query'],
+        agentId='B4Z7BTURC4',
+        agentAliasId='TSTALIASID',
+        sessionId=st.session_state.session_id_,
+        enableTrace=enable_trace,
+        endSession= end_session
+    )
+    logger.info(pprint.pprint(agentResponse))
+    print("***agent*****response*********")
+    print(agentResponse)
+    event_stream = agentResponse['completion']
+    total_context = []
+    last_tool = ""
+    last_tool_name = ""
+    agent_answer = ""
+    try:
+        for event in event_stream:
+            print("***event*********")
+            print(event)
+            # if 'chunk' in event:
+            #     data = event['chunk']['bytes']
+            #     print("***chunk*********")
+            #     print(data)
+            #     logger.info(f"Final answer ->\n{data.decode('utf8')}")
+            #     agent_answer_ = data.decode('utf8')
+            #     print(agent_answer_)
+            if 'trace' in event:
+                print("trace*****total*********")
+                print(event['trace'])
+                if('orchestrationTrace' not in event['trace']['trace']):
+                    continue
+                orchestration_trace = event['trace']['trace']['orchestrationTrace']
+                total_context_item = {}
+                if('modelInvocationOutput' in orchestration_trace and '<tool_name>' in orchestration_trace['modelInvocationOutput']['rawResponse']['content']):
+                    total_context_item['tool'] = orchestration_trace['modelInvocationOutput']['rawResponse']
+                if('rationale' in orchestration_trace):
+                    total_context_item['rationale'] = orchestration_trace['rationale']['text']
+                if('invocationInput' in orchestration_trace):
+                    total_context_item['invocationInput'] = orchestration_trace['invocationInput']['actionGroupInvocationInput']
+                    last_tool_name = total_context_item['invocationInput']['function']
+                if('observation'  in orchestration_trace):
+                    print("trace****observation******")
+                    total_context_item['observation'] = event['trace']['trace']['orchestrationTrace']['observation']
+                    tool_output_last_obs = event['trace']['trace']['orchestrationTrace']['observation']
+                    print(tool_output_last_obs)
+                    if(tool_output_last_obs['type'] == 'ACTION_GROUP'):
+                        last_tool = tool_output_last_obs['actionGroupInvocationOutput']['text']
+                    if(tool_output_last_obs['type'] == 'FINISH'):
+                        agent_answer = tool_output_last_obs['finalResponse']['text']
+                if('modelInvocationOutput' in orchestration_trace and '<thinking>' in orchestration_trace['modelInvocationOutput']['rawResponse']['content']):
+                    total_context_item['thinking'] = orchestration_trace['modelInvocationOutput']['rawResponse']
+                if(total_context_item!={}):
+                    total_context.append(total_context_item)
+        print("total_context------")
+        print(total_context)
+    except botocore.exceptions.EventStreamError as error:
+        raise error
+        # t.sleep(2)
+        # query_(st.session_state.inputs_)
+            # if 'chunk' in event:
+            #     data = event['chunk']['bytes']
+            #     final_ans = data.decode('utf8')
+            #     print(f"Final answer ->\n{final_ans}")
+            #     logger.info(f"Final answer ->\n{final_ans}")
+            #     agent_answer = final_ans
+            #     end_event_received = True
+            #     # End event indicates that the request finished successfully
+            # elif 'trace' in event:
+            #     logger.info(json.dumps(event['trace'], indent=2))
+            # else:
+            #     raise Exception("unexpected event.", event)
+    # except Exception as e:
+    #     raise Exception("unexpected event.", e)
+    return {'text':agent_answer,'source':total_context,'last_tool':{'name':last_tool_name,'response':last_tool}}
+        ####### Re-Rank ########
+    #print("re-rank")
+    # if(st.session_state.input_is_rerank == True and len(total_context)):
+    #     ques = [{"question":question}]
+    #     ans = [{"answer":total_context}]
+    #     total_context = re_ranker.re_rank('rag','Cross Encoder',"",ques, ans)
+    # llm_prompt = prompt_template.format(context=total_context[0],question=question)
+    # output = invoke_models.invoke_llm_model( "\n\nHuman: {input}\n\nAssistant:".format(input=llm_prompt) ,False)
+    # #print(output)
+    # if(len(images_2)==0):
+    #     images_2 = images
+    # return {'text':output,'source':total_context,'image':images_2,'table':df}

RAG/generate_csv_for_tables.py ADDED Viewed

	@@ -0,0 +1,167 @@

+import os
+import json
+import boto3
+import io
+from io import BytesIO
+import sys
+from pprint import pprint
+from PyPDF2 import PdfWriter, PdfReader
+import re
+import shutil
+import streamlit as st
+file_content = {}
+parent_dirname = "/".join((os.path.dirname(__file__)).split("/")[0:-1])
+# if os.path.isdir(parent_dirname+"/split_pdf"):
+#     shutil.rmtree(parent_dirname+"/split_pdf")
+# os.mkdir(parent_dirname+"/split_pdf")
+# if os.path.isdir(parent_dirname+"/split_pdf_csv"):
+#     shutil.rmtree(parent_dirname+"/split_pdf_csv")
+# os.mkdir(parent_dirname+"/split_pdf_csv")
+def get_rows_columns_map(table_result, blocks_map):
+    rows = {}
+    #scores = []
+    for relationship in table_result['Relationships']:
+        if relationship['Type'] == 'CHILD':
+            for child_id in relationship['Ids']:
+                cell = blocks_map[child_id]
+                if cell['BlockType'] == 'CELL':
+                    row_index = cell['RowIndex']
+                    col_index = cell['ColumnIndex']
+                    if row_index not in rows:
+                        # create new row
+                        rows[row_index] = {}
+                    # get confidence score
+                    #scores.append(str(cell['Confidence']))
+                    # get the text value
+                    rows[row_index][col_index] = get_text(cell, blocks_map)
+    return rows#, scores
+def get_text(result, blocks_map):
+    text = ''
+    if 'Relationships' in result:
+        for relationship in result['Relationships']:
+            if relationship['Type'] == 'CHILD':
+                for child_id in relationship['Ids']:
+                    word = blocks_map[child_id]
+                    if word['BlockType'] == 'WORD':
+                        if "," in word['Text'] and word['Text'].replace(",", "").isnumeric():
+                            text += '"' + word['Text'] + '"' +' '
+                        else:
+                            text += word['Text'] +' '
+                    if word['BlockType'] == 'SELECTION_ELEMENT':
+                        if word['SelectionStatus'] =='SELECTED':
+                            text +=  'X '
+    return text
+def split_pages(file_name):
+    inputpdf = PdfReader(open(file_name, "rb"))
+    file_name_short = re.sub('[^A-Za-z0-9]+', '', (file_name.split("/")[-1].split(".")[0]).lower())
+    for i in range(len(inputpdf.pages)):
+        output = PdfWriter()
+        output.add_page(inputpdf.pages[i])
+        split_file = parent_dirname+"/split_pdf/"+file_name_short+"%s.pdf" % i
+        with open(split_file, "wb") as outputStream:
+            output.write(outputStream)
+        table_csv = get_table_csv_results(split_file)
+        if(table_csv != "<b> NO Table FOUND </b>"):
+            output_file = parent_dirname+"/split_pdf_csv/"+file_name_short+"%s.csv" % i
+            file_content[output_file] = table_csv
+            # replace content
+            with open(output_file, "wt") as fout:
+                fout.write(table_csv)
+            # show the results
+            print('CSV OUTPUT FILE: ', output_file)
+    return file_content
+def get_table_csv_results(file_name):
+    with open(file_name, 'rb') as file:
+        img_test = file.read()
+        bytes_test = bytearray(img_test)
+        #print('Image loaded', file_name)
+    # process using image bytes
+    # get the results
+    #session = boto3.Session(profile_name='profile-name')
+    client = boto3.client('textract',aws_access_key_id=st.secrets['user_access_key'],
+                aws_secret_access_key=st.secrets['user_secret_key'], region_name = 'us-east-1')
+    # {'S3Object': {
+    #         'Bucket': 'ml-search-app-access',
+    #         'Name': 'covid19_ie_removed.pdf'
+    #     }}
+    response = client.analyze_document(Document={'Bytes': bytes_test}, FeatureTypes=['TABLES'])
+    # Get the text blocks
+    blocks=response['Blocks']
+    #pprint(blocks)
+    blocks_map = {}
+    table_blocks = []
+    for block in blocks:
+        blocks_map[block['Id']] = block
+        if block['BlockType'] == "TABLE":
+            table_blocks.append(block)
+    if len(table_blocks) <= 0:
+        return "<b> NO Table FOUND </b>"
+    csv = ''
+    for index, table in enumerate(table_blocks):
+        csv += generate_table_csv(table, blocks_map, index +1)
+        csv += '\n\n'
+    return csv
+def generate_table_csv(table_result, blocks_map, table_index):
+    rows = get_rows_columns_map(table_result, blocks_map)
+    table_id = 'Table_' + str(table_index)
+    # get cells.
+    csv = ''#Table: {0}\n\n'.format(table_id)
+    for row_index, cols in rows.items():
+        for col_index, text in cols.items():
+            col_indices = len(cols.items())
+            csv += text.strip()+"`" #'{}'.format(text) + ","
+        csv += '\n'
+    # csv += '\n\n Confidence Scores % (Table Cell) \n'
+    # cols_count = 0
+    # for score in scores:
+    #     cols_count += 1
+    #     csv += score + ","
+    #     if cols_count == col_indices:
+    #         csv += '\n'
+    #         cols_count = 0
+    csv += '\n\n\n'
+    return csv
+def main_(file_name):
+    table_csv = split_pages(file_name)
+    #print(table_csv)
+    return table_csv
+# if __name__ == "__main__":
+#     file_name = "/home/ubuntu/covid19_ie_removed.pdf"
+#     main(file_name)

RAG/rag_DocumentLoader.py ADDED Viewed

	@@ -0,0 +1,395 @@

+import boto3
+import json
+import os
+import shutil
+import time
+from unstructured.partition.pdf import partition_pdf
+from opensearchpy import OpenSearch, RequestsHttpConnection, AWSV4SignerAuth
+import streamlit as st
+from PIL import Image
+import base64
+import re
+#import torch
+import base64
+import requests
+from requests_aws4auth import AWS4Auth
+import re_ranker
+import utilities.invoke_models as invoke_models
+from requests.auth import HTTPBasicAuth
+import generate_csv_for_tables
+from pdf2image import convert_from_bytes,convert_from_path
+#import langchain
+bedrock_runtime_client = boto3.client('bedrock-runtime',region_name='us-east-1')
+textract_client = boto3.client('textract',region_name='us-east-1')
+region = 'us-east-1'
+service = 'es'
+credentials = boto3.Session().get_credentials()
+auth = HTTPBasicAuth('prasadnu',st.secrets['rag_shopping_assistant_os_api_access'])
+ospy_client = OpenSearch(
+    hosts = [{'host': 'search-opensearchservi-75ucark0bqob-bzk6r6h2t33dlnpgx2pdeg22gi.us-east-1.es.amazonaws.com', 'port': 443}],
+    http_auth = auth,
+    use_ssl = True,
+    verify_certs = True,
+    connection_class = RequestsHttpConnection,
+    pool_maxsize = 20
+)
+summary_prompt = """You are an assistant tasked with summarizing tables and text. \
+Give a detailed summary of the table or text. Table or text chunk: {element} """
+parent_dirname = "/".join((os.path.dirname(__file__)).split("/")[0:-1])
+def generate_image_captions_(image_paths):
+  images = []
+  for image_path in image_paths:
+    i_image = Image.open(image_path)
+    if i_image.mode != "RGB":
+      i_image = i_image.convert(mode="RGB")
+    images.append(i_image)
+  pixel_values = feature_extractor(images=images, return_tensors="pt").pixel_values
+  pixel_values = pixel_values.to(device)
+  output_ids = model.generate(pixel_values, **gen_kwargs)
+  preds = tokenizer.batch_decode(output_ids, skip_special_tokens=True)
+  preds = [pred.strip() for pred in preds]
+  return preds
+def load_docs(inp):
+    print("input_doc")
+    print(inp)
+    extracted_elements_list = []
+    data_dir = parent_dirname+"/pdfs"
+    target_files = [os.path.join(data_dir,inp["key"])]
+    Image.MAX_IMAGE_PIXELS = 100000000
+    width = 2048
+    height = 2048
+    for target_file in target_files:
+        tables_textract = generate_csv_for_tables.main_(target_file)
+        #tables_textract = {}
+        index_ = re.sub('[^A-Za-z0-9]+', '', (target_file.split("/")[-1].split(".")[0]).lower())
+        st.session_state.input_index = index_
+        if os.path.isdir(parent_dirname+'/figures/') == False:
+            os.mkdir(parent_dirname+'/figures/')
+        image_output_dir = parent_dirname+'/figures/'+st.session_state.input_index+"/"
+        if os.path.isdir(image_output_dir):
+            shutil.rmtree(image_output_dir)
+        os.mkdir(image_output_dir)
+        print("***")
+        print(target_file)
+        #image_output_dir_path = os.path.join(image_output_dir,target_file.split('/')[-1].split('.')[0])
+        #os.mkdir(image_output_dir_path)
+        # with open(target_file, "rb") as pdf_file:
+        #     encoded_string_pdf = bytearray(pdf_file.read())
+        #images_pdf = convert_from_path(target_file)
+        # for index,image in enumerate(images_pdf):
+        #     image.save(image_output_dir_pdf+"/"+st.session_state.input_index+"/"+str(index)+"_pdf.jpeg", 'JPEG')
+        #     with open(image_output_dir_pdf+"/"+st.session_state.input_index+"/"+str(index)+"_pdf.jpeg", "rb") as read_img:
+        #         input_encoded = base64.b64encode(read_img.read())
+        # print(encoded_string_pdf)
+        # tables_= textract_client.analyze_document(
+        #                                  Document={'Bytes': encoded_string_pdf},
+        #                                  FeatureTypes=['TABLES']
+        #                                 )
+        # print(tables_)
+        table_and_text_elements = partition_pdf(
+            filename=target_file,
+            extract_images_in_pdf=True,
+            infer_table_structure=False,
+            chunking_strategy="by_title", #Uses title elements to identify sections within the document for chunking
+            max_characters=4000,
+            new_after_n_chars=3800,
+            combine_text_under_n_chars=2000,
+            extract_image_block_output_dir=parent_dirname+'/figures/'+st.session_state.input_index+'/',
+        )
+        tables = []
+        texts = []
+        print(table_and_text_elements)
+        for table in tables_textract.keys():
+            print(table)
+            #print(tables_textract[table])
+            tables.append({'table_name':table,'raw':tables_textract[table],'summary':invoke_models.invoke_llm_model(summary_prompt.format(element=tables_textract[table]),False)})
+            time.sleep(4)
+        for element in table_and_text_elements:
+            # if "unstructured.documents.elements.Table" in str(type(element)):
+            #     tables.append({'raw':str(element),'summary':invoke_models.invoke_llm_model(summary_prompt.format(element=str(element)),False)})
+            #     tables_source.append({'raw':element,'summary':invoke_models.invoke_llm_model(summary_prompt.format(element=str(element)),False)})
+            if "unstructured.documents.elements.CompositeElement" in str(type(element)):
+                texts.append(str(element))
+        image_captions = {}
+        for image_file in os.listdir(image_output_dir):
+            print("image_processing")
+            photo_full_path = image_output_dir+image_file
+            photo_full_path_no_format = photo_full_path.replace('.jpg',"")
+            with Image.open(photo_full_path) as image:
+                image.verify()
+            with Image.open(photo_full_path) as image:
+                file_type = 'jpg'
+                path = image.filename.rsplit(".", 1)[0]
+                image.thumbnail((width, height))
+                image.save(photo_full_path_no_format+"-resized.jpg")
+            with open(photo_full_path_no_format+"-resized.jpg", "rb") as read_img:
+                input_encoded = base64.b64encode(read_img.read()).decode("utf8")
+            image_captions[image_file] = {"caption":invoke_models.generate_image_captions_llm(input_encoded, "What's in this image?"),
+                                          "encoding":input_encoded
+                                        }
+        print("image_processing done")
+        #print(image_captions)
+            #print(os.path.join('figures',image_file))
+        extracted_elements_list = []
+        extracted_elements_list.append({
+                    'source': target_file,
+                    'tables': tables,
+                    'texts': texts,
+                    'images': image_captions
+                })
+        documents = []
+        documents_mm = []
+        for extracted_element in extracted_elements_list:
+            print("prepping data")
+            texts = extracted_element['texts']
+            tables = extracted_element['tables']
+            images_data = extracted_element['images']
+            src_doc = extracted_element['source']
+            for text in texts:
+                embedding = invoke_models.invoke_model(text)
+                document = prep_document(text,text,'text',src_doc,'none',embedding)
+                documents.append(document)
+            for table in tables:
+                table_raw = table['raw']
+                table_summary = table['summary']
+                embedding = invoke_models.invoke_model(table_summary)
+                document = prep_document(table_raw,table_summary,'table*'+table['table_name'],src_doc,'none',embedding)
+                documents.append(document)
+            for file_name in images_data.keys():
+                embedding = invoke_models.invoke_model_mm(image_captions[file_name]['caption'],image_captions[file_name]['encoding'])
+                document = prep_document(image_captions[file_name]['caption'],image_captions[file_name]['caption'],'image_'+file_name,src_doc,image_captions[file_name]['encoding'],embedding)
+                documents_mm.append(document)
+                embedding = invoke_models.invoke_model(image_captions[file_name]['caption'])
+                document = prep_document(image_captions[file_name]['caption'],image_captions[file_name]['caption'],'image_'+file_name,src_doc,'none',embedding)
+                documents.append(document)
+        os_ingest(index_, documents)
+        os_ingest_mm(index_, documents_mm)
+def prep_document(raw_element,processed_element,doc_type,src_doc,encoding,embedding):
+    if('image' in doc_type):
+        img_ = doc_type.split("_")[1]
+    else:
+        img_ = "None"
+    document = {
+        "processed_element": re.sub(r"[^a-zA-Z0-9]+", ' ', processed_element) ,
+        "raw_element_type": doc_type.split("*")[0],
+        "raw_element": re.sub(r"[^a-zA-Z0-9]+", ' ', raw_element) ,
+        "src_doc": src_doc.replace(","," "),
+        "image": img_,
+    }
+    if(encoding!="none"):
+        document["image_encoding"] = encoding
+        document["processed_element_embedding_bedrock-multimodal"] = embedding
+    else:
+        document["processed_element_embedding"] = embedding
+    if('table' in doc_type):
+        document["table"] = doc_type.split("*")[1]
+    return document
+def os_ingest(index_,documents):
+    print("ingesting data")
+    #host = 'your collection id.region.aoss.amazonaws.com'
+    if(ospy_client.indices.exists(index=index_)):
+        ospy_client.indices.delete(index = index_)
+    index_body = {
+    "settings": {
+        "index": {
+            "knn": True,
+            "default_pipeline": "rag-ingest-pipeline",
+        "number_of_shards": 4
+        }
+    },
+    "mappings": {
+      "properties": {
+        "processed_element": {
+          "type": "text"
+    },
+             "raw_element": {
+          "type": "text"
+    },
+        "processed_element_embedding": {
+          "type": "knn_vector",
+           "dimension":1536,
+           "method": {
+                  "engine": "faiss",
+                  "space_type": "l2",
+                  "name": "hnsw",
+                  "parameters": {}
+                }
+    },
+        # "processed_element_embedding_bedrock-multimodal": {
+        #   "type": "knn_vector",
+        #   "dimension": 1024,
+        #   "method": {
+        #     "engine": "faiss",
+        #     "space_type": "l2",
+        #     "name": "hnsw",
+        #     "parameters": {}
+        #   }
+        # },
+        #  "image_encoding": {
+        #   "type": "binary"
+        # },
+    "raw_element_type": {
+          "type": "text"
+    },
+   "processed_element_embedding_sparse": {
+          "type": "rank_features"
+        },
+    "src_doc": {
+          "type": "text"
+    },
+    "image":{ "type": "text"}
+    }
+    }
+    }
+    response = ospy_client.indices.create(index_, body=index_body)
+    for doc in documents:
+        print("----------doc------------")
+        if(doc['image']!='None'):
+            print("image insert")
+            print(doc['image'])
+        response = ospy_client.index(
+            index = index_,
+            body = doc,
+        )
+def os_ingest_mm(index_,documents_mm):
+    #host = 'your collection id.region.aoss.amazonaws.com'
+    index_ = index_+"_mm"
+    if(ospy_client.indices.exists(index=index_)):
+        ospy_client.indices.delete(index = index_)
+    index_body = {
+    "settings": {
+        "index": {
+            "knn": True,
+           # "default_pipeline": "rag-ingest-pipeline",
+        "number_of_shards": 4
+        }
+    },
+    "mappings": {
+      "properties": {
+        "processed_element": {
+          "type": "text"
+    },
+             "raw_element": {
+          "type": "text"
+    },
+        "processed_element_embedding_bedrock-multimodal": {
+          "type": "knn_vector",
+          "dimension": 1024,
+          "method": {
+            "engine": "faiss",
+            "space_type": "l2",
+            "name": "hnsw",
+            "parameters": {}
+          }
+        },
+         "image_encoding": {
+          "type": "binary"
+        },
+    "raw_element_type": {
+          "type": "text"
+    },
+    "src_doc": {
+          "type": "text"
+    },
+    "image":{ "type": "text"}
+    }
+    }
+    }
+    response = ospy_client.indices.create(index_, body=index_body)
+    for doc in documents_mm:
+        #print("----------doc------------")
+        #print(doc)
+        response = ospy_client.index(
+            index = index_,
+            body = doc,
+        )

RAG/rag_DocumentSearcher.py ADDED Viewed

	@@ -0,0 +1,338 @@

+import boto3
+import json
+import os
+import shutil
+from unstructured.partition.pdf import partition_pdf
+from opensearchpy import OpenSearch, RequestsHttpConnection, AWSV4SignerAuth
+import streamlit as st
+from PIL import Image
+import base64
+import re
+#from transformers import VisionEncoderDecoderModel, ViTImageProcessor, AutoTokenizer
+import torch
+import base64
+import requests
+import utilities.re_ranker as re_ranker
+import utilities.invoke_models as invoke_models
+#import langchain
+headers = {"Content-Type": "application/json"}
+host = "https://search-opensearchservi-75ucark0bqob-bzk6r6h2t33dlnpgx2pdeg22gi.us-east-1.es.amazonaws.com/"
+parent_dirname = "/".join((os.path.dirname(__file__)).split("/")[0:-1])
+def query_(awsauth,inputs, session_id,search_types):
+    print("using index: "+st.session_state.input_index)
+    question = inputs['query']
+    k=1
+    embedding = invoke_models.invoke_model_mm(question,"none")
+    query_mm = {
+        "size": k,
+          "_source": {
+        "exclude": [
+            "processed_element_embedding_bedrock-multimodal","processed_element_embedding_sparse","image_encoding","processed_element_embedding"
+        ]
+        },
+        "query":  {
+            "knn": {
+                "processed_element_embedding_bedrock-multimodal": {
+                    "vector": embedding,
+                    "k": k}
+                }
+        }
+    }
+    path = st.session_state.input_index+"_mm/_search"
+    url = host+path
+    r = requests.get(url, auth=awsauth, json=query_mm, headers=headers)
+    response_mm = json.loads(r.text)
+    # response_mm = ospy_client.search(
+    #     body = query_mm,
+    #     index = st.session_state.input_index+"_mm"
+    # )
+    hits = response_mm['hits']['hits']
+    context = []
+    context_tables = []
+    images = []
+    for hit in hits:
+        #context.append(hit['_source']['caption'])
+        images.append({'file':hit['_source']['image'],'caption':hit['_source']['processed_element']})
+    ####### SEARCH ########
+    path = "_search/pipeline/rag-search-pipeline"
+    url = host + path
+    num_queries = len(search_types)
+    weights = []
+    searches = ['Keyword','Vector','NeuralSparse']
+    equal_weight = (int(100/num_queries) )/100
+    if(num_queries>1):
+        for index,search in enumerate(search_types):
+            if(index != (num_queries-1)):
+                weight = equal_weight
+            else:
+                weight = 1-sum(weights)
+            weights.append(weight)
+        #print(weights)
+        s_pipeline_payload = {
+                "description": "Post processor for hybrid search",
+                "phase_results_processors": [
+                {
+                    "normalization-processor": {
+                    "normalization": {
+                        "technique": "min_max"
+                    },
+                    "combination": {
+                        "technique": "arithmetic_mean",
+                        "parameters": {
+                        "weights": weights
+                        }
+                    }
+                    }
+                }
+                ]
+            }
+        r = requests.put(url, auth=awsauth, json=s_pipeline_payload, headers=headers)
+        #print(r.status_code)
+        #print(r.text)
+    SIZE = 5
+    hybrid_payload = {
+        "_source": {
+        "exclude": [
+            "processed_element_embedding","processed_element_embedding_sparse"
+        ]
+        },
+        "query": {
+        "hybrid": {
+            "queries": [
+            #1. keyword query
+            #2. vector search query
+            #3. Sparse query
+            ]
+        }
+        },"size":SIZE,
+    }
+    if('Keyword Search' in search_types):
+        keyword_payload = {
+                        "match": {
+                        "processed_element": {
+                            "query": question
+                        }
+                        }
+                    }
+        hybrid_payload["query"]["hybrid"]["queries"].append(keyword_payload)
+    if('Vector Search' in search_types):
+        embedding = embedding = invoke_models.invoke_model(question)
+        vector_payload = {
+            "knn": {
+                 "processed_element_embedding": {
+                     "vector": embedding,
+                     "k": 2}
+                 }
+                        }
+        hybrid_payload["query"]["hybrid"]["queries"].append(vector_payload)
+    if('Sparse Search' in search_types):
+        #print("text expansion is enabled")
+        sparse_payload =  {  "neural_sparse": {
+                "processed_element_embedding_sparse": {
+                    "query_text": question,
+                    "model_id": "srrJ-owBQhe1aB-khx2n"
+                }
+                }}
+        hybrid_payload["query"]["hybrid"]["queries"].append(sparse_payload)
+        # path2 =  "_plugins/_ml/models/srrJ-owBQhe1aB-khx2n/_predict"
+        # url2 = host+path2
+        # payload2 = {
+        # "parameters": {
+        #     "inputs": question
+        #     }
+        #         }
+        # r2 = requests.post(url2, auth=awsauth, json=payload2, headers=headers)
+        # sparse_ = json.loads(r2.text)
+        # query_sparse = sparse_["inference_results"][0]["output"][0]["dataAsMap"]["response"][0]
+    # print("hybrid_payload")
+    # print("---------------")
+    #print(hybrid_payload)
+    hits = []
+    if(num_queries>1):
+        path = st.session_state.input_index+"/_search?search_pipeline=rag-search-pipeline"
+    else:
+        path = st.session_state.input_index+"/_search"
+    url = host+path
+    if(len(hybrid_payload["query"]["hybrid"]["queries"])==1):
+        single_query = hybrid_payload["query"]["hybrid"]["queries"][0]
+        del hybrid_payload["query"]["hybrid"]
+        hybrid_payload["query"] = single_query
+        r = requests.get(url, auth=awsauth, json=hybrid_payload, headers=headers)
+        #print(r.status_code)
+        response_ = json.loads(r.text)
+        #print("-------------------------------------------------------------------")
+        #print(r.text)
+        hits = response_['hits']['hits']
+    else:
+        r = requests.get(url, auth=awsauth, json=hybrid_payload, headers=headers)
+        #print(r.status_code)
+        response_ = json.loads(r.text)
+        #print("-------------------------------------------------------------------")
+        #print(response_)
+        hits = response_['hits']['hits']
+    ##### GET reference tables separately like *_mm index search for images  ######
+    def lazy_get_table():
+        #print("Forcing table analysis")
+        table_ref = []
+        any_table_exists = False
+        for fname in os.listdir(parent_dirname+"/split_pdf_csv"):
+            if fname.startswith(st.session_state.input_index):
+                any_table_exists = True
+                break
+        if(any_table_exists):
+            #################### Basic Match query #################
+            # payload_tables = {
+            #                     "query": {
+            #                         "bool":{
+            #                         "must":{"match": {
+            #                                         "processed_element": question
+            #                                     }},
+            #                             "filter":{"term":{"raw_element_type": "table"}}
+            #                     }}}
+            #################### Neural Sparse query #################
+            payload_tables = {"query":{"neural_sparse": {
+                    "processed_element_embedding_sparse": {
+                        "query_text": question,
+                        "model_id": "srrJ-owBQhe1aB-khx2n"
+                    }
+                    }  }     }
+            r_ = requests.get(url, auth=awsauth, json=payload_tables, headers=headers)
+            r_tables = json.loads(r_.text)
+            for res_ in r_tables['hits']['hits']:
+                if(res_["_source"]['raw_element_type'] == 'table'):
+                    table_ref.append({'name':res_["_source"]['table'],'text':res_["_source"]['processed_element']})
+                if(len(table_ref) == 2):
+                    break
+        return table_ref
+    ########################### LLM Generation ########################
+    prompt_template = """
+        The following is a friendly conversation between a human and an AI.
+        The AI is talkative and provides lots of specific details from its context.
+        {context}
+        Instruction: Based on the above documents, provide a detailed answer for, {question}. Answer "don't know",
+        if not present in the context.
+        Solution:"""
+    idx = 0
+    images_2 = []
+    is_table_in_result = False
+    df = []
+    for hit in hits[0:3]:
+        if(hit["_source"]["raw_element_type"] == 'table'):
+            #print("Need to analyse table")
+            is_table_in_result = True
+            table_res = invoke_models.read_from_table(hit["_source"]["table"],question)
+            df.append({'name':hit["_source"]["table"],'text':hit["_source"]["processed_element"]})
+            context_tables.append(table_res+"\n\n"+hit["_source"]["processed_element"])
+        else:
+            if(hit["_source"]["image"]!="None"):
+                with open(parent_dirname+'/figures/'+st.session_state.input_index+"/"+hit["_source"]["raw_element_type"].split("_")[1].replace(".jpg","")+"-resized.jpg", "rb") as read_img:
+                    input_encoded = base64.b64encode(read_img.read()).decode("utf8")
+                context.append(invoke_models.generate_image_captions_llm(input_encoded,question))
+            else:
+                context.append(hit["_source"]["processed_element"])
+        if(hit["_source"]["image"]!="None"):
+            images_2.append({'file':hit["_source"]["image"],'caption':hit["_source"]["processed_element"]})
+        idx = idx +1
+        #images.append(hit['_source']['image'])
+    # if(is_table_in_result == False):
+    #     df = lazy_get_table()
+    #     print("forcefully selected top 2 tables")
+    #     print(df)
+    #     for pos,table in enumerate(df):
+    #         table_res = invoke_models.read_from_table(table['name'],question)
+    #         context_tables.append(table_res)#+"\n\n"+table['text']
+    total_context = context_tables + context
+    ####### Re-Rank ########
+    #print("re-rank")
+    if(st.session_state.input_is_rerank == True and len(total_context)):
+        ques = [{"question":question}]
+        ans = [{"answer":total_context}]
+        total_context = re_ranker.re_rank('rag','Cross Encoder',"",ques, ans)
+    llm_prompt = prompt_template.format(context=total_context[0],question=question)
+    output = invoke_models.invoke_llm_model( "\n\nHuman: {input}\n\nAssistant:".format(input=llm_prompt) ,False)
+    #print(output)
+    if(len(images_2)==0):
+        images_2 = images
+    return {'text':output,'source':total_context,'image':images_2,'table':df}

README.md ADDED Viewed

	@@ -0,0 +1,13 @@

+---
+title: OpenSearch AI
+emoji: 🔍
+colorFrom: pink
+colorTo: purple
+sdk: streamlit
+sdk_version: 1.41.1
+app_file: app.py
+pinned: false
+license: apache-2.0
+---
+Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

app.py ADDED Viewed

	@@ -0,0 +1,125 @@

+import streamlit as st
+from PIL import Image
+import base64
+import yaml
+import os
+import urllib.request
+import tarfile
+import subprocess
+from yaml.loader import SafeLoader
+st.set_page_config(
+    #page_title="Semantic Search using OpenSearch",
+    layout="wide",
+    page_icon="/home/ubuntu/images/opensearch_mark_default.png"
+)
+st.markdown("""<style>
+            @import url('https://fonts.cdnfonts.com/css/amazon-ember');
+            </style>
+                """,unsafe_allow_html=True)
+# with open('/home/ubuntu/AI-search-with-amazon-opensearch-service/OpenSearchApp/auth.yaml') as file:
+#     config = yaml.load(file, Loader=SafeLoader)
+# authenticator = Authenticate(
+#     config['credentials'],
+#     config['cookie']['name'],
+#     config['cookie']['key'],
+#     config['cookie']['expiry_days'],
+#     config['preauthorized']
+# )
+# name, authentication_status, username = authenticator.login('Login', 'main')
+AI_ICON = "images/opensearch-twitter-card.png"
+col_0_1,col_0_2,col_0_3= st.columns([10,50,85])
+with col_0_1:
+    st.image(AI_ICON, use_container_width='always')
+with col_0_2:
+    st.markdown('<p style="fontSize:40px;color:#FF9900;fontFamily:\'Amazon Ember Display 500\', sans-serif;">OpenSearch AI demos</p>',unsafe_allow_html=True)
+    #st.header("OpenSearch AI demos")#,divider = 'rainbow'
+# with col_0_3:
+#     st.markdown("<a style = 'font-size:150%;background-color: #e28743;color: white;padding: 5px 10px;text-align: center;text-decoration: none;margin: 10px 20px;border-radius: 12px;display: inline-block;' href = 'https://catalog.workshops.aws/opensearch-ml-search'>Workshop</a>",unsafe_allow_html=True)
+#st.header(":rewind: Demos available")
+st.write("")
+#st.write("----")
+#st.write("Choose a demo")
+st.write("")
+col_1_1,col_1_2,col_1_3 = st.columns([3,40,65])
+with col_1_1:
+    st.subheader(" ")
+with col_1_2:
+    st.markdown('<p style="fontSize:28px;color:#c5c3c0;fontFamily:\'Amazon Ember Cd RC 250\', sans-serif;">Neural Search</p>',unsafe_allow_html=True)
+with col_1_3:
+    demo_1 = st.button(":arrow_forward:",key = "demo_1")
+if(demo_1):
+    st.switch_page('pages/Semantic_Search.py')
+st.write("")
+#st.page_link("pages/1_Semantic_Search.py", label=":orange[1. Semantic Search] :arrow_forward:")
+#st.button("1. Semantic Search")
+# image_ = Image.open('/home/ubuntu/images/Semantic_SEarch.png')
+# new_image = image_.resize((1500, 1000))
+# new_image.save('images/semantic_search_resize.png')
+# st.image("images/semantic_search_resize.png")
+col_2_1,col_2_2,col_2_3 = st.columns([3,40,65])
+with col_2_1:
+    st.subheader(" ")
+with col_2_2:
+    st.markdown('<p style="fontSize:28px;color:#c5c3c0;fontFamily:\'Amazon Ember Cd RC 250\', sans-serif;">Multimodal Conversational Search</p>',unsafe_allow_html=True)
+with col_2_3:
+    demo_2 = st.button(":arrow_forward:",key = "demo_2")
+if(demo_2):
+    st.switch_page('pages/Multimodal_Conversational_Search.py')
+st.write("")
+#st.header("2. Multimodal Conversational Search")
+# image_ = Image.open('images/RAG_.png')
+# new_image = image_.resize((1500, 1000))
+# new_image.save('images/RAG_resize.png')
+# st.image("images/RAG_resize.png")
+col_3_1,col_3_2,col_3_3 = st.columns([3,40,65])
+with col_3_1:
+    st.subheader(" ")
+with col_3_2:
+    st.markdown('<div style="fontSize:28px;color:#c5c3c0;fontFamily:\'Amazon Ember Cd RC 250\', sans-serif;">Agentic Shopping Assistant</div>',unsafe_allow_html=True)#<span style="fontSize:14px;color:#099ef3;fontWeight:bold;textDecorationLine:underline;textDecorationStyle: dashed;">New</span>
+with col_3_3:
+    demo_3 = st.button(":arrow_forward:",key = "demo_3")
+if(demo_3):
+    st.switch_page('pages/AI_Shopping_Assistant.py')
+# with st.sidebar:
+#     st.subheader("Choose a demo !")
+    #  """
+    #     <style>
+    #         [data-testid="stHeader"]::after {
+    #             content: "My Company Name";
+    #             margin-left: 0px;
+    #             margin-top: 0px;
+    #             font-size: 30px;
+    #             position: relative;
+    #             left: 90%;
+    #             top: 30%;
+    #         }
+    #     </style>
+    #     """,
+isExist = os.path.exists("/home/user/images_retail")
+if not isExist:
+    os.makedirs("/home/user/images_retail")
+    metadata_file = urllib.request.urlretrieve('https://aws-blogs-artifacts-public.s3.amazonaws.com/BDB-3144/products-data.yml', '/home/user/products.yaml')
+    img_filename,headers= urllib.request.urlretrieve('https://aws-blogs-artifacts-public.s3.amazonaws.com/BDB-3144/images.tar.gz', '/home/user/images_retail/images.tar.gz')
+    print(img_filename)
+    file = tarfile.open('/home/user/images_retail/images.tar.gz')
+    file.extractall('/home/user/images_retail/')
+    file.close()
+    #remove images.tar.gz
+    os.remove('/home/user/images_retail/images.tar.gz')

figures/ukhousingstats/figure-1-1-resized.jpg ADDED Viewed

figures/ukhousingstats/figure-1-1.jpg ADDED Viewed

figures/ukhousingstats/figure-1-2-resized.jpg ADDED Viewed

figures/ukhousingstats/figure-1-2.jpg ADDED Viewed

figures/ukhousingstats/figure-2-3-resized.jpg ADDED Viewed

figures/ukhousingstats/figure-2-3.jpg ADDED Viewed

figures/ukhousingstats/figure-3-4-resized.jpg ADDED Viewed

figures/ukhousingstats/figure-3-4.jpg ADDED Viewed

figures/ukhousingstats/figure-3-5-resized.jpg ADDED Viewed

figures/ukhousingstats/figure-3-5.jpg ADDED Viewed

figures/ukhousingstats/figure-4-6-resized.jpg ADDED Viewed

figures/ukhousingstats/figure-4-6.jpg ADDED Viewed

figures/ukhousingstats/figure-4-7-resized.jpg ADDED Viewed

figures/ukhousingstats/figure-4-7.jpg ADDED Viewed

figures/ukhousingstats/figure-5-8-resized.jpg ADDED Viewed

figures/ukhousingstats/figure-5-8.jpg ADDED Viewed

figures/ukhousingstats/figure-6-10-resized.jpg ADDED Viewed

figures/ukhousingstats/figure-6-10.jpg ADDED Viewed

figures/ukhousingstats/figure-6-11-resized.jpg ADDED Viewed

figures/ukhousingstats/figure-6-11.jpg ADDED Viewed

figures/ukhousingstats/figure-6-12-resized.jpg ADDED Viewed

figures/ukhousingstats/figure-6-12.jpg ADDED Viewed

figures/ukhousingstats/figure-6-13-resized.jpg ADDED Viewed

figures/ukhousingstats/figure-6-13.jpg ADDED Viewed

figures/ukhousingstats/figure-6-14-resized.jpg ADDED Viewed

figures/ukhousingstats/figure-6-14.jpg ADDED Viewed

figures/ukhousingstats/figure-6-15-resized.jpg ADDED Viewed

figures/ukhousingstats/figure-6-15.jpg ADDED Viewed

figures/ukhousingstats/figure-6-16-resized.jpg ADDED Viewed

figures/ukhousingstats/figure-6-16.jpg ADDED Viewed

figures/ukhousingstats/figure-6-17-resized.jpg ADDED Viewed

figures/ukhousingstats/figure-6-17.jpg ADDED Viewed

figures/ukhousingstats/figure-6-18-resized.jpg ADDED Viewed

figures/ukhousingstats/figure-6-18.jpg ADDED Viewed

figures/ukhousingstats/figure-6-19-resized.jpg ADDED Viewed

figures/ukhousingstats/figure-6-19.jpg ADDED Viewed

figures/ukhousingstats/figure-6-20-resized.jpg ADDED Viewed

figures/ukhousingstats/figure-6-20.jpg ADDED Viewed

figures/ukhousingstats/figure-6-21-resized.jpg ADDED Viewed

figures/ukhousingstats/figure-6-21.jpg ADDED Viewed

figures/ukhousingstats/figure-6-22-resized.jpg ADDED Viewed