Spaces:
Paused
Paused
| from fastapi import FastAPI, HTTPException | |
| from fastapi.responses import JSONResponse | |
| from fastapi.middleware.cors import CORSMiddleware | |
| import requests | |
| from AWSClaude import AWSClaude | |
| import json | |
| import concurrent.futures | |
| import time | |
| app = FastAPI() | |
| app.add_middleware( | |
| CORSMiddleware, | |
| allow_origins=["*"], | |
| allow_credentials=True, | |
| allow_methods=["*"], | |
| allow_headers=["*"], | |
| ) | |
| async def get_n_depth_results(url,input_query): | |
| all_content = {} | |
| def add_pdf_content(selected_pdf): | |
| for pdf_url in selected_pdf: | |
| print(pdf_url) | |
| response = requests.get(pdf_url) | |
| # Save the content of the response as a PDF file | |
| pdf_path = "temp.pdf" | |
| with open(pdf_path, "wb") as file: | |
| file.write(response.content) | |
| print(f"PDF file saved as {pdf_path}") | |
| url = "http://localhost:5000/ask" | |
| # url = "https://us-central1-neuralgap-1.cloudfunctions.net/scraperPDFDocxTables_v2" | |
| data = {"processTables": "True"} | |
| headers = {"Origin": "http://localhost:8080"} | |
| with open(pdf_path, "rb") as file: | |
| file_contents = file.read() | |
| files = { | |
| "pdf": ( | |
| pdf_path, | |
| file_contents, | |
| "application/pdf", | |
| ) | |
| } | |
| response = requests.post(url, files=files, data=data, headers=headers) | |
| all_content[pdf_url] = response.json() | |
| def scrapper(input_url): | |
| params = {'url': input_url} | |
| headers = {'accept': 'application/json'} | |
| url = 'https://chromium-qpxamiokfa-uc.a.run.app/get_scraped_data' | |
| response = requests.get(url, headers=headers, params=params) | |
| all_url = response.json()["URL"] | |
| all_content[input_url] = response.json()["Content"] | |
| return all_url | |
| pdf_urls = [] | |
| def separate_pdf_and_nonPDF_links(urls): | |
| # Separate URLs into two lists | |
| pdf_links = [url for url in urls if url and url.endswith('.pdf')] | |
| if pdf_links: | |
| pdf_urls.append(pdf_links) | |
| return [url for url in urls if not (url and url.endswith('.pdf'))] # other links for rescraping | |
| def call_llm_service(scraped_data, input_url, input_query, pdf): | |
| query = f""" | |
| Here are my scraped links: | |
| {scraped_data} | |
| correct hostname: {input_url} use this host name for all other tasks | |
| I need the always full (www.hotname.com/../) {pdf} URLs for the most relevant links related to "{input_query}". use the correct hostname from this provided content, give raw hyperlink with json format only don't give extra text details. only give json output | |
| example json format is only links don't include keys (i need the always full (www.hotname.com/../)) | |
| """ | |
| llm = "ClaudeHaiku" | |
| env = "" | |
| user_id = "KAusXF7jp0Q40urdZWtDLXEhrmA" | |
| thread_id = "hKxvoVgi7vRJCHhvMzH5" | |
| stream_id = "stream1" | |
| app_type = "sentinel" | |
| other_request_params = {"messages": [ | |
| {"role": "user", "content": query}, | |
| ]} | |
| return AWSClaude(llm, env, user_id, thread_id, stream_id, app_type, other_request_params).invoke() | |
| input_url = f'["{url}"]' | |
| input_query = input_query | |
| for step in range(1, 3): | |
| print(f"=================={step} step of scraping to get selected URLs from LLM=================================") | |
| next_urls = [] | |
| with concurrent.futures.ThreadPoolExecutor() as executor: | |
| futures = [executor.submit(scrapper, input_url) for input_url in (json.loads(input_url)[:2])] | |
| for future in concurrent.futures.as_completed(futures): | |
| next_urls.append(separate_pdf_and_nonPDF_links(future.result())) | |
| selected_links_from_llm = call_llm_service(next_urls, input_url, input_query, "") | |
| input_url = selected_links_from_llm | |
| print(json.loads(input_url)[:2]) | |
| if not pdf_urls: | |
| print(pdf_urls) | |
| #return all_content.keys() | |
| return all_content | |
| else: | |
| selected_pdf = json.loads(call_llm_service(pdf_urls, input_url, input_query, "only end with .pdf extension")) | |
| print(pdf_urls) | |
| print("selected pdf") | |
| print(selected_pdf) | |
| #return all_content.keys() | |
| return all_content | |
| # # Start time | |
| # start_time = time.time() | |
| # print(main("https://www.keells.com/", "Please analyse reports")) | |
| # # End time | |
| # end_time = time.time() | |
| # # Calculate the time taken | |
| # time_taken = end_time - start_time | |
| # print(f"Time taken: {time_taken} seconds") | |