embeding_api

Paused

App Files Files Community

embeding_api / main.py

Arafath10

Update main.py

91bee69 verified about 1 year ago

raw

history blame

4.72 kB

	from fastapi import FastAPI, HTTPException
	from fastapi.responses import JSONResponse
	from fastapi.middleware.cors import CORSMiddleware
	import requests
	from AWSClaude import AWSClaude
	import json
	import concurrent.futures
	import time

	app = FastAPI()
	app.add_middleware(
	CORSMiddleware,
	allow_origins=["*"],
	allow_credentials=True,
	allow_methods=["*"],
	allow_headers=["*"],
	)



	@app.post("/get_n_depth_results")
	async def get_n_depth_results(url,input_query):
	all_content = {}

	def add_pdf_content(selected_pdf):
	for pdf_url in selected_pdf:
	print(pdf_url)
	response = requests.get(pdf_url)

	# Save the content of the response as a PDF file
	pdf_path = "temp.pdf"
	with open(pdf_path, "wb") as file:
	file.write(response.content)

	print(f"PDF file saved as {pdf_path}")

	url = "http://localhost:5000/ask"
	# url = "https://us-central1-neuralgap-1.cloudfunctions.net/scraperPDFDocxTables_v2"

	data = {"processTables": "True"}

	headers = {"Origin": "http://localhost:8080"}

	with open(pdf_path, "rb") as file:
	file_contents = file.read()

	files = {
	"pdf": (
	pdf_path,
	file_contents,
	"application/pdf",
	)
	}

	response = requests.post(url, files=files, data=data, headers=headers)
	all_content[pdf_url] = response.json()

	def scrapper(input_url):
	params = {'url': input_url}
	headers = {'accept': 'application/json'}
	url = 'https://chromium-qpxamiokfa-uc.a.run.app/get_scraped_data'
	response = requests.get(url, headers=headers, params=params)
	all_url = response.json()["URL"]
	all_content[input_url] = response.json()["Content"]
	return all_url

	pdf_urls = []

	def separate_pdf_and_nonPDF_links(urls):
	# Separate URLs into two lists
	pdf_links = [url for url in urls if url and url.endswith('.pdf')]
	if pdf_links:
	pdf_urls.append(pdf_links)
	return [url for url in urls if not (url and url.endswith('.pdf'))] # other links for rescraping

	def call_llm_service(scraped_data, input_url, input_query, pdf):
	query = f"""
	Here are my scraped links:

	{scraped_data}

	correct hostname: {input_url} use this host name for all other tasks

	I need the always full (www.hotname.com/../) {pdf} URLs for the most relevant links related to "{input_query}". use the correct hostname from this provided content, give raw hyperlink with json format only don't give extra text details. only give json output
	example json format is only links don't include keys (i need the always full (www.hotname.com/../))
	"""
	llm = "ClaudeHaiku"
	env = ""
	user_id = "KAusXF7jp0Q40urdZWtDLXEhrmA"
	thread_id = "hKxvoVgi7vRJCHhvMzH5"
	stream_id = "stream1"
	app_type = "sentinel"
	other_request_params = {"messages": [
	{"role": "user", "content": query},
	]}
	return AWSClaude(llm, env, user_id, thread_id, stream_id, app_type, other_request_params).invoke()

	input_url = f'["{url}"]'
	input_query = input_query

	for step in range(1, 3):
	print(f"=================={step} step of scraping to get selected URLs from LLM=================================")
	next_urls = []
	with concurrent.futures.ThreadPoolExecutor() as executor:
	futures = [executor.submit(scrapper, input_url) for input_url in (json.loads(input_url)[:2])]
	for future in concurrent.futures.as_completed(futures):
	next_urls.append(separate_pdf_and_nonPDF_links(future.result()))
	selected_links_from_llm = call_llm_service(next_urls, input_url, input_query, "")
	input_url = selected_links_from_llm
	print(json.loads(input_url)[:2])

	if not pdf_urls:
	print(pdf_urls)
	#return all_content.keys()
	return all_content
	else:
	selected_pdf = json.loads(call_llm_service(pdf_urls, input_url, input_query, "only end with .pdf extension"))
	print(pdf_urls)
	print("selected pdf")
	print(selected_pdf)
	#return all_content.keys()
	return all_content

	# # Start time
	# start_time = time.time()

	# print(main("https://www.keells.com/", "Please analyse reports"))

	# # End time
	# end_time = time.time()

	# # Calculate the time taken
	# time_taken = end_time - start_time

	# print(f"Time taken: {time_taken} seconds")