Spaces:
Runtime error
Runtime error
import gradio as gr | |
#import urllib.request | |
import requests | |
import bs4 | |
import lxml | |
import os | |
#import subprocess | |
from huggingface_hub import InferenceClient,HfApi | |
import random | |
import json | |
import datetime | |
from pypdf import PdfReader | |
import uuid | |
#from query import tasks | |
from agent import ( | |
PREFIX, | |
COMPRESS_DATA_PROMPT, | |
COMPRESS_DATA_PROMPT_SMALL, | |
LOG_PROMPT, | |
LOG_RESPONSE, | |
) | |
client = InferenceClient( | |
"mistralai/Mixtral-8x7B-Instruct-v0.1" | |
) | |
reponame="Omnibus/tmp" | |
save_data=f'https://huggingface.co/datasets/{reponame}/raw/main/' | |
token_self = os.environ['HF_TOKEN'] | |
api=HfApi(token=token_self) | |
def find_all(url): | |
return_list=[] | |
print (url) | |
print (f"trying URL:: {url}") | |
try: | |
if url != "" and url != None: | |
out = [] | |
source = requests.get(url) | |
print(source.status_code) | |
if source.status_code ==200: | |
print('trying') | |
soup = bs4.BeautifulSoup(source.content,'lxml') | |
rawp=(f'RAW TEXT RETURNED: {soup.text}') | |
print (rawp) | |
cnt=0 | |
cnt+=len(rawp) | |
out.append(rawp) | |
out.append("HTML fragments: ") | |
q=("a","p","span","content","article") | |
for p in soup.find_all("a"): | |
out.append([{"LINK TITLE":p.get('title'),"URL":p.get('href'),"STRING":p.string}]) | |
c=0 | |
out = str(out) | |
rl = len(out) | |
print(f'rl:: {rl}') | |
for i in str(out): | |
if i == " " or i=="," or i=="\n" or i=="/" or i=="." or i=="<": | |
c +=1 | |
print (f'c:: {c}') | |
#if c > MAX_HISTORY: | |
#print("compressing...") | |
#rawp = compress_data(c,purpose,task,out,result) | |
#result += rawp | |
rawp=out | |
return True, rawp | |
else: | |
return False, f'Status:: {source.status_code}' | |
else: | |
print('passing') | |
return False, "Enter Valid URL" | |
except Exception as e: | |
print (e) | |
return False, f'Error: {e}' | |
def read_txt(txt_path): | |
text="" | |
with open(txt_path,"r") as f: | |
text = f.read() | |
f.close() | |
print (text) | |
return text | |
def read_pdf(pdf_path): | |
text="" | |
reader = PdfReader(f'{pdf_path}') | |
number_of_pages = len(reader.pages) | |
for i in range(number_of_pages): | |
page = reader.pages[i] | |
text = f'{text}\n{page.extract_text()}' | |
print (text) | |
return text | |
error_box=[] | |
def read_pdf_online(url): | |
uid=uuid.uuid4() | |
print(f"reading {url}") | |
response = requests.get(url, stream=True) | |
print(response.status_code) | |
text="" | |
################# | |
##################### | |
try: | |
if response.status_code == 200: | |
with open("test.pdf", "wb") as f: | |
f.write(response.content) | |
#f.close() | |
#out = Path("./data.pdf") | |
#print (out) | |
reader = PdfReader("test.pdf") | |
number_of_pages = len(reader.pages) | |
print(number_of_pages) | |
for i in range(number_of_pages): | |
page = reader.pages[i] | |
text = f'{text}\n{page.extract_text()}' | |
print(f"PDF_TEXT:: {text}") | |
return text | |
else: | |
text = response.status_code | |
error_box.append(url) | |
print(text) | |
return text | |
except Exception as e: | |
print (e) | |
return e | |
VERBOSE = True | |
MAX_HISTORY = 100 | |
MAX_DATA = 20000 | |
def format_prompt(message, history): | |
prompt = "<s>" | |
for user_prompt, bot_response in history: | |
prompt += f"[INST] {user_prompt} [/INST]" | |
prompt += f" {bot_response}</s> " | |
prompt += f"[INST] {message} [/INST]" | |
return prompt | |
def run_gpt( | |
prompt_template, | |
stop_tokens, | |
max_tokens, | |
seed, | |
**prompt_kwargs, | |
): | |
print(seed) | |
timestamp=datetime.datetime.now() | |
generate_kwargs = dict( | |
temperature=0.9, | |
max_new_tokens=max_tokens, | |
top_p=0.95, | |
repetition_penalty=1.0, | |
do_sample=True, | |
seed=seed, | |
) | |
content = PREFIX.format( | |
timestamp=timestamp, | |
purpose="Compile the provided data and complete the users task" | |
) + prompt_template.format(**prompt_kwargs) | |
if VERBOSE: | |
print(LOG_PROMPT.format(content)) | |
#formatted_prompt = format_prompt(f"{system_prompt}, {prompt}", history) | |
#formatted_prompt = format_prompt(f'{content}', history) | |
stream = client.text_generation(content, **generate_kwargs, stream=True, details=True, return_full_text=False) | |
resp = "" | |
for response in stream: | |
resp += response.token.text | |
#yield resp | |
if VERBOSE: | |
print(LOG_RESPONSE.format(resp)) | |
return resp | |
def compress_data(c, instruct, history): | |
seed=random.randint(1,1000000000) | |
print (c) | |
#tot=len(purpose) | |
#print(tot) | |
divr=int(c)/MAX_DATA | |
divi=int(divr)+1 if divr != int(divr) else int(divr) | |
chunk = int(int(c)/divr) | |
print(f'chunk:: {chunk}') | |
print(f'divr:: {divr}') | |
print (f'divi:: {divi}') | |
out = [] | |
#out="" | |
s=0 | |
e=chunk | |
print(f'e:: {e}') | |
new_history="" | |
#task = f'Compile this data to fulfill the task: {task}, and complete the purpose: {purpose}\n' | |
for z in range(divi): | |
print(f's:e :: {s}:{e}') | |
hist = history[s:e] | |
resp = run_gpt( | |
COMPRESS_DATA_PROMPT_SMALL, | |
stop_tokens=["observation:", "task:", "action:", "thought:"], | |
max_tokens=8192, | |
seed=seed, | |
direction=instruct, | |
knowledge="", | |
history=hist, | |
) | |
out.append(resp) | |
#new_history = resp | |
print (resp) | |
#out+=resp | |
e=e+chunk | |
s=s+chunk | |
return out | |
def compress_data_og(c, instruct, history): | |
seed=random.randint(1,1000000000) | |
print (c) | |
#tot=len(purpose) | |
#print(tot) | |
divr=int(c)/MAX_DATA | |
divi=int(divr)+1 if divr != int(divr) else int(divr) | |
chunk = int(int(c)/divr) | |
print(f'chunk:: {chunk}') | |
print(f'divr:: {divr}') | |
print (f'divi:: {divi}') | |
out = [] | |
#out="" | |
s=0 | |
e=chunk | |
print(f'e:: {e}') | |
new_history="" | |
#task = f'Compile this data to fulfill the task: {task}, and complete the purpose: {purpose}\n' | |
for z in range(divi): | |
print(f's:e :: {s}:{e}') | |
hist = history[s:e] | |
resp = run_gpt( | |
COMPRESS_DATA_PROMPT, | |
stop_tokens=["observation:", "task:", "action:", "thought:"], | |
max_tokens=8192, | |
seed=seed, | |
direction=instruct, | |
knowledge=new_history, | |
history=hist, | |
) | |
new_history = resp | |
print (resp) | |
out+=resp | |
e=e+chunk | |
s=s+chunk | |
''' | |
resp = run_gpt( | |
COMPRESS_DATA_PROMPT, | |
stop_tokens=["observation:", "task:", "action:", "thought:"], | |
max_tokens=8192, | |
seed=seed, | |
direction=instruct, | |
knowledge=new_history, | |
history="All data has been recieved.", | |
)''' | |
print ("final" + resp) | |
#history = "observation: {}\n".format(resp) | |
return resp | |
RECALL_MEMORY="""The user will give you a query and a list | |
Your duty is to choose the words from the list that are closely related to the search query. | |
If there are no relevant keywords found in the provided list return 'NONE' | |
Respond with only a list, or NONE | |
Respond only in this format: | |
[keyword1,keyword2,keyword3] | |
USER QUERY: | |
{prompt} | |
KEYWORD LIST: | |
{keywords} | |
""" | |
def get_mem(prompt,kw): | |
seed=random.randint(1,1000000000) | |
generate_kwargs = dict( | |
temperature=0.6, | |
max_new_tokens=1024, | |
top_p=0.6, | |
repetition_penalty=1.0, | |
do_sample=True, | |
seed=seed, | |
) | |
content = RECALL_MEMORY.format(keywords=kw,prompt=prompt) | |
stream = client.text_generation(content, **generate_kwargs, stream=True, details=True, return_full_text=False) | |
resp = "" | |
for response in stream: | |
resp += response.token.text | |
print (resp) | |
return resp | |
def summarize(inp,history,report_check,sum_check,mem_check,data=None,files=None,url=None,pdf_url=None,pdf_batch=None): | |
json_box=[] | |
if inp == "": | |
inp = "Process this data" | |
history.clear() | |
history = [(inp,"Working on it...")] | |
yield "",history,error_box,json_box | |
if pdf_batch.startswith("http"): | |
c=0 | |
data="" | |
for i in str(pdf_batch): | |
if i==",": | |
c+=1 | |
print (f'c:: {c}') | |
try: | |
for i in range(c+1): | |
batch_url = pdf_batch.split(",",c)[i] | |
bb = read_pdf_online(batch_url) | |
data=f'{data}\nFile Name URL ({batch_url}):\n{bb}' | |
except Exception as e: | |
print(e) | |
#data=f'{data}\nError reading URL ({batch_url})' | |
if pdf_url.startswith("http"): | |
print("PDF_URL") | |
out = read_pdf_online(pdf_url) | |
data=out | |
if url.startswith("http"): | |
val, out = find_all(url) | |
if not val: | |
data="Error" | |
rawp = str(out) | |
else: | |
data=out | |
if files: | |
for i, file in enumerate(files): | |
try: | |
print (file) | |
if file.endswith(".pdf"): | |
zz=read_pdf(file) | |
print (zz) | |
data=f'{data}\nFile Name ({file}):\n{zz}' | |
elif file.endswith(".txt"): | |
zz=read_txt(file) | |
print (zz) | |
data=f'{data}\nFile Name ({file}):\n{zz}' | |
except Exception as e: | |
data=f'{data}\nError opening File Name ({file})' | |
print (e) | |
if data != "Error" and data != "": | |
print(inp) | |
out = str(data) | |
rl = len(out) | |
print(f'rl:: {rl}') | |
c=1 | |
for i in str(out): | |
if i == " " or i=="," or i=="\n": | |
c +=1 | |
print (f'c:: {c}') | |
if mem_check: | |
json_out = save_memory(inp,out) | |
rawp = "Complete" | |
if sum_check: | |
json_out = compress_data(c,inp,out) | |
out = str(json_out) | |
if report_check: | |
rl = len(out) | |
print(f'rl:: {rl}') | |
c=1 | |
for i in str(out): | |
if i == " " or i=="," or i=="\n": | |
c +=1 | |
print (f'c2:: {c}') | |
rawp = compress_data_og(c,inp,out) | |
else: | |
rawp = out | |
json_out = format_json(json_out) | |
else: | |
rawp = "Provide a valid data source" | |
history.clear() | |
history.append((inp,rawp)) | |
yield "", history,error_box,json_out | |
SAVE_MEMORY = """ | |
You are attempting to complete the task | |
task: {task} | |
Data: | |
{history} | |
Instructions: | |
Compile and categorize the data above into a JSON dictionary string | |
Include ALL text, datapoints, titles, descriptions, and source urls indexed into an easy to search JSON format | |
Required keys: | |
"keywords":["short", "list", "of", "important", "keywords", "found", "in", "this", "entry"], | |
"title":"title of entry", | |
"description":"A sentence summarizing the topic of this entry", | |
"content":"A brief paragraph summarizing the important datapoints found in this entry", | |
"url":"https://url.source" | |
""" | |
def format_json(inp): | |
print("FORMATTING:::") | |
print(type(inp)) | |
print("###########") | |
print(inp) | |
print("###########") | |
print("###########") | |
new_str="" | |
matches=["```","#","//"] | |
for i,line in enumerate(inp): | |
line = line.strip() | |
print(line) | |
if not any(x in line for x in matches): | |
new_str+=line.strip("\n") | |
print("###########") | |
print("###########") | |
#inp = inp.strip("<\s>") | |
new_str=new_str.strip("</s>") | |
out_json=eval(new_str) | |
print(out_json) | |
print("###########") | |
print("###########") | |
return out_json | |
def format_json_og(inp): | |
new_json=[] | |
start_json={} | |
print("FORMATTING:::") | |
for i,line in enumerate(inp): | |
line = line.strip() | |
if "{" in line: | |
print (line) | |
start_json={} | |
#print(f'test:: {line}') | |
if "keywords" in line and ":" in line: | |
start_json['keywords']=line.split(":")[1].strip(",") | |
print (line) | |
if "title" in line and ":" in line: | |
start_json['title']=line.split(":")[1].strip(",") | |
print (line) | |
if "description" in line and ":" in line: | |
start_json['description']=line.split(":")[1].strip(",") | |
print (line) | |
if "content" in line and ":" in line: | |
start_json['content']=line.split(":")[1].strip(",") | |
print (line) | |
if "url" in line and ":" in line: | |
start_json['url']=line.split(":")[1].strip(",") | |
print (line) | |
if "}" in line: | |
new_json.append(start_json) | |
print (new_json) | |
return new_json | |
def create_index(): | |
uid=uuid.uuid4() | |
####### load index ############### | |
r = requests.get(f'{save_data}mem-test2/index.json') | |
print(f'status code main:: {r.status_code}') | |
if r.status_code==200: | |
ind = json.loads(r.text) | |
print (f'ind::\n{ind}') | |
if not r.status_code==200: | |
print("Create new IND") | |
ind = [{}] | |
####### load main ############### | |
m = requests.get(f'{save_data}mem-test2/main.json') | |
print(f'status code main:: {m.status_code}') | |
if m.status_code==200: | |
main = json.loads(m.text) | |
#print (f'main::\n{main}') | |
if not r.status_code==200: | |
main = [] | |
try: | |
for ea in main: | |
#print(f'###### EACH::: {ea}') | |
print(f"KEYWORDS:: {ea['keywords']}") | |
except Exception as e: | |
print(f"ERROR:: {e}") | |
for ea in main: | |
try: | |
for k in ea['keywords']: | |
print(k) | |
print(ea['file_name']) | |
#for ii in ind[0]: | |
try: | |
if k in ind[0].keys(): | |
print("Adding to list") | |
if not ea['file_name'] in ind[0][k]: | |
ind[0][k].append(ea['file_name']) | |
else: | |
print("Adding new Value") | |
ind[0].update({k:[ea['file_name']]}) | |
except Exception as e: | |
print (e) | |
ind[0].append({k:[ea['file_name']]}) | |
#ind.append({k:[ea['file_name']]}) | |
except Exception as e: | |
print (e) | |
json_object = json.dumps(ind, indent=4) | |
with open(f"tmp3-{uid}.json", "w") as outfile3: | |
outfile3.write(json_object) | |
outfile3.close() | |
api.upload_file( | |
path_or_fileobj=f"tmp3-{uid}.json", | |
path_in_repo=f"/mem-test2/index.json", | |
repo_id=reponame, | |
#repo_id=save_data.split('datasets/',1)[1].split('/raw',1)[0], | |
token=token_self, | |
repo_type="dataset", | |
) | |
def save_memory(purpose, history): | |
uid=uuid.uuid4() | |
history=str(history) | |
c=1 | |
inp = str(history) | |
rl = len(inp) | |
print(f'rl:: {rl}') | |
for i in str(inp): | |
if i == " " or i=="," or i=="\n" or i=="/" or i=="\\" or i=="." or i=="<": | |
c +=1 | |
print (f'c:: {c}') | |
seed=random.randint(1,1000000000) | |
print (c) | |
#tot=len(purpose) | |
#print(tot) | |
divr=int(c)/MAX_DATA | |
divi=int(divr)+1 if divr != int(divr) else int(divr) | |
chunk = int(int(c)/divr) | |
print(f'chunk:: {chunk}') | |
print(f'divr:: {divr}') | |
print (f'divi:: {divi}') | |
out_box = [] | |
#out="" | |
s=0 | |
ee=chunk | |
print(f'e:: {ee}') | |
new_history="" | |
task = f'Index this Data\n' | |
for z in range(divi): | |
print(f's:e :: {s}:{ee}') | |
hist = inp[s:ee] | |
resp = run_gpt( | |
SAVE_MEMORY, | |
stop_tokens=["observation:", "task:", "action:", "thought:"], | |
max_tokens=4096, | |
seed=seed, | |
purpose=purpose, | |
task=task, | |
history=hist, | |
).strip('\n') | |
#new_history = resp | |
#print (resp) | |
#out+=resp | |
#print ("final1" + resp) | |
try: | |
resp='[{'+resp.split('[{')[1].split('</s>')[0] | |
#print ("final2\n" + resp) | |
#print(f"keywords:: {resp['keywords']}") | |
except Exception as e: | |
resp = resp | |
print(e) | |
timestamp=str(datetime.datetime.now()) | |
timename=timestamp.replace(" ","--").replace(":","-").replace(".","-") | |
json_object=resp | |
#json_object = json.dumps(out_box) | |
#json_object = json.dumps(out_box,indent=4) | |
with open(f"tmp-{uid}.json", "w") as outfile: | |
outfile.write(json_object) | |
outfile.close() | |
api.upload_file( | |
path_or_fileobj=f"tmp-{uid}.json", | |
path_in_repo=f"/mem-test2/{timename}---{s}-{ee}.json", | |
repo_id=reponame, | |
#repo_id=save_data.split('datasets/',1)[1].split('/raw',1)[0], | |
token=token_self, | |
repo_type="dataset", | |
) | |
lines = resp.strip().strip("\n").split("\n") | |
#formatted_json=format_json(lines) | |
r = requests.get(f'{save_data}mem-test2/main.json') | |
print(f'status code main:: {r.status_code}') | |
try: | |
print(f"KEYWORDS:: {json_object['keywords']}") | |
except Exception as e: | |
print(f"KEYWORDS:: {e}") | |
if r.status_code==200: | |
lod = json.loads(r.text) | |
#lod = eval(lod) | |
print (f'lod:: {lod}') | |
if not r.status_code==200: | |
lod = [] | |
key_box=[] | |
desc="" | |
for i,line in enumerate(lines): | |
#print(f'LINE:: {line}') | |
if ":" in line: | |
print(f'line:: {line}') | |
if "keywords" in line and ":" in line: | |
print(f'trying:: {line}') | |
keyw=line.split(":")[1] | |
print (keyw) | |
print (keyw.split("[")[1].split("]")[0]) | |
keyw=keyw.split("[")[1].split("]")[0] | |
for ea in keyw.split(","): | |
s1="" | |
ea=ea.strip().strip("\n") | |
for ev in ea: | |
if ev.isalnum(): | |
s1+=ev | |
if ev == " ": | |
s1+=ev | |
#ea=s1 | |
print(s1) | |
key_box.append(s1) | |
if "description" in line and ":" in line: | |
#print(f'trying:: {line}') | |
desc=line.split(":")[1] | |
if key_box and desc: | |
lod.append({"file_name":f"{timename}---{s}-{ee}","keywords":key_box,"description":str(desc),"index":f"{s}:{ee}"}) | |
key_box = [] | |
desc="" | |
json_object = json.dumps(lod, indent=4) | |
with open(f"tmp2-{uid}.json", "w") as outfile2: | |
outfile2.write(json_object) | |
outfile2.close() | |
api.upload_file( | |
path_or_fileobj=f"tmp2-{uid}.json", | |
path_in_repo=f"/mem-test2/main.json", | |
repo_id=reponame, | |
#repo_id=save_data.split('datasets/',1)[1].split('/raw',1)[0], | |
token=token_self, | |
repo_type="dataset", | |
) | |
ee=ee+chunk | |
s=s+chunk | |
out_box.append(resp) | |
create_index() | |
return out_box | |
def valid_list(inp): | |
out_list=[] | |
inp_typ = type(inp) | |
print(inp_typ) | |
if inp_typ==type(str(inp)): | |
print("STRING") | |
#new_list = new_list.replace(", ",",").replace(" ,",",") | |
new_list=inp.split("[")[1].split("]",-1)[0] | |
print(new_list) | |
print(type(new_list)) | |
for ea in new_list.split(","): | |
ea = ea.replace("'","").replace('"',"") | |
out_list.append(ea) | |
print(out_list) | |
print(type(out_list)) | |
def recall_memory(inp,history): | |
error_box="" | |
json_out={} | |
if not history: | |
history=[] | |
r = requests.get(f'{save_data}mem-test2/index.json') | |
print(f'status code main:: {r.status_code}') | |
if r.status_code==200: | |
mem = json.loads(r.text) | |
print (f'ind::\n{mem}') | |
if not r.status_code==200: | |
print("Create new IND") | |
out="MEMORY FILE NOT FOUND" | |
return out,out,out,out | |
mem_keys = mem[0].keys() | |
rawp = get_mem(inp,mem_keys) | |
valid_list(rawp) | |
valid_list(["123","333"]) | |
history.clear() | |
history.append((inp,rawp)) | |
yield "", history,error_box,json_out | |
################################# | |
def clear_fn(): | |
return "",[(None,None)] | |
with gr.Blocks() as app: | |
gr.HTML("""<center><h1>Mixtral 8x7B TLDR Summarizer + Web</h1><h3>Summarize Data of unlimited length</h3>""") | |
chatbot = gr.Chatbot(label="Mixtral 8x7B Chatbot",show_copy_button=True) | |
with gr.Row(): | |
with gr.Column(scale=3): | |
prompt=gr.Textbox(label = "Instructions (optional)") | |
with gr.Column(scale=1): | |
report_check=gr.Checkbox(label="Return Report", value=True) | |
sum_check=gr.Checkbox(label="Summarize", value=True) | |
mem_check=gr.Checkbox(label="Memory", value=True) | |
#sum_mem_check=gr.Radio(label="Output",choices=["Summary","Memory"]) | |
button=gr.Button() | |
#models_dd=gr.Dropdown(choices=[m for m in return_list],interactive=True) | |
with gr.Row(): | |
stop_button=gr.Button("Stop") | |
clear_btn = gr.Button("Clear") | |
with gr.Row(): | |
with gr.Tab("Text"): | |
data=gr.Textbox(label="Input Data (paste text)", lines=6) | |
with gr.Tab("File"): | |
file=gr.Files(label="Input File(s) (.pdf .txt)") | |
with gr.Tab("Raw HTML"): | |
url = gr.Textbox(label="URL") | |
with gr.Tab("PDF URL"): | |
pdf_url = gr.Textbox(label="PDF URL") | |
with gr.Tab("PDF Batch"): | |
pdf_batch = gr.Textbox(label="PDF URL Batch (comma separated)") | |
with gr.Tab("Memory"): | |
mem_inp = gr.Textbox(label="Query") | |
mem = gr.Button() | |
json_out=gr.JSON() | |
e_box=gr.Textbox() | |
mem.click(recall_memory,mem_inp,[prompt,chatbot,e_box,json_out]) | |
#text=gr.JSON() | |
#inp_query.change(search_models,inp_query,models_dd) | |
clear_btn.click(clear_fn,None,[prompt,chatbot]) | |
go=button.click(summarize,[prompt,chatbot,report_check,sum_check,mem_check,data,file,url,pdf_url,pdf_batch],[prompt,chatbot,e_box,json_out]) | |
stop_button.click(None,None,None,cancels=[go]) | |
app.queue(default_concurrency_limit=20).launch(show_api=False) |