import gradio as gr import urllib.request import requests import bs4 import lxml def find_all(url,q=None,num=None): rawp = [] source = urllib.request.urlopen(url).read() soup = bs4.BeautifulSoup(source,'lxml') # title of the page print(soup.title) # get attributes: print(soup.title.name) # get values: print(soup.title.string) # beginning navigation: print(soup.title.parent.name) # getting specific values: #print(soup.p) #print(soup.find_all('p')) rawp.append([tag.name for tag in soup.find_all()] ) '''for tag in soup.find_all('h1','h2','h3','p','div','ul'): #print(tag.findChildren("a" , recursive=False)) try: #n = tag.get(tag.string) rawp.append({tag.name:tag.text,"parent":tag.parent.name}) except Exception as e: print (e) rawp.append({tag.name:tag.text})''' #rawp.append(tag.string) #for url in soup.find_all('a'): #print(url.get('href')) #print(soup.get_text()) return rawp def find_it(url,q=None,num=None): out = [] source = urllib.request.urlopen(url).read() soup = bs4.BeautifulSoup(source,'lxml') for p in soup.find_all(f'{q}'): print(p.findChildren()) #out.append(p) out.append([{q:p.string,"parent":p.parent.parent.name}]) #out.append(p.parent.name) for url in soup.find_all('a'): print(url.get('href')) #print(soup.get_text()) return out def find_it2(url): response = requests.get(url,a1=None,q2=None,q3=None) try: response.raise_for_status() soup = BeautifulSoup(response.content, 'lxml') out = 'URL Links:\n'.join([p.text for p in soup.find_all('a')]) return out except Exception as e: print (e) return e with gr.Blocks() as app: with gr.Row(): inp = gr.Textbox() q = gr.Textbox(value="p") num = gr.Number(value=1) with gr.Row(): all_btn = gr.Button("Load") find_btn = gr.Button("Find") with gr.Row(): rawp = gr.JSON() outp = gr.JSON() all_btn.click(find_all,[inp,q,num],[rawp]) find_btn.click(find_it,[inp,q,num],[outp]) app.launch()