import gradio as gr import urllib.request import requests import bs4 import lxml def find_all(url,q=None,num=None): rawp = [] source = urllib.request.urlopen(url).read() soup = bs4.BeautifulSoup(source,'lxml') # title of the page print(soup.title) # get attributes: print(soup.title.name) # get values: print(soup.title.string) # beginning navigation: print(soup.title.parent.name) # getting specific values: #print(soup.p) #print(soup.find_all('p')) this = [tag.name for tag in soup.find_all()] that = [tag.text for tag in soup.find_all()] #rawp.append([tag.name for tag in soup.find_all()] ) #soup_list = [tag for tag in soup.find_all()] #for tag in soup.find_all('h1','h2','h3','p','div','ul'): for i,tag in enumerate(this): #print (tag.get_text().name) #print(tag.findChildren("a" , recursive=False)) #try: #n = tag.get(tag.string) rawp.append({this[i]:that[i]}) #rawp.append({tag.name:tag.string,"parent":tag.parent.name}) #except Exception as e: # print (e) # rawp.append({f'{tag.name}':f'{tag.string}'}) #rawp.append(tag.string) #for url in soup.find_all('a'): #print(url.get('href')) #print(soup.get_text()) return rawp def find_it(url,q=None,num=None): out = [] source = urllib.request.urlopen(url).read() soup = bs4.BeautifulSoup(source,'lxml') for p in soup.find_all(f'{q}'): try: test = soup.select(f'{p.name}:first-child') print(test.name) #print(p.findChildren()) except Exception as e: print (e) #out.append(p) out.append([{q:p.string,"parent":p.parent.parent.name,"first-child":soup.select(f'{p.name}:first-child'),"content":p}]) #out.append(p.parent.name) for url in soup.find_all('a'): print(url.get('href')) #print(soup.get_text()) return out def find_it2(url): response = requests.get(url,a1=None,q2=None,q3=None) try: response.raise_for_status() soup = BeautifulSoup(response.content, 'lxml') out = 'URL Links:\n'.join([p.text for p in soup.find_all('a')]) return out except Exception as e: print (e) return e with gr.Blocks() as app: with gr.Row(): inp = gr.Textbox() q = gr.Textbox(value="p") num = gr.Number(value=1) with gr.Row(): all_btn = gr.Button("Load") find_btn = gr.Button("Find") with gr.Row(): rawp = gr.JSON() outp = gr.JSON() all_btn.click(find_all,[inp,q,num],[rawp]) find_btn.click(find_it,[inp,q,num],[outp]) app.launch()