Spaces:
Runtime error
Runtime error
File size: 2,774 Bytes
d6afb45 7ee1b98 d6afb45 f586a70 56e3a34 d6afb45 b878468 d6afb45 17502e7 8a6be4c d43b917 9c6f370 8a6be4c 4d3da03 e9a6060 58f070b 31f6eb1 d43b917 4a4057c 58f070b 17502e7 64dcbf4 c83c737 58f070b e9a6060 43954cf 4a4057c 2b7c910 6005136 17502e7 d6afb45 f586a70 df0618d 07cfa54 f586a70 c69bac0 817d95e c69bac0 43954cf 817d95e 8eb0cc4 e832cec f586a70 289044f f586a70 6005136 d6afb45 56e3a34 f586a70 56e3a34 26f9624 792d4ad f586a70 176890c d6afb45 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 |
import gradio as gr
import urllib.request
import requests
import bs4
import lxml
def find_all(url,q=None,num=None):
rawp = []
source = urllib.request.urlopen(url).read()
soup = bs4.BeautifulSoup(source,'lxml')
# title of the page
print(soup.title)
# get attributes:
print(soup.title.name)
# get values:
print(soup.title.string)
# beginning navigation:
print(soup.title.parent.name)
# getting specific values:
#print(soup.p)
#print(soup.find_all('p'))
this = [tag.name for tag in soup.find_all()]
that = [tag.text for tag in soup.find_all()]
#rawp.append([tag.name for tag in soup.find_all()] )
#soup_list = [tag for tag in soup.find_all()]
#for tag in soup.find_all('h1','h2','h3','p','div','ul'):
for i,tag in enumerate(this):
#print (tag.get_text().name)
#print(tag.findChildren("a" , recursive=False))
#try:
#n = tag.get(tag.string)
rawp.append({this[i]:that[i]})
#rawp.append({tag.name:tag.string,"parent":tag.parent.name})
#except Exception as e:
# print (e)
# rawp.append({f'{tag.name}':f'{tag.string}'})
#rawp.append(tag.string)
#for url in soup.find_all('a'):
#print(url.get('href'))
#print(soup.get_text())
return rawp
def find_it(url,q=None,num=None):
out = []
source = urllib.request.urlopen(url).read()
soup = bs4.BeautifulSoup(source,'lxml')
for p in soup.find_all(f'{q}'):
try:
test = soup.select(f'{p.name}:first-child')
print(test.name)
#print(p.findChildren())
except Exception as e:
print (e)
#out.append(p)
out.append([{q:p.string,"parent":p.parent.parent.name,"first-child":soup.select(f'{p.name}:first-child'),"content":p}])
#out.append(p.parent.name)
for url in soup.find_all('a'):
print(url.get('href'))
#print(soup.get_text())
return out
def find_it2(url):
response = requests.get(url,a1=None,q2=None,q3=None)
try:
response.raise_for_status()
soup = BeautifulSoup(response.content, 'lxml')
out = 'URL Links:\n'.join([p.text for p in soup.find_all('a')])
return out
except Exception as e:
print (e)
return e
with gr.Blocks() as app:
with gr.Row():
inp = gr.Textbox()
q = gr.Textbox(value="p")
num = gr.Number(value=1)
with gr.Row():
all_btn = gr.Button("Load")
find_btn = gr.Button("Find")
with gr.Row():
rawp = gr.JSON()
outp = gr.JSON()
all_btn.click(find_all,[inp,q,num],[rawp])
find_btn.click(find_it,[inp,q,num],[outp])
app.launch()
|