File size: 2,013 Bytes
d6afb45
 
7ee1b98
d6afb45
 
 
f586a70
56e3a34
d6afb45
b878468
d6afb45
 
 
 
 
 
 
 
d73c7bb
7313962
f586a70
 
 
 
 
 
df0618d
07cfa54
f586a70
c69bac0
817d95e
7313962
c69bac0
 
 
43954cf
21a312e
8eb0cc4
7313962
21a312e
f586a70
 
 
289044f
f586a70
6005136
d6afb45
 
 
 
 
 
 
 
 
 
 
 
 
 
 
56e3a34
 
f586a70
 
 
56e3a34
26f9624
792d4ad
f586a70
 
 
 
176890c
d6afb45
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
import gradio as gr
import urllib.request
import requests
import bs4
import lxml

def find_all(url,q=None,num=None):
    rawp = []
    source = urllib.request.urlopen(url).read()
    soup = bs4.BeautifulSoup(source,'lxml')
    # title of the page
    print(soup.title)
    # get attributes:
    print(soup.title.name)
    # get values:
    print(soup.title.string)
    # beginning navigation:
    print(soup.title.parent.name)
    #rawp.append([tag.name for tag in soup.find_all()] )
    print([tag.name for tag in soup.find_all()])
    return rawp


def find_it(url,q=None,num=None):
    out = []
    source = urllib.request.urlopen(url).read()
    soup = bs4.BeautifulSoup(source,'lxml')
    
    for p in soup.find_all(f'{q}'):
        try:
            test = soup.select(f'{p.name}:first-child')
            
            #print(p.findChildren())
        except Exception as e:
            print (e)
        #out.append(p)
        out.append([{q:p.string,"parent":p.parent.name,"first-child":soup.select(f'{p.name}:first-child').name,"content":p}])
        #out.append(p.parent.name)
        print(dir(p))
        print(p.parent.name)
    for url in soup.find_all('a'):
        print(url.get('href'))
        
    #print(soup.get_text())
    return out    
    
def find_it2(url):
    response = requests.get(url,a1=None,q2=None,q3=None)
    try:
        response.raise_for_status()
        soup = BeautifulSoup(response.content, 'lxml')
        out = 'URL Links:\n'.join([p.text for p in soup.find_all('a')])
        return out
    except Exception as e:
        print (e)
        return e


with gr.Blocks() as app:
    with gr.Row():
        inp = gr.Textbox()
        q = gr.Textbox(value="p")
        num = gr.Number(value=1)
    with gr.Row():
        all_btn = gr.Button("Load")
        find_btn = gr.Button("Find")
    with gr.Row():
        rawp = gr.JSON()
        outp = gr.JSON()
    
    all_btn.click(find_all,[inp,q,num],[rawp])
    find_btn.click(find_it,[inp,q,num],[outp])
    
app.launch()