File size: 2,396 Bytes
d6afb45
 
7ee1b98
d6afb45
 
 
f586a70
56e3a34
d6afb45
b878468
d6afb45
 
 
 
 
 
 
 
 
17502e7
 
8a6be4c
 
58f070b
8a6be4c
58f070b
8278c4a
4a4057c
58f070b
17502e7
df0618d
c83c737
58f070b
 
0ae403c
43954cf
 
4a4057c
 
6005136
17502e7
d6afb45
 
f586a70
 
 
 
 
 
df0618d
07cfa54
f586a70
39c8f59
f96a4c2
43954cf
4a4057c
8eb0cc4
e832cec
f586a70
 
 
289044f
f586a70
6005136
d6afb45
 
 
 
 
 
 
 
 
 
 
 
 
 
 
56e3a34
 
f586a70
 
 
56e3a34
26f9624
792d4ad
f586a70
 
 
 
176890c
d6afb45
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
import gradio as gr
import urllib.request
import requests
import bs4
import lxml

def find_all(url,q=None,num=None):
    rawp = []
    source = urllib.request.urlopen(url).read()
    soup = bs4.BeautifulSoup(source,'lxml')
    # title of the page
    print(soup.title)
    # get attributes:
    print(soup.title.name)
    # get values:
    print(soup.title.string)
    # beginning navigation:
    print(soup.title.parent.name)
    # getting specific values:
    #print(soup.p)
    #print(soup.find_all('p'))


    #rawp.append([tag.name for tag in soup.find_all()] )

    #for tag in soup.find_all('h1','h2','h3','p','div','ul'):
    for tag in [soup.find_all()]:
        #print(tag.findChildren("a" , recursive=False))
        #try:
            #n = tag.get(tag.string)
        #rawp.append({f'{tag.name}':tag.string,"parent":tag.parent.name})
        #rawp.append({tag.name:tag.string,"parent":tag.parent.name})
        #except Exception as e:
        #    print (e)
        rawp.append({f'{tag.name}':tag.string})
    
        #rawp.append(tag.string)
    #for url in soup.find_all('a'):
        #print(url.get('href'))
        
    #print(soup.get_text())

    
    return rawp


def find_it(url,q=None,num=None):
    out = []
    source = urllib.request.urlopen(url).read()
    soup = bs4.BeautifulSoup(source,'lxml')
    
    for p in soup.find_all(f'{q}'):
        print(p.findChildren())

        #out.append(p)
        out.append([{q:p.string,"parent":p.parent.parent.name}])
        #out.append(p.parent.name)

    for url in soup.find_all('a'):
        print(url.get('href'))
        
    #print(soup.get_text())
    return out    
    
def find_it2(url):
    response = requests.get(url,a1=None,q2=None,q3=None)
    try:
        response.raise_for_status()
        soup = BeautifulSoup(response.content, 'lxml')
        out = 'URL Links:\n'.join([p.text for p in soup.find_all('a')])
        return out
    except Exception as e:
        print (e)
        return e


with gr.Blocks() as app:
    with gr.Row():
        inp = gr.Textbox()
        q = gr.Textbox(value="p")
        num = gr.Number(value=1)
    with gr.Row():
        all_btn = gr.Button("Load")
        find_btn = gr.Button("Find")
    with gr.Row():
        rawp = gr.JSON()
        outp = gr.JSON()
    
    all_btn.click(find_all,[inp,q,num],[rawp])
    find_btn.click(find_it,[inp,q,num],[outp])
    
app.launch()