File size: 2,787 Bytes
d6afb45
 
7ee1b98
d6afb45
 
 
f586a70
56e3a34
d6afb45
b878468
d6afb45
 
 
 
 
 
 
 
 
17502e7
 
8a6be4c
d43b917
 
8a6be4c
4d3da03
e9a6060
58f070b
d43b917
 
 
4a4057c
58f070b
17502e7
64dcbf4
c83c737
58f070b
 
e9a6060
43954cf
 
4a4057c
2b7c910
6005136
17502e7
d6afb45
 
f586a70
 
 
 
 
 
df0618d
07cfa54
f586a70
c69bac0
817d95e
 
c69bac0
 
 
43954cf
817d95e
8eb0cc4
e832cec
f586a70
 
 
289044f
f586a70
6005136
d6afb45
 
 
 
 
 
 
 
 
 
 
 
 
 
 
56e3a34
 
f586a70
 
 
56e3a34
26f9624
792d4ad
f586a70
 
 
 
176890c
d6afb45
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
import gradio as gr
import urllib.request
import requests
import bs4
import lxml

def find_all(url,q=None,num=None):
    rawp = []
    source = urllib.request.urlopen(url).read()
    soup = bs4.BeautifulSoup(source,'lxml')
    # title of the page
    print(soup.title)
    # get attributes:
    print(soup.title.name)
    # get values:
    print(soup.title.string)
    # beginning navigation:
    print(soup.title.parent.name)
    # getting specific values:
    #print(soup.p)
    #print(soup.find_all('p'))

    this = [tag.name for tag in soup.find_all()]
    that = [tag.string for tag in soup.find_all()]

    #rawp.append([tag.name for tag in soup.find_all()] )
    #soup_list = [tag for tag in soup.find_all()]
    #for tag in soup.find_all('h1','h2','h3','p','div','ul'):
    for i,tag in enumerate(soup.find_all()):
        
        #print (tag.get_text().name)
        #print(tag.findChildren("a" , recursive=False))
        #try:
            #n = tag.get(tag.string)
        rawp.append({this[i]:that[i]})
        #rawp.append({tag.name:tag.string,"parent":tag.parent.name})
        #except Exception as e:
        #    print (e)
     #   rawp.append({f'{tag.name}':f'{tag.string}'})
    
        #rawp.append(tag.string)
    #for url in soup.find_all('a'):
    #print(url.get('href'))
        
    #print(soup.get_text())

    
    return rawp


def find_it(url,q=None,num=None):
    out = []
    source = urllib.request.urlopen(url).read()
    soup = bs4.BeautifulSoup(source,'lxml')
    
    for p in soup.find_all(f'{q}'):
        try:
            test = soup.select(f'{p.name}:first-child')
            print(test.name)
            #print(p.findChildren())
        except Exception as e:
            print (e)
        #out.append(p)
        out.append([{q:p.string,"parent":p.parent.parent.name,"first-child":soup.select(f'{p.name}:first-child'),"content":p}])
        #out.append(p.parent.name)

    for url in soup.find_all('a'):
        print(url.get('href'))
        
    #print(soup.get_text())
    return out    
    
def find_it2(url):
    response = requests.get(url,a1=None,q2=None,q3=None)
    try:
        response.raise_for_status()
        soup = BeautifulSoup(response.content, 'lxml')
        out = 'URL Links:\n'.join([p.text for p in soup.find_all('a')])
        return out
    except Exception as e:
        print (e)
        return e


with gr.Blocks() as app:
    with gr.Row():
        inp = gr.Textbox()
        q = gr.Textbox(value="p")
        num = gr.Number(value=1)
    with gr.Row():
        all_btn = gr.Button("Load")
        find_btn = gr.Button("Find")
    with gr.Row():
        rawp = gr.JSON()
        outp = gr.JSON()
    
    all_btn.click(find_all,[inp,q,num],[rawp])
    find_btn.click(find_it,[inp,q,num],[outp])
    
app.launch()