File size: 4,712 Bytes
ae92d51
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
# -*- coding: utf-8 -*

import sys
sys.path.append('/Users/machi/Library/Python/3.8/lib/python/site-packages')

import os
import asyncio
# from pyppeteer import launcher
# # 在导入 launch 之前 把 --enable-automation 禁用 防止监测webdriver
# launcher.AUTOMATION_ARGS.remove("--enable-automation")

from pyppeteer import launch
from bs4 import BeautifulSoup
import re
import time

async def pyppteer_fetchUrl(url):
    browser = await launch({'headless': False,'dumpio':True, 'autoClose':True})
    page = await browser.newPage()

    # await page.setDefaultNavigationTimeout(60000)
    await page.goto(url)
    await asyncio.wait([page.waitForNavigation()])
    str = await page.content()
    await browser.close()
    return str

def fetchUrl(url):
    return asyncio.get_event_loop().run_until_complete(pyppteer_fetchUrl(url))

def getPageUrl():
    for page in range(1,5):
        if page == 1:
            yield 'http://www.nhc.gov.cn/xcs/yqtb/list_gzbd.shtml'
        else:
            url = 'http://www.nhc.gov.cn/xcs/yqtb/list_gzbd_'+ str(page) +'.shtml'
            yield url

def getTitleUrl(html):

    bsobj = BeautifulSoup(html,'html.parser')
    titleList = bsobj.find('div', attrs={"class":"list"}).ul.find_all("li")
    for item in titleList:
        link = "http://www.nhc.gov.cn" + item.a["href"];
        title = item.a["title"]
        date = item.span.text
        yield title, link, date

def getInfo(pat, s):
    res = re.search(pat, s)
    if res:
        return res.group(1)
    return '0'

def getContent(html):

    bsobj = BeautifulSoup(html,'html.parser')
    cnt = bsobj.find('div', attrs={"id":"xw_box"}).find_all("p")
    res = []

    if cnt:
        # 从第一段解析
        s = cnt[0].text
        res.append(getInfo(r'新增确诊病例(\d+)例', s))
        res.append(getInfo(r'本土病例(\d+)例', s))
        res.append(getInfo(r'新增死亡病例(\d+)例', s))

        # 从第二段解析
        s = cnt[1].text
        res.append(getInfo(r'新增治愈出院病例(\d+)例', s))

        # 从第五段解析
        s = cnt[4].text
        res.append(getInfo(r'新增无症状感染者(\d+)例', s))
        res.append(getInfo(r'本土(\d+)例', s))

    return res

def saveFile(path, filename, content):

    if not os.path.exists(path):
        os.makedirs(path)

    # 保存文件
    with open(path + filename + ".txt", 'w', encoding='utf-8') as f:
        f.write(content)

if "__main__" == __name__:
    # print(getInfo(r'新增死亡病例(\d+)例', '无新增死亡病例。'))
    # s = '4月28日0—24时,31个省(自治区、直辖市)和新疆生产建设兵团报告新增确诊病例5659例。其中境外输入病例13例(广东3例,北京2例,上海2例,福建2例,黑龙江1例,浙江1例,广西1例,四川1例),含2例由无症状感染者转为确诊病例(浙江1例,福建1例);本土病例5646例(上海5487例,北京47例,吉林42例,浙江31例,山东7例,广东7例,黑龙江4例,江西4例,内蒙古3例,江苏3例,四川3例,河南2例,辽宁1例,福建1例,湖南1例,广西1例,重庆1例,云南1例),含5125例由无症状感染者转为确诊病例(上海5062例,吉林31例,浙江28例,辽宁1例,山东1例,河南1例,云南1例)。新增死亡病例52例,均为本土病例,在上海;无新增疑似病例。'
    # res = re.search( r'新增确诊病例(\d+)例', s)
    # print(res.group(1))
    #
    # res = re.search( r'本土病例.*),', s)
    # print(res.group())
    #
    # res = re.search( r'新增死亡病例\d+例', s)
    # print(res.group())
    #
    # res = re.search( r'新增治愈出院病例\d+例', s)
    # print(res.group())
    #
    with open('/Users/machi/Desktop/covid.csv', 'w') as f:
        header = ','.join(['日期', '新增确诊病例', '本土新增确诊病例', '新增死亡病例', '新增治愈出院病例', '新增无症状感染者', '本土新增无症状感染者'])
        f.write(header + '\n')

        for url in getPageUrl():
            print(url)
            try:
                s =fetchUrl(url)
            except:
                continue

            for title,link,date in getTitleUrl(s):
                print(title,link)
                # time.sleep(5)
                try:
                    html =fetchUrl(link)
                    content = getContent(html)

                    s = ','.join([date] + content)
                    f.write(s + '\n')
                    print('%s write finish' % date)
                except Exception as e:
                    print('%s process failed' % date, e)
                    continue

            # break