Spaces:

pkumc
/

PyTest

Runtime error

File size: 4,712 Bytes

ae92d51

# -*- coding: utf-8 -*

import sys
sys.path.append('/Users/machi/Library/Python/3.8/lib/python/site-packages')

import os
import asyncio
# from pyppeteer import launcher
# # 在导入 launch 之前 把 --enable-automation 禁用 防止监测webdriver
# launcher.AUTOMATION_ARGS.remove("--enable-automation")

from pyppeteer import launch
from bs4 import BeautifulSoup
import re
import time

async def pyppteer_fetchUrl(url):
    browser = await launch({'headless': False,'dumpio':True, 'autoClose':True})
    page = await browser.newPage()

    # await page.setDefaultNavigationTimeout(60000)
    await page.goto(url)
    await asyncio.wait([page.waitForNavigation()])
    str = await page.content()
    await browser.close()
    return str

def fetchUrl(url):
    return asyncio.get_event_loop().run_until_complete(pyppteer_fetchUrl(url))

def getPageUrl():
    for page in range(1,5):
        if page == 1:
            yield 'http://www.nhc.gov.cn/xcs/yqtb/list_gzbd.shtml'
        else:
            url = 'http://www.nhc.gov.cn/xcs/yqtb/list_gzbd_'+ str(page) +'.shtml'
            yield url

def getTitleUrl(html):

    bsobj = BeautifulSoup(html,'html.parser')
    titleList = bsobj.find('div', attrs={"class":"list"}).ul.find_all("li")
    for item in titleList:
        link = "http://www.nhc.gov.cn" + item.a["href"];
        title = item.a["title"]
        date = item.span.text
        yield title, link, date

def getInfo(pat, s):
    res = re.search(pat, s)
    if res:
        return res.group(1)
    return '0'

def getContent(html):

    bsobj = BeautifulSoup(html,'html.parser')
    cnt = bsobj.find('div', attrs={"id":"xw_box"}).find_all("p")
    res = []

    if cnt:
        # 从第一段解析
        s = cnt[0].text
        res.append(getInfo(r'新增确诊病例(\d+)例', s))
        res.append(getInfo(r'本土病例(\d+)例', s))
        res.append(getInfo(r'新增死亡病例(\d+)例', s))

        # 从第二段解析
        s = cnt[1].text
        res.append(getInfo(r'新增治愈出院病例(\d+)例', s))

        # 从第五段解析
        s = cnt[4].text
        res.append(getInfo(r'新增无症状感染者(\d+)例', s))
        res.append(getInfo(r'本土(\d+)例', s))

    return res

def saveFile(path, filename, content):

    if not os.path.exists(path):
        os.makedirs(path)

    # 保存文件
    with open(path + filename + ".txt", 'w', encoding='utf-8') as f:
        f.write(content)

if "__main__" == __name__:
    # print(getInfo(r'新增死亡病例(\d+)例', '无新增死亡病例。'))
    # s = '4月28日0—24时，31个省（自治区、直辖市）和新疆生产建设兵团报告新增确诊病例5659例。其中境外输入病例13例（广东3例，北京2例，上海2例，福建2例，黑龙江1例，浙江1例，广西1例，四川1例），含2例由无症状感染者转为确诊病例（浙江1例，福建1例）；本土病例5646例（上海5487例，北京47例，吉林42例，浙江31例，山东7例，广东7例，黑龙江4例，江西4例，内蒙古3例，江苏3例，四川3例，河南2例，辽宁1例，福建1例，湖南1例，广西1例，重庆1例，云南1例），含5125例由无症状感染者转为确诊病例（上海5062例，吉林31例，浙江28例，辽宁1例，山东1例，河南1例，云南1例）。新增死亡病例52例，均为本土病例，在上海；无新增疑似病例。'
    # res = re.search( r'新增确诊病例(\d+)例', s)
    # print(res.group(1))
    #
    # res = re.search( r'本土病例.*），', s)
    # print(res.group())
    #
    # res = re.search( r'新增死亡病例\d+例', s)
    # print(res.group())
    #
    # res = re.search( r'新增治愈出院病例\d+例', s)
    # print(res.group())
    #
    with open('/Users/machi/Desktop/covid.csv', 'w') as f:
        header = ','.join(['日期', '新增确诊病例', '本土新增确诊病例', '新增死亡病例', '新增治愈出院病例', '新增无症状感染者', '本土新增无症状感染者'])
        f.write(header + '\n')

        for url in getPageUrl():
            print(url)
            try:
                s =fetchUrl(url)
            except:
                continue

            for title,link,date in getTitleUrl(s):
                print(title,link)
                # time.sleep(5)
                try:
                    html =fetchUrl(link)
                    content = getContent(html)

                    s = ','.join([date] + content)
                    f.write(s + '\n')
                    print('%s write finish' % date)
                except Exception as e:
                    print('%s process failed' % date, e)
                    continue

            # break