|
|
|
|
|
import sys |
|
sys.path.append('/Users/machi/Library/Python/3.8/lib/python/site-packages') |
|
|
|
import os |
|
import asyncio |
|
|
|
|
|
|
|
|
|
from pyppeteer import launch |
|
from bs4 import BeautifulSoup |
|
import re |
|
import time |
|
|
|
async def pyppteer_fetchUrl(url): |
|
browser = await launch({'headless': False,'dumpio':True, 'autoClose':True}) |
|
page = await browser.newPage() |
|
|
|
|
|
await page.goto(url) |
|
await asyncio.wait([page.waitForNavigation()]) |
|
str = await page.content() |
|
await browser.close() |
|
return str |
|
|
|
def fetchUrl(url): |
|
return asyncio.get_event_loop().run_until_complete(pyppteer_fetchUrl(url)) |
|
|
|
def getPageUrl(): |
|
for page in range(1,5): |
|
if page == 1: |
|
yield 'http://www.nhc.gov.cn/xcs/yqtb/list_gzbd.shtml' |
|
else: |
|
url = 'http://www.nhc.gov.cn/xcs/yqtb/list_gzbd_'+ str(page) +'.shtml' |
|
yield url |
|
|
|
def getTitleUrl(html): |
|
|
|
bsobj = BeautifulSoup(html,'html.parser') |
|
titleList = bsobj.find('div', attrs={"class":"list"}).ul.find_all("li") |
|
for item in titleList: |
|
link = "http://www.nhc.gov.cn" + item.a["href"]; |
|
title = item.a["title"] |
|
date = item.span.text |
|
yield title, link, date |
|
|
|
def getInfo(pat, s): |
|
res = re.search(pat, s) |
|
if res: |
|
return res.group(1) |
|
return '0' |
|
|
|
def getContent(html): |
|
|
|
bsobj = BeautifulSoup(html,'html.parser') |
|
cnt = bsobj.find('div', attrs={"id":"xw_box"}).find_all("p") |
|
res = [] |
|
|
|
if cnt: |
|
|
|
s = cnt[0].text |
|
res.append(getInfo(r'新增确诊病例(\d+)例', s)) |
|
res.append(getInfo(r'本土病例(\d+)例', s)) |
|
res.append(getInfo(r'新增死亡病例(\d+)例', s)) |
|
|
|
|
|
s = cnt[1].text |
|
res.append(getInfo(r'新增治愈出院病例(\d+)例', s)) |
|
|
|
|
|
s = cnt[4].text |
|
res.append(getInfo(r'新增无症状感染者(\d+)例', s)) |
|
res.append(getInfo(r'本土(\d+)例', s)) |
|
|
|
return res |
|
|
|
def saveFile(path, filename, content): |
|
|
|
if not os.path.exists(path): |
|
os.makedirs(path) |
|
|
|
|
|
with open(path + filename + ".txt", 'w', encoding='utf-8') as f: |
|
f.write(content) |
|
|
|
if "__main__" == __name__: |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
with open('/Users/machi/Desktop/covid.csv', 'w') as f: |
|
header = ','.join(['日期', '新增确诊病例', '本土新增确诊病例', '新增死亡病例', '新增治愈出院病例', '新增无症状感染者', '本土新增无症状感染者']) |
|
f.write(header + '\n') |
|
|
|
for url in getPageUrl(): |
|
print(url) |
|
try: |
|
s =fetchUrl(url) |
|
except: |
|
continue |
|
|
|
for title,link,date in getTitleUrl(s): |
|
print(title,link) |
|
|
|
try: |
|
html =fetchUrl(link) |
|
content = getContent(html) |
|
|
|
s = ','.join([date] + content) |
|
f.write(s + '\n') |
|
print('%s write finish' % date) |
|
except Exception as e: |
|
print('%s process failed' % date, e) |
|
continue |
|
|
|
|
|
|