Spaces:
Runtime error
Runtime error
| import requests | |
| import random | |
| from functools import lru_cache | |
| class GROBID_OFFLINE_EXCEPTION(Exception): pass | |
| def get_avail_grobid_url(): | |
| from toolbox import get_conf | |
| GROBID_URLS, = get_conf('GROBID_URLS') | |
| if len(GROBID_URLS) == 0: return None | |
| try: | |
| _grobid_url = random.choice(GROBID_URLS) # 随机负载均衡 | |
| if _grobid_url.endswith('/'): _grobid_url = _grobid_url.rstrip('/') | |
| res = requests.get(_grobid_url+'/api/isalive') | |
| if res.text=='true': return _grobid_url | |
| else: return None | |
| except: | |
| return None | |
| def parse_pdf(pdf_path, grobid_url): | |
| import scipdf # pip install scipdf_parser | |
| if grobid_url.endswith('/'): grobid_url = grobid_url.rstrip('/') | |
| try: | |
| article_dict = scipdf.parse_pdf_to_dict(pdf_path, grobid_url=grobid_url) | |
| except GROBID_OFFLINE_EXCEPTION: | |
| raise GROBID_OFFLINE_EXCEPTION("GROBID服务不可用,请修改config中的GROBID_URL,可修改成本地GROBID服务。") | |
| except: | |
| raise RuntimeError("解析PDF失败,请检查PDF是否损坏。") | |
| return article_dict | |