Spaces:
Running
Running
import gradio as gr | |
import requests | |
import urllib.parse | |
import xml.etree.ElementTree as ET | |
from typing import List, Dict | |
athtk = "0dWGjl7XuVq54v012KjGLEhRSLjj" | |
def update_tk(): | |
url = "https://ops.epo.org/3.2/auth/accesstoken" | |
payload = 'grant_type=client_credentials' | |
headers = { | |
'authorization': 'Basic RU45NzZ3RE5UM09lZ3ZCUURwMHE0NlJXN0xwZE5CNjNFZTNxRGJ6UnJwNGFWQVBmOnhZR05uQzk1N0dKV0lvcnM2ZWV4TmVrcUFybUVtdzU1ZkxjbUZlcDdDR0w5ZzZCdGsyY0hMeVBHZTkwQTBXOEE=', | |
'Content-Type': 'application/x-www-form-urlencoded' | |
} | |
response = requests.request("POST", url, headers=headers, data=payload) | |
global athtk | |
athtk = response.json()['access_token'] | |
def extract_patent_abstracts(xml_content: str) -> List[Dict[str, str]]: | |
""" | |
Extracts English abstracts, country, document number, and date from patent XML data. | |
Args: | |
xml_content (str): XML content as a string. | |
Returns: | |
List[Dict[str, str]]: A list of dictionaries containing patent information. | |
""" | |
root = ET.fromstring(xml_content) | |
namespaces = {'default': 'http://www.epo.org/exchange'} | |
extracted_patents = [] | |
# Find all 'exchange-document' elements | |
for doc in root.findall('.//default:exchange-document', namespaces): | |
# Directly find the English abstract using a specific XPath predicate. | |
# If no English abstract is found, skip to the next document. | |
if (en_abstract := doc.find('.//default:abstract[@lang="en"]', namespaces)) is None: | |
continue | |
# Extract abstract text from child <p> elements | |
abstract_text = ' '.join(p.text.strip() for p in en_abstract.findall('default:p', namespaces) if p.text) | |
# **FIX:** The XPath has been corrected to include the namespace prefix for each element in the path. | |
# This path is also made more specific to target the date within the 'docdb' document-id, | |
# ensuring the correct date is retrieved. | |
date_elem = doc.find('./default:bibliographic-data/default:publication-reference/default:document-id[@document-id-type="docdb"]/default:date', namespaces) | |
date = date_elem.text if date_elem is not None else 'N/A' | |
# Build the dictionary and add it to the list | |
extracted_patents.append({ | |
'country': doc.get('country', 'N/A'), | |
'doc_number': doc.get('doc-number', 'N/A'), | |
'date': date, | |
'abstract': abstract_text, | |
}) | |
return extracted_patents | |
def search_from_abstract(query,retried=False): | |
base_url = "https://ops.epo.org/3.2/rest-services/published-data/search" | |
endpoint = "abstract" | |
start_range = 1 | |
end_range = 100 | |
headers = { | |
'accept': 'application/xml', | |
'Authorization': f'Bearer {athtk}', | |
} | |
print(headers) | |
url = f"{base_url}/{endpoint}?q={urllib.parse.quote_plus(query)}&Range={start_range}-{end_range}" | |
response = requests.request("GET", url, headers=headers) | |
if response.status_code == 400 and not retried: | |
update_tk() | |
return search_from_abstract(query, retried=True) | |
elif response.status_code == 200: | |
return extract_patent_abstracts(response.text) | |
else: | |
print(f"Error: {response.status_code} - {response.text}") | |
response.raise_for_status() | |
app = gr.Interface( | |
fn=search_from_abstract, | |
inputs="text", | |
outputs="text", | |
title="Patent Abstract Search", | |
description="Search patents by abstract using the European Patent Office API." | |
) | |
app.launch(mcp_server=True,share=True) |