Spaces:

bbfizp
/

patent-mcp

Running

App Files Files Community

patent-mcp / app.py

bbfizp

Update app.py

8d3d168 verified about 1 month ago

raw

history blame contribute delete

3.57 kB

	import gradio as gr
	import requests
	import urllib.parse
	import xml.etree.ElementTree as ET
	from typing import List, Dict


	athtk = "0dWGjl7XuVq54v012KjGLEhRSLjj"

	def update_tk():

	url = "https://ops.epo.org/3.2/auth/accesstoken"

	payload = 'grant_type=client_credentials'
	headers = {
	'authorization': 'Basic RU45NzZ3RE5UM09lZ3ZCUURwMHE0NlJXN0xwZE5CNjNFZTNxRGJ6UnJwNGFWQVBmOnhZR05uQzk1N0dKV0lvcnM2ZWV4TmVrcUFybUVtdzU1ZkxjbUZlcDdDR0w5ZzZCdGsyY0hMeVBHZTkwQTBXOEE=',
	'Content-Type': 'application/x-www-form-urlencoded'
	}

	response = requests.request("POST", url, headers=headers, data=payload)
	global athtk
	athtk = response.json()['access_token']

	def extract_patent_abstracts(xml_content: str) -> List[Dict[str, str]]:
	"""
	Extracts English abstracts, country, document number, and date from patent XML data.

	Args:
	xml_content (str): XML content as a string.

	Returns:
	List[Dict[str, str]]: A list of dictionaries containing patent information.
	"""
	root = ET.fromstring(xml_content)
	namespaces = {'default': 'http://www.epo.org/exchange'}
	extracted_patents = []

	# Find all 'exchange-document' elements
	for doc in root.findall('.//default:exchange-document', namespaces):
	# Directly find the English abstract using a specific XPath predicate.
	# If no English abstract is found, skip to the next document.
	if (en_abstract := doc.find('.//default:abstract[@lang="en"]', namespaces)) is None:
	continue

	# Extract abstract text from child <p> elements
	abstract_text = ' '.join(p.text.strip() for p in en_abstract.findall('default:p', namespaces) if p.text)

	# FIX: The XPath has been corrected to include the namespace prefix for each element in the path.
	# This path is also made more specific to target the date within the 'docdb' document-id,
	# ensuring the correct date is retrieved.
	date_elem = doc.find('./default:bibliographic-data/default:publication-reference/default:document-id[@document-id-type="docdb"]/default:date', namespaces)
	date = date_elem.text if date_elem is not None else 'N/A'

	# Build the dictionary and add it to the list
	extracted_patents.append({
	'country': doc.get('country', 'N/A'),
	'doc_number': doc.get('doc-number', 'N/A'),
	'date': date,
	'abstract': abstract_text,
	})

	return extracted_patents



	def search_from_abstract(query,retried=False):

	base_url = "https://ops.epo.org/3.2/rest-services/published-data/search"
	endpoint = "abstract"
	start_range = 1
	end_range = 100

	headers = {
	'accept': 'application/xml',
	'Authorization': f'Bearer {athtk}',
	}
	print(headers)
	url = f"{base_url}/{endpoint}?q={urllib.parse.quote_plus(query)}&Range={start_range}-{end_range}"

	response = requests.request("GET", url, headers=headers)
	if response.status_code == 400 and not retried:
	update_tk()
	return search_from_abstract(query, retried=True)
	elif response.status_code == 200:
	return extract_patent_abstracts(response.text)
	else:
	print(f"Error: {response.status_code} - {response.text}")
	response.raise_for_status()


	app = gr.Interface(
	fn=search_from_abstract,
	inputs="text",
	outputs="text",
	title="Patent Abstract Search",
	description="Search patents by abstract using the European Patent Office API."
	)

	app.launch(mcp_server=True,share=True)