|
import streamlit as st |
|
from transformers import AutoTokenizer, AutoModelForSequenceClassification |
|
import torch |
|
|
|
import torch |
|
from pprint import pprint |
|
from datasets import load_dataset |
|
|
|
|
|
dataset_dict = load_dataset('HUPD/hupd', |
|
name='sample', |
|
data_files="https://huggingface.co/datasets/HUPD/hupd/blob/main/hupd_metadata_2022-02-22.feather", |
|
icpr_label=None, |
|
train_filing_start_date='2016-01-01', |
|
train_filing_end_date='2016-01-21', |
|
val_filing_start_date='2016-01-22', |
|
val_filing_end_date='2016-01-31', |
|
) |
|
|
|
|
|
print('Dataset contents:') |
|
print(dataset_dict) |
|
|
|
print('Dataset cache location:') |
|
print(dataset_dict.cache_files) |
|
|
|
|
|
train_dataset = dataset_dict["train"] |
|
val_dataset = dataset_dict["validation"] |
|
print(f'Train dataset shape: {train_dataset.shape}') |
|
print(f'Validation dataset shape: {val_dataset.shape}') |
|
|
|
|
|
print(f'Dataset fields:') |
|
print(train_dataset.column_names) |
|
|
|
|
|
|
|
from transformers import AutoTokenizer |
|
tokenizer = AutoTokenizer.from_pretrained('roberta-base') |
|
|
|
|
|
train_dataset = train_dataset.map( |
|
lambda e: tokenizer(e['abstract'], truncation=True, padding='max_length'), |
|
batched=True, |
|
desc="Tokenizing training files" |
|
) |
|
val_dataset = val_dataset.map( |
|
lambda e: tokenizer(e['abstract'], truncation=True, padding='max_length'), |
|
batched=True, |
|
desc="Tokenizing training files" |
|
) |
|
|
|
|
|
print('Dataset cache location after tokenization:') |
|
print(train_dataset.cache_files) |
|
|
|
|
|
print('Dataset fields after tokenization:') |
|
print(train_dataset.column_names) |
|
|
|
|
|
|
|
tokenizer = AutoTokenizer.from_pretrained('bert-base-cased') |
|
model = AutoModelForSequenceClassification.from_pretrained('bert-base-cased', num_labels=6) |
|
|
|
|
|
def get_text_data(filing_number): |
|
|
|
if filing_number >= len(train_dataset) or filing_number < 0: |
|
return None, None |
|
|
|
|
|
data = train_dataset[filing_number] |
|
|
|
|
|
abstract = data.get('abstract', None) |
|
claims = data.get('claims', None) |
|
|
|
return abstract, claims |
|
|
|
|
|
|
|
|
|
|
|
st.markdown("Link to app - [Patentabiity app](https://huggingface.co/spaces/mvasani/Patentatbility_score_app)") |
|
def main(): |
|
st.title("Patentability Score App") |
|
|
|
|
|
filing_number = st.selectbox("Select Application Filing Number", range(len(train_dataset))) |
|
|
|
|
|
abstract, claims = get_text_data(filing_number) |
|
st.subheader("Abstract:") |
|
st.text_area("Abstract Text", abstract, height=200, key='abstract_text') |
|
st.subheader("Claims:") |
|
st.text_area("Claims Text", claims, height=400, key='claims_text') |
|
|
|
|
|
if st.button("Submit"): |
|
|
|
inputs = tokenizer(abstract, claims, return_tensors="pt", padding=True, truncation=True) |
|
|
|
|
|
with torch.no_grad(): |
|
logits = model(**inputs).logits |
|
|
|
|
|
score = torch.softmax(logits, dim=1).tolist()[0] |
|
|
|
|
|
st.subheader("Patentability Score:") |
|
st.write("REJECTED:", score[0]) |
|
st.write("ACCEPTED:", score[1]) |
|
st.write("PENDING:", score[2]) |
|
st.write("CONT-REJECTED:", score[3]) |
|
st.write("CONT-ACCEPTED:", score[4]) |
|
st.write("CONT-PENDING:", score[5]) |
|
|
|
if __name__ == "__main__": |
|
main() |
|
|