mvasani3690
Update app.py
ec0b6af unverified
import streamlit as st
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
import torch
from pprint import pprint
from datasets import load_dataset
# ----- Data Loading ------
dataset_dict = load_dataset('HUPD/hupd',
name='sample',
data_files="https://huggingface.co/datasets/HUPD/hupd/blob/main/hupd_metadata_2022-02-22.feather",
icpr_label=None,
train_filing_start_date='2016-01-01',
train_filing_end_date='2016-01-21',
val_filing_start_date='2016-01-22',
val_filing_end_date='2016-01-31',
)
# Here we can see the `train` and `val` splits, along with the
# location of the cached data files
print('Dataset contents:')
print(dataset_dict)
print('Dataset cache location:')
print(dataset_dict.cache_files)
# Data
train_dataset = dataset_dict["train"]
val_dataset = dataset_dict["validation"]
print(f'Train dataset shape: {train_dataset.shape}')
print(f'Validation dataset shape: {val_dataset.shape}')
# List all available fields
print(f'Dataset fields:')
print(train_dataset.column_names)
# Example: preprocess the abstract field of the dataset
# using HF tokenizers
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained('roberta-base')
# We tokenize in batches, so tokenization is quite fast
train_dataset = train_dataset.map(
lambda e: tokenizer(e['abstract'], truncation=True, padding='max_length'),
batched=True,
desc="Tokenizing training files"
)
val_dataset = val_dataset.map(
lambda e: tokenizer(e['abstract'], truncation=True, padding='max_length'),
batched=True,
desc="Tokenizing training files"
)
# Since we've tokenized the dataset, we have a new cache location
print('Dataset cache location after tokenization:')
print(train_dataset.cache_files)
# And we have added some fields to our dataset
print('Dataset fields after tokenization:')
print(train_dataset.column_names)
# Load the BERT tokenizer and model for sequence classification
tokenizer = AutoTokenizer.from_pretrained('bert-base-cased')
model = AutoModelForSequenceClassification.from_pretrained('bert-base-cased', num_labels=6)
# Function to retrieve abstract and claims text based on filing number
def get_text_data(filing_number):
# Check if the filing number exists in the dataset
if filing_number >= len(train_dataset) or filing_number < 0:
return None, None # Return None if the filing number is out of range or negative
# Access the data corresponding to the filing number
data = train_dataset[filing_number]
# Retrieve the abstract and claims text from the data
abstract = data.get('abstract', None)
claims = data.get('claims', None)
return abstract, claims
# Streamlit app
st.markdown("Link to app - [Patentabiity app](https://huggingface.co/spaces/mvasani/Patentatbility_score_app)")
def main():
st.title("Patentability Score App")
# Dropdown menu to select the application filing number
filing_number = st.selectbox("Select Application Filing Number", range(len(train_dataset)))
# Display abstract and claims text boxes based on selected filing number
abstract, claims = get_text_data(filing_number)
st.subheader("Abstract:")
st.text_area("Abstract Text", abstract, height=200, key='abstract_text')
st.subheader("Claims:")
st.text_area("Claims Text", claims, height=400, key='claims_text')
# Submit button to calculate and display the patentability score
if st.button("Submit"):
# Tokenize the abstract and claims texts
inputs = tokenizer(abstract, claims, return_tensors="pt", padding=True, truncation=True)
# Perform inference with the model to get the logits
with torch.no_grad():
logits = model(**inputs).logits
# Calculate the patentability score
score = torch.softmax(logits, dim=1).tolist()[0]
# Display the patentability score
st.subheader("Patentability Score:")
st.write("REJECTED:", score[0])
st.write("ACCEPTED:", score[1])
st.write("PENDING:", score[2])
st.write("CONT-REJECTED:", score[3])
st.write("CONT-ACCEPTED:", score[4])
st.write("CONT-PENDING:", score[5])
if __name__ == "__main__":
main()