File size: 4,295 Bytes
4c98fef
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ec0b6af
 
4c98fef
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
import streamlit as st
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch

import torch
from pprint import pprint
from datasets import load_dataset

# ----- Data Loading ------
dataset_dict = load_dataset('HUPD/hupd',
    name='sample',
    data_files="https://huggingface.co/datasets/HUPD/hupd/blob/main/hupd_metadata_2022-02-22.feather", 
    icpr_label=None,
    train_filing_start_date='2016-01-01',
    train_filing_end_date='2016-01-21',
    val_filing_start_date='2016-01-22',
    val_filing_end_date='2016-01-31',
)
# Here we can see the `train` and `val` splits, along with the
# location of the cached data files
print('Dataset contents:')
print(dataset_dict)

print('Dataset cache location:')
print(dataset_dict.cache_files)

# Data
train_dataset = dataset_dict["train"]
val_dataset = dataset_dict["validation"]
print(f'Train dataset shape: {train_dataset.shape}')
print(f'Validation dataset shape: {val_dataset.shape}')

# List all available fields
print(f'Dataset fields:')
print(train_dataset.column_names)

# Example: preprocess the abstract field of the dataset
# using HF tokenizers
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained('roberta-base')

# We tokenize in batches, so tokenization is quite fast
train_dataset = train_dataset.map(
    lambda e: tokenizer(e['abstract'], truncation=True, padding='max_length'),
    batched=True,
    desc="Tokenizing training files"
)
val_dataset = val_dataset.map(
    lambda e: tokenizer(e['abstract'], truncation=True, padding='max_length'),
    batched=True,
    desc="Tokenizing training files"
)

# Since we've tokenized the dataset, we have a new cache location
print('Dataset cache location after tokenization:')
print(train_dataset.cache_files)

# And we have added some fields to our dataset
print('Dataset fields after tokenization:')
print(train_dataset.column_names)


# Load the BERT tokenizer and model for sequence classification
tokenizer = AutoTokenizer.from_pretrained('bert-base-cased')
model = AutoModelForSequenceClassification.from_pretrained('bert-base-cased', num_labels=6)

# Function to retrieve abstract and claims text based on filing number
def get_text_data(filing_number):
    # Check if the filing number exists in the dataset
    if filing_number >= len(train_dataset) or filing_number < 0:
        return None, None  # Return None if the filing number is out of range or negative
    
    # Access the data corresponding to the filing number
    data = train_dataset[filing_number]
    
    # Retrieve the abstract and claims text from the data
    abstract = data.get('abstract', None)
    claims = data.get('claims', None)
    
    return abstract, claims



# Streamlit app

st.markdown("Link to app - [Patentabiity app](https://huggingface.co/spaces/mvasani/Patentatbility_score_app)")
def main():
    st.title("Patentability Score App")
    
    # Dropdown menu to select the application filing number
    filing_number = st.selectbox("Select Application Filing Number", range(len(train_dataset)))
    
    # Display abstract and claims text boxes based on selected filing number
    abstract, claims = get_text_data(filing_number)
    st.subheader("Abstract:")
    st.text_area("Abstract Text", abstract, height=200, key='abstract_text')
    st.subheader("Claims:")
    st.text_area("Claims Text", claims, height=400, key='claims_text')
    
    # Submit button to calculate and display the patentability score
    if st.button("Submit"):
        # Tokenize the abstract and claims texts
        inputs = tokenizer(abstract, claims, return_tensors="pt", padding=True, truncation=True)
        
        # Perform inference with the model to get the logits
        with torch.no_grad():
            logits = model(**inputs).logits
        
        # Calculate the patentability score
        score = torch.softmax(logits, dim=1).tolist()[0]
        
        # Display the patentability score
        st.subheader("Patentability Score:")
        st.write("REJECTED:", score[0])
        st.write("ACCEPTED:", score[1])
        st.write("PENDING:", score[2])
        st.write("CONT-REJECTED:", score[3])
        st.write("CONT-ACCEPTED:", score[4])
        st.write("CONT-PENDING:", score[5])

if __name__ == "__main__":
    main()