Spaces:
Paused
Paused
Commit
·
501f2e5
1
Parent(s):
0fdb130
tested the model on all dataset
Browse files- app.py +0 -60
- test_models/create_setfit_model.py +2 -2
- test_models/models/linear_head.pth +1 -1
- test_models/models/linear_head.safetensors +1 -1
- test_models/test_model.py +4 -3
- test_models/train_head.py +3 -3
app.py
CHANGED
|
@@ -12,54 +12,6 @@ def read_and_split_file(filename, chunk_size=1200, chunk_overlap=200):
|
|
| 12 |
return texts
|
| 13 |
|
| 14 |
|
| 15 |
-
def get_label_prediction(selected_predictor, texts):
|
| 16 |
-
predicted_labels = []
|
| 17 |
-
replies = []
|
| 18 |
-
|
| 19 |
-
|
| 20 |
-
emdedding_model_name = predictors[selected_predictor]['embedding_model']
|
| 21 |
-
emdedding_model = SentenceTransformer(emdedding_model_name)
|
| 22 |
-
|
| 23 |
-
texts_str = [text.page_content for text in texts]
|
| 24 |
-
embeddings = emdedding_model.encode(texts_str, show_progress_bar=True).tolist()
|
| 25 |
-
|
| 26 |
-
# dataset = load_dataset(predictors[selected_predictor]['dataset_name'])
|
| 27 |
-
label_encoder = LabelEncoder()
|
| 28 |
-
encoded_labels = label_encoder.fit_transform([label.upper() for label in labels])
|
| 29 |
-
|
| 30 |
-
input_size = predictors[selected_predictor]['embedding_dim']
|
| 31 |
-
hidden_size = 256
|
| 32 |
-
output_size = len(label_encoder.classes_)
|
| 33 |
-
dropout_rate = 0.5
|
| 34 |
-
batch_size = 8
|
| 35 |
-
|
| 36 |
-
|
| 37 |
-
model = MLP(input_size, hidden_size, output_size, dropout_rate)
|
| 38 |
-
load_model(model, predictors[selected_predictor]['mlp_model'])
|
| 39 |
-
|
| 40 |
-
embeddings_tensor = torch.tensor(embeddings)
|
| 41 |
-
|
| 42 |
-
data = TensorDataset(embeddings_tensor)
|
| 43 |
-
dataloader = DataLoader(data, batch_size=batch_size, shuffle=True)
|
| 44 |
-
|
| 45 |
-
with torch.no_grad():
|
| 46 |
-
model.eval()
|
| 47 |
-
for inputs in dataloader:
|
| 48 |
-
# st.write(inputs[0])
|
| 49 |
-
outputs = model(inputs[0])
|
| 50 |
-
|
| 51 |
-
# _, predicted = torch.max(outputs, 1)
|
| 52 |
-
|
| 53 |
-
probabilities = F.softmax(outputs, dim=1)
|
| 54 |
-
predicted_indices = torch.argmax(probabilities, dim=1).tolist()
|
| 55 |
-
predicted_labels_list = label_encoder.inverse_transform(predicted_indices)
|
| 56 |
-
for pred_label in predicted_labels_list:
|
| 57 |
-
predicted_labels.append(pred_label)
|
| 58 |
-
# st.write(pred_label)
|
| 59 |
-
|
| 60 |
-
predicted_labels_counter = Counter(predicted_labels)
|
| 61 |
-
predicted_label = predicted_labels_counter.most_common(1)[0][0]
|
| 62 |
-
return predicted_label
|
| 63 |
|
| 64 |
|
| 65 |
|
|
@@ -68,20 +20,8 @@ def get_label_prediction(selected_predictor, texts):
|
|
| 68 |
if __name__ == '__main__':
|
| 69 |
# Comments and ideas to implement:
|
| 70 |
# 1. Try sending list of inputs to the Inference API.
|
| 71 |
-
|
| 72 |
|
| 73 |
|
| 74 |
-
from config import (
|
| 75 |
-
labels, headers_inference_api, headers_inference_endpoint,
|
| 76 |
-
# summarization_prompt_template,
|
| 77 |
-
prompt_template,
|
| 78 |
-
# task_explain_for_predictor_model,
|
| 79 |
-
summarizers, predictors, summary_scores_template,
|
| 80 |
-
summarization_system_msg, summarization_user_prompt, prediction_user_prompt, prediction_system_msg,
|
| 81 |
-
# prediction_prompt,
|
| 82 |
-
chat_prompt, instruction_prompt
|
| 83 |
-
)
|
| 84 |
-
|
| 85 |
import streamlit as st
|
| 86 |
from sys import exit
|
| 87 |
from pprint import pprint
|
|
|
|
| 12 |
return texts
|
| 13 |
|
| 14 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 15 |
|
| 16 |
|
| 17 |
|
|
|
|
| 20 |
if __name__ == '__main__':
|
| 21 |
# Comments and ideas to implement:
|
| 22 |
# 1. Try sending list of inputs to the Inference API.
|
|
|
|
| 23 |
|
| 24 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 25 |
import streamlit as st
|
| 26 |
from sys import exit
|
| 27 |
from pprint import pprint
|
test_models/create_setfit_model.py
CHANGED
|
@@ -47,10 +47,10 @@ model_head = MLP(class_weights=class_weights)
|
|
| 47 |
|
| 48 |
if __name__ == '__main__' or __name__ == 'create_setfit_model':
|
| 49 |
model_body = SentenceTransformer('financial-roberta')
|
| 50 |
-
load_model(model_head, f'models/linear_head.
|
| 51 |
elif __name__ == 'test_models.create_setfit_model':
|
| 52 |
model_body = SentenceTransformer('test_models/financial-roberta')
|
| 53 |
-
load_model(model_head, f'/test_models/models/linear_head.
|
| 54 |
|
| 55 |
|
| 56 |
model = SetFitModel(model_body=model_body,
|
|
|
|
| 47 |
|
| 48 |
if __name__ == '__main__' or __name__ == 'create_setfit_model':
|
| 49 |
model_body = SentenceTransformer('financial-roberta')
|
| 50 |
+
load_model(model_head, f'models/linear_head.safetensors')
|
| 51 |
elif __name__ == 'test_models.create_setfit_model':
|
| 52 |
model_body = SentenceTransformer('test_models/financial-roberta')
|
| 53 |
+
load_model(model_head, f'/test_models/models/linear_head.safetensors')
|
| 54 |
|
| 55 |
|
| 56 |
model = SetFitModel(model_body=model_body,
|
test_models/models/linear_head.pth
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 10800
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:908720c6263171369062dcc107a2c1003e8ae14914e49f748eb5b48b5112a541
|
| 3 |
size 10800
|
test_models/models/linear_head.safetensors
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 9380
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:ca4c505b1c00d424f85e5e60fd9268ce56eb517b2dfd59d0cf1e715d664adbb2
|
| 3 |
size 9380
|
test_models/test_model.py
CHANGED
|
@@ -22,9 +22,10 @@ labels_dir = dataset_dir + '/csvs/'
|
|
| 22 |
df = get_labels_df(labels_dir)
|
| 23 |
texts_dir = dataset_dir + '/txts/'
|
| 24 |
texts = get_texts(texts_dir)
|
| 25 |
-
df = df.iloc[
|
| 26 |
-
print(df.loc[:, 'Label'])
|
| 27 |
-
texts = [texts[0]] + [texts[13]] + [texts[113]]
|
|
|
|
| 28 |
print(len(df), len(texts))
|
| 29 |
print(mean(list(map(len, texts))))
|
| 30 |
|
|
|
|
| 22 |
df = get_labels_df(labels_dir)
|
| 23 |
texts_dir = dataset_dir + '/txts/'
|
| 24 |
texts = get_texts(texts_dir)
|
| 25 |
+
# df = df.iloc[:20, :]
|
| 26 |
+
# print(df.loc[:, 'Label'])
|
| 27 |
+
# texts = [texts[0]] + [texts[13]] + [texts[113]]
|
| 28 |
+
# texts = texts[:20]
|
| 29 |
print(len(df), len(texts))
|
| 30 |
print(mean(list(map(len, texts))))
|
| 31 |
|
test_models/train_head.py
CHANGED
|
@@ -83,7 +83,7 @@ if __name__ == '__main__':
|
|
| 83 |
|
| 84 |
|
| 85 |
class_weights = torch.tensor(compute_class_weight('balanced', classes=[0, 1, 2], y=dataset['train']['labels']), dtype=torch.float) ** .5
|
| 86 |
-
model = MLP(input_size=input_size,
|
| 87 |
|
| 88 |
|
| 89 |
criterion = model.get_loss_fn()
|
|
@@ -114,13 +114,13 @@ if __name__ == '__main__':
|
|
| 114 |
|
| 115 |
test_data = TensorDataset(torch.tensor(dataset['test']['embeddings']), torch.tensor(dataset['test']['labels']))
|
| 116 |
test_loader = DataLoader(test_data, batch_size=batch_size, shuffle=True)
|
| 117 |
-
loss, accuracy = eval_model(model, criterion, test_loader, test_data, show=
|
| 118 |
# save_as_filename=f'plots/confusion_matrix_{datetime.now().strftime("%Y-%m-%d_%H-%M")}.png'
|
| 119 |
)
|
| 120 |
|
| 121 |
# torch.save(model.state_dict(), f'models/linear_head.pth')
|
| 122 |
# save_model(model, f'models/linear_head.safetensors')
|
| 123 |
-
# load_model(model, f'models/
|
| 124 |
# print(model)
|
| 125 |
# dataset.push_to_hub(f'CabraVC/vector_dataset_stratified_ttv_split_{datetime.now().strftime("%Y-%m-%d_%H-%M")}', private=True)
|
| 126 |
|
|
|
|
| 83 |
|
| 84 |
|
| 85 |
class_weights = torch.tensor(compute_class_weight('balanced', classes=[0, 1, 2], y=dataset['train']['labels']), dtype=torch.float) ** .5
|
| 86 |
+
model = MLP(input_size=input_size, class_weights=class_weights)
|
| 87 |
|
| 88 |
|
| 89 |
criterion = model.get_loss_fn()
|
|
|
|
| 114 |
|
| 115 |
test_data = TensorDataset(torch.tensor(dataset['test']['embeddings']), torch.tensor(dataset['test']['labels']))
|
| 116 |
test_loader = DataLoader(test_data, batch_size=batch_size, shuffle=True)
|
| 117 |
+
loss, accuracy = eval_model(model, criterion, test_loader, test_data, show=False
|
| 118 |
# save_as_filename=f'plots/confusion_matrix_{datetime.now().strftime("%Y-%m-%d_%H-%M")}.png'
|
| 119 |
)
|
| 120 |
|
| 121 |
# torch.save(model.state_dict(), f'models/linear_head.pth')
|
| 122 |
# save_model(model, f'models/linear_head.safetensors')
|
| 123 |
+
# load_model(model, f'models/linear_head.safetensors')
|
| 124 |
# print(model)
|
| 125 |
# dataset.push_to_hub(f'CabraVC/vector_dataset_stratified_ttv_split_{datetime.now().strftime("%Y-%m-%d_%H-%M")}', private=True)
|
| 126 |
|