Spaces:

CabraVC
/

holiday_testing

Paused

App Files Files Community

holiday_testing / test_models /get_embeddings.py

svystun-taras

created the updated web ui

0fdb130 almost 2 years ago

raw

history blame

5.84 kB

	from torch import nn
	from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

	def get_eval_metric(y_pred, y_test):
	return {
	'accuracy': accuracy_score(y_test, y_pred),
	'precision': precision_score(y_test, y_pred, average='weighted'),
	'recall': recall_score(y_test, y_pred, average='weighted'),
	'f1': f1_score(y_test, y_pred, average='weighted'),
	'confusion_mat': confusion_matrix(y_test, y_pred, normalize='true'),
	}

	class MLP(nn.Module):
	def __init__(self, input_size=768, hidden_size=256, output_size=3, dropout_rate=.2, class_weights=None):
	super(MLP, self).__init__()
	self.class_weights = class_weights

	self.activation = nn.ReLU()
	self.bn1 = nn.BatchNorm1d(hidden_size)
	self.dropout = nn.Dropout(dropout_rate)

	self.fc1 = nn.Linear(input_size, hidden_size)
	self.fc2 = nn.Linear(hidden_size, output_size)

	def forward(self, x):
	input_is_dict = False
	if isinstance(x, dict):
	assert "sentence_embedding" in x
	input_is_dict = True
	x = x['sentence_embedding']

	x = self.fc1(x)
	x = self.bn1(x)
	x = self.activation(x)
	x = self.dropout(x)

	x = self.fc2(x)

	if input_is_dict:
	return {'logits': x}
	return x

	def predict(self, x):
	_, predicted = torch.max(self.forward(x), 1)
	print('I am predict')
	return predicted

	def predict_proba(self, x):
	print('I am predict_proba')
	return self.forward(x)

	def get_loss_fn(self):
	return nn.CrossEntropyLoss(weight=self.class_weights, reduction='mean')






	if __name__ == '__main__':
	from setfit.__init__ import SetFitModel, Trainer, TrainingArguments
	from datasets import Dataset, load_dataset, DatasetDict
	from sentence_transformers import SentenceTransformer, models, util
	from sentence_transformers.losses import BatchAllTripletLoss, BatchHardSoftMarginTripletLoss, BatchHardTripletLoss, BatchSemiHardTripletLoss
	from sklearn.linear_model import LogisticRegression
	import sys
	import os
	import warnings
	import torch
	import torch.nn as nn
	import torch.nn.functional as F
	from datetime import datetime
	import torch.optim as optim
	from statistics import mean
	from pprint import pprint
	from torch.utils.data import DataLoader, TensorDataset
	from langchain.text_splitter import RecursiveCharacterTextSplitter
	from safetensors.torch import load_model, save_model
	from itertools import chain
	from time import perf_counter
	from tqdm import trange
	from collections import Counter
	from sklearn.utils.class_weight import compute_class_weight
	import numpy as np
	import matplotlib.pyplot as plt

	warnings.filterwarnings("ignore")

	SEED = 1003200212 + 1
	DEVICE = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
	print(DEVICE)
	start = perf_counter()



	sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..')))
	dataset_dir = os.path.abspath(os.path.join(os.getcwd(), '..', '..', 'financial_dataset'))
	sys.path.append(dataset_dir)

	from load_test_data import get_labels_df, get_texts
	from train_classificator import plot_labels_distribution

	def split_text(text, chunk_size=1200, chunk_overlap=200):
	text_splitter = RecursiveCharacterTextSplitter(
	chunk_size=chunk_size, chunk_overlap=chunk_overlap,
	length_function = len, separators=[" ", ",", "\n"]
	)

	text_chunks = text_splitter.create_documents([text])
	return text_chunks


	labels_dir = dataset_dir + '/csvs/'
	df = get_labels_df(labels_dir)
	texts_dir = dataset_dir + '/txts/'
	texts = get_texts(texts_dir)
	# df = df.iloc[[0, 13, 113], :]
	# print(df.loc[:, 'Label'])
	# texts = [texts[0]] + [texts[13]] + [texts[113]]
	print(len(df), len(texts))
	print(mean(list(map(len, texts))))

	documents = [split_text(text, chunk_size=3_200, chunk_overlap=200) for text in texts]
	docs_chunks = [[doc.page_content for doc in document] for document in documents]
	# print([len(text_chunks)for text_chunks in docs_chunks])


	model = SentenceTransformer('financial-roberta')
	model = model.to('cuda:0')


	# # Get sentence embeddings for each text
	doc_embeddings = [model.encode(doc_chunks, show_progress_bar=True).tolist() for doc_chunks in docs_chunks]
	embeddings = [embedding for doc_embedding in doc_embeddings for embedding in doc_embedding]
	texts = [text for doc_chunks in docs_chunks for text in doc_chunks]
	labels = np.repeat(df['Label'], [len(document) for document in documents]).tolist()
	# print(df.loc[:, 'Label'])
	# print([len(text) for text in texts])
	# print([len(emb) for emb in embeddings])
	# print(labels)

	dataset = Dataset.from_dict({
	'texts': texts,
	'labels': labels,
	'embeddings': embeddings,
	})
	print(len(dataset['texts']))
	print(dataset['labels'])

	dataset = dataset.class_encode_column('labels')
	print(len(dataset))

	train_test_dataset = dataset.train_test_split(test_size=.2, stratify_by_column='labels')
	val_test_dataset = train_test_dataset['test'].train_test_split(test_size=.5, stratify_by_column='labels')

	dataset = DatasetDict({
	'train': train_test_dataset['train'],
	'val': val_test_dataset['train'],
	'test': val_test_dataset['test']
	}
	)
	plot_labels_distribution(dataset, save_as_filename='plots/finetuned_st_label_distr.png')
	dataset.push_to_hub("CabraVC/vector_dataset_roberta-fine-tuned", private=True)