Spaces:
Running
Running
from pathlib import Path | |
import io | |
import json | |
import math | |
import statistics | |
import sys | |
import time | |
from datasets import concatenate_datasets, Dataset | |
from datasets import load_dataset | |
from huggingface_hub import hf_hub_url | |
import pandas as pd | |
import numpy as np | |
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer | |
from evaluate import load | |
import uvicorn | |
from fastapi import FastAPI | |
from fastapi.middleware.cors import CORSMiddleware | |
metric = load("glue", "mrpc") | |
app = FastAPI() | |
def preprocess_function(examples): | |
tokenizer = AutoTokenizer.from_pretrained("sgugger/glue-mrpc") | |
return tokenizer(examples["sentence1"], examples["sentence2"], truncation=True) | |
def compute_metrics(eval_pred): | |
predictions, labels = eval_pred | |
predictions = np.argmax(predictions, axis=1) | |
return metric.compute(predictions=predictions, references=labels) | |
def compute_model_card_evaluation_results(model_checkpoint, raw_datasets, metric): | |
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint) | |
tokenized_datasets = raw_datasets.map(preprocess_function, batched=True) | |
model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint, num_labels=2) | |
batch_size = 16 | |
args = TrainingArguments( | |
"test-glue", | |
evaluation_strategy = "epoch", | |
learning_rate=5e-5, | |
seed=42, | |
lr_scheduler_type="linear", | |
per_device_train_batch_size=batch_size, | |
per_device_eval_batch_size=batch_size, | |
num_train_epochs=3, | |
weight_decay=0.01, | |
load_best_model_at_end=False, | |
metric_for_best_model="accuracy", | |
report_to="none" | |
) | |
trainer = Trainer( | |
model, | |
args, | |
train_dataset=tokenized_datasets["train"], | |
eval_dataset=tokenized_datasets["validation"], | |
tokenizer=tokenizer, | |
compute_metrics=compute_metrics | |
) | |
result = trainer.evaluate() | |
return result | |
app.add_middleware( | |
CORSMiddleware, | |
allow_origins=["*"], | |
allow_credentials=True, | |
allow_methods=["*"], | |
allow_headers=["*"], | |
) | |
def api_home(): | |
return {'detail': 'Welcome to Bastions Model evaluation!'} | |
def return_output(): #model_checkpoint, dataset_name | |
model_checkpoint = "sgugger/glue-mrpc" | |
dataset_name = "nyu-mll/glue" | |
print(model_checkpoint, dataset_name, metric) | |
model_checkpoint = model_checkpoint | |
raw_datasets = load_dataset(dataset_name, "mrpc") | |
#tokenizer = AutoTokenizer.from_pretrained(model_checkpoint) | |
output = compute_model_card_evaluation_results(model_checkpoint, raw_datasets, metric) | |
return output |