awacke1 commited on
Commit
88471fd
·
verified ·
1 Parent(s): b8e68cf

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +82 -0
app.py ADDED
@@ -0,0 +1,82 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ from datasets import load_dataset
3
+ from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline
4
+ import torch
5
+ import pandas as pd
6
+
7
+ def load_orca_dataset():
8
+ st.info("Loading dataset... This may take a while.")
9
+ return load_dataset("microsoft/orca-agentinstruct-1M-v1")
10
+
11
+ @st.cache_data
12
+ def load_model_and_tokenizer(model_name):
13
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
14
+ model = AutoModelForSequenceClassification.from_pretrained(model_name)
15
+ return tokenizer, model
16
+
17
+ def evaluate_model(ds, tokenizer, model, max_samples, text_field):
18
+ st.info("Evaluating the model...")
19
+ classifier = pipeline("text-classification", model=model, tokenizer=tokenizer, device=0 if torch.cuda.is_available() else -1)
20
+
21
+ results = []
22
+ for i, example in enumerate(ds):
23
+ if i >= max_samples:
24
+ break
25
+ input_text = example[text_field]
26
+ result = classifier(input_text)[0]
27
+ results.append({"input": input_text, "label": result["label"], "score": result["score"]})
28
+ return results
29
+
30
+ def main():
31
+ st.title("Orca Dataset Browser and Model Evaluator")
32
+
33
+ st.sidebar.header("Configuration")
34
+ load_dataset_btn = st.sidebar.button("Load Dataset")
35
+
36
+ if load_dataset_btn:
37
+ dataset = load_orca_dataset()
38
+ st.session_state["dataset"] = dataset
39
+
40
+ if "dataset" in st.session_state:
41
+ dataset = st.session_state["dataset"]
42
+
43
+ # List available splits
44
+ available_splits = list(dataset.keys())
45
+ st.sidebar.subheader("Available Dataset Splits")
46
+ selected_split = st.sidebar.selectbox("Select Split", available_splits)
47
+
48
+ st.subheader("Dataset Explorer")
49
+ st.write(f"Displaying information for split: `{selected_split}`")
50
+ st.write(dataset[selected_split].info)
51
+
52
+ # Determine available fields
53
+ sample_entry = dataset[selected_split][0]
54
+ st.sidebar.subheader("Available Fields in Dataset")
55
+ available_fields = list(sample_entry.keys())
56
+ st.sidebar.write(available_fields)
57
+ text_field = st.sidebar.selectbox("Select Text Field", available_fields)
58
+
59
+ sample_size = st.slider("Number of Samples to Display", min_value=1, max_value=20, value=5)
60
+ st.write(dataset[selected_split].shuffle(seed=42).select(range(sample_size)))
61
+
62
+ st.subheader("Model Evaluator")
63
+ model_name = st.text_input("Enter Hugging Face Model Name", value="distilbert-base-uncased-finetuned-sst-2-english")
64
+ max_samples = st.number_input("Number of Samples to Evaluate", min_value=1, max_value=100, value=10)
65
+
66
+ if st.button("Load Model and Evaluate"):
67
+ tokenizer, model = load_model_and_tokenizer(model_name)
68
+
69
+ results = evaluate_model(dataset[selected_split].shuffle(seed=42).select(range(max_samples)), tokenizer, model, max_samples, text_field)
70
+
71
+ st.subheader("Evaluation Results")
72
+ st.write(results)
73
+
74
+ st.download_button(
75
+ label="Download Results as CSV",
76
+ data=pd.DataFrame(results).to_csv(index=False),
77
+ file_name="evaluation_results.csv",
78
+ mime="text/csv",
79
+ )
80
+
81
+ if __name__ == "__main__":
82
+ main()