Spaces:
Runtime error
Runtime error
Inital commit for perplexity lenses
Browse files- app.py +141 -0
- data.py +28 -0
- perplexity.py +37 -0
- requirements.txt +10 -0
app.py
ADDED
|
@@ -0,0 +1,141 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import logging
|
| 2 |
+
from functools import partial
|
| 3 |
+
from typing import Callable, Optional
|
| 4 |
+
|
| 5 |
+
import pandas as pd
|
| 6 |
+
import streamlit as st
|
| 7 |
+
from bokeh.plotting import Figure
|
| 8 |
+
from embedding_lenses.data import uploaded_file_to_dataframe
|
| 9 |
+
from embedding_lenses.dimensionality_reduction import (get_tsne_embeddings,
|
| 10 |
+
get_umap_embeddings)
|
| 11 |
+
from embedding_lenses.embedding import embed_text, load_model
|
| 12 |
+
from embedding_lenses.utils import encode_labels
|
| 13 |
+
from embedding_lenses.visualization import draw_interactive_scatter_plot
|
| 14 |
+
from sentence_transformers import SentenceTransformer
|
| 15 |
+
|
| 16 |
+
from data import hub_dataset_to_dataframe
|
| 17 |
+
from perplexity import KenlmModel
|
| 18 |
+
|
| 19 |
+
logging.basicConfig(level=logging.INFO)
|
| 20 |
+
logger = logging.getLogger(__name__)
|
| 21 |
+
EMBEDDING_MODELS = ["distiluse-base-multilingual-cased-v1", "all-mpnet-base-v2", "flax-sentence-embeddings/all_datasets_v3_mpnet-base"]
|
| 22 |
+
DIMENSIONALITY_REDUCTION_ALGORITHMS = ["UMAP", "t-SNE"]
|
| 23 |
+
LANGUAGES = [
|
| 24 |
+
"af",
|
| 25 |
+
"ar",
|
| 26 |
+
"az",
|
| 27 |
+
"be",
|
| 28 |
+
"bg",
|
| 29 |
+
"bn",
|
| 30 |
+
"ca",
|
| 31 |
+
"cs",
|
| 32 |
+
"da",
|
| 33 |
+
"de",
|
| 34 |
+
"el",
|
| 35 |
+
"en",
|
| 36 |
+
"es",
|
| 37 |
+
"et",
|
| 38 |
+
"fa",
|
| 39 |
+
"fi",
|
| 40 |
+
"fr",
|
| 41 |
+
"gu",
|
| 42 |
+
"he",
|
| 43 |
+
"hi",
|
| 44 |
+
"hr",
|
| 45 |
+
"hu",
|
| 46 |
+
"hy",
|
| 47 |
+
"id",
|
| 48 |
+
"is",
|
| 49 |
+
"it",
|
| 50 |
+
"ja",
|
| 51 |
+
"ka",
|
| 52 |
+
"kk",
|
| 53 |
+
"km",
|
| 54 |
+
"kn",
|
| 55 |
+
"ko",
|
| 56 |
+
"lt",
|
| 57 |
+
"lv",
|
| 58 |
+
"mk",
|
| 59 |
+
"ml",
|
| 60 |
+
"mn",
|
| 61 |
+
"mr",
|
| 62 |
+
"my",
|
| 63 |
+
"ne",
|
| 64 |
+
"nl",
|
| 65 |
+
"no",
|
| 66 |
+
"pl",
|
| 67 |
+
"pt",
|
| 68 |
+
"ro",
|
| 69 |
+
"ru",
|
| 70 |
+
"uk",
|
| 71 |
+
"zh",
|
| 72 |
+
]
|
| 73 |
+
SEED = 0
|
| 74 |
+
|
| 75 |
+
|
| 76 |
+
def generate_plot(
|
| 77 |
+
df: pd.DataFrame,
|
| 78 |
+
text_column: str,
|
| 79 |
+
label_column: str,
|
| 80 |
+
sample: Optional[int],
|
| 81 |
+
dimensionality_reduction_function: Callable,
|
| 82 |
+
model: SentenceTransformer,
|
| 83 |
+
) -> Figure:
|
| 84 |
+
if text_column not in df.columns:
|
| 85 |
+
raise ValueError(f"The specified column name doesn't exist. Columns available: {df.columns.values}")
|
| 86 |
+
if label_column not in df.columns:
|
| 87 |
+
df[label_column] = 0
|
| 88 |
+
df = df.dropna(subset=[text_column, label_column])
|
| 89 |
+
if sample:
|
| 90 |
+
df = df.sample(min(sample, df.shape[0]), random_state=SEED)
|
| 91 |
+
with st.spinner(text="Embedding text..."):
|
| 92 |
+
embeddings = embed_text(df[text_column].values.tolist(), model)
|
| 93 |
+
logger.info("Encoding labels")
|
| 94 |
+
encoded_labels = encode_labels(df[label_column])
|
| 95 |
+
with st.spinner("Reducing dimensionality..."):
|
| 96 |
+
embeddings_2d = dimensionality_reduction_function(embeddings)
|
| 97 |
+
logger.info("Generating figure")
|
| 98 |
+
plot = draw_interactive_scatter_plot(
|
| 99 |
+
df[text_column].values, embeddings_2d[:, 0], embeddings_2d[:, 1], encoded_labels.values, df[label_column].values, text_column, label_column
|
| 100 |
+
)
|
| 101 |
+
return plot
|
| 102 |
+
|
| 103 |
+
|
| 104 |
+
st.title("Perplexity Lenses")
|
| 105 |
+
st.write("Visualize text embeddings in 2D using colors to represent perplexity values.")
|
| 106 |
+
uploaded_file = st.file_uploader("Choose an csv/tsv file...", type=["csv", "tsv"])
|
| 107 |
+
st.write("Alternatively, select a dataset from the [hub](https://huggingface.co/datasets)")
|
| 108 |
+
col1, col2, col3 = st.columns(3)
|
| 109 |
+
with col1:
|
| 110 |
+
hub_dataset = st.text_input("Dataset name", "mc4")
|
| 111 |
+
with col2:
|
| 112 |
+
hub_dataset_config = st.text_input("Dataset configuration", "es")
|
| 113 |
+
with col3:
|
| 114 |
+
hub_dataset_split = st.text_input("Dataset split", "train")
|
| 115 |
+
|
| 116 |
+
text_column = st.text_input("Text column name", "text")
|
| 117 |
+
language = st.selectbox("Language", LANGUAGES, 12)
|
| 118 |
+
sample = st.number_input("Maximum number of documents to use", 1, 100000, 1000)
|
| 119 |
+
dimensionality_reduction = st.selectbox("Dimensionality Reduction algorithm", DIMENSIONALITY_REDUCTION_ALGORITHMS, 0)
|
| 120 |
+
model_name = st.selectbox("Sentence embedding model", EMBEDDING_MODELS, 0)
|
| 121 |
+
|
| 122 |
+
with st.spinner(text="Loading embedding model..."):
|
| 123 |
+
model = load_model(model_name)
|
| 124 |
+
dimensionality_reduction_function = (
|
| 125 |
+
partial(get_umap_embeddings, random_state=SEED) if dimensionality_reduction == "UMAP" else partial(get_tsne_embeddings, random_state=SEED)
|
| 126 |
+
)
|
| 127 |
+
|
| 128 |
+
with st.spinner(text="Loading KenLM model..."):
|
| 129 |
+
kenlm_model = KenlmModel.from_pretrained(language)
|
| 130 |
+
|
| 131 |
+
if uploaded_file or hub_dataset:
|
| 132 |
+
with st.spinner("Loading dataset..."):
|
| 133 |
+
if uploaded_file:
|
| 134 |
+
df = uploaded_file_to_dataframe(uploaded_file)
|
| 135 |
+
df["perplexity"] = df[text_column].map(lambda x: model.get_perplexity(x[text_column]))
|
| 136 |
+
else:
|
| 137 |
+
df = hub_dataset_to_dataframe(hub_dataset, hub_dataset_config, hub_dataset_split, sample, text_column, kenlm_model, seed=SEED)
|
| 138 |
+
plot = generate_plot(df, text_column, "perplexity", sample, dimensionality_reduction_function, model)
|
| 139 |
+
logger.info("Displaying plot")
|
| 140 |
+
st.bokeh_chart(plot)
|
| 141 |
+
logger.info("Done")
|
data.py
ADDED
|
@@ -0,0 +1,28 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from functools import partial
|
| 2 |
+
|
| 3 |
+
import pandas as pd
|
| 4 |
+
from datasets import load_dataset
|
| 5 |
+
from tqdm import tqdm
|
| 6 |
+
|
| 7 |
+
from perplexity import KenlmModel
|
| 8 |
+
|
| 9 |
+
|
| 10 |
+
def hub_dataset_to_dataframe(path: str, name: str, split: str, sample: int, text_column: str, model: KenlmModel, seed: int = 0) -> pd.DataFrame:
|
| 11 |
+
load_dataset_fn = partial(load_dataset, path=path)
|
| 12 |
+
if name:
|
| 13 |
+
load_dataset_fn = partial(load_dataset_fn, name=name)
|
| 14 |
+
if split:
|
| 15 |
+
load_dataset_fn = partial(load_dataset_fn, split=split)
|
| 16 |
+
dataset = (
|
| 17 |
+
load_dataset_fn(streaming=True)
|
| 18 |
+
.shuffle(buffer_size=10000, seed=seed)
|
| 19 |
+
.map(lambda x: {text_column: x[text_column], "perplexity": model.get_perplexity(x[text_column])})
|
| 20 |
+
)
|
| 21 |
+
instances = []
|
| 22 |
+
count = 0
|
| 23 |
+
for instance in tqdm(dataset, total=sample):
|
| 24 |
+
instances.append(instance)
|
| 25 |
+
count += 1
|
| 26 |
+
if count == sample:
|
| 27 |
+
break
|
| 28 |
+
return pd.DataFrame(instances)
|
perplexity.py
ADDED
|
@@ -0,0 +1,37 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import urllib.request
|
| 3 |
+
|
| 4 |
+
import kenlm
|
| 5 |
+
|
| 6 |
+
|
| 7 |
+
class KenlmModel:
|
| 8 |
+
def __init__(self, language):
|
| 9 |
+
download_kenlm_model(language)
|
| 10 |
+
self.model = kenlm.Model(f"{language}.arpa.bin")
|
| 11 |
+
|
| 12 |
+
@classmethod
|
| 13 |
+
def from_pretrained(cls, language: str):
|
| 14 |
+
return cls(language)
|
| 15 |
+
|
| 16 |
+
def get_perplexity(self, doc: str):
|
| 17 |
+
doc_log_score, doc_length = 0, 0
|
| 18 |
+
for line in doc.split("\n"):
|
| 19 |
+
log_score = self.model.score(line)
|
| 20 |
+
length = len(line.split()) + 1
|
| 21 |
+
doc_log_score += log_score
|
| 22 |
+
doc_length += length
|
| 23 |
+
return 10.0 ** (-doc_log_score / doc_length)
|
| 24 |
+
|
| 25 |
+
|
| 26 |
+
def download_kenlm_model(language: str):
|
| 27 |
+
root_url = "http://dl.fbaipublicfiles.com/cc_net/lm"
|
| 28 |
+
bin_name = f"{language}.arpa.bin"
|
| 29 |
+
model_name = f"{language}.sp.model"
|
| 30 |
+
bin_url = f"{root_url}/{bin_name}"
|
| 31 |
+
model_url = f"{root_url}/{model_name}"
|
| 32 |
+
|
| 33 |
+
if not os.path.isfile(bin_name):
|
| 34 |
+
urllib.request.urlretrieve(bin_url, bin_name)
|
| 35 |
+
|
| 36 |
+
if not os.path.isfile(model_name):
|
| 37 |
+
urllib.request.urlretrieve(model_url, model_name)
|
requirements.txt
ADDED
|
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
huggingface-hub==0.0.17
|
| 2 |
+
streamlit==0.84.1
|
| 3 |
+
transformers==4.11.3
|
| 4 |
+
watchdog==2.1.3
|
| 5 |
+
sentence-transformers==2.0.0
|
| 6 |
+
bokeh==2.2.2
|
| 7 |
+
umap-learn==0.5.1
|
| 8 |
+
numpy==1.20.0
|
| 9 |
+
embedding-lenses==0.2.0
|
| 10 |
+
git+git://github.com/kpu/kenlm/archive/master.zip
|