Spaces:
Sleeping
Sleeping
Upload folder using huggingface_hub
Browse files- README.md +7 -6
- app.py +169 -0
- data/slogan.csv +0 -0
- logic/cleaning.py +96 -0
- logic/search.py +45 -0
- requirements.txt +9 -0
- runtime.txt +1 -0
README.md
CHANGED
@@ -1,12 +1,13 @@
|
|
1 |
---
|
2 |
-
title:
|
3 |
-
emoji:
|
4 |
-
colorFrom:
|
5 |
-
colorTo:
|
6 |
sdk: gradio
|
7 |
-
sdk_version:
|
8 |
app_file: app.py
|
9 |
pinned: false
|
10 |
---
|
11 |
|
12 |
-
|
|
|
|
1 |
---
|
2 |
+
title: Slogan Finder
|
3 |
+
emoji: 🏷️
|
4 |
+
colorFrom: yellow
|
5 |
+
colorTo: green
|
6 |
sdk: gradio
|
7 |
+
sdk_version: "4.0.0"
|
8 |
app_file: app.py
|
9 |
pinned: false
|
10 |
---
|
11 |
|
12 |
+
# Slogan Finder
|
13 |
+
Search **real slogans** (SBERT + FAISS) and get **1 AI-generated** suggestion.
|
app.py
ADDED
@@ -0,0 +1,169 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
\
|
2 |
+
import os, json, numpy as np, pandas as pd
|
3 |
+
import gradio as gr
|
4 |
+
import faiss
|
5 |
+
|
6 |
+
from sentence_transformers import SentenceTransformer
|
7 |
+
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
|
8 |
+
|
9 |
+
from logic.cleaning import clean_dataframe
|
10 |
+
from logic.search import SloganSearcher
|
11 |
+
|
12 |
+
ASSETS_DIR = "assets"
|
13 |
+
DATA_PATH = "data/slogan.csv"
|
14 |
+
|
15 |
+
MODEL_NAME = "sentence-transformers/all-MiniLM-L6-v2"
|
16 |
+
NORMALIZE = True
|
17 |
+
|
18 |
+
GEN_MODEL_NAME = "google/flan-t5-base"
|
19 |
+
NUM_GEN_CANDIDATES = 6
|
20 |
+
MAX_NEW_TOKENS = 24
|
21 |
+
TEMPERATURE = 0.9
|
22 |
+
TOP_P = 0.95
|
23 |
+
NOVELTY_SIM_THRESHOLD = 0.80
|
24 |
+
|
25 |
+
META_PATH = os.path.join(ASSETS_DIR, "meta.json")
|
26 |
+
PARQUET_PATH = os.path.join(ASSETS_DIR, "slogans_clean.parquet")
|
27 |
+
INDEX_PATH = os.path.join(ASSETS_DIR, "faiss.index")
|
28 |
+
EMB_PATH = os.path.join(ASSETS_DIR, "embeddings.npy")
|
29 |
+
|
30 |
+
def _log(m): print(f"[SLOGAN-SPACE] {m}", flush=True)
|
31 |
+
|
32 |
+
def _build_assets():
|
33 |
+
if not os.path.exists(DATA_PATH):
|
34 |
+
raise FileNotFoundError(f"Dataset not found at {DATA_PATH} (CSV with columns: 'tagline', 'description').")
|
35 |
+
os.makedirs(ASSETS_DIR, exist_ok=True)
|
36 |
+
|
37 |
+
_log(f"Loading dataset: {DATA_PATH}")
|
38 |
+
df = pd.read_csv(DATA_PATH)
|
39 |
+
|
40 |
+
_log(f"Rows before cleaning: {len(df)}")
|
41 |
+
df = clean_dataframe(df)
|
42 |
+
_log(f"Rows after cleaning: {len(df)}")
|
43 |
+
|
44 |
+
if "description" in df.columns and df["description"].notna().any():
|
45 |
+
texts = df["description"].fillna(df["tagline"]).astype(str).tolist()
|
46 |
+
text_col, fallback_col = "description", "tagline"
|
47 |
+
else:
|
48 |
+
texts = df["tagline"].astype(str).tolist()
|
49 |
+
text_col, fallback_col = "tagline", "tagline"
|
50 |
+
|
51 |
+
_log(f"Encoding with {MODEL_NAME} (normalize={NORMALIZE}) …")
|
52 |
+
encoder = SentenceTransformer(MODEL_NAME)
|
53 |
+
emb = encoder.encode(texts, batch_size=64, convert_to_numpy=True, normalize_embeddings=NORMALIZE)
|
54 |
+
|
55 |
+
dim = emb.shape[1]
|
56 |
+
index = faiss.IndexFlatIP(dim) if NORMALIZE else faiss.IndexFlatL2(dim)
|
57 |
+
index.add(emb)
|
58 |
+
|
59 |
+
_log("Persisting assets …")
|
60 |
+
df.to_parquet(PARQUET_PATH, index=False)
|
61 |
+
faiss.write_index(index, INDEX_PATH)
|
62 |
+
np.save(EMB_PATH, emb)
|
63 |
+
|
64 |
+
meta = {
|
65 |
+
"model_name": MODEL_NAME,
|
66 |
+
"dim": int(dim),
|
67 |
+
"normalized": NORMALIZE,
|
68 |
+
"metric": "ip" if NORMALIZE else "l2",
|
69 |
+
"row_count": int(len(df)),
|
70 |
+
"text_col": text_col,
|
71 |
+
"fallback_col": fallback_col,
|
72 |
+
}
|
73 |
+
with open(META_PATH, "w") as f:
|
74 |
+
json.dump(meta, f, indent=2)
|
75 |
+
_log("Assets built successfully.")
|
76 |
+
|
77 |
+
def _ensure_assets():
|
78 |
+
need = False
|
79 |
+
for p in (META_PATH, PARQUET_PATH, INDEX_PATH):
|
80 |
+
if not os.path.exists(p):
|
81 |
+
_log(f"Missing asset: {p}")
|
82 |
+
need = True
|
83 |
+
if need:
|
84 |
+
_log("Building assets from scratch …")
|
85 |
+
_build_assets()
|
86 |
+
return
|
87 |
+
try:
|
88 |
+
pd.read_parquet(PARQUET_PATH)
|
89 |
+
except Exception as e:
|
90 |
+
_log(f"Parquet read failed ({e}); rebuilding assets.")
|
91 |
+
_build_assets()
|
92 |
+
|
93 |
+
_ensure_assets()
|
94 |
+
|
95 |
+
searcher = SloganSearcher(assets_dir=ASSETS_DIR, use_rerank=False)
|
96 |
+
|
97 |
+
meta = json.load(open(META_PATH))
|
98 |
+
_encoder = SentenceTransformer(meta["model_name"])
|
99 |
+
|
100 |
+
_gen_tokenizer = AutoTokenizer.from_pretrained(GEN_MODEL_NAME)
|
101 |
+
_gen_model = AutoModelForSeq2SeqLM.from_pretrained(GEN_MODEL_NAME)
|
102 |
+
|
103 |
+
# ---- Prompt (adjust if you want your exact wording) ----
|
104 |
+
def _prompt_for(description: str) -> str:
|
105 |
+
return (
|
106 |
+
"You are a professional slogan writer. "
|
107 |
+
"Write ONE original, catchy startup slogan under 8 words, Title Case, no punctuation. "
|
108 |
+
"Do not copy examples. Description:\n"
|
109 |
+
f"{description}\nSlogan:"
|
110 |
+
)
|
111 |
+
|
112 |
+
def _generate_candidates(description: str, n: int = NUM_GEN_CANDIDATES):
|
113 |
+
prompt = _prompt_for(description)
|
114 |
+
inputs = _gen_tokenizer([prompt]*n, return_tensors="pt", padding=True, truncation=True)
|
115 |
+
outputs = _gen_model.generate(
|
116 |
+
**inputs,
|
117 |
+
do_sample=True,
|
118 |
+
temperature=TEMPERATURE,
|
119 |
+
top_p=TOP_P,
|
120 |
+
num_return_sequences=n,
|
121 |
+
max_new_tokens=MAX_NEW_TOKENS,
|
122 |
+
eos_token_id=_gen_tokenizer.eos_token_id,
|
123 |
+
)
|
124 |
+
texts = _gen_tokenizer.batch_decode(outputs, skip_special_tokens=True)
|
125 |
+
return [t.replace("Slogan:", "").strip().strip('"') for t in texts if t.strip()]
|
126 |
+
|
127 |
+
def _pick_most_novel(candidates, retrieved_texts):
|
128 |
+
if not candidates:
|
129 |
+
return None
|
130 |
+
R = _encoder.encode(retrieved_texts, convert_to_numpy=True, normalize_embeddings=True) if retrieved_texts else None
|
131 |
+
best, best_novelty = None, -1e9
|
132 |
+
for c in candidates:
|
133 |
+
c_emb = _encoder.encode([c], convert_to_numpy=True, normalize_embeddings=True)
|
134 |
+
if R is None or len(retrieved_texts) == 0:
|
135 |
+
max_sim = 0.0
|
136 |
+
else:
|
137 |
+
sims = np.dot(R, c_emb[0]) # cosine
|
138 |
+
max_sim = float(np.max(sims))
|
139 |
+
novelty = 1.0 - max_sim
|
140 |
+
if (max_sim < NOVELTY_SIM_THRESHOLD and novelty > best_novelty) or best is None and novelty > best_novelty:
|
141 |
+
best, best_novelty = c, novelty
|
142 |
+
return best
|
143 |
+
|
144 |
+
def run_pipeline(user_description: str):
|
145 |
+
if not user_description or not user_description.strip():
|
146 |
+
return "Please enter a description."
|
147 |
+
retrieved_df = searcher.search(user_description, top_k=3, rerank_top_n=10)
|
148 |
+
retrieved_texts = retrieved_df["display"].tolist() if not retrieved_df.empty else []
|
149 |
+
gens = _generate_candidates(user_description, NUM_GEN_CANDIDATES)
|
150 |
+
generated = _pick_most_novel(gens, retrieved_texts) or (gens[0] if gens else "—")
|
151 |
+
lines = []
|
152 |
+
lines.append("### 🔎 Top 3 similar slogans")
|
153 |
+
if retrieved_texts:
|
154 |
+
for i, s in enumerate(retrieved_texts, 1):
|
155 |
+
lines.append(f"{i}. {s}")
|
156 |
+
else:
|
157 |
+
lines.append("_No similar slogans found._")
|
158 |
+
lines.append("\n### ✨ AI-generated suggestion")
|
159 |
+
lines.append(generated)
|
160 |
+
return "\n".join(lines)
|
161 |
+
|
162 |
+
with gr.Blocks(title="Slogan Finder") as demo:
|
163 |
+
gr.Markdown("# 🔎 Slogan Finder\nDescribe your product/company; get 3 similar slogans + 1 AI-generated suggestion.")
|
164 |
+
query = gr.Textbox(label="Describe your product/company", placeholder="AI-powered patient financial navigation platform...")
|
165 |
+
btn = gr.Button("Get slogans", variant="primary")
|
166 |
+
out = gr.Markdown()
|
167 |
+
btn.click(run_pipeline, inputs=[query], outputs=out)
|
168 |
+
|
169 |
+
demo.queue(max_size=64).launch()
|
data/slogan.csv
ADDED
The diff for this file is too large to render.
See raw diff
|
|
logic/cleaning.py
ADDED
@@ -0,0 +1,96 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
\
|
2 |
+
import pandas as pd
|
3 |
+
import re, unicodedata
|
4 |
+
from html import unescape
|
5 |
+
|
6 |
+
MIN_LEN = 20
|
7 |
+
MAX_LEN = 60
|
8 |
+
KEEP_ASCII_ONLY = False
|
9 |
+
MIN_ALPHA_RATIO = 0.60
|
10 |
+
DROP_IF_ALL_CAPS = False
|
11 |
+
|
12 |
+
BUZZY = {
|
13 |
+
"synergy","cutting edge","cutting-edge","best in class","best-in-class",
|
14 |
+
"world class","world-class","state of the art","state-of-the-art",
|
15 |
+
"revolutionary","disruptive platform","next generation","next-gen",
|
16 |
+
"leading provider","scalable solution"
|
17 |
+
}
|
18 |
+
|
19 |
+
URL_RE = re.compile(r"(https?://|www\.)\S+", re.I)
|
20 |
+
EMAIL_RE = re.compile(r"[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,}", re.I)
|
21 |
+
PHONE_RE = re.compile(r"(\+?\d[\d\-\s()]{6,}\d)")
|
22 |
+
WS_RE = re.compile(r"\s+")
|
23 |
+
PUNCT_RE = re.compile(r"[^\w\s]+")
|
24 |
+
TM_RE = re.compile(r"[®©™]")
|
25 |
+
|
26 |
+
def _nfkc(s): return unicodedata.normalize("NFKC", s)
|
27 |
+
|
28 |
+
def _clean_text(s: str) -> str:
|
29 |
+
s = "" if s is None else str(s)
|
30 |
+
s = unescape(s)
|
31 |
+
s = _nfkc(s)
|
32 |
+
s = s.replace("\n"," ").replace("\r"," ")
|
33 |
+
s = TM_RE.sub("", s)
|
34 |
+
s = WS_RE.sub(" ", s).strip()
|
35 |
+
return s
|
36 |
+
|
37 |
+
def _alpha_ratio(s: str) -> float:
|
38 |
+
if not s: return 0.0
|
39 |
+
letters = sum(ch.isalpha() for ch in s)
|
40 |
+
return letters / max(1, len(s))
|
41 |
+
|
42 |
+
def _looks_shouty(s: str) -> bool:
|
43 |
+
letters = [ch for ch in s if ch.isalpha()]
|
44 |
+
if not letters: return False
|
45 |
+
uppers = sum(ch.isupper() for ch in letters)
|
46 |
+
return uppers / len(letters) >= 0.85
|
47 |
+
|
48 |
+
def _contains_buzzy(s: str) -> bool:
|
49 |
+
lo = s.lower()
|
50 |
+
return any(term in lo for term in BUZZY)
|
51 |
+
|
52 |
+
def _has_junk(s: str) -> bool:
|
53 |
+
return bool(URL_RE.search(s) or EMAIL_RE.search(s) or PHONE_RE.search(s))
|
54 |
+
|
55 |
+
def _ascii_only(s: str) -> bool:
|
56 |
+
try:
|
57 |
+
s.encode("ascii"); return True
|
58 |
+
except Exception:
|
59 |
+
return False
|
60 |
+
|
61 |
+
def _dupe_key(s: str) -> str:
|
62 |
+
s = s.lower()
|
63 |
+
s = PUNCT_RE.sub(" ", s)
|
64 |
+
s = WS_RE.sub(" ", s).strip()
|
65 |
+
return s
|
66 |
+
|
67 |
+
def clean_dataframe(df: pd.DataFrame) -> pd.DataFrame:
|
68 |
+
if "tagline" not in df.columns:
|
69 |
+
raise ValueError("Input must contain a 'tagline' column.")
|
70 |
+
df = df.copy()
|
71 |
+
if "description" not in df.columns:
|
72 |
+
df["description"] = df["tagline"]
|
73 |
+
|
74 |
+
df["tagline"] = df["tagline"].map(_clean_text)
|
75 |
+
df["description"] = df["description"].map(_clean_text)
|
76 |
+
|
77 |
+
df = df[(df["tagline"].str.len() > 0)]
|
78 |
+
mask_junk = df["tagline"].map(_has_junk) | df["description"].map(_has_junk)
|
79 |
+
df = df[~mask_junk]
|
80 |
+
|
81 |
+
if KEEP_ASCII_ONLY:
|
82 |
+
df = df[df["tagline"].map(_ascii_only)]
|
83 |
+
|
84 |
+
df = df[df["tagline"].map(_alpha_ratio) >= MIN_ALPHA_RATIO]
|
85 |
+
df = df[df["tagline"].str.len().between(MIN_LEN, MAX_LEN)]
|
86 |
+
|
87 |
+
if DROP_IF_ALL_CAPS:
|
88 |
+
df = df[~df["tagline"].map(_looks_shouty)]
|
89 |
+
|
90 |
+
df = df[~df["tagline"].map(_contains_buzzy)]
|
91 |
+
|
92 |
+
key = df["tagline"].map(_dupe_key)
|
93 |
+
df = df.loc[~key.duplicated()].reset_index(drop=True)
|
94 |
+
|
95 |
+
df.loc[df["description"].str.len() == 0, "description"] = df["tagline"]
|
96 |
+
return df
|
logic/search.py
ADDED
@@ -0,0 +1,45 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
\
|
2 |
+
import json, os
|
3 |
+
import numpy as np, pandas as pd
|
4 |
+
import faiss
|
5 |
+
from sentence_transformers import SentenceTransformer, CrossEncoder
|
6 |
+
|
7 |
+
class SloganSearcher:
|
8 |
+
def __init__(self, assets_dir="assets", use_rerank=False, rerank_model="cross-encoder/stsb-roberta-base"):
|
9 |
+
meta_path = os.path.join(assets_dir, "meta.json")
|
10 |
+
if not os.path.exists(meta_path):
|
11 |
+
raise FileNotFoundError(f"Missing {meta_path}. Build assets first.")
|
12 |
+
with open(meta_path, "r") as f:
|
13 |
+
self.meta = json.load(f)
|
14 |
+
|
15 |
+
self.df = pd.read_parquet(os.path.join(assets_dir, "slogans_clean.parquet"))
|
16 |
+
self.index = faiss.read_index(os.path.join(assets_dir, "faiss.index"))
|
17 |
+
self.encoder = SentenceTransformer(self.meta["model_name"])
|
18 |
+
|
19 |
+
self.use_rerank = use_rerank
|
20 |
+
self.reranker = CrossEncoder(rerank_model) if use_rerank else None
|
21 |
+
|
22 |
+
self.text_col = self.meta.get("text_col", "description")
|
23 |
+
self.fallback_col = self.meta.get("fallback_col", "tagline")
|
24 |
+
self.norm = bool(self.meta.get("normalized", True))
|
25 |
+
|
26 |
+
def search(self, query: str, top_k=5, rerank_top_n=20):
|
27 |
+
if not isinstance(query, str) or len(query.strip()) == 0:
|
28 |
+
return pd.DataFrame(columns=["display", "score"] + (["rerank_score"] if self.use_rerank else []))
|
29 |
+
q = self.encoder.encode([query], convert_to_numpy=True, normalize_embeddings=self.norm)
|
30 |
+
sims, idxs = self.index.search(q, max(int(top_k), int(rerank_top_n) if self.use_rerank else int(top_k)))
|
31 |
+
idxs = idxs[0].tolist()
|
32 |
+
sims = sims[0].tolist()
|
33 |
+
results = self.df.iloc[idxs].copy()
|
34 |
+
results["score"] = sims
|
35 |
+
if self.use_rerank:
|
36 |
+
texts = results[self.text_col].fillna(results[self.fallback_col]).astype(str).tolist()
|
37 |
+
pairs = [[query, t] for t in texts]
|
38 |
+
rr = self.reranker.predict(pairs)
|
39 |
+
results["rerank_score"] = rr
|
40 |
+
results = results.sort_values("rerank_score", ascending=False).head(int(top_k))
|
41 |
+
else:
|
42 |
+
results = results.head(int(top_k))
|
43 |
+
results["display"] = results[self.fallback_col]
|
44 |
+
cols = ["display", "score"] + (["rerank_score"] if self.use_rerank else [])
|
45 |
+
return results[cols]
|
requirements.txt
ADDED
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
gradio>=4.0.0
|
2 |
+
huggingface_hub>=0.23.0
|
3 |
+
sentence-transformers>=2.6.0
|
4 |
+
faiss-cpu>=1.8.0
|
5 |
+
pandas>=2.1.0
|
6 |
+
numpy>=1.26.0
|
7 |
+
pyarrow>=14.0.1
|
8 |
+
torch
|
9 |
+
transformers>=4.40.0
|
runtime.txt
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
python-3.10
|