yair319732 commited on
Commit
446fd19
·
verified ·
1 Parent(s): e3fb00d

Upload folder using huggingface_hub

Browse files
Files changed (7) hide show
  1. README.md +7 -6
  2. app.py +169 -0
  3. data/slogan.csv +0 -0
  4. logic/cleaning.py +96 -0
  5. logic/search.py +45 -0
  6. requirements.txt +9 -0
  7. runtime.txt +1 -0
README.md CHANGED
@@ -1,12 +1,13 @@
1
  ---
2
- title: Slogan2
3
- emoji: 🚀
4
- colorFrom: purple
5
- colorTo: indigo
6
  sdk: gradio
7
- sdk_version: 5.43.1
8
  app_file: app.py
9
  pinned: false
10
  ---
11
 
12
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
1
  ---
2
+ title: Slogan Finder
3
+ emoji: 🏷️
4
+ colorFrom: yellow
5
+ colorTo: green
6
  sdk: gradio
7
+ sdk_version: "4.0.0"
8
  app_file: app.py
9
  pinned: false
10
  ---
11
 
12
+ # Slogan Finder
13
+ Search **real slogans** (SBERT + FAISS) and get **1 AI-generated** suggestion.
app.py ADDED
@@ -0,0 +1,169 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ \
2
+ import os, json, numpy as np, pandas as pd
3
+ import gradio as gr
4
+ import faiss
5
+
6
+ from sentence_transformers import SentenceTransformer
7
+ from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
8
+
9
+ from logic.cleaning import clean_dataframe
10
+ from logic.search import SloganSearcher
11
+
12
+ ASSETS_DIR = "assets"
13
+ DATA_PATH = "data/slogan.csv"
14
+
15
+ MODEL_NAME = "sentence-transformers/all-MiniLM-L6-v2"
16
+ NORMALIZE = True
17
+
18
+ GEN_MODEL_NAME = "google/flan-t5-base"
19
+ NUM_GEN_CANDIDATES = 6
20
+ MAX_NEW_TOKENS = 24
21
+ TEMPERATURE = 0.9
22
+ TOP_P = 0.95
23
+ NOVELTY_SIM_THRESHOLD = 0.80
24
+
25
+ META_PATH = os.path.join(ASSETS_DIR, "meta.json")
26
+ PARQUET_PATH = os.path.join(ASSETS_DIR, "slogans_clean.parquet")
27
+ INDEX_PATH = os.path.join(ASSETS_DIR, "faiss.index")
28
+ EMB_PATH = os.path.join(ASSETS_DIR, "embeddings.npy")
29
+
30
+ def _log(m): print(f"[SLOGAN-SPACE] {m}", flush=True)
31
+
32
+ def _build_assets():
33
+ if not os.path.exists(DATA_PATH):
34
+ raise FileNotFoundError(f"Dataset not found at {DATA_PATH} (CSV with columns: 'tagline', 'description').")
35
+ os.makedirs(ASSETS_DIR, exist_ok=True)
36
+
37
+ _log(f"Loading dataset: {DATA_PATH}")
38
+ df = pd.read_csv(DATA_PATH)
39
+
40
+ _log(f"Rows before cleaning: {len(df)}")
41
+ df = clean_dataframe(df)
42
+ _log(f"Rows after cleaning: {len(df)}")
43
+
44
+ if "description" in df.columns and df["description"].notna().any():
45
+ texts = df["description"].fillna(df["tagline"]).astype(str).tolist()
46
+ text_col, fallback_col = "description", "tagline"
47
+ else:
48
+ texts = df["tagline"].astype(str).tolist()
49
+ text_col, fallback_col = "tagline", "tagline"
50
+
51
+ _log(f"Encoding with {MODEL_NAME} (normalize={NORMALIZE}) …")
52
+ encoder = SentenceTransformer(MODEL_NAME)
53
+ emb = encoder.encode(texts, batch_size=64, convert_to_numpy=True, normalize_embeddings=NORMALIZE)
54
+
55
+ dim = emb.shape[1]
56
+ index = faiss.IndexFlatIP(dim) if NORMALIZE else faiss.IndexFlatL2(dim)
57
+ index.add(emb)
58
+
59
+ _log("Persisting assets …")
60
+ df.to_parquet(PARQUET_PATH, index=False)
61
+ faiss.write_index(index, INDEX_PATH)
62
+ np.save(EMB_PATH, emb)
63
+
64
+ meta = {
65
+ "model_name": MODEL_NAME,
66
+ "dim": int(dim),
67
+ "normalized": NORMALIZE,
68
+ "metric": "ip" if NORMALIZE else "l2",
69
+ "row_count": int(len(df)),
70
+ "text_col": text_col,
71
+ "fallback_col": fallback_col,
72
+ }
73
+ with open(META_PATH, "w") as f:
74
+ json.dump(meta, f, indent=2)
75
+ _log("Assets built successfully.")
76
+
77
+ def _ensure_assets():
78
+ need = False
79
+ for p in (META_PATH, PARQUET_PATH, INDEX_PATH):
80
+ if not os.path.exists(p):
81
+ _log(f"Missing asset: {p}")
82
+ need = True
83
+ if need:
84
+ _log("Building assets from scratch …")
85
+ _build_assets()
86
+ return
87
+ try:
88
+ pd.read_parquet(PARQUET_PATH)
89
+ except Exception as e:
90
+ _log(f"Parquet read failed ({e}); rebuilding assets.")
91
+ _build_assets()
92
+
93
+ _ensure_assets()
94
+
95
+ searcher = SloganSearcher(assets_dir=ASSETS_DIR, use_rerank=False)
96
+
97
+ meta = json.load(open(META_PATH))
98
+ _encoder = SentenceTransformer(meta["model_name"])
99
+
100
+ _gen_tokenizer = AutoTokenizer.from_pretrained(GEN_MODEL_NAME)
101
+ _gen_model = AutoModelForSeq2SeqLM.from_pretrained(GEN_MODEL_NAME)
102
+
103
+ # ---- Prompt (adjust if you want your exact wording) ----
104
+ def _prompt_for(description: str) -> str:
105
+ return (
106
+ "You are a professional slogan writer. "
107
+ "Write ONE original, catchy startup slogan under 8 words, Title Case, no punctuation. "
108
+ "Do not copy examples. Description:\n"
109
+ f"{description}\nSlogan:"
110
+ )
111
+
112
+ def _generate_candidates(description: str, n: int = NUM_GEN_CANDIDATES):
113
+ prompt = _prompt_for(description)
114
+ inputs = _gen_tokenizer([prompt]*n, return_tensors="pt", padding=True, truncation=True)
115
+ outputs = _gen_model.generate(
116
+ **inputs,
117
+ do_sample=True,
118
+ temperature=TEMPERATURE,
119
+ top_p=TOP_P,
120
+ num_return_sequences=n,
121
+ max_new_tokens=MAX_NEW_TOKENS,
122
+ eos_token_id=_gen_tokenizer.eos_token_id,
123
+ )
124
+ texts = _gen_tokenizer.batch_decode(outputs, skip_special_tokens=True)
125
+ return [t.replace("Slogan:", "").strip().strip('"') for t in texts if t.strip()]
126
+
127
+ def _pick_most_novel(candidates, retrieved_texts):
128
+ if not candidates:
129
+ return None
130
+ R = _encoder.encode(retrieved_texts, convert_to_numpy=True, normalize_embeddings=True) if retrieved_texts else None
131
+ best, best_novelty = None, -1e9
132
+ for c in candidates:
133
+ c_emb = _encoder.encode([c], convert_to_numpy=True, normalize_embeddings=True)
134
+ if R is None or len(retrieved_texts) == 0:
135
+ max_sim = 0.0
136
+ else:
137
+ sims = np.dot(R, c_emb[0]) # cosine
138
+ max_sim = float(np.max(sims))
139
+ novelty = 1.0 - max_sim
140
+ if (max_sim < NOVELTY_SIM_THRESHOLD and novelty > best_novelty) or best is None and novelty > best_novelty:
141
+ best, best_novelty = c, novelty
142
+ return best
143
+
144
+ def run_pipeline(user_description: str):
145
+ if not user_description or not user_description.strip():
146
+ return "Please enter a description."
147
+ retrieved_df = searcher.search(user_description, top_k=3, rerank_top_n=10)
148
+ retrieved_texts = retrieved_df["display"].tolist() if not retrieved_df.empty else []
149
+ gens = _generate_candidates(user_description, NUM_GEN_CANDIDATES)
150
+ generated = _pick_most_novel(gens, retrieved_texts) or (gens[0] if gens else "—")
151
+ lines = []
152
+ lines.append("### 🔎 Top 3 similar slogans")
153
+ if retrieved_texts:
154
+ for i, s in enumerate(retrieved_texts, 1):
155
+ lines.append(f"{i}. {s}")
156
+ else:
157
+ lines.append("_No similar slogans found._")
158
+ lines.append("\n### ✨ AI-generated suggestion")
159
+ lines.append(generated)
160
+ return "\n".join(lines)
161
+
162
+ with gr.Blocks(title="Slogan Finder") as demo:
163
+ gr.Markdown("# 🔎 Slogan Finder\nDescribe your product/company; get 3 similar slogans + 1 AI-generated suggestion.")
164
+ query = gr.Textbox(label="Describe your product/company", placeholder="AI-powered patient financial navigation platform...")
165
+ btn = gr.Button("Get slogans", variant="primary")
166
+ out = gr.Markdown()
167
+ btn.click(run_pipeline, inputs=[query], outputs=out)
168
+
169
+ demo.queue(max_size=64).launch()
data/slogan.csv ADDED
The diff for this file is too large to render. See raw diff
 
logic/cleaning.py ADDED
@@ -0,0 +1,96 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ \
2
+ import pandas as pd
3
+ import re, unicodedata
4
+ from html import unescape
5
+
6
+ MIN_LEN = 20
7
+ MAX_LEN = 60
8
+ KEEP_ASCII_ONLY = False
9
+ MIN_ALPHA_RATIO = 0.60
10
+ DROP_IF_ALL_CAPS = False
11
+
12
+ BUZZY = {
13
+ "synergy","cutting edge","cutting-edge","best in class","best-in-class",
14
+ "world class","world-class","state of the art","state-of-the-art",
15
+ "revolutionary","disruptive platform","next generation","next-gen",
16
+ "leading provider","scalable solution"
17
+ }
18
+
19
+ URL_RE = re.compile(r"(https?://|www\.)\S+", re.I)
20
+ EMAIL_RE = re.compile(r"[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,}", re.I)
21
+ PHONE_RE = re.compile(r"(\+?\d[\d\-\s()]{6,}\d)")
22
+ WS_RE = re.compile(r"\s+")
23
+ PUNCT_RE = re.compile(r"[^\w\s]+")
24
+ TM_RE = re.compile(r"[®©™]")
25
+
26
+ def _nfkc(s): return unicodedata.normalize("NFKC", s)
27
+
28
+ def _clean_text(s: str) -> str:
29
+ s = "" if s is None else str(s)
30
+ s = unescape(s)
31
+ s = _nfkc(s)
32
+ s = s.replace("\n"," ").replace("\r"," ")
33
+ s = TM_RE.sub("", s)
34
+ s = WS_RE.sub(" ", s).strip()
35
+ return s
36
+
37
+ def _alpha_ratio(s: str) -> float:
38
+ if not s: return 0.0
39
+ letters = sum(ch.isalpha() for ch in s)
40
+ return letters / max(1, len(s))
41
+
42
+ def _looks_shouty(s: str) -> bool:
43
+ letters = [ch for ch in s if ch.isalpha()]
44
+ if not letters: return False
45
+ uppers = sum(ch.isupper() for ch in letters)
46
+ return uppers / len(letters) >= 0.85
47
+
48
+ def _contains_buzzy(s: str) -> bool:
49
+ lo = s.lower()
50
+ return any(term in lo for term in BUZZY)
51
+
52
+ def _has_junk(s: str) -> bool:
53
+ return bool(URL_RE.search(s) or EMAIL_RE.search(s) or PHONE_RE.search(s))
54
+
55
+ def _ascii_only(s: str) -> bool:
56
+ try:
57
+ s.encode("ascii"); return True
58
+ except Exception:
59
+ return False
60
+
61
+ def _dupe_key(s: str) -> str:
62
+ s = s.lower()
63
+ s = PUNCT_RE.sub(" ", s)
64
+ s = WS_RE.sub(" ", s).strip()
65
+ return s
66
+
67
+ def clean_dataframe(df: pd.DataFrame) -> pd.DataFrame:
68
+ if "tagline" not in df.columns:
69
+ raise ValueError("Input must contain a 'tagline' column.")
70
+ df = df.copy()
71
+ if "description" not in df.columns:
72
+ df["description"] = df["tagline"]
73
+
74
+ df["tagline"] = df["tagline"].map(_clean_text)
75
+ df["description"] = df["description"].map(_clean_text)
76
+
77
+ df = df[(df["tagline"].str.len() > 0)]
78
+ mask_junk = df["tagline"].map(_has_junk) | df["description"].map(_has_junk)
79
+ df = df[~mask_junk]
80
+
81
+ if KEEP_ASCII_ONLY:
82
+ df = df[df["tagline"].map(_ascii_only)]
83
+
84
+ df = df[df["tagline"].map(_alpha_ratio) >= MIN_ALPHA_RATIO]
85
+ df = df[df["tagline"].str.len().between(MIN_LEN, MAX_LEN)]
86
+
87
+ if DROP_IF_ALL_CAPS:
88
+ df = df[~df["tagline"].map(_looks_shouty)]
89
+
90
+ df = df[~df["tagline"].map(_contains_buzzy)]
91
+
92
+ key = df["tagline"].map(_dupe_key)
93
+ df = df.loc[~key.duplicated()].reset_index(drop=True)
94
+
95
+ df.loc[df["description"].str.len() == 0, "description"] = df["tagline"]
96
+ return df
logic/search.py ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ \
2
+ import json, os
3
+ import numpy as np, pandas as pd
4
+ import faiss
5
+ from sentence_transformers import SentenceTransformer, CrossEncoder
6
+
7
+ class SloganSearcher:
8
+ def __init__(self, assets_dir="assets", use_rerank=False, rerank_model="cross-encoder/stsb-roberta-base"):
9
+ meta_path = os.path.join(assets_dir, "meta.json")
10
+ if not os.path.exists(meta_path):
11
+ raise FileNotFoundError(f"Missing {meta_path}. Build assets first.")
12
+ with open(meta_path, "r") as f:
13
+ self.meta = json.load(f)
14
+
15
+ self.df = pd.read_parquet(os.path.join(assets_dir, "slogans_clean.parquet"))
16
+ self.index = faiss.read_index(os.path.join(assets_dir, "faiss.index"))
17
+ self.encoder = SentenceTransformer(self.meta["model_name"])
18
+
19
+ self.use_rerank = use_rerank
20
+ self.reranker = CrossEncoder(rerank_model) if use_rerank else None
21
+
22
+ self.text_col = self.meta.get("text_col", "description")
23
+ self.fallback_col = self.meta.get("fallback_col", "tagline")
24
+ self.norm = bool(self.meta.get("normalized", True))
25
+
26
+ def search(self, query: str, top_k=5, rerank_top_n=20):
27
+ if not isinstance(query, str) or len(query.strip()) == 0:
28
+ return pd.DataFrame(columns=["display", "score"] + (["rerank_score"] if self.use_rerank else []))
29
+ q = self.encoder.encode([query], convert_to_numpy=True, normalize_embeddings=self.norm)
30
+ sims, idxs = self.index.search(q, max(int(top_k), int(rerank_top_n) if self.use_rerank else int(top_k)))
31
+ idxs = idxs[0].tolist()
32
+ sims = sims[0].tolist()
33
+ results = self.df.iloc[idxs].copy()
34
+ results["score"] = sims
35
+ if self.use_rerank:
36
+ texts = results[self.text_col].fillna(results[self.fallback_col]).astype(str).tolist()
37
+ pairs = [[query, t] for t in texts]
38
+ rr = self.reranker.predict(pairs)
39
+ results["rerank_score"] = rr
40
+ results = results.sort_values("rerank_score", ascending=False).head(int(top_k))
41
+ else:
42
+ results = results.head(int(top_k))
43
+ results["display"] = results[self.fallback_col]
44
+ cols = ["display", "score"] + (["rerank_score"] if self.use_rerank else [])
45
+ return results[cols]
requirements.txt ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ gradio>=4.0.0
2
+ huggingface_hub>=0.23.0
3
+ sentence-transformers>=2.6.0
4
+ faiss-cpu>=1.8.0
5
+ pandas>=2.1.0
6
+ numpy>=1.26.0
7
+ pyarrow>=14.0.1
8
+ torch
9
+ transformers>=4.40.0
runtime.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ python-3.10