Spaces:
Runtime error
Runtime error
| """NetCom β WooCommerce transformer (TryΒ 3Β schema β metaβdays calc, sorted attributes, deduped AI sections, persistent cache, 100βparallel, | |
| duplicateβsafe, relativeβlogo paths, cacheβpreload) | |
| ============================================================================== | |
| Accept a NetCom schedule (CSV/XLSX) and **optionally** a *previous* WooCommerce | |
| CSV; output the fresh WooCommerce CSV. | |
| NewΒ in this revision | |
| -------------------- | |
| * **MetaΒ days** automatically calculated as the inclusive span (in days) between | |
| the earliest and latest course dates for each CourseΒ ID. | |
| * **AttributeΒ 1 (Date)** lists are now guaranteed to be sorted chronologically. | |
| * All AIβgenerated sections (descriptions, objectives, agenda, prerequisites) | |
| are postβprocessed to **deduplicate any repeated lines** inside each section. | |
| * Everything else (persistent cache in `/data`, 100βparallel semaphore, | |
| inβflight deβduplication, pandas compatibility fix) remains unchanged. | |
| """ | |
| from __future__ import annotations | |
| import asyncio | |
| import hashlib | |
| import json | |
| import os | |
| import tempfile | |
| from io import BytesIO | |
| from pathlib import Path | |
| from typing import List | |
| import gradio as gr | |
| import gradio_client.utils | |
| import openai | |
| import pandas as pd | |
| # ββ Gradio boolβschema hotβpatch ββββββββββββββββββββββββββββββββββββββββββββ | |
| _original = gradio_client.utils._json_schema_to_python_type | |
| def _fixed_json_schema_to_python_type(schema, defs=None): # type: ignore | |
| if isinstance(schema, bool): | |
| return "any" | |
| return _original(schema, defs) | |
| gradio_client.utils._json_schema_to_python_type = _fixed_json_schema_to_python_type # type: ignore | |
| # ββ Persistent disk cache (HF Spaces uses /data) ββββββββββββββββββββββββββββ | |
| _PERSISTENT_ROOT = Path("/data") | |
| CACHE_DIR = (_PERSISTENT_ROOT if _PERSISTENT_ROOT.exists() else Path(".")) / "ai_response_cache" | |
| CACHE_DIR.mkdir(parents=True, exist_ok=True) | |
| def _cache_path(p: str) -> Path: | |
| return CACHE_DIR / f"{hashlib.md5(p.encode()).hexdigest()}.json" | |
| def _get_cached(p: str) -> str | None: | |
| try: | |
| return json.loads(_cache_path(p).read_text("utf-8"))["response"] | |
| except Exception: | |
| return None | |
| def _set_cache(p: str, r: str) -> None: | |
| try: | |
| _cache_path(p).write_text(json.dumps({"prompt": p, "response": r}), "utf-8") | |
| except Exception: | |
| pass | |
| # ββ Helpers ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def _dedup_lines(txt: str) -> str: | |
| """Remove duplicated lines while preserving order inside a block of text.""" | |
| seen = set() | |
| out: List[str] = [] | |
| for raw in txt.splitlines(): | |
| line = raw.rstrip() | |
| if line and line not in seen: | |
| out.append(line) | |
| seen.add(line) | |
| return "\n".join(out) | |
| # ββ OpenAI helpers: 100βparallel + deβdup βββββββββββββββββββββββββββββββββββ | |
| _SEM = asyncio.Semaphore(100) # β€100 concurrent OpenAI calls | |
| _inflight: dict[str, asyncio.Future] = {} # prompt β Future | |
| async def _gpt_async(client: openai.AsyncOpenAI, prompt: str) -> str: | |
| cached = _get_cached(prompt) | |
| if cached is not None: | |
| return cached | |
| running = _inflight.get(prompt) | |
| if running is not None: | |
| return await running | |
| loop = asyncio.get_running_loop() | |
| async def _call_api() -> str: | |
| async with _SEM: | |
| try: | |
| msg = await client.chat.completions.create( | |
| model="gpt-4o-mini", | |
| messages=[{"role": "user", "content": prompt}], | |
| temperature=0, | |
| ) | |
| text = msg.choices[0].message.content | |
| except Exception as exc: | |
| text = f"Error: {exc}" | |
| _set_cache(prompt, text) | |
| return text | |
| task = loop.create_task(_call_api()) | |
| _inflight[prompt] = task | |
| try: | |
| return await task | |
| finally: | |
| _inflight.pop(prompt, None) | |
| async def _batch_async(lst, instruction: str, client): | |
| out = ["" for _ in lst] | |
| idx, prompts = [], [] | |
| for i, txt in enumerate(lst): | |
| if isinstance(txt, str) and txt.strip(): | |
| idx.append(i) | |
| prompts.append(f"{instruction}\n\nText: {txt}") | |
| if not prompts: | |
| return out | |
| responses = await asyncio.gather(*[_gpt_async(client, p) for p in prompts]) | |
| for j, val in enumerate(responses): | |
| out[idx[j]] = _dedup_lines(val) | |
| return out | |
| # ββ Instructions (reuse across preload & gen) βββββββββββββββββββββββββββββββ | |
| DESC_SHORT = "Create a concise 250-character summary of this course description:" | |
| DESC_LONG = "Condense this description to a maximum of 750 characters in paragraph format, with clean formatting:" | |
| OBJECTIVES = "Format these objectives into a bullet list with clean formatting. Start each bullet with 'β’ ':" | |
| AGENDA = "Format this agenda into a bullet list with clean formatting. Start each bullet with 'β’ ':" | |
| PREREQ = "Format these prerequisites into a bullet list with clean formatting. Start each bullet with 'β’ ':" | |
| # ββ Logo map (relative paths, with common aliases) ββββββββββββββββββββββββββ | |
| logos = { | |
| "Amazon Web Services": "https://staging.greathorizonslearning.com/wp-content/uploads/2025/04/aws.png", | |
| "AWS": "https://staging.greathorizonslearning.com/wp-content/uploads/2025/04/aws.png", | |
| "Cisco": "https://staging.greathorizonslearning.com/wp-content/uploads/2025/04/cisco-e1738593292198-1.webp", | |
| "Microsoft": "https://staging.greathorizonslearning.com/wp-content/uploads/2025/04/Microsoft-e1737494120985-1.png", | |
| "Google Cloud": "https://staging.greathorizonslearning.com/wp-content/uploads/2025/04/Google_Cloud.png", | |
| "EC Council": "https://staging.greathorizonslearning.com/wp-content/uploads/2025/04/Ec_Council.png", | |
| "ITIL": "https://staging.greathorizonslearning.com/wp-content/uploads/2025/04/ITIL.webp", | |
| "PMI": "https://staging.greathorizonslearning.com/wp-content/uploads/2025/04/PMI.png", | |
| "Comptia": "https://staging.greathorizonslearning.com/wp-content/uploads/2025/04/Comptia.png", | |
| "Autodesk": "https://staging.greathorizonslearning.com/wp-content/uploads/2025/04/autodesk.png", | |
| "ISC2": "https://staging.greathorizonslearning.com/wp-content/uploads/2025/04/ISC2.png", | |
| "AICerts": "https://staging.greathorizonslearning.com/wp-content/uploads/2025/04/aicerts-logo-1.png", | |
| } | |
| DEFAULT_PREREQ = ( | |
| "No specific prerequisites are required for this course. Basic computer literacy and " | |
| "familiarity with fundamental concepts in the subject area are recommended for the best " | |
| "learning experience." | |
| ) | |
| # ββ Cacheβpreload from previous WooCommerce CSV βββββββββββββββββββββββββββββ | |
| def _preload_cache(prev_csv: str, df_new: pd.DataFrame, dcol, ocol, pcol, acol): | |
| """Seed the onβdisk cache with completions from an earlier WooCommerce CSV.""" | |
| try: | |
| prev = pd.read_csv(prev_csv, encoding="utf-8-sig") | |
| except Exception: | |
| return | |
| prev_parent = prev[prev["Type"].str.startswith("variable", na=False)] | |
| prev_map = {row["SKU"]: row for _, row in prev_parent.iterrows()} # SKU == Course ID | |
| for _, row in df_new.iterrows(): | |
| cid = row["Course ID"] | |
| if cid not in prev_map: | |
| continue | |
| old = prev_map[cid] | |
| desc = str(row[dcol]) | |
| obj = str(row[ocol]) | |
| ag = str(row[acol]) | |
| pre = str(row[pcol]) | |
| _set_cache(f"{DESC_SHORT}\n\nText: {desc}", _dedup_lines(old.get("Short description", ""))) | |
| _set_cache(f"{DESC_LONG}\n\nText: {desc}", _dedup_lines(old.get("Description", ""))) | |
| _set_cache(f"{OBJECTIVES}\n\nText: {obj}", _dedup_lines(old.get("Meta: objectives", ""))) | |
| _set_cache(f"{AGENDA}\n\nText: {ag}", _dedup_lines(old.get("Meta: agenda", ""))) | |
| if pre.strip(): | |
| _set_cache(f"{PREREQ}\n\nText: {pre}", _dedup_lines(old.get("Meta: prerequisites", ""))) | |
| # ββ Helper: read user file (CSV or Excel) βββββββββββββββββββββββββββββββββββ | |
| def _read(path: str) -> pd.DataFrame: | |
| if path.lower().endswith((".xlsx", ".xls")): | |
| return pd.read_excel(path) | |
| return pd.read_csv(path, encoding="latin1") | |
| # ββ Enrichment step (async batched LLM) ββββββββββββββββββββββββββββββββββββββ | |
| async def _enrich_dataframe(df, dcol, ocol, pcol, acol): | |
| async with openai.AsyncOpenAI(api_key=os.getenv("OPENAI_API_KEY")) as client: | |
| sdesc, ldesc, fobj, fout = await asyncio.gather( | |
| _batch_async(df.get(dcol, "").fillna("").tolist(), DESC_SHORT, client), | |
| _batch_async(df.get(dcol, "").fillna("").tolist(), DESC_LONG, client), | |
| _batch_async(df.get(ocol, "").fillna("").tolist(), OBJECTIVES, client), | |
| _batch_async(df.get(acol, "").fillna("").tolist(), AGENDA, client), | |
| ) | |
| prereq_raw = df.get(pcol, "").fillna("").tolist() | |
| fpre = [] | |
| for req in prereq_raw: | |
| if not str(req).strip(): | |
| fpre.append(DEFAULT_PREREQ) | |
| else: | |
| out = await _batch_async([req], PREREQ, client) | |
| fpre.append(out[0]) | |
| # Ensure everything is deduped (safety). | |
| sdesc = [_dedup_lines(t) for t in sdesc] | |
| ldesc = [_dedup_lines(t) for t in ldesc] | |
| fobj = [_dedup_lines(t) for t in fobj] | |
| fout = [_dedup_lines(t) for t in fout] | |
| fpre = [_dedup_lines(t) for t in fpre] | |
| return sdesc, ldesc, fobj, fout, fpre | |
| # ββ Main converter ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def convert(schedule_path: str, prev_csv_path: str | None = None) -> BytesIO: | |
| df = _read(schedule_path) | |
| df.columns = df.columns.str.strip() | |
| first_col = lambda *cand: next((c for c in cand if c in df.columns), None) | |
| dcol = first_col("Description", "Decription") | |
| ocol = first_col("Objectives", "objectives") | |
| pcol = first_col("RequiredPrerequisite", "Required Pre-requisite") | |
| acol = first_col("Outline") | |
| dur = first_col("Duration") or "Duration" # kept for backwardβcompat (unused) | |
| sid = first_col("Course SID", "Course SID") | |
| # optional cache preload | |
| if prev_csv_path: | |
| _preload_cache(prev_csv_path, df, dcol, ocol, pcol, acol) | |
| # asyncβenrich via LLM | |
| sdesc, ldesc, fobj, fout, fpre = asyncio.run( | |
| _enrich_dataframe(df, dcol, ocol, pcol, acol) | |
| ) | |
| df["Short_Description"] = sdesc | |
| df["Condensed_Description"] = ldesc | |
| df["Formatted_Objectives"] = fobj | |
| df["Formatted_Agenda"] = fout | |
| df["Formatted_Prerequisites"] = fpre | |
| # schedule aggregation & metaβdays calculation | |
| df["Course Start Date"] = pd.to_datetime(df["Course Start Date"], errors="coerce") | |
| df["Date_fmt"] = df["Course Start Date"].dt.strftime("%-m/%-d/%Y") | |
| dsorted = df.sort_values(["Course ID", "Course Start Date"]) | |
| # "MetaDays" = inclusive span between earliest & latest dates per CourseΒ ID | |
| meta_days = ( | |
| dsorted.groupby("Course ID")["Course Start Date"].agg(lambda s: (s.max() - s.min()).days + 1) | |
| .reset_index(name="MetaDays") | |
| ) | |
| # AttributeΒ 1 list β ensure chronological order | |
| d_agg = ( | |
| dsorted.groupby("Course ID")["Date_fmt"] | |
| .apply(lambda s: ",".join(sorted(s.dropna().unique(), key=lambda x: pd.to_datetime(x)))) | |
| .reset_index(name="Dates") | |
| ) | |
| t_agg = ( | |
| dsorted.groupby("Course ID", group_keys=False) | |
| .apply( | |
| lambda g: ",".join( | |
| f"{st}-{et} {tz}" | |
| for st, et, tz in zip( | |
| g["Course Start Time"], g["Course End Time"], g["Time Zone"] | |
| ) | |
| ), | |
| include_groups=False, | |
| ) | |
| .reset_index(name="Times") | |
| ) | |
| parents = ( | |
| dsorted.drop_duplicates("Course ID") | |
| .merge(d_agg) | |
| .merge(t_agg) | |
| .merge(meta_days) | |
| ) | |
| # propagate MetaDays to each schedule row | |
| dsorted = dsorted.merge(meta_days, on="Course ID", how="left") | |
| parent = pd.DataFrame( | |
| { | |
| "Type": "variable", | |
| "SKU": parents["Course ID"], | |
| "Name": parents["Course Name"], | |
| "Published": 1, | |
| "Visibility in catalog": "visible", | |
| "Short description": parents["Short_Description"], | |
| "Description": parents["Condensed_Description"], | |
| "Tax status": "taxable", | |
| "In stock?": 1, | |
| "Stock": 1, | |
| "Sold individually?": 1, | |
| "Regular price": parents["SRP Pricing"].replace("[\\$,]", "", regex=True), | |
| "Categories": "courses", | |
| "Images": parents["Vendor"].map(logos).fillna(""), | |
| "Parent": "", | |
| "Brands": parents["Vendor"], | |
| "Attribute 1 name": "Date", | |
| "Attribute 1 value(s)": parents["Dates"], | |
| "Attribute 1 visible": "visible", | |
| "Attribute 1 global": 1, | |
| "Attribute 2 name": "Location", | |
| "Attribute 2 value(s)": "Virtual", | |
| "Attribute 2 visible": "visible", | |
| "Attribute 2 global": 1, | |
| "Attribute 3 name": "Time", | |
| "Attribute 3 value(s)": parents["Times"], | |
| "Attribute 3 visible": "visible", | |
| "Attribute 3 global": 1, | |
| "Meta: outline": parents["Formatted_Agenda"], | |
| "Meta: days": parents["MetaDays"], | |
| "Meta: location": "Virtual", | |
| "Meta: overview": parents["Target Audience"], | |
| "Meta: objectives": parents["Formatted_Objectives"], | |
| "Meta: prerequisites": parents["Formatted_Prerequisites"], | |
| "Meta: agenda": parents["Formatted_Agenda"], | |
| } | |
| ) | |
| child = pd.DataFrame( | |
| { | |
| "Type": "variation, virtual", | |
| "SKU": dsorted[sid].astype(str).str.strip(), | |
| "Name": dsorted["Course Name"], | |
| "Published": 1, | |
| "Visibility in catalog": "visible", | |
| "Short description": dsorted["Short_Description"], | |
| "Description": dsorted["Condensed_Description"], | |
| "Tax status": "taxable", | |
| "In stock?": 1, | |
| "Stock": 1, | |
| "Sold individually?": 1, | |
| "Regular price": dsorted["SRP Pricing"].replace("[\\$,]", "", regex=True), | |
| "Categories": "courses", | |
| "Images": dsorted["Vendor"].map(logos).fillna(""), | |
| "Parent": dsorted["Course ID"], | |
| "Brands": dsorted["Vendor"], | |
| "Attribute 1 name": "Date", | |
| "Attribute 1 value(s)": dsorted["Date_fmt"], | |
| "Attribute 1 visible": "visible", | |
| "Attribute 1 global": 1, | |
| "Attribute 2 name": "Location", | |
| "Attribute 2 value(s)": "Virtual", | |
| "Attribute 2 visible": "visible", | |
| "Attribute 2 global": 1, | |
| "Attribute 3 name": "Time", | |
| "Attribute 3 value(s)": dsorted.apply( | |
| lambda r: f"{r['Course Start Time']}-{r['Course End Time']} {r['Time Zone']}", | |
| axis=1, | |
| ), | |
| "Attribute 3 visible": "visible", | |
| "Attribute 3 global": 1, | |
| "Meta: outline": dsorted["Formatted_Agenda"], | |
| "Meta: days": dsorted["MetaDays"], | |
| "Meta: location": "Virtual", | |
| "Meta: overview": dsorted["Target Audience"], | |
| "Meta: objectives": dsorted["Formatted_Objectives"], | |
| "Meta: prerequisites": dsorted["Formatted_Prerequisites"], | |
| "Meta: agenda": dsorted["Formatted_Agenda"], | |
| } | |
| ) | |
| all_rows = pd.concat([parent, child], ignore_index=True) | |
| order = [ | |
| "Type","SKU","Name","Published","Visibility in catalog","Short description","Description", | |
| "Tax status","In stock?","Stock","Sold individually?","Regular price","Categories","Images", | |
| "Parent","Brands","Attribute 1 name","Attribute 1 value(s)","Attribute 1 visible","Attribute 1 global", | |
| "Attribute 2 name","Attribute 2 value(s)","Attribute 2 visible","Attribute 2 global", | |
| "Attribute 3 name","Attribute 3 value(s)","Attribute 3 visible","Attribute 3 global", | |
| "Meta: outline","Meta: days","Meta: location","Meta: overview","Meta: objectives", | |
| "Meta: prerequisites","Meta: agenda", | |
| ] | |
| out = BytesIO() | |
| all_rows[order].to_csv(out, index=False, encoding="utf-8-sig") | |
| out.seek(0) | |
| return out | |
| # ββ Gradio interface ββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def process_files(schedule: gr.File, previous: gr.File | None) -> str: | |
| csv_bytes = convert(schedule.name, previous.name if previous else None) | |
| with tempfile.NamedTemporaryFile(delete=False, suffix=".csv") as tmp: | |
| tmp.write(csv_bytes.getvalue()) | |
| return tmp.name | |
| ui = gr.Interface( | |
| fn=process_files, | |
| inputs=[ | |
| gr.File(label="Upload NetCom schedule (.csv/.xlsx/.xls)", file_types=[".csv", ".xlsx", ".xls"]), | |
| gr.File(label="Previous WooCommerce CSV (optional)", file_types=[".csv"]), | |
| ], | |
| outputs=gr.File(label="Download WooCommerce CSV"), | |
| title="NetCom β WooCommerce CSV Processor (TryΒ 3)", | |
| description=( | |
| "1. Upload the **latest NetCom schedule** file.\n" | |
| "2. *(Optional)* Upload the **WooCommerce CSV** generated by a previous run to " | |
| "pre-load the cache and skip already-processed courses." | |
| ), | |
| analytics_enabled=False, | |
| ) | |
| if __name__ == "__main__": | |
| if not os.getenv("OPENAI_API_KEY"): | |
| print("β οΈ OPENAI_API_KEY not set β AI features will error") | |
| ui.launch() | |