Spaces:

codys12
/

NetCom-to-WooComerce

Runtime error

App Files Files Community

codys12 commited on May 14

Commit

e4dba80

verified ·

1 Parent(s): cf087b0

Update app.py

Browse files

Files changed (1) hide show

app.py +126 -154

app.py CHANGED Viewed

@@ -1,12 +1,17 @@
-"""NetCom → WooCommerce transformer (Try 2 schema — 100-parallel, de-dupe, pandas-fix)
-=====================================================================================
-*Accept CSV **or** Excel schedule files and output the WooCommerce CSV.*
-Latest tweak
-------------
-• **Logo map** now contains both `"Amazon Web Services"` *and* `"AWS"` keys
-  so either value in the *Vendor* column resolves to the same upload path.
-  (Everything else is untouched.)
 """
 from __future__ import annotations
@@ -24,58 +29,46 @@ import gradio_client.utils
 import openai
 import pandas as pd
-# -------- Gradio bool-schema hot-patch --------------------------------------
 _original = gradio_client.utils._json_schema_to_python_type
 def _fixed_json_schema_to_python_type(schema, defs=None):  # type: ignore
     if isinstance(schema, bool):
         return "any"
     return _original(schema, defs)
-gradio_client.utils._json_schema_to_python_type = (  # type: ignore
-    _fixed_json_schema_to_python_type
-)
-# -------- Tiny disk cache ----------------------------------------------------
-CACHE_DIR = Path("ai_response_cache")
-CACHE_DIR.mkdir(exist_ok=True)
 def _cache_path(p: str) -> Path:
     return CACHE_DIR / f"{hashlib.md5(p.encode()).hexdigest()}.json"
 def _get_cached(p: str) -> str | None:
     try:
         return json.loads(_cache_path(p).read_text("utf-8"))["response"]
     except Exception:
         return None
 def _set_cache(p: str, r: str) -> None:
     try:
         _cache_path(p).write_text(json.dumps({"prompt": p, "response": r}), "utf-8")
     except Exception:
         pass
-# -------- Async GPT helpers --------------------------------------------------
-_SEM = asyncio.Semaphore(100)  # ≤100 concurrent OpenAI calls
 _inflight: dict[str, asyncio.Future] = {}  # prompt → Future
 async def _gpt_async(client: openai.AsyncOpenAI, prompt: str) -> str:
-    """Single LLM call with cache, concurrency cap, and de-duplication."""
     cached = _get_cached(prompt)
     if cached is not None:
         return cached
-    # de-dup identical prompts already in-flight
-    existing = _inflight.get(prompt)
-    if existing is not None:
-        return await existing
     loop = asyncio.get_running_loop()
@@ -100,12 +93,8 @@ async def _gpt_async(client: openai.AsyncOpenAI, prompt: str) -> str:
     finally:
         _inflight.pop(prompt, None)
-async def _batch_async(
-    lst: list[str], instruction: str, client: openai.AsyncOpenAI
-) -> list[str]:
-    """Vectorised helper — returns an output list matching *lst* length."""
-    out: list[str] = ["" for _ in lst]
     idx, prompts = [], []
     for i, txt in enumerate(lst):
         if isinstance(txt, str) and txt.strip():
@@ -119,109 +108,122 @@ async def _batch_async(
         out[idx[j]] = val
     return out
-# -------- Core converter -----------------------------------------------------
 DEFAULT_PREREQ = (
     "No specific prerequisites are required for this course. Basic computer literacy and "
     "familiarity with fundamental concepts in the subject area are recommended for the best "
     "learning experience."
 )
 def _read(path: str) -> pd.DataFrame:
     if path.lower().endswith((".xlsx", ".xls")):
         return pd.read_excel(path)
     return pd.read_csv(path, encoding="latin1")
-async def _enrich_dataframe(
-    df: pd.DataFrame, dcol: str, ocol: str, pcol: str, acol: str
-) -> tuple[list[str], list[str], list[str], list[str], list[str]]:
-    """Run all LLM batches concurrently and return the five enrichment columns."""
     async with openai.AsyncOpenAI(api_key=os.getenv("OPENAI_API_KEY")) as client:
         sdesc, ldesc, fobj, fout = await asyncio.gather(
-            _batch_async(
-                df.get(dcol, "").fillna("").tolist(),
-                "Create a concise 250-character summary of this course description:",
-                client,
-            ),
-            _batch_async(
-                df.get(dcol, "").fillna("").tolist(),
-                "Condense this description to a maximum of 750 characters in paragraph format, with clean formatting:",
-                client,
-            ),
-            _batch_async(
-                df.get(ocol, "").fillna("").tolist(),
-                "Format these objectives into a bullet list with clean formatting. Start each bullet with '• ':",
-                client,
-            ),
-            _batch_async(
-                df.get(acol, "").fillna("").tolist(),
-                "Format this agenda into a bullet list with clean formatting. Start each bullet with '• ':",
-                client,
-            ),
         )
-        # prerequisites
         prereq_raw = df.get(pcol, "").fillna("").tolist()
-        fpre: list[str] = []
         for req in prereq_raw:
             if not str(req).strip():
                 fpre.append(DEFAULT_PREREQ)
             else:
-                formatted = await _batch_async(
-                    [req],
-                    "Format these prerequisites into a bullet list with clean formatting. Start each bullet with '• ':",
-                    client,
-                )
-                fpre.append(formatted[0])
     return sdesc, ldesc, fobj, fout, fpre
-def convert(path: str) -> BytesIO:
-    logos = {
-        "Amazon Web Services": "/wp-content/uploads/2025/04/aws.png",
-        "AWS": "/wp-content/uploads/2025/04/aws.png",
-        "Cisco": "/wp-content/uploads/2025/04/cisco-e1738593292198-1.webp",
-        "Microsoft": "/wp-content/uploads/2025/04/Microsoft-e1737494120985-1.png",
-        "Google Cloud": "/wp-content/uploads/2025/04/Google_Cloud.png",
-        "EC Council": "/wp-content/uploads/2025/04/Ec_Council.png",
-        "ITIL": "/wp-content/uploads/2025/04/ITIL.webp",
-        "PMI": "/wp-content/uploads/2025/04/PMI.png",
-        "Comptia": "/wp-content/uploads/2025/04/Comptia.png",
-        "Autodesk": "/wp-content/uploads/2025/04/autodesk.png",
-        "ISC2": "/wp-content/uploads/2025/04/ISC2.png",
-        "AICerts": "/wp-content/uploads/2025/04/aicerts-logo-1.png",
-    }
-    df = _read(path)
     df.columns = df.columns.str.strip()
-    first_col = lambda *candidates: next((c for c in candidates if c in df.columns), None)
     dcol = first_col("Description", "Decription")
     ocol = first_col("Objectives", "objectives")
     pcol = first_col("RequiredPrerequisite", "Required Pre-requisite")
     acol = first_col("Outline")
-    dur = first_col("Duration") or "Duration"
-    sid = first_col("Course SID", "ï»¿Course SID")
     if dur not in df.columns:
         df[dur] = ""
-    # ---------- LLM enrichment (async) -------------------------------------
     sdesc, ldesc, fobj, fout, fpre = asyncio.run(
         _enrich_dataframe(df, dcol, ocol, pcol, acol)
     )
-    df["Short_Description"] = sdesc
-    df["Condensed_Description"] = ldesc
-    df["Formatted_Objectives"] = fobj
-    df["Formatted_Agenda"] = fout
-    df["Formatted_Prerequisites"] = fpre
-    # ---------- Schedule aggregation --------------------------------------
     df["Course Start Date"] = pd.to_datetime(df["Course Start Date"], errors="coerce")
     df["Date_fmt"] = df["Course Start Date"].dt.strftime("%-m/%-d/%Y")
@@ -231,7 +233,6 @@ def convert(path: str) -> BytesIO:
         .apply(lambda s: ",".join(s.dropna().unique()))
         .reset_index(name="Dates")
     )
     t_agg = (
         dsorted.groupby("Course ID", group_keys=False)
         .apply(
@@ -245,10 +246,8 @@ def convert(path: str) -> BytesIO:
         )
         .reset_index(name="Times")
     )
     parents = dsorted.drop_duplicates("Course ID").merge(d_agg).merge(t_agg)
-    # ---------- Parent / child product rows --------------------------------
     parent = pd.DataFrame(
         {
             "Type": "variable",
@@ -334,66 +333,39 @@ def convert(path: str) -> BytesIO:
     all_rows = pd.concat([parent, child], ignore_index=True)
     order = [
-        "Type",
-        "SKU",
-        "Name",
-        "Published",
-        "Visibility in catalog",
-        "Short description",
-        "Description",
-        "Tax status",
-        "In stock?",
-        "Stock",
-        "Sold individually?",
-        "Regular price",
-        "Categories",
-        "Images",
-        "Parent",
-        "Brands",
-        "Attribute 1 name",
-        "Attribute 1 value(s)",
-        "Attribute 1 visible",
-        "Attribute 1 global",
-        "Attribute 2 name",
-        "Attribute 2 value(s)",
-        "Attribute 2 visible",
-        "Attribute 2 global",
-        "Attribute 3 name",
-        "Attribute 3 value(s)",
-        "Attribute 3 visible",
-        "Attribute 3 global",
-        "Meta: outline",
-        "Meta: days",
-        "Meta: location",
-        "Meta: overview",
-        "Meta: objectives",
-        "Meta: prerequisites",
-        "Meta: agenda",
     ]
     out = BytesIO()
     all_rows[order].to_csv(out, index=False, encoding="utf-8-sig")
     out.seek(0)
     return out
-# -------- Gradio wrappers ----------------------------------------------------
-def process_file(upload: gr.File) -> str:
-    csv_bytes = convert(upload.name)
     with tempfile.NamedTemporaryFile(delete=False, suffix=".csv") as tmp:
         tmp.write(csv_bytes.getvalue())
-        path = tmp.name
-    return path
 ui = gr.Interface(
-    fn=process_file,
-    inputs=gr.File(
-        label="Upload NetCom CSV / Excel", file_types=[".csv", ".xlsx", ".xls"]
-    ),
     outputs=gr.File(label="Download WooCommerce CSV"),
     title="NetCom → WooCommerce CSV Processor (Try 2)",
-    description="Upload NetCom schedule (.csv/.xlsx) to get the Try 2-formatted WooCommerce CSV.",
     analytics_enabled=False,
 )

+"""NetCom → WooCommerce transformer (Try 2 schema — persistent cache, 100-parallel,
+duplicate-safe, relative-logo paths, cache-preload)
+==================================================================================
+*Accept a NetCom schedule (CSV/XLSX) and **optionally** a *previous* WooCommerce
+CSV; output the fresh WooCommerce CSV.*
+New in this revision
+--------------------
+* **Relative** image paths kept (WooCommerce resolves them to your own domain).
+* Second optional file-input lets you *pre-load* the on-disk cache from a prior
+  run, so already-processed courses skip OpenAI completely.
+* Everything else (persistent cache in `/data`, 100-parallel semaphore,
+  in-flight de-duplication, pandas compatibility fix) remains unchanged.
 """
 from __future__ import annotations
 import openai
 import pandas as pd
+# ── Gradio bool-schema hot-patch ─────────────────────────────────────────────
 _original = gradio_client.utils._json_schema_to_python_type
 def _fixed_json_schema_to_python_type(schema, defs=None):  # type: ignore
     if isinstance(schema, bool):
         return "any"
     return _original(schema, defs)
+gradio_client.utils._json_schema_to_python_type = _fixed_json_schema_to_python_type  # type: ignore
+# ── Persistent disk cache (HF Spaces uses /data) ─────────────────────────────
+_PERSISTENT_ROOT = Path("/data")
+CACHE_DIR = (_PERSISTENT_ROOT if _PERSISTENT_ROOT.exists() else Path(".")) / "ai_response_cache"
+CACHE_DIR.mkdir(parents=True, exist_ok=True)
 def _cache_path(p: str) -> Path:
     return CACHE_DIR / f"{hashlib.md5(p.encode()).hexdigest()}.json"
 def _get_cached(p: str) -> str | None:
     try:
         return json.loads(_cache_path(p).read_text("utf-8"))["response"]
     except Exception:
         return None
 def _set_cache(p: str, r: str) -> None:
     try:
         _cache_path(p).write_text(json.dumps({"prompt": p, "response": r}), "utf-8")
     except Exception:
         pass
+# ── OpenAI helpers: 100-parallel + de-dup ────────────────────────────────────
+_SEM = asyncio.Semaphore(100)              # ≤100 concurrent OpenAI calls
 _inflight: dict[str, asyncio.Future] = {}  # prompt → Future
 async def _gpt_async(client: openai.AsyncOpenAI, prompt: str) -> str:
     cached = _get_cached(prompt)
     if cached is not None:
         return cached
+    running = _inflight.get(prompt)
+    if running is not None:
+        return await running
     loop = asyncio.get_running_loop()
     finally:
         _inflight.pop(prompt, None)
+async def _batch_async(lst, instruction: str, client):
+    out = ["" for _ in lst]
     idx, prompts = [], []
     for i, txt in enumerate(lst):
         if isinstance(txt, str) and txt.strip():
         out[idx[j]] = val
     return out
+# ── Instructions (reuse across preload & gen) ────────────────────────────────
+DESC_SHORT = "Create a concise 250-character summary of this course description:"
+DESC_LONG  = "Condense this description to a maximum of 750 characters in paragraph format, with clean formatting:"
+OBJECTIVES = "Format these objectives into a bullet list with clean formatting. Start each bullet with '• ':"
+AGENDA     = "Format this agenda into a bullet list with clean formatting. Start each bullet with '• ':"
+PREREQ     = "Format these prerequisites into a bullet list with clean formatting. Start each bullet with '• ':"
+# ── Logo map (relative paths, with common aliases) ───────────────────────────
+logos = {
+    "Amazon Web Services": "/wp-content/uploads/2025/04/aws.png",
+    "AWS":                 "/wp-content/uploads/2025/04/aws.png",
+    "Cisco":               "/wp-content/uploads/2025/04/cisco-e1738593292198-1.webp",
+    "Microsoft":           "/wp-content/uploads/2025/04/Microsoft-e1737494120985-1.png",
+    "Google Cloud":        "/wp-content/uploads/2025/04/Google_Cloud.png",
+    "EC Council":          "/wp-content/uploads/2025/04/Ec_Council.png",
+    "ITIL":                "/wp-content/uploads/2025/04/ITIL.webp",
+    "PMI":                 "/wp-content/uploads/2025/04/PMI.png",
+    "Comptia":             "/wp-content/uploads/2025/04/Comptia.png",
+    "Autodesk":            "/wp-content/uploads/2025/04/autodesk.png",
+    "ISC2":                "/wp-content/uploads/2025/04/ISC2.png",
+    "AICerts":             "/wp-content/uploads/2025/04/aicerts-logo-1.png",
+}
 DEFAULT_PREREQ = (
     "No specific prerequisites are required for this course. Basic computer literacy and "
     "familiarity with fundamental concepts in the subject area are recommended for the best "
     "learning experience."
 )
+# ── Cache-preload from previous WooCommerce CSV ──────────────────────────────
+def _preload_cache(prev_csv: str, df_new: pd.DataFrame, dcol, ocol, pcol, acol):
+    """Seed the on-disk cache with completions from an earlier WooCommerce CSV."""
+    try:
+        prev = pd.read_csv(prev_csv, encoding="utf-8-sig")
+    except Exception:
+        return
+    prev_parent = prev[prev["Type"].str.startswith("variable", na=False)]
+    prev_map = {row["SKU"]: row for _, row in prev_parent.iterrows()}  # SKU == Course ID
+    for _, row in df_new.iterrows():
+        cid = row["Course ID"]
+        if cid not in prev_map:
+            continue
+        old = prev_map[cid]
+        desc = str(row[dcol])
+        obj  = str(row[ocol])
+        ag   = str(row[acol])
+        pre  = str(row[pcol])
+        _set_cache(f"{DESC_SHORT}\n\nText: {desc}", old.get("Short description", ""))
+        _set_cache(f"{DESC_LONG}\n\nText: {desc}", old.get("Description", ""))
+        _set_cache(f"{OBJECTIVES}\n\nText: {obj}",    old.get("Meta: objectives", ""))
+        _set_cache(f"{AGENDA}\n\nText: {ag}",         old.get("Meta: agenda", ""))
+        if pre.strip():
+            _set_cache(f"{PREREQ}\n\nText: {pre}",    old.get("Meta: prerequisites", ""))
+# ── Helper: read user file (CSV or Excel) ────────────────────────────────────
 def _read(path: str) -> pd.DataFrame:
     if path.lower().endswith((".xlsx", ".xls")):
         return pd.read_excel(path)
     return pd.read_csv(path, encoding="latin1")
+# ── Enrichment step (async batched LLM) ──────────────────────────────────────
+async def _enrich_dataframe(df, dcol, ocol, pcol, acol):
     async with openai.AsyncOpenAI(api_key=os.getenv("OPENAI_API_KEY")) as client:
         sdesc, ldesc, fobj, fout = await asyncio.gather(
+            _batch_async(df.get(dcol, "").fillna("").tolist(), DESC_SHORT, client),
+            _batch_async(df.get(dcol, "").fillna("").tolist(), DESC_LONG,  client),
+            _batch_async(df.get(ocol, "").fillna("").tolist(), OBJECTIVES, client),
+            _batch_async(df.get(acol, "").fillna("").tolist(), AGENDA,     client),
         )
         prereq_raw = df.get(pcol, "").fillna("").tolist()
+        fpre = []
         for req in prereq_raw:
             if not str(req).strip():
                 fpre.append(DEFAULT_PREREQ)
             else:
+                out = await _batch_async([req], PREREQ, client)
+                fpre.append(out[0])
     return sdesc, ldesc, fobj, fout, fpre
+# ── Main converter ───────────────────────────────────────────────────────────
+def convert(schedule_path: str, prev_csv_path: str | None = None) -> BytesIO:
+    df = _read(schedule_path)
     df.columns = df.columns.str.strip()
+    first_col = lambda *cand: next((c for c in cand if c in df.columns), None)
     dcol = first_col("Description", "Decription")
     ocol = first_col("Objectives", "objectives")
     pcol = first_col("RequiredPrerequisite", "Required Pre-requisite")
     acol = first_col("Outline")
+    dur  = first_col("Duration") or "Duration"
+    sid  = first_col("Course SID", "ï»¿Course SID")
     if dur not in df.columns:
         df[dur] = ""
+    # optional cache preload
+    if prev_csv_path:
+        _preload_cache(prev_csv_path, df, dcol, ocol, pcol, acol)
+    # async-enrich via LLM
     sdesc, ldesc, fobj, fout, fpre = asyncio.run(
         _enrich_dataframe(df, dcol, ocol, pcol, acol)
     )
+    df["Short_Description"]      = sdesc
+    df["Condensed_Description"]  = ldesc
+    df["Formatted_Objectives"]   = fobj
+    df["Formatted_Agenda"]       = fout
+    df["Formatted_Prerequisites"]= fpre
+    # schedule aggregation
     df["Course Start Date"] = pd.to_datetime(df["Course Start Date"], errors="coerce")
     df["Date_fmt"] = df["Course Start Date"].dt.strftime("%-m/%-d/%Y")
         .apply(lambda s: ",".join(s.dropna().unique()))
         .reset_index(name="Dates")
     )
     t_agg = (
         dsorted.groupby("Course ID", group_keys=False)
         .apply(
         )
         .reset_index(name="Times")
     )
     parents = dsorted.drop_duplicates("Course ID").merge(d_agg).merge(t_agg)
     parent = pd.DataFrame(
         {
             "Type": "variable",
     all_rows = pd.concat([parent, child], ignore_index=True)
     order = [
+        "Type","SKU","Name","Published","Visibility in catalog","Short description","Description",
+        "Tax status","In stock?","Stock","Sold individually?","Regular price","Categories","Images",
+        "Parent","Brands","Attribute 1 name","Attribute 1 value(s)","Attribute 1 visible","Attribute 1 global",
+        "Attribute 2 name","Attribute 2 value(s)","Attribute 2 visible","Attribute 2 global",
+        "Attribute 3 name","Attribute 3 value(s)","Attribute 3 visible","Attribute 3 global",
+        "Meta: outline","Meta: days","Meta: location","Meta: overview","Meta: objectives",
+        "Meta: prerequisites","Meta: agenda",
     ]
     out = BytesIO()
     all_rows[order].to_csv(out, index=False, encoding="utf-8-sig")
     out.seek(0)
     return out
+# ── Gradio interface ─────────────────────────────────────────────────────────
+def process_files(schedule: gr.File, previous: gr.File | None) -> str:
+    csv_bytes = convert(schedule.name, previous.name if previous else None)
     with tempfile.NamedTemporaryFile(delete=False, suffix=".csv") as tmp:
         tmp.write(csv_bytes.getvalue())
+        return tmp.name
 ui = gr.Interface(
+    fn=process_files,
+    inputs=[
+        gr.File(label="Upload NetCom schedule (.csv/.xlsx/.xls)", file_types=[".csv", ".xlsx", ".xls"]),
+        gr.File(label="Previous WooCommerce CSV (optional)", file_types=[".csv"], optional=True),
+    ],
     outputs=gr.File(label="Download WooCommerce CSV"),
     title="NetCom → WooCommerce CSV Processor (Try 2)",
+    description=(
+        "1. Upload the **latest NetCom schedule** file.\n"
+        "2. *(Optional)* Upload the **WooCommerce CSV** generated by a previous run to "
+        "pre-load the cache and skip already-processed courses."
+    ),
     analytics_enabled=False,
 )