Spaces:

codys12
/

NetCom-to-WooComerce

Runtime error

App Files Files Community

codys12 commited on May 14

Commit

f88b322

verified ·

1 Parent(s): 46bc7fa

Update app.py

Browse files

Files changed (1) hide show

app.py +105 -55

app.py CHANGED Viewed

@@ -1,17 +1,19 @@
-"""NetCom → WooCommerce transformer (Try 2 schema — persistent cache, 100-parallel,
-duplicate-safe, relative-logo paths, cache-preload)
-==================================================================================
-*Accept a NetCom schedule (CSV/XLSX) and **optionally** a *previous* WooCommerce
-CSV; output the fresh WooCommerce CSV.*
-New in this revision
 --------------------
-* **Relative** image paths kept (WooCommerce resolves them to your own domain).
-* Second optional file-input lets you *pre-load* the on-disk cache from a prior
-  run, so already-processed courses skip OpenAI completely.
-* Everything else (persistent cache in `/data`, 100-parallel semaphore,
-  in-flight de-duplication, pandas compatibility fix) remains unchanged.
 """
 from __future__ import annotations
@@ -23,44 +25,64 @@ import os
 import tempfile
 from io import BytesIO
 from pathlib import Path
 import gradio as gr
 import gradio_client.utils
 import openai
 import pandas as pd
-# ── Gradio bool-schema hot-patch ─────────────────────────────────────────────
 _original = gradio_client.utils._json_schema_to_python_type
 def _fixed_json_schema_to_python_type(schema, defs=None):  # type: ignore
     if isinstance(schema, bool):
         return "any"
     return _original(schema, defs)
 gradio_client.utils._json_schema_to_python_type = _fixed_json_schema_to_python_type  # type: ignore
-# ── Persistent disk cache (HF Spaces uses /data) ─────────────────────────────
 _PERSISTENT_ROOT = Path("/data")
 CACHE_DIR = (_PERSISTENT_ROOT if _PERSISTENT_ROOT.exists() else Path(".")) / "ai_response_cache"
 CACHE_DIR.mkdir(parents=True, exist_ok=True)
 def _cache_path(p: str) -> Path:
     return CACHE_DIR / f"{hashlib.md5(p.encode()).hexdigest()}.json"
 def _get_cached(p: str) -> str | None:
     try:
         return json.loads(_cache_path(p).read_text("utf-8"))["response"]
     except Exception:
         return None
 def _set_cache(p: str, r: str) -> None:
     try:
         _cache_path(p).write_text(json.dumps({"prompt": p, "response": r}), "utf-8")
     except Exception:
         pass
-# ── OpenAI helpers: 100-parallel + de-dup ────────────────────────────────────
 _SEM = asyncio.Semaphore(100)              # ≤100 concurrent OpenAI calls
 _inflight: dict[str, asyncio.Future] = {}  # prompt → Future
 async def _gpt_async(client: openai.AsyncOpenAI, prompt: str) -> str:
     cached = _get_cached(prompt)
     if cached is not None:
@@ -93,6 +115,7 @@ async def _gpt_async(client: openai.AsyncOpenAI, prompt: str) -> str:
     finally:
         _inflight.pop(prompt, None)
 async def _batch_async(lst, instruction: str, client):
     out = ["" for _ in lst]
     idx, prompts = [], []
@@ -105,30 +128,30 @@ async def _batch_async(lst, instruction: str, client):
     responses = await asyncio.gather(*[_gpt_async(client, p) for p in prompts])
     for j, val in enumerate(responses):
-        out[idx[j]] = val
     return out
-# ── Instructions (reuse across preload & gen) ────────────────────────────────
 DESC_SHORT = "Create a concise 250-character summary of this course description:"
 DESC_LONG  = "Condense this description to a maximum of 750 characters in paragraph format, with clean formatting:"
 OBJECTIVES = "Format these objectives into a bullet list with clean formatting. Start each bullet with '• ':"
 AGENDA     = "Format this agenda into a bullet list with clean formatting. Start each bullet with '• ':"
 PREREQ     = "Format these prerequisites into a bullet list with clean formatting. Start each bullet with '• ':"
-# ── Logo map (relative paths, with common aliases) ───────────────────────────
 logos = {
-    "Amazon Web Services": "/wp-content/uploads/2025/04/aws.png",
-    "AWS":                 "/wp-content/uploads/2025/04/aws.png",
-    "Cisco":               "/wp-content/uploads/2025/04/cisco-e1738593292198-1.webp",
-    "Microsoft":           "/wp-content/uploads/2025/04/Microsoft-e1737494120985-1.png",
-    "Google Cloud":        "/wp-content/uploads/2025/04/Google_Cloud.png",
-    "EC Council":          "/wp-content/uploads/2025/04/Ec_Council.png",
-    "ITIL":                "/wp-content/uploads/2025/04/ITIL.webp",
-    "PMI":                 "/wp-content/uploads/2025/04/PMI.png",
-    "Comptia":             "/wp-content/uploads/2025/04/Comptia.png",
-    "Autodesk":            "/wp-content/uploads/2025/04/autodesk.png",
-    "ISC2":                "/wp-content/uploads/2025/04/ISC2.png",
-    "AICerts":             "/wp-content/uploads/2025/04/aicerts-logo-1.png",
 }
 DEFAULT_PREREQ = (
@@ -137,9 +160,10 @@ DEFAULT_PREREQ = (
     "learning experience."
 )
-# ── Cache-preload from previous WooCommerce CSV ──────────────────────────────
 def _preload_cache(prev_csv: str, df_new: pd.DataFrame, dcol, ocol, pcol, acol):
-    """Seed the on-disk cache with completions from an earlier WooCommerce CSV."""
     try:
         prev = pd.read_csv(prev_csv, encoding="utf-8-sig")
     except Exception:
@@ -159,14 +183,15 @@ def _preload_cache(prev_csv: str, df_new: pd.DataFrame, dcol, ocol, pcol, acol):
         ag   = str(row[acol])
         pre  = str(row[pcol])
-        _set_cache(f"{DESC_SHORT}\n\nText: {desc}", old.get("Short description", ""))
-        _set_cache(f"{DESC_LONG}\n\nText: {desc}", old.get("Description", ""))
-        _set_cache(f"{OBJECTIVES}\n\nText: {obj}",    old.get("Meta: objectives", ""))
-        _set_cache(f"{AGENDA}\n\nText: {ag}",         old.get("Meta: agenda", ""))
         if pre.strip():
-            _set_cache(f"{PREREQ}\n\nText: {pre}",    old.get("Meta: prerequisites", ""))
-# ── Helper: read user file (CSV or Excel) ────────────────────────────────────
 def _read(path: str) -> pd.DataFrame:
     if path.lower().endswith((".xlsx", ".xls")):
         return pd.read_excel(path)
@@ -191,9 +216,17 @@ async def _enrich_dataframe(df, dcol, ocol, pcol, acol):
                 out = await _batch_async([req], PREREQ, client)
                 fpre.append(out[0])
     return sdesc, ldesc, fobj, fout, fpre
-# ── Main converter ───────────────────────────────────────────────────────────
 def convert(schedule_path: str, prev_csv_path: str | None = None) -> BytesIO:
     df = _read(schedule_path)
     df.columns = df.columns.str.strip()
@@ -203,36 +236,43 @@ def convert(schedule_path: str, prev_csv_path: str | None = None) -> BytesIO:
     ocol = first_col("Objectives", "objectives")
     pcol = first_col("RequiredPrerequisite", "Required Pre-requisite")
     acol = first_col("Outline")
-    dur  = first_col("Duration") or "Duration"
     sid  = first_col("Course SID", "ï»¿Course SID")
-    if dur not in df.columns:
-        df[dur] = ""
     # optional cache preload
     if prev_csv_path:
         _preload_cache(prev_csv_path, df, dcol, ocol, pcol, acol)
-    # async-enrich via LLM
     sdesc, ldesc, fobj, fout, fpre = asyncio.run(
         _enrich_dataframe(df, dcol, ocol, pcol, acol)
     )
-    df["Short_Description"]      = sdesc
-    df["Condensed_Description"]  = ldesc
-    df["Formatted_Objectives"]   = fobj
-    df["Formatted_Agenda"]       = fout
-    df["Formatted_Prerequisites"]= fpre
-    # schedule aggregation
     df["Course Start Date"] = pd.to_datetime(df["Course Start Date"], errors="coerce")
     df["Date_fmt"] = df["Course Start Date"].dt.strftime("%-m/%-d/%Y")
     dsorted = df.sort_values(["Course ID", "Course Start Date"])
     d_agg = (
         dsorted.groupby("Course ID")["Date_fmt"]
-        .apply(lambda s: ",".join(s.dropna().unique()))
         .reset_index(name="Dates")
     )
     t_agg = (
         dsorted.groupby("Course ID", group_keys=False)
         .apply(
@@ -246,7 +286,16 @@ def convert(schedule_path: str, prev_csv_path: str | None = None) -> BytesIO:
         )
         .reset_index(name="Times")
     )
-    parents = dsorted.drop_duplicates("Course ID").merge(d_agg).merge(t_agg)
     parent = pd.DataFrame(
         {
@@ -279,7 +328,7 @@ def convert(schedule_path: str, prev_csv_path: str | None = None) -> BytesIO:
             "Attribute 3 visible": "visible",
             "Attribute 3 global": 1,
             "Meta: outline": parents["Formatted_Agenda"],
-            "Meta: days": parents[dur],
             "Meta: location": "Virtual",
             "Meta: overview": parents["Target Audience"],
             "Meta: objectives": parents["Formatted_Objectives"],
@@ -322,7 +371,7 @@ def convert(schedule_path: str, prev_csv_path: str | None = None) -> BytesIO:
             "Attribute 3 visible": "visible",
             "Attribute 3 global": 1,
             "Meta: outline": dsorted["Formatted_Agenda"],
-            "Meta: days": dsorted[dur],
             "Meta: location": "Virtual",
             "Meta: overview": dsorted["Target Audience"],
             "Meta: objectives": dsorted["Formatted_Objectives"],
@@ -346,7 +395,8 @@ def convert(schedule_path: str, prev_csv_path: str | None = None) -> BytesIO:
     out.seek(0)
     return out
-# ── Gradio interface ─────────────────────────────────────────────────────────
 def process_files(schedule: gr.File, previous: gr.File | None) -> str:
     csv_bytes = convert(schedule.name, previous.name if previous else None)
     with tempfile.NamedTemporaryFile(delete=False, suffix=".csv") as tmp:
@@ -360,7 +410,7 @@ ui = gr.Interface(
         gr.File(label="Previous WooCommerce CSV (optional)", file_types=[".csv"]),
     ],
     outputs=gr.File(label="Download WooCommerce CSV"),
-    title="NetCom → WooCommerce CSV Processor (Try 2)",
     description=(
         "1. Upload the **latest NetCom schedule** file.\n"
         "2. *(Optional)* Upload the **WooCommerce CSV** generated by a previous run to "

+"""NetCom → WooCommerce transformer (Try 3 schema — meta‑days calc, sorted attributes, deduped AI sections, persistent cache, 100‑parallel,
+duplicate‑safe, relative‑logo paths, cache‑preload)
+==============================================================================
+Accept a NetCom schedule (CSV/XLSX) and **optionally** a *previous* WooCommerce
+CSV; output the fresh WooCommerce CSV.
+New in this revision
 --------------------
+* **Meta days** automatically calculated as the inclusive span (in days) between
+  the earliest and latest course dates for each Course ID.
+* **Attribute 1 (Date)** lists are now guaranteed to be sorted chronologically.
+* All AI‑generated sections (descriptions, objectives, agenda, prerequisites)
+  are post‑processed to **deduplicate any repeated lines** inside each section.
+* Everything else (persistent cache in `/data`, 100‑parallel semaphore,
+  in‑flight de‑duplication, pandas compatibility fix) remains unchanged.
 """
 from __future__ import annotations
 import tempfile
 from io import BytesIO
 from pathlib import Path
+from typing import List
 import gradio as gr
 import gradio_client.utils
 import openai
 import pandas as pd
+# ── Gradio bool‑schema hot‑patch ────────────────────────────────────────────
 _original = gradio_client.utils._json_schema_to_python_type
 def _fixed_json_schema_to_python_type(schema, defs=None):  # type: ignore
     if isinstance(schema, bool):
         return "any"
     return _original(schema, defs)
 gradio_client.utils._json_schema_to_python_type = _fixed_json_schema_to_python_type  # type: ignore
+# ── Persistent disk cache (HF Spaces uses /data) ────────────────────────────
 _PERSISTENT_ROOT = Path("/data")
 CACHE_DIR = (_PERSISTENT_ROOT if _PERSISTENT_ROOT.exists() else Path(".")) / "ai_response_cache"
 CACHE_DIR.mkdir(parents=True, exist_ok=True)
 def _cache_path(p: str) -> Path:
     return CACHE_DIR / f"{hashlib.md5(p.encode()).hexdigest()}.json"
 def _get_cached(p: str) -> str | None:
     try:
         return json.loads(_cache_path(p).read_text("utf-8"))["response"]
     except Exception:
         return None
 def _set_cache(p: str, r: str) -> None:
     try:
         _cache_path(p).write_text(json.dumps({"prompt": p, "response": r}), "utf-8")
     except Exception:
         pass
+# ── Helpers ────────────────────────────────────────────────────────────────
+def _dedup_lines(txt: str) -> str:
+    """Remove duplicated lines while preserving order inside a block of text."""
+    seen = set()
+    out: List[str] = []
+    for raw in txt.splitlines():
+        line = raw.rstrip()
+        if line and line not in seen:
+            out.append(line)
+            seen.add(line)
+    return "\n".join(out)
+# ── OpenAI helpers: 100‑parallel + de‑dup ───────────────────────────────────
 _SEM = asyncio.Semaphore(100)              # ≤100 concurrent OpenAI calls
 _inflight: dict[str, asyncio.Future] = {}  # prompt → Future
 async def _gpt_async(client: openai.AsyncOpenAI, prompt: str) -> str:
     cached = _get_cached(prompt)
     if cached is not None:
     finally:
         _inflight.pop(prompt, None)
 async def _batch_async(lst, instruction: str, client):
     out = ["" for _ in lst]
     idx, prompts = [], []
     responses = await asyncio.gather(*[_gpt_async(client, p) for p in prompts])
     for j, val in enumerate(responses):
+        out[idx[j]] = _dedup_lines(val)
     return out
+# ── Instructions (reuse across preload & gen) ───────────────────────────────
 DESC_SHORT = "Create a concise 250-character summary of this course description:"
 DESC_LONG  = "Condense this description to a maximum of 750 characters in paragraph format, with clean formatting:"
 OBJECTIVES = "Format these objectives into a bullet list with clean formatting. Start each bullet with '• ':"
 AGENDA     = "Format this agenda into a bullet list with clean formatting. Start each bullet with '• ':"
 PREREQ     = "Format these prerequisites into a bullet list with clean formatting. Start each bullet with '• ':"
+# ── Logo map (relative paths, with common aliases) ──────────────────────────
 logos = {
+    "Amazon Web Services": "https://staging.greathorizonslearning.com/wp-content/uploads/2025/04/aws.png",
+    "AWS":                 "https://staging.greathorizonslearning.com/wp-content/uploads/2025/04/aws.png",
+    "Cisco":               "https://staging.greathorizonslearning.com/wp-content/uploads/2025/04/cisco-e1738593292198-1.webp",
+    "Microsoft":           "https://staging.greathorizonslearning.com/wp-content/uploads/2025/04/Microsoft-e1737494120985-1.png",
+    "Google Cloud":        "https://staging.greathorizonslearning.com/wp-content/uploads/2025/04/Google_Cloud.png",
+    "EC Council":          "https://staging.greathorizonslearning.com/wp-content/uploads/2025/04/Ec_Council.png",
+    "ITIL":                "https://staging.greathorizonslearning.com/wp-content/uploads/2025/04/ITIL.webp",
+    "PMI":                 "https://staging.greathorizonslearning.com/wp-content/uploads/2025/04/PMI.png",
+    "Comptia":             "https://staging.greathorizonslearning.com/wp-content/uploads/2025/04/Comptia.png",
+    "Autodesk":            "https://staging.greathorizonslearning.com/wp-content/uploads/2025/04/autodesk.png",
+    "ISC2":                "https://staging.greathorizonslearning.com/wp-content/uploads/2025/04/ISC2.png",
+    "AICerts":             "https://staging.greathorizonslearning.com/wp-content/uploads/2025/04/aicerts-logo-1.png",
 }
 DEFAULT_PREREQ = (
     "learning experience."
 )
+# ── Cache‑preload from previous WooCommerce CSV ─────────────────────────────
 def _preload_cache(prev_csv: str, df_new: pd.DataFrame, dcol, ocol, pcol, acol):
+    """Seed the on‑disk cache with completions from an earlier WooCommerce CSV."""
     try:
         prev = pd.read_csv(prev_csv, encoding="utf-8-sig")
     except Exception:
         ag   = str(row[acol])
         pre  = str(row[pcol])
+        _set_cache(f"{DESC_SHORT}\n\nText: {desc}", _dedup_lines(old.get("Short description", "")))
+        _set_cache(f"{DESC_LONG}\n\nText: {desc}",  _dedup_lines(old.get("Description", "")))
+        _set_cache(f"{OBJECTIVES}\n\nText: {obj}",    _dedup_lines(old.get("Meta: objectives", "")))
+        _set_cache(f"{AGENDA}\n\nText: {ag}",         _dedup_lines(old.get("Meta: agenda", "")))
         if pre.strip():
+            _set_cache(f"{PREREQ}\n\nText: {pre}",    _dedup_lines(old.get("Meta: prerequisites", "")))
+# ── Helper: read user file (CSV or Excel) ───────────────────────────────────
 def _read(path: str) -> pd.DataFrame:
     if path.lower().endswith((".xlsx", ".xls")):
         return pd.read_excel(path)
                 out = await _batch_async([req], PREREQ, client)
                 fpre.append(out[0])
+    # Ensure everything is deduped (safety).
+    sdesc = [_dedup_lines(t) for t in sdesc]
+    ldesc = [_dedup_lines(t) for t in ldesc]
+    fobj  = [_dedup_lines(t) for t in fobj]
+    fout  = [_dedup_lines(t) for t in fout]
+    fpre  = [_dedup_lines(t) for t in fpre]
     return sdesc, ldesc, fobj, fout, fpre
+# ── Main converter ──────────────────────────────────────────────────────────
 def convert(schedule_path: str, prev_csv_path: str | None = None) -> BytesIO:
     df = _read(schedule_path)
     df.columns = df.columns.str.strip()
     ocol = first_col("Objectives", "objectives")
     pcol = first_col("RequiredPrerequisite", "Required Pre-requisite")
     acol = first_col("Outline")
+    dur  = first_col("Duration") or "Duration"  # kept for backward‑compat (unused)
     sid  = first_col("Course SID", "ï»¿Course SID")
     # optional cache preload
     if prev_csv_path:
         _preload_cache(prev_csv_path, df, dcol, ocol, pcol, acol)
+    # async‑enrich via LLM
     sdesc, ldesc, fobj, fout, fpre = asyncio.run(
         _enrich_dataframe(df, dcol, ocol, pcol, acol)
     )
+    df["Short_Description"]       = sdesc
+    df["Condensed_Description"]   = ldesc
+    df["Formatted_Objectives"]    = fobj
+    df["Formatted_Agenda"]        = fout
+    df["Formatted_Prerequisites"] = fpre
+    # schedule aggregation & meta‑days calculation
     df["Course Start Date"] = pd.to_datetime(df["Course Start Date"], errors="coerce")
     df["Date_fmt"] = df["Course Start Date"].dt.strftime("%-m/%-d/%Y")
     dsorted = df.sort_values(["Course ID", "Course Start Date"])
+    # "MetaDays" = inclusive span between earliest & latest dates per Course ID
+    meta_days = (
+        dsorted.groupby("Course ID")["Course Start Date"].agg(lambda s: (s.max() - s.min()).days + 1)
+        .reset_index(name="MetaDays")
+    )
+    # Attribute 1 list — ensure chronological order
     d_agg = (
         dsorted.groupby("Course ID")["Date_fmt"]
+        .apply(lambda s: ",".join(sorted(s.dropna().unique(), key=lambda x: pd.to_datetime(x))))
         .reset_index(name="Dates")
     )
     t_agg = (
         dsorted.groupby("Course ID", group_keys=False)
         .apply(
         )
         .reset_index(name="Times")
     )
+    parents = (
+        dsorted.drop_duplicates("Course ID")
+        .merge(d_agg)
+        .merge(t_agg)
+        .merge(meta_days)
+    )
+    # propagate MetaDays to each schedule row
+    dsorted = dsorted.merge(meta_days, on="Course ID", how="left")
     parent = pd.DataFrame(
         {
             "Attribute 3 visible": "visible",
             "Attribute 3 global": 1,
             "Meta: outline": parents["Formatted_Agenda"],
+            "Meta: days": parents["MetaDays"],
             "Meta: location": "Virtual",
             "Meta: overview": parents["Target Audience"],
             "Meta: objectives": parents["Formatted_Objectives"],
             "Attribute 3 visible": "visible",
             "Attribute 3 global": 1,
             "Meta: outline": dsorted["Formatted_Agenda"],
+            "Meta: days": dsorted["MetaDays"],
             "Meta: location": "Virtual",
             "Meta: overview": dsorted["Target Audience"],
             "Meta: objectives": dsorted["Formatted_Objectives"],
     out.seek(0)
     return out
+# ── Gradio interface ────────────────────────────────────────────────────────
 def process_files(schedule: gr.File, previous: gr.File | None) -> str:
     csv_bytes = convert(schedule.name, previous.name if previous else None)
     with tempfile.NamedTemporaryFile(delete=False, suffix=".csv") as tmp:
         gr.File(label="Previous WooCommerce CSV (optional)", file_types=[".csv"]),
     ],
     outputs=gr.File(label="Download WooCommerce CSV"),
+    title="NetCom → WooCommerce CSV Processor (Try 3)",
     description=(
         "1. Upload the **latest NetCom schedule** file.\n"
         "2. *(Optional)* Upload the **WooCommerce CSV** generated by a previous run to "