Spaces:

eaglelandsonce
/

BOW_Workflow

Sleeping

App Files Files Community

eaglelandsonce commited on 26 days ago

Commit

3dd5cd9

verified ·

1 Parent(s): b041b2e

Update app.py

Browse files

Files changed (1) hide show

app.py +86 -65

app.py CHANGED Viewed

@@ -5,12 +5,20 @@ from typing import List, Tuple, Dict
 import gradio as gr
 import nltk
-# Ensure NLTK resources are available at startup (safe to call repeatedly)
 def _ensure_nltk():
     try:
         nltk.data.find("tokenizers/punkt")
     except LookupError:
         nltk.download("punkt", quiet=True)
     try:
         nltk.data.find("corpora/stopwords")
     except LookupError:
@@ -22,39 +30,37 @@ from nltk.tokenize import sent_tokenize, word_tokenize
 from nltk.corpus import stopwords
 # ---------- Helpers ----------
-def read_text_input(text: str, file_obj) -> str:
     """
     Priority: if a file is provided, read it; otherwise use text box.
     Supports .txt and .docx (not legacy .doc).
     """
-    if file_obj is not None:
-        path = file_obj.name if hasattr(file_obj, "name") else str(file_obj)
         ext = os.path.splitext(path)[1].lower()
         if ext == ".txt":
-            with open(path, "r", encoding="utf-8", errors="ignore") as f:
-                return f.read()
         elif ext == ".docx":
             try:
                 from docx import Document
             except Exception as e:
-                return f"ERROR: python-docx not installed or failed to import: {e}"
             try:
                 doc = Document(path)
-                return "\n".join(p.text for p in doc.paragraphs)
             except Exception as e:
-                return f"ERROR reading .docx: {e}"
         else:
-            return "ERROR: Unsupported file type. Please upload .txt or .docx."
-    return text or ""
 def preprocess_tokens(tokens: List[str], clean: bool) -> List[str]:
-    """
-    Optionally lowercases and removes English stopwords.
-    Leaves punctuation/nums as-is (tokenizer keeps them); the Bag of Words
-    will reflect exactly what remains after stopword filtering.
-    """
     if not clean:
         return tokens
     stops = set(stopwords.words("english"))
@@ -90,16 +96,10 @@ def tokenize_pipeline(
 def build_sentence_vector(
     tokenized_sentences: List[List[str]], vocabulary: List[str], idx: int
 ) -> Dict[str, int]:
-    """
-    Count occurrences of each vocab term inside the selected sentence.
-    Returns a {word: count} mapping (only non-zero entries for clarity).
-    """
     if not tokenized_sentences or not vocabulary:
         return {}
     if idx < 0 or idx >= len(tokenized_sentences):
         return {}
     sent_tokens = tokenized_sentences[idx]
     counts = Counter(sent_tokens)
     vector = {word: counts[word] for word in vocabulary if counts[word] > 0}
@@ -107,7 +107,6 @@ def build_sentence_vector(
 # ---------- Gradio App ----------
 SAMPLE_TEXT = """NLTK is a powerful library for text processing.
 Text processing is essential for NLP tasks.
 Bag of Words is a fundamental concept in NLP.
@@ -124,15 +123,17 @@ with gr.Blocks(title="NLTK: Tokenize → Bag of Words → Sentence Vector") as d
     gr.Markdown(
         """
 # NLTK Mini-Workbench
-Type/paste text or drop a **.txt** / **.docx** file.
-Then click **Process** to:
 1) Install NLTK (auto-checked at startup)
 2) Tokenize sentences into words
 3) Count word occurrences (Bag of Words)
 4) Build a word-frequency vector for any selected sentence
-**Option:** Toggle *Stopword removal + lowercasing* to get a cleaner Bag of Words.
-> Note: Legacy `.doc` files are not supported—please convert to `.docx`.
 """
     )
@@ -177,7 +178,6 @@ Then click **Process** to:
             headers=["word", "count"],
             label="Bag of Words (sorted by count desc)",
             interactive=False,
-            wrap=True,
         )
     with gr.Tab("Sentence Vector"):
@@ -185,52 +185,74 @@ Then click **Process** to:
             headers=["word", "count"],
             label="Word-frequency vector for selected sentence",
             interactive=False,
-            wrap=True,
         )
-    # --------- Events ---------
     def on_process(text, file, clean):
-        # Ensure required NLTK bits exist (esp. for fresh environments)
-        _ensure_nltk()
-        raw_text = read_text_input(text, file)
-        # If read_text_input returned an error string, pass it through gracefully
-        if raw_text.startswith("ERROR"):
             return (
                 gr.update(choices=[], value=None),
                 [],
                 [],
                 [],
                 [],
                 [],
             )
-        sentences, tokenized_sentences, bow, vocab = tokenize_pipeline(raw_text, clean)
-        # Prepare UI artifacts
-        # Sentence dropdown: "1: <first 60 chars>"
-        dd_choices = [f"{i+1}: {s[:60].strip()}{'...' if len(s) > 60 else ''}" for i, s in enumerate(sentences)]
-        dd_value = dd_choices[0] if dd_choices else None
-        tokenized_json = {f"Sentence {i+1}": tokens for i, tokens in enumerate(tokenized_sentences)}
-        bow_rows = sorted(bow.items(), key=lambda kv: (-kv[1], kv[0]))
-        # Build initial vector for sentence 1 if available
-        vector_rows = []
-        if tokenized_sentences and vocab:
-            vec_map = build_sentence_vector(tokenized_sentences, vocab, 0)
-            vector_rows = [[w, c] for w, c in vec_map.items()]
-        return (
-            gr.update(choices=dd_choices, value=dd_value),
-            tokenized_json,
-            [[w, c] for w, c in bow_rows],
-            vector_rows,
-            sentences,
-            tokenized_sentences,
-            vocab,
-        )
     process_btn.click(
         fn=on_process,
@@ -243,6 +265,7 @@ Then click **Process** to:
             st_sentences,       # state: sentences
             st_tokenized,       # state: tokenized sentences
             st_vocab,           # state: vocabulary
         ],
     )
@@ -250,7 +273,6 @@ Then click **Process** to:
         if not choice or not tokenized_sentences or not vocabulary:
             return []
         try:
-            # Choice looks like "3: <preview>"
             idx = int(choice.split(":")[0]) - 1
         except Exception:
             return []
@@ -264,5 +286,4 @@ Then click **Process** to:
     )
 if __name__ == "__main__":
-    # Launch on http://127.0.0.1:7860
     demo.launch()

 import gradio as gr
 import nltk
+# ---------- NLTK bootstrap ----------
 def _ensure_nltk():
+    # NLTK 3.9+ needs both 'punkt' and 'punkt_tab'
     try:
         nltk.data.find("tokenizers/punkt")
     except LookupError:
         nltk.download("punkt", quiet=True)
+    try:
+        nltk.data.find("tokenizers/punkt_tab")
+    except LookupError:
+        try:
+            nltk.download("punkt_tab", quiet=True)
+        except Exception:
+            pass  # old NLTK won't have punkt_tab; 'punkt' is enough there
     try:
         nltk.data.find("corpora/stopwords")
     except LookupError:
 from nltk.corpus import stopwords
 # ---------- Helpers ----------
+def read_text_input(text: str, file_obj) -> Tuple[str, str]:
     """
     Priority: if a file is provided, read it; otherwise use text box.
     Supports .txt and .docx (not legacy .doc).
+    Returns (content, error_message). If error_message != "", content may be empty.
     """
+    if file_obj:
+        path = file_obj if isinstance(file_obj, str) else getattr(file_obj, "name", str(file_obj))
         ext = os.path.splitext(path)[1].lower()
         if ext == ".txt":
+            try:
+                with open(path, "r", encoding="utf-8", errors="ignore") as f:
+                    return f.read(), ""
+            except Exception as e:
+                return "", f"❌ Error reading .txt: {e}"
         elif ext == ".docx":
             try:
                 from docx import Document
             except Exception as e:
+                return "", f"❌ python-docx import failed: {e}. Did you install requirements?"
             try:
                 doc = Document(path)
+                return "\n".join(p.text for p in doc.paragraphs), ""
             except Exception as e:
+                return "", f"❌ Error reading .docx: {e}"
         else:
+            return "", "❌ Unsupported file type. Please upload .txt or .docx (not legacy .doc)."
+    return (text or "", "")
 def preprocess_tokens(tokens: List[str], clean: bool) -> List[str]:
     if not clean:
         return tokens
     stops = set(stopwords.words("english"))
 def build_sentence_vector(
     tokenized_sentences: List[List[str]], vocabulary: List[str], idx: int
 ) -> Dict[str, int]:
     if not tokenized_sentences or not vocabulary:
         return {}
     if idx < 0 or idx >= len(tokenized_sentences):
         return {}
     sent_tokens = tokenized_sentences[idx]
     counts = Counter(sent_tokens)
     vector = {word: counts[word] for word in vocabulary if counts[word] > 0}
 # ---------- Gradio App ----------
 SAMPLE_TEXT = """NLTK is a powerful library for text processing.
 Text processing is essential for NLP tasks.
 Bag of Words is a fundamental concept in NLP.
     gr.Markdown(
         """
 # NLTK Mini-Workbench
+Type/paste text or drop a **.txt** / **.docx** file.
+**Pipeline**
 1) Install NLTK (auto-checked at startup)
 2) Tokenize sentences into words
 3) Count word occurrences (Bag of Words)
 4) Build a word-frequency vector for any selected sentence
+**Option:** Toggle *Stopword removal + lowercasing* for a cleaner Bag of Words.
+> Tip: Legacy `.doc` files are not supported—please convert to `.docx`.
 """
     )
             headers=["word", "count"],
             label="Bag of Words (sorted by count desc)",
             interactive=False,
         )
     with gr.Tab("Sentence Vector"):
             headers=["word", "count"],
             label="Word-frequency vector for selected sentence",
             interactive=False,
         )
+    status_md = gr.Markdown("", label="Status / Errors")
+    # --------- Events ---------
     def on_process(text, file, clean):
+        try:
+            _ensure_nltk()
+            raw_text, read_err = read_text_input(text, file)
+            if read_err:
+                return (
+                    gr.update(choices=[], value=None),
+                    {},
+                    [],
+                    [],
+                    [],
+                    [],
+                    [],
+                    f"**Status:** {read_err}",
+                )
+            sentences, tokenized_sentences, bow, vocab = tokenize_pipeline(raw_text, clean)
+            dd_choices = [f"{i+1}: {s[:60].strip()}{'...' if len(s) > 60 else ''}" for i, s in enumerate(sentences)]
+            dd_value = dd_choices[0] if dd_choices else None
+            tokenized_json = {f"Sentence {i+1}": tokens for i, tokens in enumerate(tokenized_sentences)}
+            bow_rows = sorted(bow.items(), key=lambda kv: (-kv[1], kv[0]))
+            vector_rows = []
+            if tokenized_sentences and vocab:
+                vec_map = build_sentence_vector(tokenized_sentences, vocab, 0)
+                vector_rows = [[w, c] for w, c in vec_map.items()]
+            status = f"✅ Processed {len(sentences)} sentence(s). Vocabulary size: {len(vocab)}."
+            return (
+                gr.update(choices=dd_choices, value=dd_value),
+                tokenized_json,
+                [[w, c] for w, c in bow_rows],
+                vector_rows,
+                sentences,
+                tokenized_sentences,
+                vocab,
+                status,
+            )
+        except LookupError as e:
+            # Common NLTK resource errors (e.g., punkt_tab)
             return (
                 gr.update(choices=[], value=None),
+                {},
                 [],
                 [],
                 [],
                 [],
                 [],
+                f"❌ NLTK resource error: {e}\n\nTry running:\n\n```\npython -m nltk.downloader punkt punkt_tab stopwords\n```",
+            )
+        except Exception as e:
+            return (
+                gr.update(choices=[], value=None),
+                {},
+                [],
+                [],
+                [],
+                [],
+                [],
+                f"❌ Unexpected error: {type(e).__name__}: {e}",
             )
     process_btn.click(
         fn=on_process,
             st_sentences,       # state: sentences
             st_tokenized,       # state: tokenized sentences
             st_vocab,           # state: vocabulary
+            status_md,          # status/errors
         ],
     )
         if not choice or not tokenized_sentences or not vocabulary:
             return []
         try:
             idx = int(choice.split(":")[0]) - 1
         except Exception:
             return []
     )
 if __name__ == "__main__":
     demo.launch()