Spaces:

eaglelandsonce
/

BOW_Workflow

Sleeping

App Files Files Community

eaglelandsonce commited on Aug 17

Commit

6a3bf81

verified ·

1 Parent(s): 2d115e8

Update app.py

Browse files

Files changed (1) hide show

app.py +238 -291

app.py CHANGED Viewed

@@ -1,321 +1,268 @@
-import io
 import os
-from typing import List, Tuple, Union
 import gradio as gr
 import nltk
-# -----------------------------------------------------------------------------
-# Force NLTK data into a local folder to avoid permissions/network issues
-# -----------------------------------------------------------------------------
-NLTK_DATA_DIR = os.path.join(os.path.dirname(__file__), "nltk_data")
-os.makedirs(NLTK_DATA_DIR, exist_ok=True)
-os.environ["NLTK_DATA"] = NLTK_DATA_DIR
-if NLTK_DATA_DIR not in nltk.data.path:
-    nltk.data.path.insert(0, NLTK_DATA_DIR)
-# Cover old/new resource names across recent NLTK releases
-NLTK_PACKAGES = [
-    # Tokenizers
-    "punkt", "punkt_tab",
-    # Stopwords / Lemmas
-    "stopwords", "wordnet", "omw-1.4",
-    # POS taggers (old and new english-specific)
-    "averaged_perceptron_tagger", "averaged_perceptron_tagger_eng",
-    # NE chunkers (old and new)
-    "maxent_ne_chunker", "maxent_ne_chunker_tab",
-    # Word lists used by NE chunker
-    "words",
-]
-def ensure_nltk_resources() -> str:
-    msgs = []
-    for pkg in NLTK_PACKAGES:
-        try:
-            # idempotent; will skip if already present
-            ok = nltk.download(pkg, download_dir=NLTK_DATA_DIR, quiet=True)
-            msgs.append(f"OK: {pkg}" if ok else f"Skipped: {pkg}")
-        except Exception as e:
-            msgs.append(f"Failed {pkg}: {e}")
-    return " | ".join(msgs) if msgs else "Resources checked."
-# Import after setting up data path
-from nltk.tokenize import word_tokenize
 from nltk.corpus import stopwords
-from nltk.stem import PorterStemmer, WordNetLemmatizer
-from nltk import pos_tag
-from nltk.chunk import ne_chunk
-# -----------------------------------------------------------------------------
-# File reading helpers
-# -----------------------------------------------------------------------------
-def _read_bytes(path: str) -> bytes:
-    with open(path, "rb") as f:
-        return f.read()
-def _extract_from_docx_bytes(b: bytes) -> str:
-    try:
-        import docx  # python-docx
-    except ImportError:
-        return "ERROR: python-docx not installed. Add 'python-docx' to requirements.txt."
-    f = io.BytesIO(b)
-    doc = docx.Document(f)
-    return "\n".join(p.text for p in doc.paragraphs)
-def _extract_from_doc_bytes(b: bytes) -> str:
     """
-    Best-effort .doc (binary) support:
-    - If 'textract' is installed, use it.
-    - Otherwise, return a clear message telling the user to convert to .docx.
     """
-    try:
-        import textract  # optional
-    except Exception:
-        return ("ERROR: .doc files require optional dependency 'textract' "
-                "and system tools. Either `pip install textract` or convert "
-                "the file to .docx and try again.")
-    try:
-        text = textract.process(io.BytesIO(b))  # may still fail if system tools missing
-        return text.decode("utf-8", errors="replace")
-    except Exception as e:
-        return (f"ERROR: Could not extract text from .doc with textract: {e}. "
-                "Please convert the file to .docx and try again.")
-def read_file(upload: Union[str, dict, "gr.File", None]) -> str:
     """
-    Reads text from Gradio's File input. Supports .txt, .docx, and (optionally) .doc.
-    Works if `upload` is a path (str), a dict, or a file-like with .name/.read().
     """
-    if upload is None:
-        return ""
-    # Normalize to name/path/bytes
-    name, path, content = None, None, None
-    if isinstance(upload, str):
-        path = upload
-        name = os.path.basename(path)
-        content = _read_bytes(path)
-    elif isinstance(upload, dict):
-        # gradio sometimes passes {'name': '/tmp/..', 'orig_name': 'foo.txt', ...}
-        path = upload.get("name") or upload.get("path")
-        name = upload.get("orig_name") or (os.path.basename(path) if path else "")
-        if path and os.path.exists(path):
-            content = _read_bytes(path)
-    else:
-        # file-like
-        name = getattr(upload, "name", "") or ""
-        path = getattr(upload, "name", None)
-        try:
-            if path and os.path.exists(path):
-                content = _read_bytes(path)
-            else:
-                content = upload.read()
-        except Exception:
-            if path and os.path.exists(path):
-                content = _read_bytes(path)
-    if not name:
-        name = "(uploaded)"
-    if content is None:
-        return "ERROR: Could not read uploaded file."
-    ext = os.path.splitext(name)[1].lower()
-    if ext == ".txt":
-        # try common encodings
-        for enc in ("utf-8", "utf-16", "latin-1"):
-            try:
-                return content.decode(enc)
-            except UnicodeDecodeError:
-                continue
-        return "ERROR: Could not decode text file. Try UTF-8/plain text."
-    if ext == ".docx":
-        return _extract_from_docx_bytes(content)
-    if ext == ".doc":
-        return _extract_from_doc_bytes(content)
-    return f"Unsupported file type: {ext}. Please upload .txt, .docx, or .doc."
-# -----------------------------------------------------------------------------
-# NLP helpers
-# -----------------------------------------------------------------------------
-def extract_ner(ne_tree) -> List[Tuple[str, str]]:
-    entities = []
-    for subtree in ne_tree:
-        if hasattr(subtree, "label"):
-            label = subtree.label()
-            text = " ".join(token for token, _ in subtree.leaves())
-            entities.append((text, label))
-    return entities
-def process_text(raw_text: str, steps: List[str]) -> str:
-    if not raw_text or raw_text.strip() == "":
-        return "⚠️ No text provided."
-    # Ensure data locally (quiet)
-    ensure_nltk_resources()
-    report_lines = []
-    text = raw_text
-    # 1) Tokenize (required by later steps)
-    tokens = None
-    if "Tokenize text." in steps or any(
-        s in steps for s in [
-            "Remove stopwords.", "Stem words.", "Lemmatize words.",
-            "Tag parts of speech.", "Extract named entities."
-        ]
-    ):
-        tokens = word_tokenize(text)
-        if "Tokenize text." in steps:
-            report_lines.append("### Tokens")
-            report_lines.append(f"`{tokens}`\n")
-    # 2) Stopwords
-    filtered_tokens = tokens
-    if "Remove stopwords." in steps:
-        sw = set(stopwords.words("english"))
-        filtered_tokens = [w for w in (tokens or []) if w.lower() not in sw]
-        report_lines.append("### After Stopword Removal")
-        report_lines.append(f"`{filtered_tokens}`\n")
-    # 3) Stemming
-    stemmed_tokens = filtered_tokens
-    if "Stem words." in steps:
-        stemmer = PorterStemmer()
-        stemmed_tokens = [stemmer.stem(w) for w in (filtered_tokens or [])]
-        report_lines.append("### Stemmed Tokens (Porter)")
-        report_lines.append(f"`{stemmed_tokens}`\n")
-    # 4) Lemmatization
-    lemmatized_tokens = stemmed_tokens if stemmed_tokens is not None else filtered_tokens
-    if "Lemmatize words." in steps:
-        lemmatizer = WordNetLemmatizer()
-        lemmatized_tokens = [lemmatizer.lemmatize(w) for w in (filtered_tokens or [])]
-        report_lines.append("### Lemmatized Tokens (WordNet)")
-        report_lines.append(f"`{lemmatized_tokens}`\n")
-    # 5) POS Tagging
-    pos_tags_val = None
-    if "Tag parts of speech." in steps or "Extract named entities." in steps:
-        base_for_tagging = lemmatized_tokens if lemmatized_tokens is not None else (tokens or [])
-        pos_tags_val = pos_tag(base_for_tagging)
-        if "Tag parts of speech." in steps:
-            report_lines.append("### Part-of-Speech Tags")
-            rows = ["| Token | POS |", "|---|---|"]
-            rows += [f"| {t} | {p} |" for (t, p) in pos_tags_val]
-            report_lines.append("\n".join(rows) + "\n")
-    # 6) NER
-    if "Extract named entities." in steps:
-        if not pos_tags_val:
-            base_for_tagging = lemmatized_tokens if lemmatized_tokens is not None else (tokens or [])
-            pos_tags_val = pos_tag(base_for_tagging)
-        ne_tree = ne_chunk(pos_tags_val, binary=False)
-        ner_pairs = extract_ner(ne_tree)
-        report_lines.append("### Named Entities")
-        if ner_pairs:
-            rows = ["| Entity | Label |", "|---|---|"]
-            rows += [f"| {ent} | {lbl} |" for (ent, lbl) in ner_pairs]
-            report_lines.append("\n".join(rows) + "\n")
-        else:
-            report_lines.append("_No named entities found._\n")
-    return "\n".join(report_lines).strip() or "No steps selected."
-# -----------------------------------------------------------------------------
-# Gradio UI
-# -----------------------------------------------------------------------------
-MENU = [
-    "Install and download required resources.",
-    "Tokenize text.",
-    "Remove stopwords.",
-    "Stem words.",
-    "Lemmatize words.",
-    "Tag parts of speech.",
-    "Extract named entities.",
-]
-DEFAULT_TEXT = (
-    "NLTK is a powerful library for text processing. "
-    "Barack Obama served as the 44th President of the United States and lived in Washington, D.C."
-)
-with gr.Blocks(title="NLTK Text Processing Toolkit") as demo:
-    gr.Markdown("# NLTK Text Processing Toolkit")
     gr.Markdown(
-        "Type or paste text, or drop a `.txt`/`.docx`/`.doc` file. "
-        "Select steps and click **Process**. Use **Install/Download Resources** first if needed."
     )
     with gr.Row():
-        with gr.Column():
-            text_in = gr.Textbox(
-                label="Text Input",
-                lines=10,
-                value=DEFAULT_TEXT,
-                placeholder="Type or paste text here..."
-            )
-            file_in = gr.File(
-                label="...or drop a .txt / .docx / .doc file",
-                file_types=[".txt", ".docx", ".doc"]
-            )
-            steps_in = gr.CheckboxGroup(
-                choices=MENU,
-                value=[
-                    "Tokenize text.",
-                    "Remove stopwords.",
-                    "Lemmatize words.",
-                    "Tag parts of speech.",
-                    "Extract named entities.",
-                ],
-                label="Menu (choose one or more)"
-            )
-            with gr.Row():
-                install_btn = gr.Button("Install/Download Resources")
-                process_btn = gr.Button("Process", variant="primary")
-                clear_btn = gr.Button("Clear")
-        with gr.Column():
-            status_out = gr.Textbox(label="Status / Logs", interactive=False)
-            result_out = gr.Markdown(label="Results")
-    # Button callbacks
-    def on_install():
-        try:
-            return ensure_nltk_resources()
-        except Exception as e:
-            return f"Install error: {e}"
-    def on_process(text, file, steps):
-        try:
-            text = (text or "").strip()
-            file_text = read_file(file) if file is not None else ""
-            if not text and file_text:
-                text = file_text
-            if file_text.startswith("ERROR:") or file_text.startswith("Unsupported file type:"):
-                return file_text
-            return process_text(text, steps or [])
         except Exception:
-            import traceback
-            return "### Error\n```\n" + "".join(traceback.format_exc()) + "\n```"
-    def on_clear():
-        return "", ""
-    install_btn.click(fn=on_install, inputs=None, outputs=status_out)
-    process_btn.click(fn=on_process, inputs=[text_in, file_in, steps_in], outputs=result_out)
-    clear_btn.click(fn=on_clear, inputs=None, outputs=[status_out, result_out])
 if __name__ == "__main__":
-    # If you need external access, set server_name="0.0.0.0"
     demo.launch()

 import os
+from collections import Counter
+from typing import List, Tuple, Dict
 import gradio as gr
 import nltk
+# Ensure NLTK resources are available at startup (safe to call repeatedly)
+def _ensure_nltk():
+    try:
+        nltk.data.find("tokenizers/punkt")
+    except LookupError:
+        nltk.download("punkt", quiet=True)
+    try:
+        nltk.data.find("corpora/stopwords")
+    except LookupError:
+        nltk.download("stopwords", quiet=True)
+_ensure_nltk()
+from nltk.tokenize import sent_tokenize, word_tokenize
 from nltk.corpus import stopwords
+# ---------- Helpers ----------
+def read_text_input(text: str, file_obj) -> str:
+    """
+    Priority: if a file is provided, read it; otherwise use text box.
+    Supports .txt and .docx (not legacy .doc).
+    """
+    if file_obj is not None:
+        path = file_obj.name if hasattr(file_obj, "name") else str(file_obj)
+        ext = os.path.splitext(path)[1].lower()
+        if ext == ".txt":
+            with open(path, "r", encoding="utf-8", errors="ignore") as f:
+                return f.read()
+        elif ext == ".docx":
+            try:
+                from docx import Document
+            except Exception as e:
+                return f"ERROR: python-docx not installed or failed to import: {e}"
+            try:
+                doc = Document(path)
+                return "\n".join(p.text for p in doc.paragraphs)
+            except Exception as e:
+                return f"ERROR reading .docx: {e}"
+        else:
+            return "ERROR: Unsupported file type. Please upload .txt or .docx."
+    return text or ""
+def preprocess_tokens(tokens: List[str], clean: bool) -> List[str]:
     """
+    Optionally lowercases and removes English stopwords.
+    Leaves punctuation/nums as-is (tokenizer keeps them); the Bag of Words
+    will reflect exactly what remains after stopword filtering.
     """
+    if not clean:
+        return tokens
+    stops = set(stopwords.words("english"))
+    return [t.lower() for t in tokens if t.lower() not in stops]
+def tokenize_pipeline(
+    raw_text: str, clean: bool
+) -> Tuple[List[str], List[List[str]], Counter, List[str]]:
     """
+    - Split text into sentences
+    - Tokenize each sentence into words
+    - (Optionally) lower + remove stopwords
+    - Build Bag of Words across the full text
+    Returns: sentences, tokenized_sentences, bow_counter, vocabulary_list
     """
+    if not raw_text.strip():
+        return [], [], Counter(), []
+    sentences = sent_tokenize(raw_text)
+    tokenized_sentences = []
+    for s in sentences:
+        tokens = word_tokenize(s)
+        tokens = preprocess_tokens(tokens, clean=clean)
+        tokenized_sentences.append(tokens)
+    all_words = [w for sent in tokenized_sentences for w in sent]
+    bow = Counter(all_words)
+    vocabulary = sorted(bow.keys())
+    return sentences, tokenized_sentences, bow, vocabulary
+def build_sentence_vector(
+    tokenized_sentences: List[List[str]], vocabulary: List[str], idx: int
+) -> Dict[str, int]:
+    """
+    Count occurrences of each vocab term inside the selected sentence.
+    Returns a {word: count} mapping (only non-zero entries for clarity).
+    """
+    if not tokenized_sentences or not vocabulary:
+        return {}
+    if idx < 0 or idx >= len(tokenized_sentences):
+        return {}
+    sent_tokens = tokenized_sentences[idx]
+    counts = Counter(sent_tokens)
+    vector = {word: counts[word] for word in vocabulary if counts[word] > 0}
+    return dict(sorted(vector.items(), key=lambda kv: (-kv[1], kv[0])))
+# ---------- Gradio App ----------
+SAMPLE_TEXT = """NLTK is a powerful library for text processing.
+Text processing is essential for NLP tasks.
+Bag of Words is a fundamental concept in NLP.
+Tokenization splits sentences into words.
+We can count word occurrences in text.
+Word frequency vectors represent sentences numerically.
+Vectorization helps in transforming text for machine learning.
+Machine learning models can use BOW as input.
+NLP tasks include classification and sentiment analysis.
+Word frequency counts provide insight into text structure.
+"""
+with gr.Blocks(title="NLTK: Tokenize → Bag of Words → Sentence Vector") as demo:
     gr.Markdown(
+        """
+# NLTK Mini-Workbench
+Type/paste text or drop a **.txt** / **.docx** file.
+Then click **Process** to:
+1) Install NLTK (auto-checked at startup)
+2) Tokenize sentences into words
+3) Count word occurrences (Bag of Words)
+4) Build a word-frequency vector for any selected sentence
+**Option:** Toggle *Stopword removal + lowercasing* to get a cleaner Bag of Words.
+> Note: Legacy `.doc` files are not supported—please convert to `.docx`.
+"""
     )
     with gr.Row():
+        text_in = gr.Textbox(
+            label="Input Text",
+            value=SAMPLE_TEXT,
+            lines=12,
+            placeholder="Paste text here, or upload a file instead...",
+        )
+        file_in = gr.File(
+            label="Or upload a file (.txt or .docx)",
+            file_types=[".txt", ".docx"],
+            type="filepath",
+        )
+    clean_opt = gr.Checkbox(
+        label="Stopword removal + lowercasing",
+        value=True,
+        info='Removes common English stopwords (e.g., "is", "for", "the") and lowercases tokens.',
+    )
+    process_btn = gr.Button("Process", variant="primary")
+    # Hidden state to carry processed artifacts between events
+    st_sentences = gr.State([])
+    st_tokenized = gr.State([])
+    st_vocab = gr.State([])
+    with gr.Row():
+        sentence_dropdown = gr.Dropdown(
+            choices=[],
+            label="Select a sentence to vectorize",
+            interactive=True,
+        )
+    with gr.Tab("Tokenized Sentences"):
+        tokenized_out = gr.JSON(label="Tokens per sentence")
+    with gr.Tab("Bag of Words"):
+        bow_df = gr.Dataframe(
+            headers=["word", "count"],
+            label="Bag of Words (sorted by count desc)",
+            interactive=False,
+            wrap=True,
+        )
+    with gr.Tab("Sentence Vector"):
+        vec_df = gr.Dataframe(
+            headers=["word", "count"],
+            label="Word-frequency vector for selected sentence",
+            interactive=False,
+            wrap=True,
+        )
+    # --------- Events ---------
+    def on_process(text, file, clean):
+        # Ensure required NLTK bits exist (esp. for fresh environments)
+        _ensure_nltk()
+        raw_text = read_text_input(text, file)
+        # If read_text_input returned an error string, pass it through gracefully
+        if raw_text.startswith("ERROR"):
+            return (
+                gr.update(choices=[], value=None),
+                [],
+                [],
+                [],
+                [],
+                [],
+            )
+        sentences, tokenized_sentences, bow, vocab = tokenize_pipeline(raw_text, clean)
+        # Prepare UI artifacts
+        # Sentence dropdown: "1: <first 60 chars>"
+        dd_choices = [f"{i+1}: {s[:60].strip()}{'...' if len(s) > 60 else ''}" for i, s in enumerate(sentences)]
+        dd_value = dd_choices[0] if dd_choices else None
+        tokenized_json = {f"Sentence {i+1}": tokens for i, tokens in enumerate(tokenized_sentences)}
+        bow_rows = sorted(bow.items(), key=lambda kv: (-kv[1], kv[0]))
+        # Build initial vector for sentence 1 if available
+        vector_rows = []
+        if tokenized_sentences and vocab:
+            vec_map = build_sentence_vector(tokenized_sentences, vocab, 0)
+            vector_rows = [[w, c] for w, c in vec_map.items()]
+        return (
+            gr.update(choices=dd_choices, value=dd_value),
+            tokenized_json,
+            [[w, c] for w, c in bow_rows],
+            vector_rows,
+            sentences,
+            tokenized_sentences,
+            vocab,
+        )
+    process_btn.click(
+        fn=on_process,
+        inputs=[text_in, file_in, clean_opt],
+        outputs=[
+            sentence_dropdown,  # dropdown choices + value
+            tokenized_out,      # JSON tokens
+            bow_df,             # BOW table
+            vec_df,             # initial vector table
+            st_sentences,       # state: sentences
+            st_tokenized,       # state: tokenized sentences
+            st_vocab,           # state: vocabulary
+        ],
+    )
+    def on_select_sentence(choice: str, tokenized_sentences, vocabulary):
+        if not choice or not tokenized_sentences or not vocabulary:
+            return []
+        try:
+            # Choice looks like "3: <preview>"
+            idx = int(choice.split(":")[0]) - 1
         except Exception:
+            return []
+        vec_map = build_sentence_vector(tokenized_sentences, vocabulary, idx)
+        return [[w, c] for w, c in vec_map.items()]
+    sentence_dropdown.change(
+        fn=on_select_sentence,
+        inputs=[sentence_dropdown, st_tokenized, st_vocab],
+        outputs=[vec_df],
+    )
 if __name__ == "__main__":
+    # Launch on http://127.0.0.1:7860
     demo.launch()