import os import logging import csv import shutil import nltk import pandas as pd from tqdm import tqdm import gradio as gr from datasets import Dataset from transformers import pipeline from huggingface_hub import HfApi # ---------------------- Logging Setup ---------------------- logging.basicConfig( level=logging.INFO, format="%(asctime)s [%(levelname)s] %(message)s", handlers=[logging.StreamHandler()] ) # ---------------------- NLTK Setup ---------------------- def download_nltk(): nltk.download("words") nltk.download("punkt") logging.info("NLTK resources downloaded.") download_nltk() # ---------------------- Data Preparation ---------------------- def get_all_words(): from nltk.corpus import words as nltk_words all_words = nltk_words.words() logging.info(f"Got {len(all_words)} words from NLTK.") return all_words def generate_meaning(word, generator): prompt = f"Define the word '{word}' in one concise sentence." try: result = generator(prompt, max_length=50)[0]["generated_text"] return result.strip() except Exception as e: logging.error(f"Error generating meaning for '{word}': {e}") return "" def process_words(model_name, limit=None): logging.info("Initializing Hugging Face text2text-generation pipeline...") generator = pipeline("text2text-generation", model=model_name, device=-1) words_list = get_all_words() if limit: words_list = words_list[:limit] data = [] for word in tqdm(words_list, desc="Processing words"): tokens = nltk.word_tokenize(word) meaning = generate_meaning(word, generator) data.append({ "tokenizer": tokens, "words": word, "meaning": meaning }) logging.info("Finished processing words.") return data def save_to_csv(data, filename="output.csv"): df = pd.DataFrame(data) df.to_csv(filename, index=False) logging.info(f"Saved CSV to {filename}.") return filename # ---------------------- Push to Hugging Face ---------------------- def push_dataset(csv_file, repo_id="katsukiai/DeepFocus-X3"): repo_local_dir = "DeepFocus-X3_repo" if not os.path.exists(repo_local_dir): os.system(f"git clone https://huggingface.co/{repo_id} {repo_local_dir}") logging.info("Repository cloned locally.") shutil.copy(csv_file, os.path.join(repo_local_dir, csv_file)) current_dir = os.getcwd() os.chdir(repo_local_dir) os.system("git add .") os.system('git commit -m "Update dataset"') os.system("git push") os.chdir(current_dir) logging.info("Pushed dataset to Hugging Face repository.") def generate_all(model_name, word_limit): try: word_limit = int(word_limit) except Exception: word_limit = None data = process_words(model_name, limit=word_limit) csv_file = save_to_csv(data) push_dataset(csv_file) return csv_file # ---------------------- Gradio Interface Functions ---------------------- def run_generate(model_name, word_limit): output_csv = generate_all(model_name, word_limit) return f"Generated and pushed CSV: {output_csv}" def about_tab_content(): about_text = ( "## DeepFocus-X3 Dataset Generator\n\n" "This tool downloads all available words from the NLTK corpus, " "generates concise meanings using a Hugging Face text-to-text generation model, " "and converts the data into a CSV file. Finally, it pushes the CSV to the " "[katsukiai/DeepFocus-X3](https://huggingface.co/datasets/katsukiai/DeepFocus-X3) repository." ) return about_text def settings_tab_content(): settings_text = ( "**Current Settings**\n\n" "- Model: `google/flan-t5-xl`\n" "- Word Limit: 50 (set to empty to process all words)\n" "\nYou can update these settings in the Generate tab." ) return settings_text # ---------------------- Gradio App ---------------------- with gr.Blocks() as demo: gr.Markdown("## DeepFocus-X3 Dataset Generator") with gr.Tabs(): # About Tab with gr.Tab("About"): gr.Markdown(about_tab_content()) # Generate All Tab with gr.Tab("Generate all"): model_name_input = gr.Textbox(value="google/flan-t5-xl", label="Hugging Face Model Name for Means") word_limit_input = gr.Textbox(value="50", label="Word Limit (Leave empty for all)") generate_button = gr.Button("Generate and Push Dataset") generate_output = gr.Textbox(label="Output") generate_button.click(run_generate, inputs=[model_name_input, word_limit_input], outputs=generate_output) # Settings Tab with gr.Tab("Settings"): gr.Markdown(settings_tab_content()) demo.launch()