import os import csv import logging import gradio as gr import nltk from datasets import Dataset, DatasetDict, DatasetInfo, Features, Value, ClassLabel from huggingface_hub import HfApi, Repository, create_repo from tqdm import tqdm from nltk.tokenize import word_tokenize from nltk.corpus import wordnet as wn import random import string # Ensure necessary NLTK resources are downloaded nltk.download('all') #nltk.download('wordnet') # Set up logging logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) # Function to generate random words def generate_random_words(num_words=100): words = [] for _ in range(num_words): word_length = random.randint(3, 10) word = ''.join(random.choices(string.ascii_lowercase, k=word_length)) words.append(word) return words # Function to get meanings of words using NLTK WordNet def get_word_meanings(words): meanings = {} for word in words: synsets = wn.synsets(word) if synsets: meanings[word] = synsets[0].definition() else: meanings[word] = "No definition found." return meanings # Function to convert data to CSV format def convert_to_csv(data, filename='dataset.csv'): fieldnames = ['word', 'meaning'] with open(filename, mode='w', newline='', encoding='utf-8') as file: writer = csv.DictWriter(file, fieldnames=fieldnames) writer.writeheader() for word, meaning in data.items(): writer.writerow({'word': word, 'meaning': meaning}) # Function to create and push dataset to Hugging Face def create_and_push_dataset(csv_file='dataset.csv', repo_name='DeepFocus-X3'): # Create a new dataset repository on Hugging Face create_repo(repo_name, exist_ok=True) api = HfApi() api.upload_file( path_or_fileobj=csv_file, path_in_repo=csv_file, repo_id=repo_name, repo_type='dataset' ) logger.info(f"Dataset {repo_name} created and file {csv_file} uploaded.") # Gradio interface functions def generate_words_interface(): num_words = random.randint(50, 200) words = generate_random_words(num_words) meanings = get_word_meanings(words) convert_to_csv(meanings) return f"Generated {num_words} random words and saved to dataset.csv." def about_interface(): return "This is a dataset generation tool that creates a dataset of random words and their meanings, then uploads it to Hugging Face." def logs_interface(): with open('dataset_generation.log', 'r') as file: logs = file.read() return logs # Gradio app setup with gr.Blocks() as demo: with gr.Tabs(): with gr.Tab("About"): about_text = gr.Markdown(about_interface) with gr.Tab("Generate"): generate_button = gr.Button("Generate Dataset") generate_output = gr.Textbox() generate_button.click(generate_words_interface, outputs=generate_output) with gr.Tab("Logs"): logs_output = gr.Textbox(value=logs_interface(), interactive=False) # Run the Gradio app if __name__ == "__main__": demo.launch()