Spaces:
Sleeping
Sleeping
| # src/data_processing.py | |
| import os | |
| import json | |
| import csv | |
| from pdfminer.high_level import extract_text | |
| import pandas as pd | |
| from utils import tokenize, build_vocab, save_vocab | |
| def read_txt(file_path): | |
| with open(file_path, 'r', encoding='utf-8') as file: | |
| return file.read() | |
| def read_pdf(file_path): | |
| return extract_text(file_path) | |
| def read_json(file_path): | |
| with open(file_path, 'r', encoding='utf-8') as file: | |
| return json.load(file) | |
| def read_csv(file_path): | |
| df = pd.read_csv(file_path) | |
| # Concatenate all text columns into a single string | |
| text = ' '.join(df.astype(str).values.flatten()) | |
| return text | |
| def process_file(file_path): | |
| _, ext = os.path.splitext(file_path) | |
| ext = ext.lower() | |
| if ext == '.txt': | |
| return read_txt(file_path) | |
| elif ext == '.pdf': | |
| return read_pdf(file_path) | |
| elif ext == '.json': | |
| return read_json(file_path) | |
| elif ext == '.csv': | |
| return read_csv(file_path) | |
| else: | |
| print(f"Unsupported file format: {ext}") | |
| return None | |
| def load_data(raw_data_dir='data/raw'): | |
| all_data = [] | |
| for root, dirs, files in os.walk(raw_data_dir): | |
| for file in files: | |
| file_path = os.path.join(root, file) | |
| data = process_file(file_path) | |
| if data: | |
| all_data.append(data) | |
| return all_data | |
| def prepare_training_data(processed_data, vocab_path='vocab.json'): | |
| tokenized_texts = [] | |
| for entry in processed_data: | |
| if isinstance(entry, str): | |
| tokens = tokenize(entry) | |
| tokenized_texts.append(tokens) | |
| elif isinstance(entry, list): | |
| for item in entry: | |
| if isinstance(item, str): | |
| tokens = tokenize(item) | |
| tokenized_texts.append(tokens) | |
| vocab = build_vocab(tokenized_texts) | |
| save_vocab(vocab, vocab_path) | |
| return tokenized_texts, vocab | |
| def save_tokenized_data(tokenized_texts, filepath='data/processed/tokenized_data.json'): | |
| with open(filepath, 'w', encoding='utf-8') as f: | |
| json.dump(tokenized_texts, f, ensure_ascii=False, indent=4) | |
| def save_processed_data(processed_data, filepath='data/processed/processed_data.json'): | |
| with open(filepath, 'w', encoding='utf-8') as f: | |
| json.dump(processed_data, f, ensure_ascii=False, indent=4) | |
| if __name__ == "__main__": | |
| print("Loading raw data...") | |
| data = load_data() | |
| print(f"Loaded {len(data)} data entries.") | |
| print("Preparing training data...") | |
| tokenized_texts, vocab = prepare_training_data(data) | |
| save_tokenized_data(tokenized_texts) | |
| save_processed_data(data) | |
| print("Data processing complete.") | |
| print(f"Vocabulary size: {len(vocab)}") | |