import pandas as pd from datasets import load_dataset # Load the dataset from Hugging Face dataset = load_dataset("macadeliccc/US-FederalLaws") # Convert to pandas DataFrame df = pd.DataFrame(dataset['train']) # Clean text (basic preprocessing) def clean_text(text): text = text.replace('\n', ' ').strip() return text df['text'] = df['text'].apply(clean_text) # Save the preprocessed data df.to_csv("preprocessed_dataset.csv", index=False) print("✅ Preprocessing complete! Saved as 'p) if __name__ == "__main__": output_path = "app/processed_dataset" preprocess_and_save("macadeliccc/US-FederalLaws", output_path) print(f"Dataset saved to {output_path}")