Spaces:

Tymec
/

sentiment-analysis

Running

App Files Files

sentiment-analysis / app /data.py

Tymec

Use spacy instead of nltk and move data functions to separate module

a092d54 over 1 year ago

raw

history blame

4.67 kB

	from __future__ import annotations

	import bz2
	from typing import Literal

	import pandas as pd

	from app.constants import (
	AMAZONREVIEWS_PATH,
	AMAZONREVIEWS_URL,
	IMDB50K_PATH,
	IMDB50K_URL,
	SENTIMENT140_PATH,
	SENTIMENT140_URL,
	)

	__all__ = ["load_data"]


	def load_sentiment140(include_neutral: bool = False) -> tuple[list[str], list[int]]:
	"""Load the sentiment140 dataset and make it suitable for use.

	Args:
	include_neutral: Whether to include neutral sentiment

	Returns:
	Text and label data

	Raises:
	FileNotFoundError: If the dataset is not found
	"""
	# Check if the dataset exists
	if not SENTIMENT140_PATH.exists():
	msg = (
	f"Sentiment140 dataset not found at: '{SENTIMENT140_PATH}'\n"
	"Please download the dataset from:\n"
	f"{SENTIMENT140_URL}"
	)
	raise FileNotFoundError(msg)

	# Load the dataset
	data = pd.read_csv(
	SENTIMENT140_PATH,
	encoding="ISO-8859-1",
	names=[
	"target", # 0 = negative, 2 = neutral, 4 = positive
	"id", # The id of the tweet
	"date", # The date of the tweet
	"flag", # The query, NO_QUERY if not present
	"user", # The user that tweeted
	"text", # The text of the tweet
	],
	)

	# Ignore rows with neutral sentiment
	if not include_neutral:
	data = data[data["target"] != 2]

	# Map sentiment values
	data["sentiment"] = data["target"].map(
	{
	0: 0, # Negative
	4: 1, # Positive
	2: 2, # Neutral
	},
	)

	# Return as lists
	return data["text"].tolist(), data["sentiment"].tolist()


	def load_amazonreviews(merge: bool = True) -> tuple[list[str], list[int]]:
	"""Load the amazonreviews dataset and make it suitable for use.

	Args:
	merge: Whether to merge the test and train datasets (otherwise ignore test)

	Returns:
	Text and label data

	Raises:
	FileNotFoundError: If the dataset is not found
	"""
	# Check if the dataset exists
	test_exists = AMAZONREVIEWS_PATH[0].exists() or not merge
	train_exists = AMAZONREVIEWS_PATH[1].exists()
	if not (test_exists and train_exists):
	msg = (
	f"Amazonreviews dataset not found at: '{AMAZONREVIEWS_PATH[0]}' and '{AMAZONREVIEWS_PATH[1]}'\n"
	"Please download the dataset from:\n"
	f"{AMAZONREVIEWS_URL}"
	)
	raise FileNotFoundError(msg)

	# Load the datasets
	with bz2.BZ2File(AMAZONREVIEWS_PATH[1]) as train_file:
	train_data = [line.decode("utf-8") for line in train_file]

	test_data = []
	if merge:
	with bz2.BZ2File(AMAZONREVIEWS_PATH[0]) as test_file:
	test_data = [line.decode("utf-8") for line in test_file]

	# Merge the datasets
	data = train_data + test_data

	# Split the data into labels and text
	labels, texts = zip(*(line.split(" ", 1) for line in data))

	# Map sentiment values
	sentiments = [int(label.split("__label__")[1]) - 1 for label in labels]

	# Return as lists
	return texts, sentiments


	def load_imdb50k() -> tuple[list[str], list[int]]:
	"""Load the imdb50k dataset and make it suitable for use.

	Returns:
	Text and label data

	Raises:
	FileNotFoundError: If the dataset is not found
	"""
	# Check if the dataset exists
	if not IMDB50K_PATH.exists():
	msg = (
	f"IMDB50K dataset not found at: '{IMDB50K_PATH}'\n"
	"Please download the dataset from:\n"
	f"{IMDB50K_URL}"
	) # fmt: off
	raise FileNotFoundError(msg)

	# Load the dataset
	data = pd.read_csv(IMDB50K_PATH)

	# Map sentiment values
	data["sentiment"] = data["sentiment"].map(
	{
	"positive": 1,
	"negative": 0,
	},
	)

	# Return as lists
	return data["review"].tolist(), data["sentiment"].tolist()


	def load_data(dataset: Literal["sentiment140", "amazonreviews", "imdb50k"]) -> tuple[list[str], list[int]]:
	"""Load and preprocess the specified dataset.

	Args:
	dataset: Dataset to load

	Returns:
	Text and label data

	Raises:
	ValueError: If the dataset is not recognized
	"""
	match dataset:
	case "sentiment140":
	return load_sentiment140(include_neutral=False)
	case "amazonreviews":
	return load_amazonreviews(merge=True)
	case "imdb50k":
	return load_imdb50k()
	case _:
	msg = f"Unknown dataset: {dataset}"
	raise ValueError(msg)