Spaces:

ababio
/

Search-Engine

Sleeping

App Files Files Community

Search-Engine / authorindexer.py

iababio

solve nltk error

dc7ed36 about 1 month ago

raw

history blame contribute delete

No virus

1.73 kB

	import nltk
	from nltk.corpus import stopwords
	from nltk.tokenize import word_tokenize
	from nltk.stem import PorterStemmer
	import ujson

	# Load the JSON file containing scraped results
	with open('scraper_results.json', 'r') as doc:
	scraper_results = doc.read()

	# Extract author names from the JSON data
	authors = []
	data_dict = ujson.loads(scraper_results)
	for item in data_dict:
	authors.append(item["cu_author"])

	# Write the author names to a JSON file
	with open('author_names.json', 'w') as f:
	ujson.dump(authors, f)

	# Download necessary NLTK resources
	nltk.download('stopwords')
	nltk.download('punkt')
	nltk.download('all')


	# Load the JSON file containing author names
	with open('author_names.json', 'r') as f:
	author_data = f.read()

	# Load JSON data
	authors = ujson.loads(author_data)

	# Preprocess the author names
	stop_words = stopwords.words('english')
	stemmer = PorterStemmer()
	authors_list_first_stem = []
	authors_list = []

	for author in authors:
	words = word_tokenize(author)
	stem_word = ""
	for word in words:
	if word.lower() not in stop_words:
	stem_word += stemmer.stem(word) + " "
	authors_list_first_stem.append(stem_word)
	authors_list.append(author)

	# Indexing process
	data_dict = {}
	for i in range(len(authors_list_first_stem)):
	for word in authors_list_first_stem[i].split():
	if word not in data_dict:
	data_dict[word] = [i]
	else:
	data_dict[word].append(i)

	# Write the preprocessed author names and indexed dictionary to JSON files
	with open('author_list_stemmed.json', 'w') as f:
	ujson.dump(authors_list_first_stem, f)

	with open('author_indexed_dictionary.json', 'w') as f:
	ujson.dump(data_dict, f)