Spaces:
Runtime error
Runtime error
from nltk import FreqDist | |
from nltk.corpus import brown | |
from nltk.stem.lancaster import LancasterStemmer | |
import json | |
WORD_LIMIT = 10000 | |
MIN_WORD_SIZE, MAX_WORD_SIZE = 4, 10 | |
# stem = LancasterStemmer() | |
# frequency_list = FreqDist(i.lower() for i in brown.words()) | |
# words = [ | |
# w.lower() | |
# for w, _ in frequency_list.most_common()[:WORD_LIMIT] | |
# if w.isalpha() and len(w) >= MIN_WORD_SIZE and len(w) <= MAX_WORD_SIZE | |
# ] | |
# stem_to_words = {} | |
# for word in words: | |
# stemmed = stem.stem(word) | |
# if stemmed not in stem_to_words: | |
# stem_to_words[stemmed] = set() | |
# stem_to_words[stemmed].add(word) | |
# final_words = [] | |
# for stem, words in stem_to_words.items(): | |
# shortest = min(words, key=len) | |
# final_words.append(shortest) | |
# with open("words.json", "w") as f: | |
# f.write(json.dumps(final_words)) | |
with open("jeopardy.json", "r") as f: | |
jeopardy = json.loads(f.read()) | |
answers = set() | |
for row in jeopardy: | |
answer = row["answer"].lower() | |
if not answer.isalpha(): | |
continue | |
if answer.startswith("the "): | |
answer = answer[4:] | |
elif answer.startswith("a "): | |
answer = answer[2:] | |
if len(answer) < MIN_WORD_SIZE or len(answer) > MAX_WORD_SIZE: | |
continue | |
answers.add(answer) | |
with open("words.json", "w") as f: | |
f.write(json.dumps(list(answers))) |