Search-Engine / Indexer.py
ababio's picture
Upload 24 files
2c4cdb1 verified
raw
history blame
3.31 kB
import nltk #NLTK for natural language processing tasks
from nltk.corpus import stopwords # list of stop word
from nltk.tokenize import word_tokenize # To tokenize each word
from nltk.stem import PorterStemmer # For specific rules to transform words to their stems
# Preprosessing data before indexing
with open('scraper_results.json', 'r') as doc: scraper_results=doc.read()
# Initialize empty lists to store publication name, URL, author, and date
pubName = []
pubURL = []
pubCUAuthor = []
pubDate = []
# Load the scraped results using ujson
data_dict = ujson.loads(scraper_results)
# Get the length of the data_dict (number of publications)
array_length = len(data_dict)
# Print the number of publications
print(array_length)
#Seperate name, url, date, author in different file
for item in data_dict:
pubName.append(item["name"])
pubURL.append(item["pub_url"])
pubCUAuthor.append(item["cu_author"])
pubDate.append(item["date"])
with open('pub_name.json', 'w') as f:ujson.dump(pubName, f)
with open('pub_url.json', 'w') as f:ujson.dump(pubURL, f)
with open('pub_cu_author.json', 'w') as f:ujson.dump(pubCUAuthor, f)
with open('pub_date.json', 'w') as f: ujson.dump(pubDate, f)
#Open a file with publication names in read mode
with open('pub_name.json','r') as f:publication=f.read()
#Load JSON File
pubName = ujson.loads(publication)
#Downloading libraries to use its methods
nltk.download('stopwords')
nltk.download('punkt')
#Predefined stopwords in nltk are used
stop_words = stopwords.words('english')
stemmer = PorterStemmer()
pub_list_first_stem = []
pub_list = []
pub_list_wo_sc = []
print(len(pubName))
for file in pubName:
#Splitting strings to tokens(words)
words = word_tokenize(file)
stem_word = ""
for i in words:
if i.lower() not in stop_words:
stem_word += stemmer.stem(i) + " "
pub_list_first_stem.append(stem_word)
pub_list.append(file)
#Removing all below characters
special_characters = '''!()-β€”[]{};:'"\, <>./?@#$%^&*_~0123456789+=β€™β€˜'''
for file in pub_list:
word_wo_sc = ""
if len(file.split()) ==1 : pub_list_wo_sc.append(file)
else:
for a in file:
if a in special_characters:
word_wo_sc += ' '
else:
word_wo_sc += a
#print(word_wo_sc)
pub_list_wo_sc.append(word_wo_sc)
#Stemming Process
pub_list_stem_wo_sw = []
for name in pub_list_wo_sc:
words = word_tokenize(name)
stem_word = ""
for a in words:
if a.lower() not in stop_words:
stem_word += stemmer.stem(a) + ' '
pub_list_stem_wo_sw.append(stem_word.lower())
data_dict = {} #Inverted Index holder
# Indexing process
for a in range(len(pub_list_stem_wo_sw)):
for b in pub_list_stem_wo_sw[a].split():
if b not in data_dict:
data_dict[b] = [a]
else:
data_dict[b].append(a)
# printing the lenght
print(len(pub_list_wo_sc))
print(len(pub_list_stem_wo_sw))
print(len(pub_list_first_stem))
print(len(pub_list))
# with open('publication_list.json', 'w') as f:
# ujson.dump(pub_list, f)
with open('publication_list_stemmed.json', 'w') as f:
ujson.dump(pub_list_first_stem, f)
with open('publication_indexed_dictionary.json', 'w') as f:
ujson.dump(data_dict, f)