Update README.md

5871dad verified about 1 year ago

10.4 kB

	---
	license: mit
	language:
	- ru
	tags:
	- natural-language-processing
	- dh
	- word2vec
	---

	The model is built on Leo Tolstoy's [collected works](https://github.com/tolstoydigital/TEI) and represents his individual semantics

	## Preparation

	All texts are converted from the TEI markup, splitted into sentences and lemmatized. Only modern orthography left in the data.

	```python
	import html
	import os
	import re
	import shutil
	from bs4 import BeautifulSoup

	!pip install razdel # for splitting

	from razdel import sentenize
	from tqdm import tqdm

	!git clone https://github.com/tolstoydigital/TEI.git

	relevant_dirs = ['diaries', 'letters', 'notes', 'works']

	path = 'TEI/reference/bibllist_works.xml' # allows to work with fiction and non fiction separately
	xml = open(path).read()
	soup = BeautifulSoup(xml, features="xml")

	group_texts = {}
	for it in soup.find_all("item"):
	ref = it.find("ref")
	for related in it.find_all("relatedItem"):
	for ref_ana in related.find_all("ref"):
	group_texts[ref_ana.text] = ref.text

	prefix_texts = 'extracted_texts'
	os.mkdir(prefix_texts)

	if os.path.exists(prefix_texts):
	shutil.rmtree(prefix_texts)
	os.mkdir(prefix_texts)

	# extract texts from XML

	complex_texts = {}
	for rel_dir in relevant_dirs:
	path = os.path.join('TEI/texts', rel_dir)
	for file in tqdm(sorted(os.listdir(path))):
	fiction = 0
	if not file.endswith('.xml'):
	continue
	xml = open(os.path.join(path, file)).read()
	if 'Печатные варианты' in xml:
	continue
	nameID = file.replace('.xml', '')
	soup = BeautifulSoup(xml, features="xml")
	if soup.find("catRef", {"ana":"#fiction"}):
	fiction = 1
	s = soup.find("body")
	paragraphs = []
	for erase in s.find_all(["orig", "comments", "sic", "note"]):
	erase.decompose()
	for p in s.find_all(["p", "l"]):
	paragraphs.append(html.unescape(p.text.replace('\n', ' ').strip()))
	if not fiction:
	with open(os.path.join(prefix_texts, rel_dir + '.txt'), 'a') as f:
	for par in paragraphs:
	par = re.sub(' ([.,;:!?)"»])', '\\1', par)
	par = par.replace('\n', ' ')
	par = par.strip()
	par = re.sub('\s+', ' ', par)
	par = re.sub('\[.+?\]', '', par)
	for sent in sentenize(par):
	f.write(list(sent)[2].strip() + '\n')
	else:
	if nameID in group_texts:
	hyper_name = group_texts[nameID]
	if hyper_name not in complex_texts:
	complex_texts[hyper_name] = paragraphs
	else:
	complex_texts[hyper_name].extend(paragraphs)
	else:
	with open(os.path.join(prefix_texts, nameID + '.txt'), 'w') as f:
	f.write('\n'.join(paragraphs))
	for hyper_name in complex_texts:
	with open(os.path.join(prefix_texts, hyper_name + '.txt'), 'w') as f:
	f.write('\n'.join(complex_texts[hyper_name]))

	# tagging

	from pymystem3 import Mystem

	pos = ['S', 'V', 'A', 'ADV']

	def tagging():
	m = Mystem()
	for fl in os.listdir(prefix_texts):
	#print(fl)
	if 'mystem' in fl:
	continue
	with open(os.path.join(prefix_texts, fl)) as f:
	text = f.read()
	lines = text.split('\n')
	ana_lines = []
	for line in lines:
	line = ' '.join(line.split()[1:])
	line = line.replace('ò', 'о')
	line = line.replace('è', 'е')
	line = line.replace('à', 'а')
	line = line.replace('ѝ', 'и')
	line = line.replace('ỳ', 'у')
	line = line.replace('о̀', 'о')
	#line = line.replace('Изд.̀', 'издательство')
	ana = []
	info = m.analyze(line)
	for token in info:
	if "analysis" in token:
	try:
	analysis = token["analysis"][0]
	except:
	#print(token)
	continue
	# if "lex" in analysis:
	lex = analysis["lex"]
	#if 'gr' in analysis:
	gr = analysis['gr']
	#print(gr)
	const = gr.split('=')[0]
	if ',' in const:
	pos = const.split(',')[0]
	else:
	pos = const

	ana.append('{}_{}'.format(lex, pos))
	ln = ' '.join(ana)
	if re.search('[А-Яа-я]', ln):
	ana_lines.append(ln)
	with open('{}/mystem-{}'.format(prefix_texts, fl), 'w') as fw:
	fw.write('\n'.join(ana_lines))

	def mk_input():
	inp = []
	for fl in os.listdir(prefix_texts):
	if not 'mystem' in fl:
	continue
	#print(fl)
	with open(os.path.join(prefix_texts, fl)) as f:
	text = f.read()
	lines = text.split('\n')
	for line in lines:
	words = []
	for w in line.split():
	word = w.split('_')
	if word[1] in pos:
	words.append(w)
	if len(words) > 1:
	inp.append(' '.join(words))

	with open('input.txt', 'w') as fw:
	fw.write('\n'.join(inp))

	tagging()
	mk_input()

	```
	The whole code is in the `w2v-prep.ipynb` notebook.


	## Models

	There are 2 models in the repository. Their parameters are taen from the general language models to be comparable from rusvectores site.

	Here is the code for building models:

	```python
	import sys
	import logging
	import gensim

	logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

	pth = './input.txt'
	data = gensim.models.word2vec.LineSentence(pth) # train sentence by sentence

	modelLNT1 = gensim.models.Word2Vec(data, vector_size=500, window=2, min_count=2, sg=1) # comparable with web_mystem_skipgram_500_2_2015.bin

	modelLNT1.save('skipgram_500_2.model') # saving

	modelLNT2 = gensim.models.Word2Vec(data, vector_size=300, window=10, min_count=2, sg=0) # comparable with ruwikiruscorpora_upos_cbow_300_10_2021

	modelLNT2.save('cbow_300_10.model')
	```

	## Usage

	```python

	# load models

	modelLNT1 = Word2Vec.load("skipgram_500_2.model")

	# most similar words viz

	import numpy as np
	import pandas as pd
	import matplotlib.pyplot as plt
	%matplotlib inline

	import seaborn as sns
	sns.set_style("darkgrid")

	from sklearn.decomposition import PCA
	from sklearn.manifold import TSNE

	def tsnescatterplot(model, word, list_names): # stolen code
	""" Plot in seaborn the results from the t-SNE dimensionality reduction algorithm of the vectors of a query word,
	its list of most similar words, and a list of words.
	"""
	arrays = np.empty((0, 300), dtype='f')
	word_labels = [word]
	color_list = ['red']

	# adds the vector of the query word
	arrays = np.append(arrays, model.wv.__getitem__([word]), axis=0)

	# gets list of most similar words
	close_words = model.wv.most_similar([word])

	# adds the vector for each of the closest words to the array
	for wrd_score in close_words:
	wrd_vector = model.wv.__getitem__([wrd_score[0]])
	word_labels.append(wrd_score[0])
	color_list.append('blue')
	arrays = np.append(arrays, wrd_vector, axis=0)

	# adds the vector for each of the words from list_names to the array
	for wrd in list_names:
	wrd_vector = model.wv.__getitem__([wrd])
	word_labels.append(wrd)
	color_list.append('green')
	arrays = np.append(arrays, wrd_vector, axis=0)

	# Reduces the dimensionality from 300 to 50 dimensions with PCA
	reduc = PCA(n_components=20).fit_transform(arrays)

	# Finds t-SNE coordinates for 2 dimensions
	np.set_printoptions(suppress=True)

	Y = TSNE(n_components=2, random_state=0, perplexity=15).fit_transform(reduc)

	# Sets everything up to plot
	df = pd.DataFrame({'x': [x for x in Y[:, 0]],
	'y': [y for y in Y[:, 1]],
	'words': word_labels,
	'color': color_list})

	fig, _ = plt.subplots()
	fig.set_size_inches(9, 9)

	# Basic plot
	p1 = sns.regplot(data=df,
	x="x",
	y="y",
	fit_reg=False,
	marker="o",
	scatter_kws={'s': 40,
	'facecolors': df['color']
	}
	)

	# Adds annotations one by one with a loop
	for line in range(0, df.shape[0]):
	p1.text(df["x"][line],
	df['y'][line],
	' ' + df["words"][line].title(),
	horizontalalignment='left',
	verticalalignment='bottom', size='medium',
	color=df['color'][line],
	weight='normal'
	).set_size(15)


	plt.xlim(Y[:, 0].min()-50, Y[:, 0].max()+50)
	plt.ylim(Y[:, 1].min()-50, Y[:, 1].max()+50)

	plt.title('t-SNE visualization for {}'.format(word.title()))

	tsnescatterplot(modelLNT2, 'бог_S', [i[0] for i in modelLNT2.wv.most_similar(negative=["бог_S"])])

	```

	![](./god.png)

	## Train data

	Train corpus inclded in this repository as an `input.txt` file. It contains more than 7 mln words. For detailed explanation see Bonch-Osmolovskaya, A., Skorinkin, D., Pavlova, I., Kolbasov, M., & Orekhov, B. (2019). [Tolstoy semanticized: Constructing a digital edition for knowledge discovery](https://www.sciencedirect.com/science/article/abs/pii/S1570826818300635). Journal of Web Semantics, 59, 100483.

	## Publication

	Орехов Б. В. [Индивидуальная семантика Л. Н. Толстого в свете векторных моделей](https://human.spbstu.ru/article/2023.54.09/) // Terra Linguistica. 2023. Т. 14. No 4. С. 119–129. DOI: 10.18721/JHSS.14409

	```
	@article{орехов2023индивидуальная,
	title={Индивидуальная семантика Л. Н. Толстого в свете векторных моделей},
	author={Орехов, Б.В.},
	journal={Terra Linguistica},
	volume={14},
	number={4},
	pages={119--129},
	doi={10.18721/JHSS.14409}
	url={https://human.spbstu.ru/userfiles/files/articles/2023/4/119-129.pdf}
	year={2023}
	}
	```