w2v-russian-tolstoy / README.md
nevmenandr's picture
Update README.md
5871dad verified
---
license: mit
language:
- ru
tags:
- natural-language-processing
- dh
- word2vec
---
The model is built on Leo Tolstoy's [collected works](https://github.com/tolstoydigital/TEI) and represents his individual semantics
## Preparation
All texts are converted from the TEI markup, splitted into sentences and lemmatized. Only modern orthography left in the data.
```python
import html
import os
import re
import shutil
from bs4 import BeautifulSoup
!pip install razdel # for splitting
from razdel import sentenize
from tqdm import tqdm
!git clone https://github.com/tolstoydigital/TEI.git
relevant_dirs = ['diaries', 'letters', 'notes', 'works']
path = 'TEI/reference/bibllist_works.xml' # allows to work with fiction and non fiction separately
xml = open(path).read()
soup = BeautifulSoup(xml, features="xml")
group_texts = {}
for it in soup.find_all("item"):
ref = it.find("ref")
for related in it.find_all("relatedItem"):
for ref_ana in related.find_all("ref"):
group_texts[ref_ana.text] = ref.text
prefix_texts = 'extracted_texts'
os.mkdir(prefix_texts)
if os.path.exists(prefix_texts):
shutil.rmtree(prefix_texts)
os.mkdir(prefix_texts)
# extract texts from XML
complex_texts = {}
for rel_dir in relevant_dirs:
path = os.path.join('TEI/texts', rel_dir)
for file in tqdm(sorted(os.listdir(path))):
fiction = 0
if not file.endswith('.xml'):
continue
xml = open(os.path.join(path, file)).read()
if 'Печатные варианты' in xml:
continue
nameID = file.replace('.xml', '')
soup = BeautifulSoup(xml, features="xml")
if soup.find("catRef", {"ana":"#fiction"}):
fiction = 1
s = soup.find("body")
paragraphs = []
for erase in s.find_all(["orig", "comments", "sic", "note"]):
erase.decompose()
for p in s.find_all(["p", "l"]):
paragraphs.append(html.unescape(p.text.replace('\n', ' ').strip()))
if not fiction:
with open(os.path.join(prefix_texts, rel_dir + '.txt'), 'a') as f:
for par in paragraphs:
par = re.sub(' ([.,;:!?)"»])', '\\1', par)
par = par.replace('\n', ' ')
par = par.strip()
par = re.sub('\s+', ' ', par)
par = re.sub('\[.+?\]', '', par)
for sent in sentenize(par):
f.write(list(sent)[2].strip() + '\n')
else:
if nameID in group_texts:
hyper_name = group_texts[nameID]
if hyper_name not in complex_texts:
complex_texts[hyper_name] = paragraphs
else:
complex_texts[hyper_name].extend(paragraphs)
else:
with open(os.path.join(prefix_texts, nameID + '.txt'), 'w') as f:
f.write('\n'.join(paragraphs))
for hyper_name in complex_texts:
with open(os.path.join(prefix_texts, hyper_name + '.txt'), 'w') as f:
f.write('\n'.join(complex_texts[hyper_name]))
# tagging
from pymystem3 import Mystem
pos = ['S', 'V', 'A', 'ADV']
def tagging():
m = Mystem()
for fl in os.listdir(prefix_texts):
#print(fl)
if 'mystem' in fl:
continue
with open(os.path.join(prefix_texts, fl)) as f:
text = f.read()
lines = text.split('\n')
ana_lines = []
for line in lines:
line = ' '.join(line.split()[1:])
line = line.replace('ò', 'о')
line = line.replace('è', 'е')
line = line.replace('à', 'а')
line = line.replace('ѝ', 'и')
line = line.replace('ỳ', 'у')
line = line.replace('о̀', 'о')
#line = line.replace('Изд.̀', 'издательство')
ana = []
info = m.analyze(line)
for token in info:
if "analysis" in token:
try:
analysis = token["analysis"][0]
except:
#print(token)
continue
# if "lex" in analysis:
lex = analysis["lex"]
#if 'gr' in analysis:
gr = analysis['gr']
#print(gr)
const = gr.split('=')[0]
if ',' in const:
pos = const.split(',')[0]
else:
pos = const
ana.append('{}_{}'.format(lex, pos))
ln = ' '.join(ana)
if re.search('[А-Яа-я]', ln):
ana_lines.append(ln)
with open('{}/mystem-{}'.format(prefix_texts, fl), 'w') as fw:
fw.write('\n'.join(ana_lines))
def mk_input():
inp = []
for fl in os.listdir(prefix_texts):
if not 'mystem' in fl:
continue
#print(fl)
with open(os.path.join(prefix_texts, fl)) as f:
text = f.read()
lines = text.split('\n')
for line in lines:
words = []
for w in line.split():
word = w.split('_')
if word[1] in pos:
words.append(w)
if len(words) > 1:
inp.append(' '.join(words))
with open('input.txt', 'w') as fw:
fw.write('\n'.join(inp))
tagging()
mk_input()
```
The whole code is in the `w2v-prep.ipynb` notebook.
## Models
There are 2 models in the repository. Their parameters are taen from the general language models to be comparable from rusvectores site.
Here is the code for building models:
```python
import sys
import logging
import gensim
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
pth = './input.txt'
data = gensim.models.word2vec.LineSentence(pth) # train sentence by sentence
modelLNT1 = gensim.models.Word2Vec(data, vector_size=500, window=2, min_count=2, sg=1) # comparable with web_mystem_skipgram_500_2_2015.bin
modelLNT1.save('skipgram_500_2.model') # saving
modelLNT2 = gensim.models.Word2Vec(data, vector_size=300, window=10, min_count=2, sg=0) # comparable with ruwikiruscorpora_upos_cbow_300_10_2021
modelLNT2.save('cbow_300_10.model')
```
## Usage
```python
# load models
modelLNT1 = Word2Vec.load("skipgram_500_2.model")
# most similar words viz
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
sns.set_style("darkgrid")
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
def tsnescatterplot(model, word, list_names): # stolen code
""" Plot in seaborn the results from the t-SNE dimensionality reduction algorithm of the vectors of a query word,
its list of most similar words, and a list of words.
"""
arrays = np.empty((0, 300), dtype='f')
word_labels = [word]
color_list = ['red']
# adds the vector of the query word
arrays = np.append(arrays, model.wv.__getitem__([word]), axis=0)
# gets list of most similar words
close_words = model.wv.most_similar([word])
# adds the vector for each of the closest words to the array
for wrd_score in close_words:
wrd_vector = model.wv.__getitem__([wrd_score[0]])
word_labels.append(wrd_score[0])
color_list.append('blue')
arrays = np.append(arrays, wrd_vector, axis=0)
# adds the vector for each of the words from list_names to the array
for wrd in list_names:
wrd_vector = model.wv.__getitem__([wrd])
word_labels.append(wrd)
color_list.append('green')
arrays = np.append(arrays, wrd_vector, axis=0)
# Reduces the dimensionality from 300 to 50 dimensions with PCA
reduc = PCA(n_components=20).fit_transform(arrays)
# Finds t-SNE coordinates for 2 dimensions
np.set_printoptions(suppress=True)
Y = TSNE(n_components=2, random_state=0, perplexity=15).fit_transform(reduc)
# Sets everything up to plot
df = pd.DataFrame({'x': [x for x in Y[:, 0]],
'y': [y for y in Y[:, 1]],
'words': word_labels,
'color': color_list})
fig, _ = plt.subplots()
fig.set_size_inches(9, 9)
# Basic plot
p1 = sns.regplot(data=df,
x="x",
y="y",
fit_reg=False,
marker="o",
scatter_kws={'s': 40,
'facecolors': df['color']
}
)
# Adds annotations one by one with a loop
for line in range(0, df.shape[0]):
p1.text(df["x"][line],
df['y'][line],
' ' + df["words"][line].title(),
horizontalalignment='left',
verticalalignment='bottom', size='medium',
color=df['color'][line],
weight='normal'
).set_size(15)
plt.xlim(Y[:, 0].min()-50, Y[:, 0].max()+50)
plt.ylim(Y[:, 1].min()-50, Y[:, 1].max()+50)
plt.title('t-SNE visualization for {}'.format(word.title()))
tsnescatterplot(modelLNT2, 'бог_S', [i[0] for i in modelLNT2.wv.most_similar(negative=["бог_S"])])
```
![](./god.png)
## Train data
Train corpus inclded in this repository as an `input.txt` file. It contains more than 7 mln words. For detailed explanation see Bonch-Osmolovskaya, A., Skorinkin, D., Pavlova, I., Kolbasov, M., & Orekhov, B. (2019). [Tolstoy semanticized: Constructing a digital edition for knowledge discovery](https://www.sciencedirect.com/science/article/abs/pii/S1570826818300635). *Journal of Web Semantics, 59*, 100483.
## Publication
Орехов Б. В. [Индивидуальная семантика Л. Н. Толстого в свете векторных моделей](https://human.spbstu.ru/article/2023.54.09/) // Terra Linguistica. 2023. Т. 14. No 4. С. 119–129. DOI: 10.18721/JHSS.14409
```
@article{орехов2023индивидуальная,
title={Индивидуальная семантика Л. Н. Толстого в свете векторных моделей},
author={Орехов, Б.В.},
journal={Terra Linguistica},
volume={14},
number={4},
pages={119--129},
doi={10.18721/JHSS.14409}
url={https://human.spbstu.ru/userfiles/files/articles/2023/4/119-129.pdf}
year={2023}
}
```