|
--- |
|
license: mit |
|
language: |
|
- ru |
|
tags: |
|
- natural-language-processing |
|
- dh |
|
- word2vec |
|
--- |
|
|
|
The model is built on Leo Tolstoy's [collected works](https://github.com/tolstoydigital/TEI) and represents his individual semantics |
|
|
|
## Preparation |
|
|
|
All texts are converted from the TEI markup, splitted into sentences and lemmatized. Only modern orthography left in the data. |
|
|
|
```python |
|
import html |
|
import os |
|
import re |
|
import shutil |
|
from bs4 import BeautifulSoup |
|
|
|
!pip install razdel # for splitting |
|
|
|
from razdel import sentenize |
|
from tqdm import tqdm |
|
|
|
!git clone https://github.com/tolstoydigital/TEI.git |
|
|
|
relevant_dirs = ['diaries', 'letters', 'notes', 'works'] |
|
|
|
path = 'TEI/reference/bibllist_works.xml' # allows to work with fiction and non fiction separately |
|
xml = open(path).read() |
|
soup = BeautifulSoup(xml, features="xml") |
|
|
|
group_texts = {} |
|
for it in soup.find_all("item"): |
|
ref = it.find("ref") |
|
for related in it.find_all("relatedItem"): |
|
for ref_ana in related.find_all("ref"): |
|
group_texts[ref_ana.text] = ref.text |
|
|
|
prefix_texts = 'extracted_texts' |
|
os.mkdir(prefix_texts) |
|
|
|
if os.path.exists(prefix_texts): |
|
shutil.rmtree(prefix_texts) |
|
os.mkdir(prefix_texts) |
|
|
|
# extract texts from XML |
|
|
|
complex_texts = {} |
|
for rel_dir in relevant_dirs: |
|
path = os.path.join('TEI/texts', rel_dir) |
|
for file in tqdm(sorted(os.listdir(path))): |
|
fiction = 0 |
|
if not file.endswith('.xml'): |
|
continue |
|
xml = open(os.path.join(path, file)).read() |
|
if 'Печатные варианты' in xml: |
|
continue |
|
nameID = file.replace('.xml', '') |
|
soup = BeautifulSoup(xml, features="xml") |
|
if soup.find("catRef", {"ana":"#fiction"}): |
|
fiction = 1 |
|
s = soup.find("body") |
|
paragraphs = [] |
|
for erase in s.find_all(["orig", "comments", "sic", "note"]): |
|
erase.decompose() |
|
for p in s.find_all(["p", "l"]): |
|
paragraphs.append(html.unescape(p.text.replace('\n', ' ').strip())) |
|
if not fiction: |
|
with open(os.path.join(prefix_texts, rel_dir + '.txt'), 'a') as f: |
|
for par in paragraphs: |
|
par = re.sub(' ([.,;:!?)"»])', '\\1', par) |
|
par = par.replace('\n', ' ') |
|
par = par.strip() |
|
par = re.sub('\s+', ' ', par) |
|
par = re.sub('\[.+?\]', '', par) |
|
for sent in sentenize(par): |
|
f.write(list(sent)[2].strip() + '\n') |
|
else: |
|
if nameID in group_texts: |
|
hyper_name = group_texts[nameID] |
|
if hyper_name not in complex_texts: |
|
complex_texts[hyper_name] = paragraphs |
|
else: |
|
complex_texts[hyper_name].extend(paragraphs) |
|
else: |
|
with open(os.path.join(prefix_texts, nameID + '.txt'), 'w') as f: |
|
f.write('\n'.join(paragraphs)) |
|
for hyper_name in complex_texts: |
|
with open(os.path.join(prefix_texts, hyper_name + '.txt'), 'w') as f: |
|
f.write('\n'.join(complex_texts[hyper_name])) |
|
|
|
# tagging |
|
|
|
from pymystem3 import Mystem |
|
|
|
pos = ['S', 'V', 'A', 'ADV'] |
|
|
|
def tagging(): |
|
m = Mystem() |
|
for fl in os.listdir(prefix_texts): |
|
#print(fl) |
|
if 'mystem' in fl: |
|
continue |
|
with open(os.path.join(prefix_texts, fl)) as f: |
|
text = f.read() |
|
lines = text.split('\n') |
|
ana_lines = [] |
|
for line in lines: |
|
line = ' '.join(line.split()[1:]) |
|
line = line.replace('ò', 'о') |
|
line = line.replace('è', 'е') |
|
line = line.replace('à', 'а') |
|
line = line.replace('ѝ', 'и') |
|
line = line.replace('ỳ', 'у') |
|
line = line.replace('о̀', 'о') |
|
#line = line.replace('Изд.̀', 'издательство') |
|
ana = [] |
|
info = m.analyze(line) |
|
for token in info: |
|
if "analysis" in token: |
|
try: |
|
analysis = token["analysis"][0] |
|
except: |
|
#print(token) |
|
continue |
|
# if "lex" in analysis: |
|
lex = analysis["lex"] |
|
#if 'gr' in analysis: |
|
gr = analysis['gr'] |
|
#print(gr) |
|
const = gr.split('=')[0] |
|
if ',' in const: |
|
pos = const.split(',')[0] |
|
else: |
|
pos = const |
|
|
|
ana.append('{}_{}'.format(lex, pos)) |
|
ln = ' '.join(ana) |
|
if re.search('[А-Яа-я]', ln): |
|
ana_lines.append(ln) |
|
with open('{}/mystem-{}'.format(prefix_texts, fl), 'w') as fw: |
|
fw.write('\n'.join(ana_lines)) |
|
|
|
def mk_input(): |
|
inp = [] |
|
for fl in os.listdir(prefix_texts): |
|
if not 'mystem' in fl: |
|
continue |
|
#print(fl) |
|
with open(os.path.join(prefix_texts, fl)) as f: |
|
text = f.read() |
|
lines = text.split('\n') |
|
for line in lines: |
|
words = [] |
|
for w in line.split(): |
|
word = w.split('_') |
|
if word[1] in pos: |
|
words.append(w) |
|
if len(words) > 1: |
|
inp.append(' '.join(words)) |
|
|
|
with open('input.txt', 'w') as fw: |
|
fw.write('\n'.join(inp)) |
|
|
|
tagging() |
|
mk_input() |
|
|
|
``` |
|
The whole code is in the `w2v-prep.ipynb` notebook. |
|
|
|
|
|
## Models |
|
|
|
There are 2 models in the repository. Their parameters are taen from the general language models to be comparable from rusvectores site. |
|
|
|
Here is the code for building models: |
|
|
|
```python |
|
import sys |
|
import logging |
|
import gensim |
|
|
|
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) |
|
|
|
pth = './input.txt' |
|
data = gensim.models.word2vec.LineSentence(pth) # train sentence by sentence |
|
|
|
modelLNT1 = gensim.models.Word2Vec(data, vector_size=500, window=2, min_count=2, sg=1) # comparable with web_mystem_skipgram_500_2_2015.bin |
|
|
|
modelLNT1.save('skipgram_500_2.model') # saving |
|
|
|
modelLNT2 = gensim.models.Word2Vec(data, vector_size=300, window=10, min_count=2, sg=0) # comparable with ruwikiruscorpora_upos_cbow_300_10_2021 |
|
|
|
modelLNT2.save('cbow_300_10.model') |
|
``` |
|
|
|
## Usage |
|
|
|
```python |
|
|
|
# load models |
|
|
|
modelLNT1 = Word2Vec.load("skipgram_500_2.model") |
|
|
|
# most similar words viz |
|
|
|
import numpy as np |
|
import pandas as pd |
|
import matplotlib.pyplot as plt |
|
%matplotlib inline |
|
|
|
import seaborn as sns |
|
sns.set_style("darkgrid") |
|
|
|
from sklearn.decomposition import PCA |
|
from sklearn.manifold import TSNE |
|
|
|
def tsnescatterplot(model, word, list_names): # stolen code |
|
""" Plot in seaborn the results from the t-SNE dimensionality reduction algorithm of the vectors of a query word, |
|
its list of most similar words, and a list of words. |
|
""" |
|
arrays = np.empty((0, 300), dtype='f') |
|
word_labels = [word] |
|
color_list = ['red'] |
|
|
|
# adds the vector of the query word |
|
arrays = np.append(arrays, model.wv.__getitem__([word]), axis=0) |
|
|
|
# gets list of most similar words |
|
close_words = model.wv.most_similar([word]) |
|
|
|
# adds the vector for each of the closest words to the array |
|
for wrd_score in close_words: |
|
wrd_vector = model.wv.__getitem__([wrd_score[0]]) |
|
word_labels.append(wrd_score[0]) |
|
color_list.append('blue') |
|
arrays = np.append(arrays, wrd_vector, axis=0) |
|
|
|
# adds the vector for each of the words from list_names to the array |
|
for wrd in list_names: |
|
wrd_vector = model.wv.__getitem__([wrd]) |
|
word_labels.append(wrd) |
|
color_list.append('green') |
|
arrays = np.append(arrays, wrd_vector, axis=0) |
|
|
|
# Reduces the dimensionality from 300 to 50 dimensions with PCA |
|
reduc = PCA(n_components=20).fit_transform(arrays) |
|
|
|
# Finds t-SNE coordinates for 2 dimensions |
|
np.set_printoptions(suppress=True) |
|
|
|
Y = TSNE(n_components=2, random_state=0, perplexity=15).fit_transform(reduc) |
|
|
|
# Sets everything up to plot |
|
df = pd.DataFrame({'x': [x for x in Y[:, 0]], |
|
'y': [y for y in Y[:, 1]], |
|
'words': word_labels, |
|
'color': color_list}) |
|
|
|
fig, _ = plt.subplots() |
|
fig.set_size_inches(9, 9) |
|
|
|
# Basic plot |
|
p1 = sns.regplot(data=df, |
|
x="x", |
|
y="y", |
|
fit_reg=False, |
|
marker="o", |
|
scatter_kws={'s': 40, |
|
'facecolors': df['color'] |
|
} |
|
) |
|
|
|
# Adds annotations one by one with a loop |
|
for line in range(0, df.shape[0]): |
|
p1.text(df["x"][line], |
|
df['y'][line], |
|
' ' + df["words"][line].title(), |
|
horizontalalignment='left', |
|
verticalalignment='bottom', size='medium', |
|
color=df['color'][line], |
|
weight='normal' |
|
).set_size(15) |
|
|
|
|
|
plt.xlim(Y[:, 0].min()-50, Y[:, 0].max()+50) |
|
plt.ylim(Y[:, 1].min()-50, Y[:, 1].max()+50) |
|
|
|
plt.title('t-SNE visualization for {}'.format(word.title())) |
|
|
|
tsnescatterplot(modelLNT2, 'бог_S', [i[0] for i in modelLNT2.wv.most_similar(negative=["бог_S"])]) |
|
|
|
``` |
|
|
|
 |
|
|
|
## Train data |
|
|
|
Train corpus inclded in this repository as an `input.txt` file. It contains more than 7 mln words. For detailed explanation see Bonch-Osmolovskaya, A., Skorinkin, D., Pavlova, I., Kolbasov, M., & Orekhov, B. (2019). [Tolstoy semanticized: Constructing a digital edition for knowledge discovery](https://www.sciencedirect.com/science/article/abs/pii/S1570826818300635). *Journal of Web Semantics, 59*, 100483. |
|
|
|
## Publication |
|
|
|
Орехов Б. В. [Индивидуальная семантика Л. Н. Толстого в свете векторных моделей](https://human.spbstu.ru/article/2023.54.09/) // Terra Linguistica. 2023. Т. 14. No 4. С. 119–129. DOI: 10.18721/JHSS.14409 |
|
|
|
``` |
|
@article{орехов2023индивидуальная, |
|
title={Индивидуальная семантика Л. Н. Толстого в свете векторных моделей}, |
|
author={Орехов, Б.В.}, |
|
journal={Terra Linguistica}, |
|
volume={14}, |
|
number={4}, |
|
pages={119--129}, |
|
doi={10.18721/JHSS.14409} |
|
url={https://human.spbstu.ru/userfiles/files/articles/2023/4/119-129.pdf} |
|
year={2023} |
|
} |
|
``` |