Mila commited on
Commit
862ca42
1 Parent(s): 3cff715

still broken?

Browse files
Files changed (2) hide show
  1. app_context.py +0 -260
  2. word_embedding.py +0 -625
app_context.py CHANGED
@@ -1,266 +1,7 @@
1
- <<<<<<< HEAD
2
- import gradio as gr
3
- import math
4
- import spacy
5
- from datasets import load_dataset
6
- from transformers import pipeline, T5Tokenizer
7
- from transformers import AutoTokenizer, AutoModel, AutoModelForSequenceClassification
8
- from transformers import TrainingArguments, Trainer, T5ForConditionalGeneration
9
- import torch
10
- import torch.nn.functional as F
11
- from torch.utils.data import DataLoader
12
- import numpy as np
13
- import evaluate
14
- import nltk
15
- from nltk.corpus import stopwords
16
- import subprocess
17
- import sys
18
- import random
19
- from textwrap import fill
20
-
21
- # !pip install https://huggingface.co/spacy/en_core_web_sm/resolve/main/en_core_web_sm-any-py3-none-any.whl
22
- subprocess.check_call([sys.executable, '-m', 'pip', 'install', 'https://huggingface.co/spacy/en_core_web_sm/resolve/main/en_core_web_sm-any-py3-none-any.whl'])
23
- # tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')
24
- model_base = "results/checkpoint-17000"
25
- nltk.download('stopwords')
26
- nlp = spacy.load("en_core_web_sm")
27
- stops = stopwords.words("english")
28
- ROMAN_CONSTANTS = (
29
- ( "", "I", "II", "III", "IV", "V", "VI", "VII", "VIII", "IX" ),
30
- ( "", "X", "XX", "XXX", "XL", "L", "LX", "LXX", "LXXX", "XC" ),
31
- ( "", "C", "CC", "CCC", "CD", "D", "DC", "DCC", "DCCC", "CM" ),
32
- ( "", "M", "MM", "MMM", "", "", "-", "", "", "" ),
33
- ( "", "i", "ii", "iii", "iv", "v", "vi", "vii", "viii", "ix" ),
34
- ( "", "x", "xx", "xxx", "xl", "l", "lx", "lxx", "lxxx", "xc" ),
35
- ( "", "c", "cc", "ccc", "cd", "d", "dc", "dcc", "dccc", "cm" ),
36
- ( "", "m", "mm", "mmm", "", "", "-", "", "", "" ),
37
- )
38
-
39
- # answer = "Pizza"
40
- guesses = []
41
- return_guesses = []
42
- answer = "Moon"
43
- word1 = "Black"
44
- word2 = "White"
45
- word3 = "Sun"
46
- base_prompts = ["Sun is to Moon as ", "Black is to White as ", "Atom is to Element as",
47
- "Athens is to Greece as ", "Cat is to Dog as ", "Robin is to Bird as",
48
- "Hunger is to Ambition as "]
49
-
50
-
51
- #Mean Pooling - Take attention mask into account for correct averaging
52
- def mean_pooling(model_output, attention_mask):
53
- token_embeddings = model_output['token_embeddings'] #First element of model_output contains all token embeddings
54
- input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
55
- return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)
56
-
57
-
58
- def normalize(comment, lowercase, remove_stopwords):
59
- if lowercase:
60
- comment = comment.lower()
61
- comment = nlp(comment)
62
- lemmatized = list()
63
- for word in comment:
64
- lemma = word.lemma_.strip()
65
- if lemma:
66
- if not remove_stopwords or (remove_stopwords and lemma not in stops):
67
- lemmatized.append(lemma)
68
- return " ".join(lemmatized)
69
-
70
-
71
- # def tokenize_function(examples):
72
- # return tokenizer(examples["text"])
73
-
74
-
75
- def compute_metrics(eval_pred):
76
- logits, labels = eval_pred
77
- predictions = np.argmax(logits, axis=-1)
78
- metric = evaluate.load("accuracy")
79
- return metric.compute(predictions=predictions, references=labels)
80
-
81
-
82
- def get_model():
83
- global model_base
84
- # last_checkpoint = "./results/checkpoint-22500"
85
-
86
- finetuned_model = T5ForConditionalGeneration.from_pretrained(model_base)
87
- tokenizer = T5Tokenizer.from_pretrained(model_base)
88
- # model = SentenceTransformer(model_base)
89
- gpu_available = torch.cuda.is_available()
90
- device = torch.device("cuda" if gpu_available else "cpu")
91
- finetuned_model = finetuned_model.to(device)
92
- return finetuned_model, tokenizer
93
-
94
-
95
- def cosine_scores(model, sentence):
96
- global word1
97
- global word2
98
- global word3
99
- # sentence1 = f"{word1} is to {word2} as"
100
- embeddings1 = model.encode(sentence, convert_to_tensor=True)
101
-
102
- def embeddings(model, sentences, tokenizer):
103
- global word1
104
- global word2
105
- global word3
106
- global model_base
107
- gpu_available = torch.cuda.is_available()
108
- device = torch.device("cuda" if gpu_available else "cpu")
109
- # device = torch.device('cuda:0')
110
- # embeddings = model.encode(sentences)
111
- question = "Please answer to this question: " + sentences
112
-
113
- inputs = tokenizer(question, return_tensors="pt")
114
-
115
- print(inputs)
116
- # print(inputs.device)
117
- print(model.device)
118
- print(inputs['input_ids'].device)
119
- print(inputs['attention_mask'].device)
120
-
121
- inputs['attention_mask'] = inputs['attention_mask'].to(device)
122
- inputs['input_ids'] = inputs['input_ids'].to(device)
123
-
124
- outputs = model.generate(**inputs)
125
- answer = tokenizer.decode(outputs[0])
126
- answer = answer[6:-4]
127
- # print(fill(answer, width=80))
128
-
129
- print("ANSWER IS", answer)
130
-
131
- return answer
132
-
133
-
134
- def random_word(model, tokenizer):
135
- global model_base
136
- vocab = tokenizer.get_vocab()
137
- # with open(model_base + '/vocab.txt', 'r') as file:
138
- line = ""
139
- # content = file.readlines()
140
- length = tokenizer.vocab_size
141
- # print(vocab)
142
- while line == "":
143
- rand_line = random.randrange(0, length)
144
- # print("TRYING TO FIND", rand_line, "OUT OF", length, "WITH VOCAB OF TYPE", type(vocab))
145
- for word, id in vocab.items():
146
- if id == rand_line and word[0].isalpha() and word not in stops and word not in ROMAN_CONSTANTS:
147
- # if vocab[rand_line][0].isalpha() and vocab[rand_line][:-1] not in stops and vocab[rand_line][:-1] not in ROMAN_CONSTANTS:
148
- line = word
149
- elif id == rand_line:
150
- print(f"{word} is not alpha or is a stop word")
151
- # for num, aline in enumerate(file, 1997):
152
- # if random.randrange(num) and aline.isalpha():
153
- # continue
154
- # # elif not aline.isalpha():
155
-
156
- # line = aline
157
- print(line)
158
- return line
159
-
160
-
161
- def generate_prompt(model, tokenizer):
162
- global word1
163
- global word2
164
- global word3
165
- global answer
166
- global base_prompts
167
- word1 = random_word(model, tokenizer)
168
- # word2 = random_word()
169
-
170
- word2 = embeddings(model, f"{base_prompts[random.randint(0, len(base_prompts) - 1)]}{word1} is to ___.", tokenizer)
171
- word3 = random_word(model, tokenizer)
172
- sentence = f"{word1} is to {word2} as {word3} is to ___."
173
- print(sentence)
174
- answer = embeddings(model, sentence, tokenizer)
175
- print("ANSWER IS", answer)
176
- return f"# {word1} is to {word2} as {word3} is to ___."
177
- # cosine_scores(model, sentence)
178
-
179
-
180
- def greet(name):
181
- return "Hello " + name + "!!"
182
-
183
- def check_answer(guess:str):
184
- global guesses
185
- global answer
186
- global return_guesses
187
- global word1
188
- global word2
189
- global word3
190
-
191
- model, tokenizer = get_model()
192
- output = ""
193
- protected_guess = guess
194
- sentence = f"{word1} is to {word2} as [MASK] is to {guess}."
195
-
196
- other_word = embeddings(model, sentence, tokenizer)
197
- guesses.append(guess)
198
-
199
-
200
-
201
- for guess in return_guesses:
202
- output += ("- " + guess + "<br>")
203
-
204
- # output = output[:-1]
205
- prompt = f"{word1} is to {word2} as {word3} is to ___."
206
- # print("IS", protected_guess, "EQUAL TO", answer, ":", protected_guess.lower() == answer.lower())
207
-
208
- if protected_guess.lower() == answer.lower():
209
- return_guesses.append(f"{protected_guess}: {word1} is to {word2} as {word3} is to {protected_guess}.")
210
- output += f"<span style='color:green'>- {return_guesses[-1]}</span><br>"
211
- new_prompt = generate_prompt(model, tokenizer)
212
- return new_prompt, "Correct!", output
213
- else:
214
- return_guess = f"{protected_guess}: {word1} is to {word2} as {other_word} is to {protected_guess}."
215
- return_guesses.append(return_guess)
216
- output += ("- " + return_guess + " <br>")
217
- return prompt, "Try again!", output
218
-
219
- def main():
220
- global word1
221
- global word2
222
- global word3
223
- global answer
224
- # answer = "Moon"
225
- global guesses
226
-
227
-
228
- # num_rows, data_type, value, example, embeddings = training()
229
- # sent_embeddings = embeddings()
230
- model, tokenizer = get_model()
231
- generate_prompt(model, tokenizer)
232
-
233
- prompt = f"{word1} is to {word2} as {word3} is to ____"
234
- print(prompt)
235
- print("TESTING EMBEDDINGS")
236
- with gr.Blocks() as iface:
237
- mark_question = gr.Markdown(prompt)
238
- with gr.Tab("Guess"):
239
- text_input = gr.Textbox()
240
- text_output = gr.Textbox()
241
- text_button = gr.Button("Submit")
242
- with gr.Accordion("Open for previous guesses"):
243
- text_guesses = gr.Markdown()
244
- # with gr.Tab("Testing"):
245
- # gr.Markdown(f"""The Embeddings are {sent_embeddings}.""")
246
- text_button.click(check_answer, inputs=[text_input], outputs=[mark_question, text_output, text_guesses])
247
- # iface = gr.Interface(fn=greet, inputs="text", outputs="text")
248
- iface.launch()
249
-
250
-
251
-
252
-
253
-
254
- if __name__ == "__main__":
255
- =======
256
  import gradio as gr
257
  import math
258
  import spacy
259
  from datasets import load_dataset
260
- from sentence_transformers import SentenceTransformer
261
- from sentence_transformers import InputExample
262
- from sentence_transformers import losses
263
- from sentence_transformers import util
264
  from transformers import pipeline, T5Tokenizer
265
  from transformers import AutoTokenizer, AutoModel, AutoModelForSequenceClassification
266
  from transformers import TrainingArguments, Trainer, T5ForConditionalGeneration
@@ -510,5 +251,4 @@ def main():
510
 
511
 
512
  if __name__ == "__main__":
513
- >>>>>>> 5058aea (Problems)
514
  main()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import gradio as gr
2
  import math
3
  import spacy
4
  from datasets import load_dataset
 
 
 
 
5
  from transformers import pipeline, T5Tokenizer
6
  from transformers import AutoTokenizer, AutoModel, AutoModelForSequenceClassification
7
  from transformers import TrainingArguments, Trainer, T5ForConditionalGeneration
 
251
 
252
 
253
  if __name__ == "__main__":
 
254
  main()
word_embedding.py CHANGED
@@ -1,625 +1,3 @@
1
- <<<<<<< HEAD
2
- <<<<<<< HEAD
3
- =======
4
- >>>>>>> 5058aea (Problems)
5
- from datasets import load_dataset
6
- import shutil
7
- import json
8
- from collections import defaultdict
9
- import multiprocessing
10
- import gensim
11
- from sklearn.metrics import classification_report
12
- from gensim import corpora
13
- from gensim.test.utils import common_texts
14
- from gensim.models import Word2Vec
15
- from gensim.models import KeyedVectors
16
- from gensim.models import fasttext
17
- from gensim.test.utils import datapath
18
- from wefe.datasets import load_bingliu
19
- from wefe.metrics import RNSB
20
- from wefe.query import Query
21
- from wefe.word_embedding_model import WordEmbeddingModel
22
- from wefe.utils import plot_queries_results, run_queries
23
- import pandas as pd
24
- import gensim.downloader as api
25
- import glob
26
- from sklearn.feature_extraction.text import TfidfVectorizer
27
- from sklearn.ensemble import RandomForestClassifier
28
- from wefe.metrics import WEAT
29
- from wefe.datasets import load_weat
30
- from wefe.utils import run_queries
31
- from wefe.utils import plot_queries_results
32
- import random
33
- from scipy.special import expit
34
- import math
35
- import sys
36
- import os
37
- import argparse
38
- import nltk
39
- import scipy.sparse
40
- import numpy as np
41
- import string
42
- import io
43
- from sklearn.model_selection import train_test_split
44
-
45
-
46
- '''STEPS FOR CODE:
47
- 1. Train word embeddings on Simple English Wikipedia;
48
- 2. Compare these to other pre-trained embeddings;
49
- 3. Quantify biases that exist in these word embeddings;
50
- 4. Use your word embeddings as features in a simple text classifier;
51
- '''
52
-
53
-
54
- def load_vectors(fname):
55
- fin = io.open(fname, 'r', encoding='utf-8', newline='\n', errors='ignore')
56
- n, d = map(int, fin.readline().split())
57
- data = {}
58
- # print("Hello", n, d)
59
- for line in fin:
60
- tokens = line.rstrip().split(' ')
61
- data[tokens[0]] = map(float, tokens[1:])
62
- # print(data)
63
-
64
- print(data)
65
- return data
66
-
67
-
68
- def train_embeddings():
69
- '''TRAIN WORD EMBEDDINGS
70
- This will be making use of the dataset from wikipedia and the first step'''
71
- dataset = load_dataset("wikipedia", "20220301.simple")
72
- cores = multiprocessing.cpu_count()
73
- # check the first example of the training portion of the dataset :
74
- # print(dataset['train'][0])
75
- dataset_size = len(dataset)
76
-
77
- ### BUILD VOCAB ###
78
- # print(type(dataset["train"][0]))
79
- vocab = set()
80
- vocab_size = 0
81
- count = 0
82
- ## Generate vocab and split sentances and words?
83
- data = []
84
- for index, page in enumerate(dataset["train"]):
85
- document = page["text"]
86
- document = document.replace("\n", ". ")
87
- # print(document)
88
- for sent in document.split("."):
89
- # print("Sentance:", sent)
90
- new_sent = []
91
- clean_sent =[s for s in sent if s.isalnum() or s.isspace()]
92
- clean_sent = "".join(clean_sent)
93
- for word in clean_sent.split(" "):
94
- if len(word) > 0:
95
- new_word = word.lower()
96
- # print("Word:", new_word)
97
- if new_word[0] not in string.punctuation:
98
- new_sent.append(new_word)
99
- if len(new_sent) > 0:
100
- data.append(new_sent)
101
- # print("New Sent:", new_sent)
102
-
103
-
104
- for index, page in enumerate(dataset["train"]):
105
- # print(page["text"])
106
- # for text in page:
107
- # print(text)
108
- text = page["text"]
109
- clean_text = [s for s in text if s.isalnum() or s.isspace()]
110
- clean_text = "".join(clean_text)
111
- clean_text = clean_text.replace("\n", " ")
112
- # text = text.replace('; ', ' ').replace(", ", " ").replace("\n", " ").replace(":", " ").replace(". ", " ").replace("! ", " ").replace("? ", " ").replace()
113
-
114
- for word in clean_text.split(" "):
115
- # print(word)
116
- if word != "\n" and word != " " and word not in vocab:
117
- vocab.add(word)
118
- vocab_size += 1
119
- # if index == 10:
120
- # break
121
- # print(f"word #{index}/{count} is {word}")
122
- count += 1
123
-
124
- # print(f"There are {vocab_size} vocab words")
125
-
126
- embeddings_model = Word2Vec(
127
- data,
128
- epochs= 10,
129
- window=10,
130
- vector_size= 50)
131
- embeddings_model.save("word2vec.model")
132
-
133
- skip_model = Word2Vec(
134
- data,
135
- epochs= 10,
136
- window=10,
137
- vector_size= 50,
138
- sg=1)
139
- skip_model.save("skip2vec.model")
140
-
141
- embeddings_model = Word2Vec.load("word2vec.model")
142
- skip_model = Word2Vec.load("skip2vec.model")
143
-
144
- # embeddings_model.train(dataset, total_examples=dataset_size, epochs=15)
145
- # print(embeddings_model['train'])
146
- # print(embeddings_model.wv["france"])
147
- return embeddings_model, skip_model
148
-
149
-
150
- def get_data():
151
- dataset = load_dataset("wikipedia", "20220301.simple")
152
- cores = multiprocessing.cpu_count()
153
- # check the first example of the training portion of the dataset :
154
- # print(dataset['train'][0])
155
- dataset_size = len(dataset)
156
-
157
- ### BUILD VOCAB ###
158
- # print(type(dataset["train"][0]))
159
- vocab = set()
160
- vocab_size = 0
161
- count = 0
162
- ## Generate vocab and split sentances and words?
163
- data = []
164
- num_sents = 0
165
- for index, page in enumerate(dataset["train"]):
166
- document = page["text"]
167
- document = document.replace("\n", ". ")
168
- # print(document)
169
- for sent in document.split("."):
170
- num_sents += 1
171
- # print("Sentance:", sent)
172
- new_sent = []
173
- clean_sent =[s for s in sent if s.isalnum() or s.isspace()]
174
- clean_sent = "".join(clean_sent)
175
- for word in clean_sent.split(" "):
176
- if len(word) > 0:
177
- new_word = word.lower()
178
- # print("Word:", new_word)
179
- if new_word[0] not in string.punctuation:
180
- new_sent.append(new_word)
181
- if len(new_sent) > 0:
182
- data.append(new_sent)
183
- # print("New Sent:", new_sent)
184
-
185
- return data, num_sents
186
-
187
-
188
- def compare_embeddings(cbow, skip, urban, fasttext):
189
- '''COMPARE EMBEDDINGS'''
190
- print("Most Similar to dog")
191
- print("cbow", cbow.wv.most_similar(positive=['dog'], negative=[], topn=2))
192
- print("skip", skip.wv.most_similar(positive=['dog'], negative=[], topn=2))
193
- print("urban", urban.most_similar(positive=['dog'], negative=[], topn=2))
194
- print("fasttext", fasttext.most_similar(positive=['dog'], negative=[], topn=2))
195
-
196
- print("\nMost Similar to Pizza - Pepperoni + Pretzel")
197
- print("cbow", cbow.wv.most_similar(positive=['pizza', 'pretzel'], negative=['pepperoni'], topn=2))
198
- print("skip", skip.wv.most_similar(positive=['pizza', 'pretzel'], negative=['pepperoni'], topn=2))
199
- print("urban", urban.most_similar(positive=['pizza', 'pretzel'], negative=['pepperoni'], topn=2))
200
- print("fasttext", fasttext.most_similar(positive=['pizza', 'pretzel'], negative=['pepperoni'], topn=2))
201
-
202
- print("\nMost Similar to witch - woman + man")
203
- print("cbow", cbow.wv.most_similar(positive=['witch', 'man'], negative=['woman'], topn=2))
204
- print("skip", skip.wv.most_similar(positive=['witch', 'man'], negative=['woman'], topn=2))
205
- print("urban", urban.most_similar(positive=['witch', 'man'], negative=['woman'], topn=2))
206
- print("fasttext", fasttext.most_similar(positive=['witch', 'man'], negative=['woman'], topn=2))
207
-
208
- print("\nMost Similar to mayor - town + country")
209
- print("cbow", cbow.wv.most_similar(positive=['mayor', 'country'], negative=['town'], topn=2))
210
- print("skip", skip.wv.most_similar(positive=['mayor', 'country'], negative=['town'], topn=2))
211
- print("urban", urban.most_similar(positive=['mayor', 'country'], negative=['town'], topn=2))
212
- print("fasttext", fasttext.most_similar(positive=['mayor', 'country'], negative=['town'], topn=2))
213
-
214
- print("\nMost Similar to death")
215
- print("cbow", cbow.wv.most_similar(positive=['death'], negative=[], topn=2))
216
- print("skip", skip.wv.most_similar(positive=['death'], negative=[], topn=2))
217
- print("urban", urban.most_similar(positive=['death'], negative=[], topn=2))
218
- print("fasttext", fasttext.most_similar(positive=['death'], negative=[], topn=2))
219
-
220
-
221
- def quantify_bias(cbow, skip, urban, fasttext):
222
- '''QUANTIFY BIASES'''
223
- '''Using WEFE, RNSB'''
224
-
225
- RNSB_words = [
226
- ['christianity'],
227
- ['catholicism'],
228
- ['islam'],
229
- ['judaism'],
230
- ['hinduism'],
231
- ['buddhism'],
232
- ['mormonism'],
233
- ['scientology'],
234
- ['taoism']]
235
-
236
- weat_wordset = load_weat()
237
-
238
- models = [WordEmbeddingModel(cbow.wv, "CBOW"),
239
- WordEmbeddingModel(skip.wv, "skip-gram"),
240
- WordEmbeddingModel(urban, "urban dictionary"),
241
- WordEmbeddingModel(fasttext, "fasttext")]
242
-
243
- # Define the 10 Queries:
244
- # print(weat_wordset["science"])
245
- religions = ['christianity',
246
- 'catholicism',
247
- 'islam',
248
- 'judaism',
249
- 'hinduism',
250
- 'buddhism',
251
- 'mormonism',
252
- 'scientology',
253
- 'taoism',
254
- 'atheism']
255
- queries = [
256
- # Flowers vs Insects wrt Pleasant (5) and Unpleasant (5)
257
- Query([religions, weat_wordset['arts']],
258
- [weat_wordset['career'], weat_wordset['family']],
259
- ['Religion', 'Art'], ['Career', 'Family']),
260
-
261
- Query([religions, weat_wordset['weapons']],
262
- [weat_wordset['male_terms'], weat_wordset['female_terms']],
263
- ['Religion', 'Weapons'], ['Male terms', 'Female terms']),
264
-
265
- ]
266
-
267
- wefe_results = run_queries(WEAT,
268
- queries,
269
- models,
270
- metric_params ={
271
- 'preprocessors': [
272
- {},
273
- {'lowercase': True }
274
- ]
275
- },
276
- warn_not_found_words = True
277
- ).T.round(2)
278
-
279
- print(wefe_results)
280
- plot_queries_results(wefe_results).show()
281
-
282
-
283
- def text_classifier(cbow):
284
- '''SIMPLE TEXT CLASSIFIER'''
285
- '''For each document, average together all embeddings for the
286
- individual words in that document to get a new, d-dimensional representation
287
- of that document (this is essentially a “continuous bag-of-words”). Note that
288
- your input feature size is only d now, instead of the size of your entire vocabulary.
289
- Compare the results of training a model using these “CBOW” input features to
290
- your original (discrete) BOW model.'''
291
- pos_train_files = glob.glob('aclImdb/train/pos/*')
292
- neg_train_files = glob.glob('aclImdb/train/neg/*')
293
- # print(pos_train_files[:5])
294
-
295
- num_files_per_class = 1000
296
- # bow_train_files = cbow
297
- all_train_files = pos_train_files[:num_files_per_class] + neg_train_files[:num_files_per_class]
298
- # vectorizer = TfidfVectorizer(input="filename", stop_words="english")
299
- # vectors = vectorizer.fit_transform(all_train_files)
300
- d = len(cbow.wv["man"])
301
- vectors = np.empty([len(all_train_files), d])
302
- count = 0
303
- vocab = set()
304
- for doc in all_train_files:
305
- temp_array = avg_embeddings(doc, cbow, vocab)
306
- if len(temp_array) > 0:
307
- vectors[count] = temp_array
308
- count += 1
309
- else:
310
- vectors = np.delete(vectors, count)
311
- # vectors = np.array(avg_embeddings(doc, cbow) for doc in all_train_files)
312
- # print(vectors)
313
- # print(vocab)
314
-
315
- # len(vectorizer.vocabulary_)
316
- vectors[0].sum()
317
- # print("Vector at 0", vectors[0])
318
-
319
- X = vectors
320
- y = [1] * num_files_per_class + [0] * num_files_per_class
321
- len(y)
322
-
323
- x_0 = X[0]
324
- w = np.zeros(X.shape[1])
325
- # x_0_dense = x_0.todense()
326
- x_0.dot(w)
327
-
328
- w,b = sgd_for_lr_with_ce(X,y)
329
- # w
330
-
331
- # sorted_vocab = sorted([(k,v) for k,v in vectorizer.vocabulary_.items()],key=lambda x:x[1])
332
- sorted_vocab = sorted(vocab)
333
- # sorted_vocab = [a for (a,b) in sorted_vocab]
334
-
335
- sorted_words_weights = sorted([x for x in zip(sorted_vocab, w)], key=lambda x:x[1])
336
- sorted_words_weights[-50:]
337
-
338
- preds = predict_y_lr(w,b,X)
339
-
340
- preds
341
-
342
- w,b = sgd_for_lr_with_ce(X, y, num_passes=10)
343
- y_pred = predict_y_lr(w,b,X)
344
- print(classification_report(y, y_pred))
345
-
346
- # compute for dev set
347
- # pos_dev_files = glob.glob('aclImdb/test/pos/*')
348
- # neg_dev_files = glob.glob('aclImdb/test/neg/*')
349
- # num_dev_files_per_class = 100
350
- # all_dev_files = pos_dev_files[:num_dev_files_per_class] + neg_dev_files[:num_dev_files_per_class]
351
- # # use the same vectorizer from before! otherwise features won't line up
352
- # # don't fit it again, just use it to transform!
353
- # X_dev = vectorizer.transform(all_dev_files)
354
- # y_dev = [1]* num_dev_files_per_class + [0]* num_dev_files_per_class
355
- # # don't need new w and b, these are from out existing model
356
- # y_dev_pred = predict_y_lr(w,b,X_dev)
357
- # print(classification_report(y_dev, y_dev_pred))
358
-
359
-
360
- def avg_embeddings(doc, model, vocab: set):
361
- words = []
362
- # remove out-of-vocabulary words
363
- with open(doc, "r") as file:
364
- for line in file:
365
- for word in line.split():
366
- words.append(word)
367
- vocab.add(word)
368
- words = [word for word in words if word in model.wv.index_to_key]
369
- if len(words) >= 1:
370
- return np.mean(model.wv[words], axis=0)
371
- else:
372
- return []
373
-
374
-
375
-
376
- def sent_vec(sent, cbow):
377
- vector_size = cbow.wv.vector_size
378
- wv_res = np.zeros(vector_size)
379
- # print(wv_res)
380
- ctr = 1
381
- for w in sent:
382
- if w in cbow.wv:
383
- ctr += 1
384
- wv_res += cbow.wv[w]
385
- wv_res = wv_res/ctr
386
- return wv_res
387
-
388
-
389
- def spacy_tokenizer(sentence):
390
- # Creating our token object, which is used to create documents with linguistic annotations.
391
- # doc = nlp(sentence)
392
-
393
-
394
-
395
- # print(doc)
396
- # print(type(doc))
397
-
398
- # Lemmatizing each token and converting each token into lowercase
399
- # mytokens = [ word.lemma_.lower().strip() for word in doc ]
400
-
401
- # print(mytokens)
402
-
403
- # Removing stop words
404
- # mytokens = [ word for word in mytokens if word not in stop_words and word not in punctuations ]
405
-
406
- # return preprocessed list of tokens
407
- return 0
408
-
409
-
410
- def cbow_classifier(cbow, data, num_sentances):
411
- vocab_len = len(cbow.wv.index_to_key)
412
-
413
- embeddings = []
414
- embedding_dict = {}
415
- vocab = set(cbow.wv.index_to_key)
416
-
417
- # print("Data len", len(data))
418
- # print("Data at 0", data[0])
419
-
420
- X_temp = np.empty([len(data), 1])
421
- X_train_vect = np.array([np.array([cbow.wv[i] for i in ls if i in vocab])
422
- for ls in data])
423
- X_test_vect = np.array([np.array([cbow.wv[i] for i in ls if i in vocab])
424
- for ls in data])
425
-
426
- # words = [word for word in words if word in cbow.wv.index_to_key]
427
- for word in vocab:
428
- # embedding[word] = cbow.wv[word]
429
- embeddings.append(np.mean(cbow.wv[word], axis=0))
430
- embedding_dict[word] = np.mean(cbow.wv[word], axis=0)
431
-
432
- X = embeddings
433
-
434
- X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,stratify=y)
435
-
436
- # print(embeddings)
437
- # print(vocab_len)
438
-
439
- # X_train_vect_avg = []
440
- # for v in X_train_vect:
441
- # if v.size:
442
- # X_train_vect_avg.append(v.mean(axis=0))
443
- # else:
444
- # X_train_vect_avg.append(np.zeros(100, dtype=float))
445
-
446
- # X_test_vect_avg = []
447
- # for v in X_test_vect:
448
- # if v.size:
449
- # X_test_vect_avg.append(v.mean(axis=0))
450
- # else:
451
- # X_test_vect_avg.append(np.zeros(100, dtype=float))
452
-
453
- # # for i, v in enumerate(X_train_vect_avg):
454
- # # print(len(data.iloc[i]), len(v))
455
-
456
- # x_0 = X_train_vect_avg[0]
457
- # num_files_per_class = 100
458
- # y = [1] * num_files_per_class + [0] * num_files_per_class
459
- # w = np.zeros(X_train_vect_avg.shape[1])
460
- # x_0_dense = x_0.todense()
461
- # x_0.dot(w)
462
-
463
- # w,b = sgd_for_lr_with_ce(X_train_vect_avg, y)
464
- # w
465
-
466
- # sorted_vocab = sorted([(k,v) for k,v in enumerate(embedding_dict)],key=lambda x:x[1])
467
- # sorted_vocab = [a for (a,b) in sorted_vocab]
468
-
469
- # sorted_words_weights = sorted([x for x in zip(sorted_vocab, w)], key=lambda x:x[1])
470
- # sorted_words_weights[-50:]
471
-
472
- # preds = predict_y_lr(w,b,X_train_vect_avg)
473
-
474
- # preds
475
-
476
- # w,b = sgd_for_lr_with_ce(X_train_vect_avg, y, num_passes=10)
477
- # y_pred = predict_y_lr(w,b,X_train_vect_avg)
478
- # print(classification_report(y, y_pred))
479
-
480
- # # compute for dev set
481
- # pos_dev_files = glob.glob('aclImdb/test/pos/*')
482
- # neg_dev_files = glob.glob('aclImdb/test/neg/*')
483
- # num_dev_files_per_class = 100
484
- # all_dev_files = pos_dev_files[:num_dev_files_per_class] + neg_dev_files[:num_dev_files_per_class]
485
- # # use the same vectorizer from before! otherwise features won't line up
486
- # # don't fit it again, just use it to transform!
487
- # # X_dev = vectorizer.transform(all_dev_files)
488
- # # y_dev = [1]* num_dev_files_per_class + [0]* num_dev_files_per_class
489
- # # # don't need new w and b, these are from out existing model
490
- # # y_dev_pred = predict_y_lr(w,b,X_dev)
491
- # # print(classification_report(y_dev, y_dev_pred))
492
-
493
-
494
- def sgd_for_lr_with_ce(X, y, num_passes=5, learning_rate = 0.1):
495
-
496
- num_data_points = X.shape[0]
497
-
498
- # Initialize theta -> 0
499
- num_features = X.shape[1]
500
- w = np.zeros(num_features)
501
- b = 0.0
502
-
503
- # repeat until done
504
- # how to define "done"? let's just make it num passes for now
505
- # we can also do norm of gradient and when it is < epsilon (something tiny)
506
- # we stop
507
-
508
- for current_pass in range(num_passes):
509
-
510
- # iterate through entire dataset in random order
511
- order = list(range(num_data_points))
512
- random.shuffle(order)
513
- for i in order:
514
-
515
- # compute y-hat for this value of i given y_i and x_i
516
- x_i = X[i]
517
- y_i = y[i]
518
-
519
- # need to compute based on w and b
520
- # sigmoid(w dot x + b)
521
- z = x_i.dot(w) + b
522
- y_hat_i = expit(z)
523
-
524
- # for each w (and b), modify by -lr * (y_hat_i - y_i) * x_i
525
- w = w - learning_rate * (y_hat_i - y_i) * x_i
526
- b = b - learning_rate * (y_hat_i - y_i)
527
-
528
- # return theta
529
- return w,b
530
-
531
-
532
- def predict_y_lr(w,b,X,threshold=0.5):
533
-
534
- # use our matrix operation version of the logistic regression model
535
- # X dot w + b
536
- # need to make w a column vector so the dimensions line up correctly
537
- y_hat = X.dot( w.reshape((-1,1)) ) + b
538
-
539
- # then just check if it's > threshold
540
- preds = np.where(y_hat > threshold,1,0)
541
-
542
- return preds
543
-
544
-
545
- def main():
546
- parser = argparse.ArgumentParser(
547
- prog='word_embedding',
548
- description='This program will train a word embedding model using simple wikipedia.',
549
- epilog='To skip training the model and to used the saved model "word2vec.model", use the command --skip or -s.'
550
- )
551
- parser.add_argument('-s', '--skip', action='store_true')
552
- parser.add_argument('-e', '--extra', action='store_true')
553
- parser.add_argument('-b', '--bias', action='store_true')
554
- parser.add_argument('-c', '--compare', action='store_true')
555
- parser.add_argument('-t', '--text', action='store_true')
556
-
557
- args = parser.parse_args()
558
- skip_model = None
559
- cbow_model = None
560
- ud_model = None
561
- wiki_model = None
562
- if args.compare:
563
- if args.skip:
564
- # print("Skipping")
565
- cbow_model = Word2Vec.load("word2vec.model")
566
- skip_model = Word2Vec.load("skip2vec.model")
567
- ud_model = KeyedVectors.load("urban2vec.model")
568
- wiki_model = KeyedVectors.load("wiki2vec.model")
569
- elif args.extra:
570
- # print("Extra mode")
571
- cbow_model = Word2Vec.load("word2vec.model")
572
- skip_model = Word2Vec.load("skip2vec.model")
573
- wiki_model = KeyedVectors.load_word2vec_format("wiki-news-300d-1M-subwords.vec", binary=False)
574
- ud_model = KeyedVectors.load_word2vec_format("ud_basic.vec", binary=False)
575
- wiki_model.save("wiki2vec.model")
576
- ud_model.save("urban2vec.model")
577
- else:
578
- cbow_model, skip_model = train_embeddings()
579
- wiki_model = KeyedVectors.load_word2vec_format("wiki-news-300d-1M-subwords.vec", binary=False)
580
- ud_model = KeyedVectors.load_word2vec_format("ud_basic.vec", binary=False)
581
- wiki_model.save("wiki2vec.model")
582
- ud_model.save("urban2vec.model")
583
- compare_embeddings(cbow_model, skip_model, ud_model, wiki_model)
584
- if args.bias:
585
- if args.skip:
586
- # print("Skipping")
587
- cbow_model = Word2Vec.load("word2vec.model")
588
- skip_model = Word2Vec.load("skip2vec.model")
589
- ud_model = KeyedVectors.load("urban2vec.model")
590
- wiki_model = KeyedVectors.load("wiki2vec.model")
591
- elif args.extra:
592
- # print("Extra mode")
593
- cbow_model = Word2Vec.load("word2vec.model")
594
- skip_model = Word2Vec.load("skip2vec.model")
595
- wiki_model = KeyedVectors.load_word2vec_format("wiki-news-300d-1M-subwords.vec", binary=False)
596
- ud_model = KeyedVectors.load_word2vec_format("ud_basic.vec", binary=False)
597
- wiki_model.save("wiki2vec.model")
598
- ud_model.save("urban2vec.model")
599
- else:
600
- cbow_model, skip_model = train_embeddings()
601
- wiki_model = KeyedVectors.load_word2vec_format("wiki-news-300d-1M-subwords.vec", binary=False)
602
- ud_model = KeyedVectors.load_word2vec_format("ud_basic.vec", binary=False)
603
- wiki_model.save("wiki2vec.model")
604
- ud_model.save("urban2vec.model")
605
- quantify_bias(cbow_model, skip_model, ud_model, wiki_model)
606
- if args.text:
607
- if args.skip:
608
- # print("Skipping")
609
- cbow_model = Word2Vec.load("word2vec.model")
610
- else:
611
- cbow_model, skip_model = train_embeddings()
612
-
613
- text_classifier(cbow_model)
614
- # data, sents = get_data()
615
- # cbow_classifier(cbow_model, data, sents)
616
-
617
- # print("No errors?")
618
-
619
-
620
- if __name__ == "__main__":
621
- <<<<<<< HEAD
622
- =======
623
  from datasets import load_dataset
624
  import shutil
625
  import json
@@ -1236,7 +614,4 @@ def main():
1236
 
1237
 
1238
  if __name__ == "__main__":
1239
- >>>>>>> 7d5b505 (New in-context model with working UI System)
1240
- =======
1241
- >>>>>>> 5058aea (Problems)
1242
  main()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  from datasets import load_dataset
2
  import shutil
3
  import json
 
614
 
615
 
616
  if __name__ == "__main__":
 
 
 
617
  main()