lambdaofgod
commited on
Commit
·
42de6bd
1
Parent(s):
21d27ae
feat: Implement `extract_from_corpus` in `text_visualization.py`
Browse files- requirements.txt +2 -0
- text_visualization.py +38 -0
requirements.txt
CHANGED
@@ -1,2 +1,4 @@
|
|
1 |
gradio==3.48.0
|
2 |
plotly==5.24.1
|
|
|
|
|
|
1 |
gradio==3.48.0
|
2 |
plotly==5.24.1
|
3 |
+
scikit-learn==1.3.0
|
4 |
+
wordcloud==1.9.2
|
text_visualization.py
CHANGED
@@ -9,3 +9,41 @@ class WordCloudExtractor:
|
|
9 |
|
10 |
def extract_from_corpus(self, texts: Iterable[str], n_words: int) -> wordcloud.WordCloud:
|
11 |
pass
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
9 |
|
10 |
def extract_from_corpus(self, texts: Iterable[str], n_words: int) -> wordcloud.WordCloud:
|
11 |
pass
|
12 |
+
from sklearn.feature_extraction.text import TfidfVectorizer
|
13 |
+
from wordcloud import WordCloud
|
14 |
+
import numpy as np
|
15 |
+
|
16 |
+
class TextVisualization:
|
17 |
+
@staticmethod
|
18 |
+
def extract_from_corpus(texts, max_features=100):
|
19 |
+
"""
|
20 |
+
Extract word frequencies from a corpus using TF-IDF vectorization
|
21 |
+
and generate word cloud frequencies.
|
22 |
+
|
23 |
+
Args:
|
24 |
+
texts: List of text documents
|
25 |
+
max_features: Maximum number of words to include
|
26 |
+
|
27 |
+
Returns:
|
28 |
+
Dictionary of word frequencies suitable for WordCloud
|
29 |
+
"""
|
30 |
+
# Initialize TF-IDF vectorizer
|
31 |
+
tfidf = TfidfVectorizer(
|
32 |
+
max_features=max_features,
|
33 |
+
stop_words='english',
|
34 |
+
lowercase=True
|
35 |
+
)
|
36 |
+
|
37 |
+
# Fit and transform the texts
|
38 |
+
tfidf_matrix = tfidf.fit_transform(texts)
|
39 |
+
|
40 |
+
# Get feature names (words)
|
41 |
+
feature_names = tfidf.get_feature_names_out()
|
42 |
+
|
43 |
+
# Calculate mean TF-IDF scores across documents
|
44 |
+
mean_tfidf = np.array(tfidf_matrix.mean(axis=0)).flatten()
|
45 |
+
|
46 |
+
# Create frequency dictionary
|
47 |
+
frequencies = dict(zip(feature_names, mean_tfidf))
|
48 |
+
|
49 |
+
return frequencies
|