lambdaofgod commited on
Commit
42de6bd
·
1 Parent(s): 21d27ae

feat: Implement `extract_from_corpus` in `text_visualization.py`

Browse files
Files changed (2) hide show
  1. requirements.txt +2 -0
  2. text_visualization.py +38 -0
requirements.txt CHANGED
@@ -1,2 +1,4 @@
1
  gradio==3.48.0
2
  plotly==5.24.1
 
 
 
1
  gradio==3.48.0
2
  plotly==5.24.1
3
+ scikit-learn==1.3.0
4
+ wordcloud==1.9.2
text_visualization.py CHANGED
@@ -9,3 +9,41 @@ class WordCloudExtractor:
9
 
10
  def extract_from_corpus(self, texts: Iterable[str], n_words: int) -> wordcloud.WordCloud:
11
  pass
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9
 
10
  def extract_from_corpus(self, texts: Iterable[str], n_words: int) -> wordcloud.WordCloud:
11
  pass
12
+ from sklearn.feature_extraction.text import TfidfVectorizer
13
+ from wordcloud import WordCloud
14
+ import numpy as np
15
+
16
+ class TextVisualization:
17
+ @staticmethod
18
+ def extract_from_corpus(texts, max_features=100):
19
+ """
20
+ Extract word frequencies from a corpus using TF-IDF vectorization
21
+ and generate word cloud frequencies.
22
+
23
+ Args:
24
+ texts: List of text documents
25
+ max_features: Maximum number of words to include
26
+
27
+ Returns:
28
+ Dictionary of word frequencies suitable for WordCloud
29
+ """
30
+ # Initialize TF-IDF vectorizer
31
+ tfidf = TfidfVectorizer(
32
+ max_features=max_features,
33
+ stop_words='english',
34
+ lowercase=True
35
+ )
36
+
37
+ # Fit and transform the texts
38
+ tfidf_matrix = tfidf.fit_transform(texts)
39
+
40
+ # Get feature names (words)
41
+ feature_names = tfidf.get_feature_names_out()
42
+
43
+ # Calculate mean TF-IDF scores across documents
44
+ mean_tfidf = np.array(tfidf_matrix.mean(axis=0)).flatten()
45
+
46
+ # Create frequency dictionary
47
+ frequencies = dict(zip(feature_names, mean_tfidf))
48
+
49
+ return frequencies