Zelyanoth commited on
Commit
6e6661b
·
1 Parent(s): c4c0b39

feat(content): update gradio-client and improve content service

Browse files

Update gradio-client to version 2.0.0 for improved AI content generation
Refactor ContentService class to enhance readability and maintainability
Implement lazy initialization of Gradio client
Improve code organization and formatting
Update Linkedin_poster_dev submodule to reflect changes

backend/requirements.txt CHANGED
@@ -19,7 +19,7 @@ apscheduler>=3.11.0
19
  pandas>=2.2.2
20
 
21
  # AI content generation
22
- gradio-client>=1.10.4
23
 
24
  # Database integration
25
  supabase>=2.16.0
 
19
  pandas>=2.2.2
20
 
21
  # AI content generation
22
+ gradio-client==2.0.0
23
 
24
  # Database integration
25
  supabase>=2.16.0
backend/services/content_service.py CHANGED
@@ -12,14 +12,14 @@ import base64
12
 
13
  class ContentService:
14
  """Service for AI content generation using Hugging Face models."""
15
-
16
  def __init__(self, hugging_key=None):
17
  # Store the hugging_key to be used later when needed
18
  # This avoids accessing current_app during initialization
19
  self.hugging_key = hugging_key
20
  # Initialize the Gradio client lazily - only when first needed
21
  self.client = None
22
-
23
  def _initialize_client(self):
24
  """Initialize the Gradio client, either with provided key or from app config."""
25
  if self.client is None:
@@ -31,14 +31,14 @@ class ContentService:
31
  # We're outside of an application context
32
  raise RuntimeError("Hugging Face API key not provided and not available in app config. "
33
  "Please provide the key when initializing ContentService.")
34
-
35
- self.client = Client("Zelyanoth/Linkedin_poster_dev", hf_token=self.hugging_key)
36
-
37
  def validate_unicode_content(self, content):
38
  """Validate Unicode content while preserving original formatting and spaces."""
39
  if not content or not isinstance(content, str):
40
  return content
41
-
42
  try:
43
  # Test if content can be encoded as UTF-8
44
  content.encode('utf-8')
@@ -50,12 +50,12 @@ class ContentService:
50
  except:
51
  # Ultimate fallback
52
  return str(content)
53
-
54
  def preserve_formatting(self, content):
55
  """Preserve spaces, line breaks, and paragraph formatting."""
56
  if not content:
57
  return content
58
-
59
  # Preserve all whitespace characters including spaces, tabs, and newlines
60
  # This ensures that paragraph breaks and indentation are maintained
61
  try:
@@ -65,36 +65,36 @@ class ContentService:
65
  except UnicodeEncodeError:
66
  # Fallback with error replacement but preserve whitespace
67
  return content.encode('utf-8', errors='replace').decode('utf-8')
68
-
69
  def sanitize_content_for_api(self, content):
70
  """Sanitize content for API calls while preserving original text, spaces, and formatting."""
71
  if not content:
72
  return content
73
-
74
  # First preserve formatting and spaces
75
  preserved = self.preserve_formatting(content)
76
-
77
  # Only validate Unicode, don't remove spaces or formatting
78
  validated = self.validate_unicode_content(preserved)
79
-
80
  # Only remove null bytes that might cause issues in API calls
81
  if '\x00' in validated:
82
  validated = validated.replace('\x00', '')
83
-
84
  # Ensure line breaks and spaces are preserved
85
  validated = validated.replace('\r\n', '\n').replace('\r', '\n')
86
-
87
  return validated
88
-
89
  def _is_base64_image(self, data):
90
  """Check if the data is a base64 encoded image string."""
91
  if not isinstance(data, str):
92
  return False
93
-
94
  # Check if it starts with data URL prefix
95
  if data.startswith('data:image/'):
96
  return True
97
-
98
  # Try to decode as base64
99
  try:
100
  # Extract base64 part if it's a data URL
@@ -102,13 +102,13 @@ class ContentService:
102
  base64_part = data.split(',')[1]
103
  else:
104
  base64_part = data
105
-
106
  # Try to decode
107
  base64.b64decode(base64_part, validate=True)
108
  return True
109
  except Exception:
110
  return False
111
-
112
  def _base64_to_bytes(self, base64_string):
113
  """Convert a base64 encoded string to bytes."""
114
  try:
@@ -117,20 +117,20 @@ class ContentService:
117
  base64_part = base64_string.split(',')[1]
118
  else:
119
  base64_part = base64_string
120
-
121
  # Decode base64 to bytes
122
  return base64.b64decode(base64_part, validate=True)
123
  except Exception as e:
124
  current_app.logger.error(f"Failed to decode base64 image: {str(e)}")
125
  raise Exception(f"Failed to decode base64 image: {str(e)}")
126
-
127
  def generate_post_content(self, user_id: str) -> tuple:
128
  """
129
  Generate post content using AI.
130
-
131
  Args:
132
  user_id (str): User ID for personalization
133
-
134
  Returns:
135
  tuple: (Generated post content, Image URL or None)
136
  """
@@ -138,13 +138,13 @@ class ContentService:
138
  # Ensure the client is initialized (lazy initialization)
139
  if self.client is None:
140
  self._initialize_client()
141
-
142
  # Call the Hugging Face model to generate content
143
  result = self.client.predict(
144
  code=user_id,
145
  api_name="/poster_linkedin"
146
  )
147
-
148
  # Handle the case where result might be a tuple from Gradio
149
  # The Gradio API returns a tuple with (content, image_data)
150
  if isinstance(result, tuple) and len(result) >= 2:
@@ -164,7 +164,7 @@ class ContentService:
164
  except (ValueError, SyntaxError):
165
  # If that fails, treat the result as a plain string
166
  parsed_result = [result]
167
-
168
  # Extract the first element if it's a list
169
  if isinstance(parsed_result, list):
170
  generated_content = parsed_result[0] if parsed_result and parsed_result[0] is not None else "Generated content will appear here..."
@@ -173,13 +173,13 @@ class ContentService:
173
  else:
174
  generated_content = str(parsed_result) if parsed_result is not None else "Generated content will appear here..."
175
  image_data = None
176
-
177
  # Validate, sanitize, and preserve formatting of the generated content
178
  sanitized_content = self.sanitize_content_for_api(generated_content)
179
-
180
  # Ensure paragraph breaks and formatting are preserved
181
  final_content = self.preserve_formatting(sanitized_content)
182
-
183
  # Handle image data - could be URL or base64
184
  image_bytes = None
185
  if image_data:
@@ -189,22 +189,22 @@ class ContentService:
189
  else:
190
  # It's a URL, keep as string
191
  image_bytes = image_data
192
-
193
  return (final_content, image_bytes)
194
-
195
  except Exception as e:
196
  error_message = str(e)
197
  current_app.logger.error(f"Content generation failed: {error_message}")
198
  raise Exception(f"Content generation failed: {error_message}")
199
-
200
  def add_rss_source(self, rss_link: str, user_id: str) -> str:
201
  """
202
  Add an RSS source for content generation.
203
-
204
  Args:
205
  rss_link (str): RSS feed URL
206
  user_id (str): User ID
207
-
208
  Returns:
209
  str: Result message
210
  """
@@ -212,32 +212,32 @@ class ContentService:
212
  # Ensure the client is initialized (lazy initialization)
213
  if self.client is None:
214
  self._initialize_client()
215
-
216
  # Call the Hugging Face model to add RSS source
217
  rss_input = f"{rss_link}__thi_irrh'èçs_my_id__! {user_id}"
218
  sanitized_rss_input = self.sanitize_content_for_api(rss_input)
219
-
220
  result = self.client.predict(
221
  rss_link=sanitized_rss_input,
222
  api_name="/ajouter_rss"
223
  )
224
-
225
  # Sanitize and preserve formatting of the result
226
  sanitized_result = self.sanitize_content_for_api(result)
227
  return self.preserve_formatting(sanitized_result)
228
-
229
  except Exception as e:
230
  raise Exception(f"Failed to add RSS source: {str(e)}")
231
 
232
  def analyze_keyword_frequency(self, keyword, user_id, date_range='monthly'):
233
  """
234
  Analyze the frequency of new articles/links appearing in RSS feeds generated from keywords.
235
-
236
  Args:
237
  keyword (str): The keyword to analyze
238
  user_id (str): User ID for filtering content
239
  date_range (str): The date range to analyze ('daily', 'weekly', 'monthly')
240
-
241
  Returns:
242
  dict: Analysis data with article frequency over time
243
  """
@@ -245,14 +245,14 @@ class ContentService:
245
  from flask import current_app
246
  from datetime import datetime, timedelta
247
  import re
248
-
249
  # Attempt to access current_app, but handle gracefully if outside of app context
250
  try:
251
  # Fetch posts from the database that belong to the user
252
  # Check if Supabase client is initialized
253
  if not hasattr(current_app, 'supabase') or current_app.supabase is None:
254
  raise Exception("Database connection not initialized")
255
-
256
  # Get all RSS sources for the user to analyze
257
  rss_response = (
258
  current_app.supabase
@@ -261,18 +261,18 @@ class ContentService:
261
  .eq("user_id", user_id)
262
  .execute()
263
  )
264
-
265
  user_rss_sources = rss_response.data if rss_response.data else []
266
-
267
  # Analyze each RSS source for frequency of new articles/links
268
  keyword_data = []
269
-
270
  # Create a DataFrame to store articles from RSS feeds
271
  all_articles = []
272
-
273
  for rss_source in user_rss_sources:
274
  rss_link = rss_source["source"]
275
-
276
  # Check if the source is a keyword rather than an RSS URL
277
  # If it's a keyword, generate a Google News RSS URL
278
  if self._is_url(rss_link):
@@ -281,14 +281,14 @@ class ContentService:
281
  else:
282
  # It's a keyword, generate Google News RSS URL
283
  feed_url = self._generate_google_news_rss_from_string(rss_link)
284
-
285
  # Parse the RSS feed
286
  feed = feedparser.parse(feed_url)
287
-
288
  # Log some debug information
289
  current_app.logger.info(f"Processing RSS feed: {feed_url}")
290
  current_app.logger.info(f"Number of entries in feed: {len(feed.entries)}")
291
-
292
  # Extract articles from the feed
293
  for entry in feed.entries:
294
  # Use the same date handling as in the original ai_agent.py
@@ -299,39 +299,39 @@ class ContentService:
299
  'date': entry.get('published', entry.get('updated', None)),
300
  'content': entry.get('summary', '') + ' ' + entry.get('title', '')
301
  }
302
-
303
  # Log individual article data for debugging
304
  current_app.logger.info(f"Article title: {entry.title}")
305
  current_app.logger.info(f"Article date: {article_data['date']}")
306
-
307
  all_articles.append(article_data)
308
-
309
  # Create a DataFrame from the articles
310
  df_articles = pd.DataFrame(all_articles)
311
-
312
  current_app.logger.info(f"Total articles collected: {len(df_articles)}")
313
  if not df_articles.empty:
314
  current_app.logger.info(f"DataFrame columns: {df_articles.columns.tolist()}")
315
  current_app.logger.info(f"Sample of DataFrame:\n{df_articles.head()}")
316
-
317
  # Convert date column to datetime if it exists
318
  if not df_articles.empty and 'date' in df_articles.columns:
319
  # Convert struct_time objects to datetime
320
  df_articles['date'] = pd.to_datetime(df_articles['date'], errors='coerce', utc=True)
321
-
322
  current_app.logger.info(f"DataFrame shape after date conversion: {df_articles.shape}")
323
  current_app.logger.info(f"Date column after conversion:\n{df_articles['date'].head()}")
324
-
325
  df_articles = df_articles.dropna(subset=['date']) # Remove entries with invalid dates
326
  df_articles = df_articles.sort_values(by='date', ascending=True)
327
-
328
  current_app.logger.info(f"DataFrame shape after dropping invalid dates: {df_articles.shape}")
329
-
330
  # If we have articles, analyze article frequency over time
331
  if not df_articles.empty:
332
  # Group by date ranges and count all articles (not just those containing the keyword)
333
  # This will show how many new articles appear in RSS feeds over time
334
-
335
  # For the date grouping, use the appropriate pandas syntax
336
  # Handle timezone-aware dates properly to avoid warnings
337
  if date_range == 'daily':
@@ -354,15 +354,15 @@ class ContentService:
354
  'day': 1
355
  }).dt.date
356
  interval = 'MS' # Month Start frequency
357
-
358
  # Count all articles by date group (this is the key difference - we're counting all articles, not keyword matches)
359
  article_counts = df_articles.groupby('date_group').size().reset_index(name='count')
360
-
361
  # Create a complete date range for the chart
362
  if not article_counts.empty:
363
  start_date = article_counts['date_group'].min()
364
  end_date = article_counts['date_group'].max()
365
-
366
  # Use the correct frequency for the date range generation
367
  if date_range == 'daily':
368
  freq = 'D'
@@ -370,26 +370,26 @@ class ContentService:
370
  freq = 'W-MON' # Weekly on Monday
371
  else: # monthly
372
  freq = 'MS' # Month start frequency
373
-
374
  # Create a complete date range
375
  full_date_range = pd.date_range(start=start_date, end=end_date, freq=freq).to_frame(index=False, name='date_group')
376
  full_date_range['date_group'] = full_date_range['date_group'].dt.date
377
-
378
  # Merge with article counts
379
  article_counts = full_date_range.merge(article_counts, on='date_group', how='left').fillna(0)
380
-
381
  # Convert counts to integers
382
  article_counts['count'] = article_counts['count'].astype(int)
383
-
384
  # Format the data for the frontend chart
385
  for _, row in article_counts.iterrows():
386
  date_str = row['date_group'].strftime('%Y-%m-%d')
387
-
388
  # Calculate values for different time ranges
389
  daily_val = row['count'] if date_range == 'daily' else int(row['count'] / 7) if date_range == 'weekly' else int(row['count'] / 30)
390
  weekly_val = daily_val * 7 if date_range == 'daily' else row['count'] if date_range == 'weekly' else int(row['count'] / 4)
391
  monthly_val = daily_val * 30 if date_range == 'daily' else weekly_val * 4 if date_range == 'weekly' else row['count']
392
-
393
  keyword_data.append({
394
  'date': date_str,
395
  'daily': daily_val,
@@ -406,7 +406,7 @@ class ContentService:
406
  date = (start_date - timedelta(weeks=i)).strftime('%Y-%m-%d')
407
  else: # monthly
408
  date = (start_date - timedelta(days=30*i)).strftime('%Y-%m-%d')
409
-
410
  keyword_data.append({
411
  'date': date,
412
  'daily': 0,
@@ -423,14 +423,14 @@ class ContentService:
423
  date = (start_date - timedelta(weeks=i)).strftime('%Y-%m-%d')
424
  else: # monthly
425
  date = (start_date - timedelta(days=30*i)).strftime('%Y-%m-%d')
426
-
427
  keyword_data.append({
428
  'date': date,
429
  'daily': 0,
430
  'weekly': 0,
431
  'monthly': 0
432
  })
433
-
434
  return keyword_data
435
  except RuntimeError:
436
  # We're outside of application context
@@ -445,16 +445,16 @@ class ContentService:
445
  date = (start_date - timedelta(weeks=i)).strftime('%Y-%m-%d')
446
  else: # monthly
447
  date = (start_date - timedelta(days=30*i)).strftime('%Y-%m-%d')
448
-
449
  keyword_data.append({
450
  'date': date,
451
  'daily': 0,
452
  'weekly': 0,
453
  'monthly': 0
454
  })
455
-
456
  return keyword_data
457
-
458
  except Exception as e:
459
  import logging
460
  logging.error(f"Keyword frequency analysis failed: {str(e)}")
@@ -464,11 +464,11 @@ class ContentService:
464
  """
465
  Analyze the frequency pattern of links generated from RSS feeds for a specific keyword over time.
466
  Determines if the keyword follows a daily, weekly, monthly, or rare pattern based on recency and frequency.
467
-
468
  Args:
469
  keyword (str): The keyword to analyze
470
  user_id (str): User ID for filtering content
471
-
472
  Returns:
473
  dict: Analysis data with frequency pattern classification
474
  """
@@ -476,17 +476,17 @@ class ContentService:
476
  from flask import current_app
477
  from datetime import datetime, timedelta
478
  import re
479
-
480
  # Create a DataFrame to store articles from RSS feeds
481
  all_articles = []
482
-
483
  # Attempt to access current_app, but handle gracefully if outside of app context
484
  try:
485
  # Fetch posts from the database that belong to the user
486
  # Check if Supabase client is initialized
487
  if not hasattr(current_app, 'supabase') or current_app.supabase is None:
488
  raise Exception("Database connection not initialized")
489
-
490
  # Get all RSS sources for the user to analyze
491
  rss_response = (
492
  current_app.supabase
@@ -495,15 +495,15 @@ class ContentService:
495
  .eq("user_id", user_id)
496
  .execute()
497
  )
498
-
499
  user_rss_sources = rss_response.data if rss_response.data else []
500
-
501
  # Analyze each RSS source
502
-
503
-
504
  # Check if the source matches the keyword or if it's any source
505
  # We'll analyze any source that contains the keyword or is related to it
506
-
507
  # Check if the source is a keyword rather than an RSS URL
508
  # If it's a keyword, generate a Google News RSS URL
509
  if self._is_url(keyword):
@@ -512,14 +512,14 @@ class ContentService:
512
  else:
513
  # It's a keyword, generate Google News RSS URL
514
  feed_url = self._generate_google_news_rss_from_string(keyword)
515
-
516
  # Parse the RSS feed
517
  feed = feedparser.parse(feed_url)
518
-
519
  # Log some debug information
520
  current_app.logger.info(f"Processing RSS feed: {feed_url}")
521
  current_app.logger.info(f"Number of entries in feed: {len(feed.entries)}")
522
-
523
  # Extract ALL articles from the feed (without filtering by keyword again)
524
  for entry in feed.entries:
525
  # Use the same date handling as in the original ai_agent.py
@@ -530,37 +530,37 @@ class ContentService:
530
  'date': entry.get('published', entry.get('updated', None)),
531
  'content': entry.get('summary', '') + ' ' + entry.get('title', '')
532
  }
533
-
534
  # Log individual article data for debugging
535
  current_app.logger.info(f"Article title: {entry.title}")
536
  current_app.logger.info(f"Article date: {article_data['date']}")
537
-
538
  all_articles.append(article_data)
539
-
540
  # Create a DataFrame from the articles
541
  df_articles = pd.DataFrame(all_articles)
542
-
543
  current_app.logger.info(f"Total articles collected for keyword '{keyword}': {len(df_articles)}")
544
  if not df_articles.empty:
545
  current_app.logger.info(f"DataFrame columns: {df_articles.columns.tolist()}")
546
  current_app.logger.info(f"Sample of DataFrame:\n{df_articles.head()}")
547
-
548
  # Convert date column to datetime if it exists
549
  if not df_articles.empty and 'date' in df_articles.columns:
550
  # Convert struct_time objects to datetime
551
  df_articles['date'] = pd.to_datetime(df_articles['date'], errors='coerce', utc=True)
552
-
553
  current_app.logger.info(f"DataFrame shape after date conversion: {df_articles.shape}")
554
  current_app.logger.info(f"Date column after conversion:\n{df_articles['date'].head()}")
555
-
556
  df_articles = df_articles.dropna(subset=['date']) # Remove entries with invalid dates
557
  df_articles = df_articles.sort_values(by='date', ascending=False) # Sort by date descending to get most recent first
558
-
559
  current_app.logger.info(f"DataFrame shape after dropping invalid dates: {df_articles.shape}")
560
-
561
  # Analyze frequency pattern
562
  frequency_pattern = self._determine_frequency_pattern(df_articles)
563
-
564
  # Prepare recent articles to return with the response
565
  recent_articles = []
566
  if not df_articles.empty:
@@ -572,13 +572,13 @@ class ContentService:
572
  if pd.notna(row['date']):
573
  # Convert to string in a readable format
574
  formatted_date = row['date'].strftime('%Y-%m-%d %H:%M:%S') if hasattr(row['date'], 'strftime') else str(row['date'])
575
-
576
  recent_articles.append({
577
  'title': row['title'],
578
  'link': row['link'],
579
  'date': formatted_date
580
  })
581
-
582
  # Return comprehensive analysis
583
  return {
584
  'keyword': keyword,
@@ -591,7 +591,7 @@ class ContentService:
591
  'end': df_articles['date'].min().strftime('%Y-%m-%d') if not df_articles.empty else None # Earliest date last
592
  }
593
  }
594
-
595
  except RuntimeError:
596
  # We're outside of application context
597
  # Return default analysis for testing purposes
@@ -609,7 +609,7 @@ class ContentService:
609
  'end': None
610
  }
611
  }
612
-
613
  except Exception as e:
614
  import logging
615
  logging.error(f"Keyword frequency pattern analysis failed: {str(e)}")
@@ -618,10 +618,10 @@ class ContentService:
618
  def _determine_frequency_pattern(self, df_articles):
619
  """
620
  Determine the frequency pattern based on the recency and frequency of articles.
621
-
622
  Args:
623
  df_articles: DataFrame with articles data including dates
624
-
625
  Returns:
626
  dict: Pattern classification and details
627
  """
@@ -633,23 +633,23 @@ class ContentService:
633
  'confidence': 1.0
634
  }
635
  }
636
-
637
  # Calculate time since the latest article
638
  latest_date = df_articles['date'].max()
639
  current_time = pd.Timestamp.now(tz=latest_date.tz) if latest_date.tz else pd.Timestamp.now()
640
  time_since_latest = (current_time - latest_date).days
641
-
642
  # Calculate article frequency
643
  total_articles = len(df_articles)
644
-
645
  # Group articles by date to get daily counts
646
  df_articles['date_only'] = df_articles['date'].dt.date
647
  daily_counts = df_articles.groupby('date_only').size()
648
-
649
  # Calculate metrics
650
  avg_daily_frequency = daily_counts.mean() if len(daily_counts) > 0 else 0
651
  recent_activity = daily_counts.tail(7).sum() # articles in last 7 days
652
-
653
  # Determine pattern based on multiple factors
654
  if total_articles == 0:
655
  return {
@@ -659,7 +659,7 @@ class ContentService:
659
  'confidence': 1.0
660
  }
661
  }
662
-
663
  # Check if pattern is truly persistent by considering recency
664
  if time_since_latest > 30:
665
  # If no activity in the last month, it's likely not a daily/weekly pattern anymore
@@ -671,7 +671,7 @@ class ContentService:
671
  'confidence': 0.9
672
  }
673
  }
674
-
675
  # If there are many recent articles per day, it's likely daily
676
  if recent_activity > 7 and time_since_latest <= 1:
677
  return {
@@ -681,7 +681,7 @@ class ContentService:
681
  'confidence': 0.9
682
  }
683
  }
684
-
685
  # If there are few articles per day but regular weekly activity
686
  if 3 <= recent_activity <= 7 and time_since_latest <= 7:
687
  return {
@@ -691,7 +691,7 @@ class ContentService:
691
  'confidence': 0.8
692
  }
693
  }
694
-
695
  # If there are very few articles but they are somewhat spread over time
696
  if recent_activity < 3 and total_articles > 0 and time_since_latest <= 30:
697
  return {
@@ -701,7 +701,7 @@ class ContentService:
701
  'confidence': 0.7
702
  }
703
  }
704
-
705
  # Default to rare if no clear pattern
706
  return {
707
  'pattern': 'rare',
@@ -723,12 +723,12 @@ class ContentService:
723
  def _generate_google_news_rss_from_string(self, query, language="en", country="US"):
724
  """
725
  Génère un lien RSS Google News à partir d'une chaîne de recherche brute.
726
-
727
  Args:
728
  query (str): Requête brute de recherche Google News.
729
  language (str): Code langue, ex: "en".
730
  country (str): Code pays, ex: "US".
731
-
732
  Returns:
733
  str: URL du flux RSS Google News.
734
  """
 
12
 
13
  class ContentService:
14
  """Service for AI content generation using Hugging Face models."""
15
+
16
  def __init__(self, hugging_key=None):
17
  # Store the hugging_key to be used later when needed
18
  # This avoids accessing current_app during initialization
19
  self.hugging_key = hugging_key
20
  # Initialize the Gradio client lazily - only when first needed
21
  self.client = None
22
+
23
  def _initialize_client(self):
24
  """Initialize the Gradio client, either with provided key or from app config."""
25
  if self.client is None:
 
31
  # We're outside of an application context
32
  raise RuntimeError("Hugging Face API key not provided and not available in app config. "
33
  "Please provide the key when initializing ContentService.")
34
+
35
+ self.client = Client("Zelyanoth/Linkedin_poster_dev", token=self.hugging_key)
36
+
37
  def validate_unicode_content(self, content):
38
  """Validate Unicode content while preserving original formatting and spaces."""
39
  if not content or not isinstance(content, str):
40
  return content
41
+
42
  try:
43
  # Test if content can be encoded as UTF-8
44
  content.encode('utf-8')
 
50
  except:
51
  # Ultimate fallback
52
  return str(content)
53
+
54
  def preserve_formatting(self, content):
55
  """Preserve spaces, line breaks, and paragraph formatting."""
56
  if not content:
57
  return content
58
+
59
  # Preserve all whitespace characters including spaces, tabs, and newlines
60
  # This ensures that paragraph breaks and indentation are maintained
61
  try:
 
65
  except UnicodeEncodeError:
66
  # Fallback with error replacement but preserve whitespace
67
  return content.encode('utf-8', errors='replace').decode('utf-8')
68
+
69
  def sanitize_content_for_api(self, content):
70
  """Sanitize content for API calls while preserving original text, spaces, and formatting."""
71
  if not content:
72
  return content
73
+
74
  # First preserve formatting and spaces
75
  preserved = self.preserve_formatting(content)
76
+
77
  # Only validate Unicode, don't remove spaces or formatting
78
  validated = self.validate_unicode_content(preserved)
79
+
80
  # Only remove null bytes that might cause issues in API calls
81
  if '\x00' in validated:
82
  validated = validated.replace('\x00', '')
83
+
84
  # Ensure line breaks and spaces are preserved
85
  validated = validated.replace('\r\n', '\n').replace('\r', '\n')
86
+
87
  return validated
88
+
89
  def _is_base64_image(self, data):
90
  """Check if the data is a base64 encoded image string."""
91
  if not isinstance(data, str):
92
  return False
93
+
94
  # Check if it starts with data URL prefix
95
  if data.startswith('data:image/'):
96
  return True
97
+
98
  # Try to decode as base64
99
  try:
100
  # Extract base64 part if it's a data URL
 
102
  base64_part = data.split(',')[1]
103
  else:
104
  base64_part = data
105
+
106
  # Try to decode
107
  base64.b64decode(base64_part, validate=True)
108
  return True
109
  except Exception:
110
  return False
111
+
112
  def _base64_to_bytes(self, base64_string):
113
  """Convert a base64 encoded string to bytes."""
114
  try:
 
117
  base64_part = base64_string.split(',')[1]
118
  else:
119
  base64_part = base64_string
120
+
121
  # Decode base64 to bytes
122
  return base64.b64decode(base64_part, validate=True)
123
  except Exception as e:
124
  current_app.logger.error(f"Failed to decode base64 image: {str(e)}")
125
  raise Exception(f"Failed to decode base64 image: {str(e)}")
126
+
127
  def generate_post_content(self, user_id: str) -> tuple:
128
  """
129
  Generate post content using AI.
130
+
131
  Args:
132
  user_id (str): User ID for personalization
133
+
134
  Returns:
135
  tuple: (Generated post content, Image URL or None)
136
  """
 
138
  # Ensure the client is initialized (lazy initialization)
139
  if self.client is None:
140
  self._initialize_client()
141
+
142
  # Call the Hugging Face model to generate content
143
  result = self.client.predict(
144
  code=user_id,
145
  api_name="/poster_linkedin"
146
  )
147
+
148
  # Handle the case where result might be a tuple from Gradio
149
  # The Gradio API returns a tuple with (content, image_data)
150
  if isinstance(result, tuple) and len(result) >= 2:
 
164
  except (ValueError, SyntaxError):
165
  # If that fails, treat the result as a plain string
166
  parsed_result = [result]
167
+
168
  # Extract the first element if it's a list
169
  if isinstance(parsed_result, list):
170
  generated_content = parsed_result[0] if parsed_result and parsed_result[0] is not None else "Generated content will appear here..."
 
173
  else:
174
  generated_content = str(parsed_result) if parsed_result is not None else "Generated content will appear here..."
175
  image_data = None
176
+
177
  # Validate, sanitize, and preserve formatting of the generated content
178
  sanitized_content = self.sanitize_content_for_api(generated_content)
179
+
180
  # Ensure paragraph breaks and formatting are preserved
181
  final_content = self.preserve_formatting(sanitized_content)
182
+
183
  # Handle image data - could be URL or base64
184
  image_bytes = None
185
  if image_data:
 
189
  else:
190
  # It's a URL, keep as string
191
  image_bytes = image_data
192
+
193
  return (final_content, image_bytes)
194
+
195
  except Exception as e:
196
  error_message = str(e)
197
  current_app.logger.error(f"Content generation failed: {error_message}")
198
  raise Exception(f"Content generation failed: {error_message}")
199
+
200
  def add_rss_source(self, rss_link: str, user_id: str) -> str:
201
  """
202
  Add an RSS source for content generation.
203
+
204
  Args:
205
  rss_link (str): RSS feed URL
206
  user_id (str): User ID
207
+
208
  Returns:
209
  str: Result message
210
  """
 
212
  # Ensure the client is initialized (lazy initialization)
213
  if self.client is None:
214
  self._initialize_client()
215
+
216
  # Call the Hugging Face model to add RSS source
217
  rss_input = f"{rss_link}__thi_irrh'èçs_my_id__! {user_id}"
218
  sanitized_rss_input = self.sanitize_content_for_api(rss_input)
219
+
220
  result = self.client.predict(
221
  rss_link=sanitized_rss_input,
222
  api_name="/ajouter_rss"
223
  )
224
+
225
  # Sanitize and preserve formatting of the result
226
  sanitized_result = self.sanitize_content_for_api(result)
227
  return self.preserve_formatting(sanitized_result)
228
+
229
  except Exception as e:
230
  raise Exception(f"Failed to add RSS source: {str(e)}")
231
 
232
  def analyze_keyword_frequency(self, keyword, user_id, date_range='monthly'):
233
  """
234
  Analyze the frequency of new articles/links appearing in RSS feeds generated from keywords.
235
+
236
  Args:
237
  keyword (str): The keyword to analyze
238
  user_id (str): User ID for filtering content
239
  date_range (str): The date range to analyze ('daily', 'weekly', 'monthly')
240
+
241
  Returns:
242
  dict: Analysis data with article frequency over time
243
  """
 
245
  from flask import current_app
246
  from datetime import datetime, timedelta
247
  import re
248
+
249
  # Attempt to access current_app, but handle gracefully if outside of app context
250
  try:
251
  # Fetch posts from the database that belong to the user
252
  # Check if Supabase client is initialized
253
  if not hasattr(current_app, 'supabase') or current_app.supabase is None:
254
  raise Exception("Database connection not initialized")
255
+
256
  # Get all RSS sources for the user to analyze
257
  rss_response = (
258
  current_app.supabase
 
261
  .eq("user_id", user_id)
262
  .execute()
263
  )
264
+
265
  user_rss_sources = rss_response.data if rss_response.data else []
266
+
267
  # Analyze each RSS source for frequency of new articles/links
268
  keyword_data = []
269
+
270
  # Create a DataFrame to store articles from RSS feeds
271
  all_articles = []
272
+
273
  for rss_source in user_rss_sources:
274
  rss_link = rss_source["source"]
275
+
276
  # Check if the source is a keyword rather than an RSS URL
277
  # If it's a keyword, generate a Google News RSS URL
278
  if self._is_url(rss_link):
 
281
  else:
282
  # It's a keyword, generate Google News RSS URL
283
  feed_url = self._generate_google_news_rss_from_string(rss_link)
284
+
285
  # Parse the RSS feed
286
  feed = feedparser.parse(feed_url)
287
+
288
  # Log some debug information
289
  current_app.logger.info(f"Processing RSS feed: {feed_url}")
290
  current_app.logger.info(f"Number of entries in feed: {len(feed.entries)}")
291
+
292
  # Extract articles from the feed
293
  for entry in feed.entries:
294
  # Use the same date handling as in the original ai_agent.py
 
299
  'date': entry.get('published', entry.get('updated', None)),
300
  'content': entry.get('summary', '') + ' ' + entry.get('title', '')
301
  }
302
+
303
  # Log individual article data for debugging
304
  current_app.logger.info(f"Article title: {entry.title}")
305
  current_app.logger.info(f"Article date: {article_data['date']}")
306
+
307
  all_articles.append(article_data)
308
+
309
  # Create a DataFrame from the articles
310
  df_articles = pd.DataFrame(all_articles)
311
+
312
  current_app.logger.info(f"Total articles collected: {len(df_articles)}")
313
  if not df_articles.empty:
314
  current_app.logger.info(f"DataFrame columns: {df_articles.columns.tolist()}")
315
  current_app.logger.info(f"Sample of DataFrame:\n{df_articles.head()}")
316
+
317
  # Convert date column to datetime if it exists
318
  if not df_articles.empty and 'date' in df_articles.columns:
319
  # Convert struct_time objects to datetime
320
  df_articles['date'] = pd.to_datetime(df_articles['date'], errors='coerce', utc=True)
321
+
322
  current_app.logger.info(f"DataFrame shape after date conversion: {df_articles.shape}")
323
  current_app.logger.info(f"Date column after conversion:\n{df_articles['date'].head()}")
324
+
325
  df_articles = df_articles.dropna(subset=['date']) # Remove entries with invalid dates
326
  df_articles = df_articles.sort_values(by='date', ascending=True)
327
+
328
  current_app.logger.info(f"DataFrame shape after dropping invalid dates: {df_articles.shape}")
329
+
330
  # If we have articles, analyze article frequency over time
331
  if not df_articles.empty:
332
  # Group by date ranges and count all articles (not just those containing the keyword)
333
  # This will show how many new articles appear in RSS feeds over time
334
+
335
  # For the date grouping, use the appropriate pandas syntax
336
  # Handle timezone-aware dates properly to avoid warnings
337
  if date_range == 'daily':
 
354
  'day': 1
355
  }).dt.date
356
  interval = 'MS' # Month Start frequency
357
+
358
  # Count all articles by date group (this is the key difference - we're counting all articles, not keyword matches)
359
  article_counts = df_articles.groupby('date_group').size().reset_index(name='count')
360
+
361
  # Create a complete date range for the chart
362
  if not article_counts.empty:
363
  start_date = article_counts['date_group'].min()
364
  end_date = article_counts['date_group'].max()
365
+
366
  # Use the correct frequency for the date range generation
367
  if date_range == 'daily':
368
  freq = 'D'
 
370
  freq = 'W-MON' # Weekly on Monday
371
  else: # monthly
372
  freq = 'MS' # Month start frequency
373
+
374
  # Create a complete date range
375
  full_date_range = pd.date_range(start=start_date, end=end_date, freq=freq).to_frame(index=False, name='date_group')
376
  full_date_range['date_group'] = full_date_range['date_group'].dt.date
377
+
378
  # Merge with article counts
379
  article_counts = full_date_range.merge(article_counts, on='date_group', how='left').fillna(0)
380
+
381
  # Convert counts to integers
382
  article_counts['count'] = article_counts['count'].astype(int)
383
+
384
  # Format the data for the frontend chart
385
  for _, row in article_counts.iterrows():
386
  date_str = row['date_group'].strftime('%Y-%m-%d')
387
+
388
  # Calculate values for different time ranges
389
  daily_val = row['count'] if date_range == 'daily' else int(row['count'] / 7) if date_range == 'weekly' else int(row['count'] / 30)
390
  weekly_val = daily_val * 7 if date_range == 'daily' else row['count'] if date_range == 'weekly' else int(row['count'] / 4)
391
  monthly_val = daily_val * 30 if date_range == 'daily' else weekly_val * 4 if date_range == 'weekly' else row['count']
392
+
393
  keyword_data.append({
394
  'date': date_str,
395
  'daily': daily_val,
 
406
  date = (start_date - timedelta(weeks=i)).strftime('%Y-%m-%d')
407
  else: # monthly
408
  date = (start_date - timedelta(days=30*i)).strftime('%Y-%m-%d')
409
+
410
  keyword_data.append({
411
  'date': date,
412
  'daily': 0,
 
423
  date = (start_date - timedelta(weeks=i)).strftime('%Y-%m-%d')
424
  else: # monthly
425
  date = (start_date - timedelta(days=30*i)).strftime('%Y-%m-%d')
426
+
427
  keyword_data.append({
428
  'date': date,
429
  'daily': 0,
430
  'weekly': 0,
431
  'monthly': 0
432
  })
433
+
434
  return keyword_data
435
  except RuntimeError:
436
  # We're outside of application context
 
445
  date = (start_date - timedelta(weeks=i)).strftime('%Y-%m-%d')
446
  else: # monthly
447
  date = (start_date - timedelta(days=30*i)).strftime('%Y-%m-%d')
448
+
449
  keyword_data.append({
450
  'date': date,
451
  'daily': 0,
452
  'weekly': 0,
453
  'monthly': 0
454
  })
455
+
456
  return keyword_data
457
+
458
  except Exception as e:
459
  import logging
460
  logging.error(f"Keyword frequency analysis failed: {str(e)}")
 
464
  """
465
  Analyze the frequency pattern of links generated from RSS feeds for a specific keyword over time.
466
  Determines if the keyword follows a daily, weekly, monthly, or rare pattern based on recency and frequency.
467
+
468
  Args:
469
  keyword (str): The keyword to analyze
470
  user_id (str): User ID for filtering content
471
+
472
  Returns:
473
  dict: Analysis data with frequency pattern classification
474
  """
 
476
  from flask import current_app
477
  from datetime import datetime, timedelta
478
  import re
479
+
480
  # Create a DataFrame to store articles from RSS feeds
481
  all_articles = []
482
+
483
  # Attempt to access current_app, but handle gracefully if outside of app context
484
  try:
485
  # Fetch posts from the database that belong to the user
486
  # Check if Supabase client is initialized
487
  if not hasattr(current_app, 'supabase') or current_app.supabase is None:
488
  raise Exception("Database connection not initialized")
489
+
490
  # Get all RSS sources for the user to analyze
491
  rss_response = (
492
  current_app.supabase
 
495
  .eq("user_id", user_id)
496
  .execute()
497
  )
498
+
499
  user_rss_sources = rss_response.data if rss_response.data else []
500
+
501
  # Analyze each RSS source
502
+
503
+
504
  # Check if the source matches the keyword or if it's any source
505
  # We'll analyze any source that contains the keyword or is related to it
506
+
507
  # Check if the source is a keyword rather than an RSS URL
508
  # If it's a keyword, generate a Google News RSS URL
509
  if self._is_url(keyword):
 
512
  else:
513
  # It's a keyword, generate Google News RSS URL
514
  feed_url = self._generate_google_news_rss_from_string(keyword)
515
+
516
  # Parse the RSS feed
517
  feed = feedparser.parse(feed_url)
518
+
519
  # Log some debug information
520
  current_app.logger.info(f"Processing RSS feed: {feed_url}")
521
  current_app.logger.info(f"Number of entries in feed: {len(feed.entries)}")
522
+
523
  # Extract ALL articles from the feed (without filtering by keyword again)
524
  for entry in feed.entries:
525
  # Use the same date handling as in the original ai_agent.py
 
530
  'date': entry.get('published', entry.get('updated', None)),
531
  'content': entry.get('summary', '') + ' ' + entry.get('title', '')
532
  }
533
+
534
  # Log individual article data for debugging
535
  current_app.logger.info(f"Article title: {entry.title}")
536
  current_app.logger.info(f"Article date: {article_data['date']}")
537
+
538
  all_articles.append(article_data)
539
+
540
  # Create a DataFrame from the articles
541
  df_articles = pd.DataFrame(all_articles)
542
+
543
  current_app.logger.info(f"Total articles collected for keyword '{keyword}': {len(df_articles)}")
544
  if not df_articles.empty:
545
  current_app.logger.info(f"DataFrame columns: {df_articles.columns.tolist()}")
546
  current_app.logger.info(f"Sample of DataFrame:\n{df_articles.head()}")
547
+
548
  # Convert date column to datetime if it exists
549
  if not df_articles.empty and 'date' in df_articles.columns:
550
  # Convert struct_time objects to datetime
551
  df_articles['date'] = pd.to_datetime(df_articles['date'], errors='coerce', utc=True)
552
+
553
  current_app.logger.info(f"DataFrame shape after date conversion: {df_articles.shape}")
554
  current_app.logger.info(f"Date column after conversion:\n{df_articles['date'].head()}")
555
+
556
  df_articles = df_articles.dropna(subset=['date']) # Remove entries with invalid dates
557
  df_articles = df_articles.sort_values(by='date', ascending=False) # Sort by date descending to get most recent first
558
+
559
  current_app.logger.info(f"DataFrame shape after dropping invalid dates: {df_articles.shape}")
560
+
561
  # Analyze frequency pattern
562
  frequency_pattern = self._determine_frequency_pattern(df_articles)
563
+
564
  # Prepare recent articles to return with the response
565
  recent_articles = []
566
  if not df_articles.empty:
 
572
  if pd.notna(row['date']):
573
  # Convert to string in a readable format
574
  formatted_date = row['date'].strftime('%Y-%m-%d %H:%M:%S') if hasattr(row['date'], 'strftime') else str(row['date'])
575
+
576
  recent_articles.append({
577
  'title': row['title'],
578
  'link': row['link'],
579
  'date': formatted_date
580
  })
581
+
582
  # Return comprehensive analysis
583
  return {
584
  'keyword': keyword,
 
591
  'end': df_articles['date'].min().strftime('%Y-%m-%d') if not df_articles.empty else None # Earliest date last
592
  }
593
  }
594
+
595
  except RuntimeError:
596
  # We're outside of application context
597
  # Return default analysis for testing purposes
 
609
  'end': None
610
  }
611
  }
612
+
613
  except Exception as e:
614
  import logging
615
  logging.error(f"Keyword frequency pattern analysis failed: {str(e)}")
 
618
  def _determine_frequency_pattern(self, df_articles):
619
  """
620
  Determine the frequency pattern based on the recency and frequency of articles.
621
+
622
  Args:
623
  df_articles: DataFrame with articles data including dates
624
+
625
  Returns:
626
  dict: Pattern classification and details
627
  """
 
633
  'confidence': 1.0
634
  }
635
  }
636
+
637
  # Calculate time since the latest article
638
  latest_date = df_articles['date'].max()
639
  current_time = pd.Timestamp.now(tz=latest_date.tz) if latest_date.tz else pd.Timestamp.now()
640
  time_since_latest = (current_time - latest_date).days
641
+
642
  # Calculate article frequency
643
  total_articles = len(df_articles)
644
+
645
  # Group articles by date to get daily counts
646
  df_articles['date_only'] = df_articles['date'].dt.date
647
  daily_counts = df_articles.groupby('date_only').size()
648
+
649
  # Calculate metrics
650
  avg_daily_frequency = daily_counts.mean() if len(daily_counts) > 0 else 0
651
  recent_activity = daily_counts.tail(7).sum() # articles in last 7 days
652
+
653
  # Determine pattern based on multiple factors
654
  if total_articles == 0:
655
  return {
 
659
  'confidence': 1.0
660
  }
661
  }
662
+
663
  # Check if pattern is truly persistent by considering recency
664
  if time_since_latest > 30:
665
  # If no activity in the last month, it's likely not a daily/weekly pattern anymore
 
671
  'confidence': 0.9
672
  }
673
  }
674
+
675
  # If there are many recent articles per day, it's likely daily
676
  if recent_activity > 7 and time_since_latest <= 1:
677
  return {
 
681
  'confidence': 0.9
682
  }
683
  }
684
+
685
  # If there are few articles per day but regular weekly activity
686
  if 3 <= recent_activity <= 7 and time_since_latest <= 7:
687
  return {
 
691
  'confidence': 0.8
692
  }
693
  }
694
+
695
  # If there are very few articles but they are somewhat spread over time
696
  if recent_activity < 3 and total_articles > 0 and time_since_latest <= 30:
697
  return {
 
701
  'confidence': 0.7
702
  }
703
  }
704
+
705
  # Default to rare if no clear pattern
706
  return {
707
  'pattern': 'rare',
 
723
  def _generate_google_news_rss_from_string(self, query, language="en", country="US"):
724
  """
725
  Génère un lien RSS Google News à partir d'une chaîne de recherche brute.
726
+
727
  Args:
728
  query (str): Requête brute de recherche Google News.
729
  language (str): Code langue, ex: "en".
730
  country (str): Code pays, ex: "US".
731
+
732
  Returns:
733
  str: URL du flux RSS Google News.
734
  """
requirements.txt CHANGED
@@ -19,7 +19,7 @@ apscheduler>=3.11.0
19
  pandas>=2.2.2
20
 
21
  # AI content generation
22
- gradio-client>=1.10.4
23
 
24
  # Database integration
25
  supabase>=2.16.0
 
19
  pandas>=2.2.2
20
 
21
  # AI content generation
22
+ gradio-client==2.0.0
23
 
24
  # Database integration
25
  supabase>=2.16.0