Spaces:

Zelyanoth
/

Lin

Running

Zelyanoth commited on 21 days ago

Commit

6e6661b

1 Parent(s): c4c0b39

feat(content): update gradio-client and improve content service

Update gradio-client to version 2.0.0 for improved AI content generation
Refactor ContentService class to enhance readability and maintainability
Implement lazy initialization of Gradio client
Improve code organization and formatting
Update Linkedin_poster_dev submodule to reflect changes

Files changed (3) hide show

backend/requirements.txt +1 -1
backend/services/content_service.py +116 -116
requirements.txt +1 -1

backend/requirements.txt CHANGED Viewed

@@ -19,7 +19,7 @@ apscheduler>=3.11.0
 pandas>=2.2.2
 # AI content generation
-gradio-client>=1.10.4
 # Database integration
 supabase>=2.16.0

 pandas>=2.2.2
 # AI content generation
+gradio-client==2.0.0
 # Database integration
 supabase>=2.16.0

backend/services/content_service.py CHANGED Viewed

@@ -12,14 +12,14 @@ import base64
 class ContentService:
     """Service for AI content generation using Hugging Face models."""
     def __init__(self, hugging_key=None):
         # Store the hugging_key to be used later when needed
         # This avoids accessing current_app during initialization
         self.hugging_key = hugging_key
         # Initialize the Gradio client lazily - only when first needed
         self.client = None
     def _initialize_client(self):
         """Initialize the Gradio client, either with provided key or from app config."""
         if self.client is None:
@@ -31,14 +31,14 @@ class ContentService:
                     # We're outside of an application context
                     raise RuntimeError("Hugging Face API key not provided and not available in app config. "
                                        "Please provide the key when initializing ContentService.")
-            self.client = Client("Zelyanoth/Linkedin_poster_dev", hf_token=self.hugging_key)
     def validate_unicode_content(self, content):
         """Validate Unicode content while preserving original formatting and spaces."""
         if not content or not isinstance(content, str):
             return content
         try:
             # Test if content can be encoded as UTF-8
             content.encode('utf-8')
@@ -50,12 +50,12 @@ class ContentService:
             except:
                 # Ultimate fallback
                 return str(content)
     def preserve_formatting(self, content):
         """Preserve spaces, line breaks, and paragraph formatting."""
         if not content:
             return content
         # Preserve all whitespace characters including spaces, tabs, and newlines
         # This ensures that paragraph breaks and indentation are maintained
         try:
@@ -65,36 +65,36 @@ class ContentService:
         except UnicodeEncodeError:
             # Fallback with error replacement but preserve whitespace
             return content.encode('utf-8', errors='replace').decode('utf-8')
     def sanitize_content_for_api(self, content):
         """Sanitize content for API calls while preserving original text, spaces, and formatting."""
         if not content:
             return content
         # First preserve formatting and spaces
         preserved = self.preserve_formatting(content)
         # Only validate Unicode, don't remove spaces or formatting
         validated = self.validate_unicode_content(preserved)
         # Only remove null bytes that might cause issues in API calls
         if '\x00' in validated:
             validated = validated.replace('\x00', '')
         # Ensure line breaks and spaces are preserved
         validated = validated.replace('\r\n', '\n').replace('\r', '\n')
         return validated
     def _is_base64_image(self, data):
         """Check if the data is a base64 encoded image string."""
         if not isinstance(data, str):
             return False
         # Check if it starts with data URL prefix
         if data.startswith('data:image/'):
             return True
         # Try to decode as base64
         try:
             # Extract base64 part if it's a data URL
@@ -102,13 +102,13 @@ class ContentService:
                 base64_part = data.split(',')[1]
             else:
                 base64_part = data
             # Try to decode
             base64.b64decode(base64_part, validate=True)
             return True
         except Exception:
             return False
     def _base64_to_bytes(self, base64_string):
         """Convert a base64 encoded string to bytes."""
         try:
@@ -117,20 +117,20 @@ class ContentService:
                 base64_part = base64_string.split(',')[1]
             else:
                 base64_part = base64_string
             # Decode base64 to bytes
             return base64.b64decode(base64_part, validate=True)
         except Exception as e:
             current_app.logger.error(f"Failed to decode base64 image: {str(e)}")
             raise Exception(f"Failed to decode base64 image: {str(e)}")
     def generate_post_content(self, user_id: str) -> tuple:
         """
         Generate post content using AI.
         Args:
             user_id (str): User ID for personalization
         Returns:
             tuple: (Generated post content, Image URL or None)
         """
@@ -138,13 +138,13 @@ class ContentService:
             # Ensure the client is initialized (lazy initialization)
             if self.client is None:
                 self._initialize_client()
             # Call the Hugging Face model to generate content
             result = self.client.predict(
                 code=user_id,
                 api_name="/poster_linkedin"
             )
             # Handle the case where result might be a tuple from Gradio
             # The Gradio API returns a tuple with (content, image_data)
             if isinstance(result, tuple) and len(result) >= 2:
@@ -164,7 +164,7 @@ class ContentService:
                     except (ValueError, SyntaxError):
                         # If that fails, treat the result as a plain string
                         parsed_result = [result]
                 # Extract the first element if it's a list
                 if isinstance(parsed_result, list):
                     generated_content = parsed_result[0] if parsed_result and parsed_result[0] is not None else "Generated content will appear here..."
@@ -173,13 +173,13 @@ class ContentService:
                 else:
                     generated_content = str(parsed_result) if parsed_result is not None else "Generated content will appear here..."
                     image_data = None
             # Validate, sanitize, and preserve formatting of the generated content
             sanitized_content = self.sanitize_content_for_api(generated_content)
             # Ensure paragraph breaks and formatting are preserved
             final_content = self.preserve_formatting(sanitized_content)
             # Handle image data - could be URL or base64
             image_bytes = None
             if image_data:
@@ -189,22 +189,22 @@ class ContentService:
                 else:
                     # It's a URL, keep as string
                     image_bytes = image_data
             return (final_content, image_bytes)
         except Exception as e:
             error_message = str(e)
             current_app.logger.error(f"Content generation failed: {error_message}")
             raise Exception(f"Content generation failed: {error_message}")
     def add_rss_source(self, rss_link: str, user_id: str) -> str:
         """
         Add an RSS source for content generation.
         Args:
             rss_link (str): RSS feed URL
             user_id (str): User ID
         Returns:
             str: Result message
         """
@@ -212,32 +212,32 @@ class ContentService:
             # Ensure the client is initialized (lazy initialization)
             if self.client is None:
                 self._initialize_client()
             # Call the Hugging Face model to add RSS source
             rss_input = f"{rss_link}__thi_irrh'èçs_my_id__! {user_id}"
             sanitized_rss_input = self.sanitize_content_for_api(rss_input)
             result = self.client.predict(
                 rss_link=sanitized_rss_input,
                 api_name="/ajouter_rss"
             )
             # Sanitize and preserve formatting of the result
             sanitized_result = self.sanitize_content_for_api(result)
             return self.preserve_formatting(sanitized_result)
         except Exception as e:
             raise Exception(f"Failed to add RSS source: {str(e)}")
     def analyze_keyword_frequency(self, keyword, user_id, date_range='monthly'):
         """
         Analyze the frequency of new articles/links appearing in RSS feeds generated from keywords.
         Args:
             keyword (str): The keyword to analyze
             user_id (str): User ID for filtering content
             date_range (str): The date range to analyze ('daily', 'weekly', 'monthly')
         Returns:
             dict: Analysis data with article frequency over time
         """
@@ -245,14 +245,14 @@ class ContentService:
             from flask import current_app
             from datetime import datetime, timedelta
             import re
             # Attempt to access current_app, but handle gracefully if outside of app context
             try:
                 # Fetch posts from the database that belong to the user
                 # Check if Supabase client is initialized
                 if not hasattr(current_app, 'supabase') or current_app.supabase is None:
                     raise Exception("Database connection not initialized")
                 # Get all RSS sources for the user to analyze
                 rss_response = (
                     current_app.supabase
@@ -261,18 +261,18 @@ class ContentService:
                     .eq("user_id", user_id)
                     .execute()
                 )
                 user_rss_sources = rss_response.data if rss_response.data else []
                 # Analyze each RSS source for frequency of new articles/links
                 keyword_data = []
                 # Create a DataFrame to store articles from RSS feeds
                 all_articles = []
                 for rss_source in user_rss_sources:
                     rss_link = rss_source["source"]
                     # Check if the source is a keyword rather than an RSS URL
                     # If it's a keyword, generate a Google News RSS URL
                     if self._is_url(rss_link):
@@ -281,14 +281,14 @@ class ContentService:
                     else:
                         # It's a keyword, generate Google News RSS URL
                         feed_url = self._generate_google_news_rss_from_string(rss_link)
                     # Parse the RSS feed
                     feed = feedparser.parse(feed_url)
                     # Log some debug information
                     current_app.logger.info(f"Processing RSS feed: {feed_url}")
                     current_app.logger.info(f"Number of entries in feed: {len(feed.entries)}")
                     # Extract articles from the feed
                     for entry in feed.entries:
                         # Use the same date handling as in the original ai_agent.py
@@ -299,39 +299,39 @@ class ContentService:
                             'date': entry.get('published', entry.get('updated', None)),
                             'content': entry.get('summary', '') + ' ' + entry.get('title', '')
                         }
                         # Log individual article data for debugging
                         current_app.logger.info(f"Article title: {entry.title}")
                         current_app.logger.info(f"Article date: {article_data['date']}")
                         all_articles.append(article_data)
                 # Create a DataFrame from the articles
                 df_articles = pd.DataFrame(all_articles)
                 current_app.logger.info(f"Total articles collected: {len(df_articles)}")
                 if not df_articles.empty:
                     current_app.logger.info(f"DataFrame columns: {df_articles.columns.tolist()}")
                     current_app.logger.info(f"Sample of DataFrame:\n{df_articles.head()}")
                 # Convert date column to datetime if it exists
                 if not df_articles.empty and 'date' in df_articles.columns:
                     # Convert struct_time objects to datetime
                     df_articles['date'] = pd.to_datetime(df_articles['date'], errors='coerce', utc=True)
                     current_app.logger.info(f"DataFrame shape after date conversion: {df_articles.shape}")
                     current_app.logger.info(f"Date column after conversion:\n{df_articles['date'].head()}")
                     df_articles = df_articles.dropna(subset=['date'])  # Remove entries with invalid dates
                     df_articles = df_articles.sort_values(by='date', ascending=True)
                     current_app.logger.info(f"DataFrame shape after dropping invalid dates: {df_articles.shape}")
                 # If we have articles, analyze article frequency over time
                 if not df_articles.empty:
                     # Group by date ranges and count all articles (not just those containing the keyword)
                     # This will show how many new articles appear in RSS feeds over time
                     # For the date grouping, use the appropriate pandas syntax
                     # Handle timezone-aware dates properly to avoid warnings
                     if date_range == 'daily':
@@ -354,15 +354,15 @@ class ContentService:
                             'day': 1
                         }).dt.date
                         interval = 'MS'  # Month Start frequency
                     # Count all articles by date group (this is the key difference - we're counting all articles, not keyword matches)
                     article_counts = df_articles.groupby('date_group').size().reset_index(name='count')
                     # Create a complete date range for the chart
                     if not article_counts.empty:
                         start_date = article_counts['date_group'].min()
                         end_date = article_counts['date_group'].max()
                         # Use the correct frequency for the date range generation
                         if date_range == 'daily':
                             freq = 'D'
@@ -370,26 +370,26 @@ class ContentService:
                             freq = 'W-MON'  # Weekly on Monday
                         else:  # monthly
                             freq = 'MS'  # Month start frequency
                         # Create a complete date range
                         full_date_range = pd.date_range(start=start_date, end=end_date, freq=freq).to_frame(index=False, name='date_group')
                         full_date_range['date_group'] = full_date_range['date_group'].dt.date
                         # Merge with article counts
                         article_counts = full_date_range.merge(article_counts, on='date_group', how='left').fillna(0)
                         # Convert counts to integers
                         article_counts['count'] = article_counts['count'].astype(int)
                         # Format the data for the frontend chart
                         for _, row in article_counts.iterrows():
                             date_str = row['date_group'].strftime('%Y-%m-%d')
                             # Calculate values for different time ranges
                             daily_val = row['count'] if date_range == 'daily' else int(row['count'] / 7) if date_range == 'weekly' else int(row['count'] / 30)
                             weekly_val = daily_val * 7 if date_range == 'daily' else row['count'] if date_range == 'weekly' else int(row['count'] / 4)
                             monthly_val = daily_val * 30 if date_range == 'daily' else weekly_val * 4 if date_range == 'weekly' else row['count']
                             keyword_data.append({
                                 'date': date_str,
                                 'daily': daily_val,
@@ -406,7 +406,7 @@ class ContentService:
                                 date = (start_date - timedelta(weeks=i)).strftime('%Y-%m-%d')
                             else:  # monthly
                                 date = (start_date - timedelta(days=30*i)).strftime('%Y-%m-%d')
                             keyword_data.append({
                                 'date': date,
                                 'daily': 0,
@@ -423,14 +423,14 @@ class ContentService:
                             date = (start_date - timedelta(weeks=i)).strftime('%Y-%m-%d')
                         else:  # monthly
                             date = (start_date - timedelta(days=30*i)).strftime('%Y-%m-%d')
                         keyword_data.append({
                             'date': date,
                             'daily': 0,
                             'weekly': 0,
                             'monthly': 0
                         })
                 return keyword_data
             except RuntimeError:
                 # We're outside of application context
@@ -445,16 +445,16 @@ class ContentService:
                         date = (start_date - timedelta(weeks=i)).strftime('%Y-%m-%d')
                     else:  # monthly
                         date = (start_date - timedelta(days=30*i)).strftime('%Y-%m-%d')
                     keyword_data.append({
                         'date': date,
                         'daily': 0,
                         'weekly': 0,
                         'monthly': 0
                     })
                 return keyword_data
         except Exception as e:
             import logging
             logging.error(f"Keyword frequency analysis failed: {str(e)}")
@@ -464,11 +464,11 @@ class ContentService:
         """
         Analyze the frequency pattern of links generated from RSS feeds for a specific keyword over time.
         Determines if the keyword follows a daily, weekly, monthly, or rare pattern based on recency and frequency.
         Args:
             keyword (str): The keyword to analyze
             user_id (str): User ID for filtering content
         Returns:
             dict: Analysis data with frequency pattern classification
         """
@@ -476,17 +476,17 @@ class ContentService:
             from flask import current_app
             from datetime import datetime, timedelta
             import re
             # Create a DataFrame to store articles from RSS feeds
             all_articles = []
             # Attempt to access current_app, but handle gracefully if outside of app context
             try:
                 # Fetch posts from the database that belong to the user
                 # Check if Supabase client is initialized
                 if not hasattr(current_app, 'supabase') or current_app.supabase is None:
                     raise Exception("Database connection not initialized")
                 # Get all RSS sources for the user to analyze
                 rss_response = (
                     current_app.supabase
@@ -495,15 +495,15 @@ class ContentService:
                     .eq("user_id", user_id)
                     .execute()
                 )
                 user_rss_sources = rss_response.data if rss_response.data else []
                 # Analyze each RSS source
                 # Check if the source matches the keyword or if it's any source
                 # We'll analyze any source that contains the keyword or is related to it
                 # Check if the source is a keyword rather than an RSS URL
                 # If it's a keyword, generate a Google News RSS URL
                 if self._is_url(keyword):
@@ -512,14 +512,14 @@ class ContentService:
                 else:
                     # It's a keyword, generate Google News RSS URL
                     feed_url = self._generate_google_news_rss_from_string(keyword)
                 # Parse the RSS feed
                 feed = feedparser.parse(feed_url)
                 # Log some debug information
                 current_app.logger.info(f"Processing RSS feed: {feed_url}")
                 current_app.logger.info(f"Number of entries in feed: {len(feed.entries)}")
                 # Extract ALL articles from the feed (without filtering by keyword again)
                 for entry in feed.entries:
                     # Use the same date handling as in the original ai_agent.py
@@ -530,37 +530,37 @@ class ContentService:
                         'date': entry.get('published', entry.get('updated', None)),
                         'content': entry.get('summary', '') + ' ' + entry.get('title', '')
                     }
                     # Log individual article data for debugging
                     current_app.logger.info(f"Article title: {entry.title}")
                     current_app.logger.info(f"Article date: {article_data['date']}")
                     all_articles.append(article_data)
                 # Create a DataFrame from the articles
                 df_articles = pd.DataFrame(all_articles)
                 current_app.logger.info(f"Total articles collected for keyword '{keyword}': {len(df_articles)}")
                 if not df_articles.empty:
                     current_app.logger.info(f"DataFrame columns: {df_articles.columns.tolist()}")
                     current_app.logger.info(f"Sample of DataFrame:\n{df_articles.head()}")
                 # Convert date column to datetime if it exists
                 if not df_articles.empty and 'date' in df_articles.columns:
                     # Convert struct_time objects to datetime
                     df_articles['date'] = pd.to_datetime(df_articles['date'], errors='coerce', utc=True)
                     current_app.logger.info(f"DataFrame shape after date conversion: {df_articles.shape}")
                     current_app.logger.info(f"Date column after conversion:\n{df_articles['date'].head()}")
                     df_articles = df_articles.dropna(subset=['date'])  # Remove entries with invalid dates
                     df_articles = df_articles.sort_values(by='date', ascending=False)  # Sort by date descending to get most recent first
                     current_app.logger.info(f"DataFrame shape after dropping invalid dates: {df_articles.shape}")
                 # Analyze frequency pattern
                 frequency_pattern = self._determine_frequency_pattern(df_articles)
                 # Prepare recent articles to return with the response
                 recent_articles = []
                 if not df_articles.empty:
@@ -572,13 +572,13 @@ class ContentService:
                         if pd.notna(row['date']):
                             # Convert to string in a readable format
                             formatted_date = row['date'].strftime('%Y-%m-%d %H:%M:%S') if hasattr(row['date'], 'strftime') else str(row['date'])
                         recent_articles.append({
                             'title': row['title'],
                             'link': row['link'],
                             'date': formatted_date
                         })
                 # Return comprehensive analysis
                 return {
                     'keyword': keyword,
@@ -591,7 +591,7 @@ class ContentService:
                         'end': df_articles['date'].min().strftime('%Y-%m-%d') if not df_articles.empty else None    # Earliest date last
                     }
                 }
             except RuntimeError:
                 # We're outside of application context
                 # Return default analysis for testing purposes
@@ -609,7 +609,7 @@ class ContentService:
                         'end': None
                     }
                 }
         except Exception as e:
             import logging
             logging.error(f"Keyword frequency pattern analysis failed: {str(e)}")
@@ -618,10 +618,10 @@ class ContentService:
     def _determine_frequency_pattern(self, df_articles):
         """
         Determine the frequency pattern based on the recency and frequency of articles.
         Args:
             df_articles: DataFrame with articles data including dates
         Returns:
             dict: Pattern classification and details
         """
@@ -633,23 +633,23 @@ class ContentService:
                     'confidence': 1.0
                 }
             }
         # Calculate time since the latest article
         latest_date = df_articles['date'].max()
         current_time = pd.Timestamp.now(tz=latest_date.tz) if latest_date.tz else pd.Timestamp.now()
         time_since_latest = (current_time - latest_date).days
         # Calculate article frequency
         total_articles = len(df_articles)
         # Group articles by date to get daily counts
         df_articles['date_only'] = df_articles['date'].dt.date
         daily_counts = df_articles.groupby('date_only').size()
         # Calculate metrics
         avg_daily_frequency = daily_counts.mean() if len(daily_counts) > 0 else 0
         recent_activity = daily_counts.tail(7).sum()  # articles in last 7 days
         # Determine pattern based on multiple factors
         if total_articles == 0:
             return {
@@ -659,7 +659,7 @@ class ContentService:
                     'confidence': 1.0
                 }
             }
         # Check if pattern is truly persistent by considering recency
         if time_since_latest > 30:
             # If no activity in the last month, it's likely not a daily/weekly pattern anymore
@@ -671,7 +671,7 @@ class ContentService:
                         'confidence': 0.9
                     }
                 }
         # If there are many recent articles per day, it's likely daily
         if recent_activity > 7 and time_since_latest <= 1:
             return {
@@ -681,7 +681,7 @@ class ContentService:
                     'confidence': 0.9
                 }
             }
         # If there are few articles per day but regular weekly activity
         if 3 <= recent_activity <= 7 and time_since_latest <= 7:
             return {
@@ -691,7 +691,7 @@ class ContentService:
                     'confidence': 0.8
                 }
             }
         # If there are very few articles but they are somewhat spread over time
         if recent_activity < 3 and total_articles > 0 and time_since_latest <= 30:
             return {
@@ -701,7 +701,7 @@ class ContentService:
                     'confidence': 0.7
                 }
             }
         # Default to rare if no clear pattern
         return {
             'pattern': 'rare',
@@ -723,12 +723,12 @@ class ContentService:
     def _generate_google_news_rss_from_string(self, query, language="en", country="US"):
         """
         Génère un lien RSS Google News à partir d'une chaîne de recherche brute.
         Args:
             query (str): Requête brute de recherche Google News.
             language (str): Code langue, ex: "en".
             country (str): Code pays, ex: "US".
         Returns:
             str: URL du flux RSS Google News.
         """

 class ContentService:
     """Service for AI content generation using Hugging Face models."""
     def __init__(self, hugging_key=None):
         # Store the hugging_key to be used later when needed
         # This avoids accessing current_app during initialization
         self.hugging_key = hugging_key
         # Initialize the Gradio client lazily - only when first needed
         self.client = None
     def _initialize_client(self):
         """Initialize the Gradio client, either with provided key or from app config."""
         if self.client is None:
                     # We're outside of an application context
                     raise RuntimeError("Hugging Face API key not provided and not available in app config. "
                                        "Please provide the key when initializing ContentService.")
+            self.client = Client("Zelyanoth/Linkedin_poster_dev", token=self.hugging_key)
     def validate_unicode_content(self, content):
         """Validate Unicode content while preserving original formatting and spaces."""
         if not content or not isinstance(content, str):
             return content
         try:
             # Test if content can be encoded as UTF-8
             content.encode('utf-8')
             except:
                 # Ultimate fallback
                 return str(content)
     def preserve_formatting(self, content):
         """Preserve spaces, line breaks, and paragraph formatting."""
         if not content:
             return content
         # Preserve all whitespace characters including spaces, tabs, and newlines
         # This ensures that paragraph breaks and indentation are maintained
         try:
         except UnicodeEncodeError:
             # Fallback with error replacement but preserve whitespace
             return content.encode('utf-8', errors='replace').decode('utf-8')
     def sanitize_content_for_api(self, content):
         """Sanitize content for API calls while preserving original text, spaces, and formatting."""
         if not content:
             return content
         # First preserve formatting and spaces
         preserved = self.preserve_formatting(content)
         # Only validate Unicode, don't remove spaces or formatting
         validated = self.validate_unicode_content(preserved)
         # Only remove null bytes that might cause issues in API calls
         if '\x00' in validated:
             validated = validated.replace('\x00', '')
         # Ensure line breaks and spaces are preserved
         validated = validated.replace('\r\n', '\n').replace('\r', '\n')
         return validated
     def _is_base64_image(self, data):
         """Check if the data is a base64 encoded image string."""
         if not isinstance(data, str):
             return False
         # Check if it starts with data URL prefix
         if data.startswith('data:image/'):
             return True
         # Try to decode as base64
         try:
             # Extract base64 part if it's a data URL
                 base64_part = data.split(',')[1]
             else:
                 base64_part = data
             # Try to decode
             base64.b64decode(base64_part, validate=True)
             return True
         except Exception:
             return False
     def _base64_to_bytes(self, base64_string):
         """Convert a base64 encoded string to bytes."""
         try:
                 base64_part = base64_string.split(',')[1]
             else:
                 base64_part = base64_string
             # Decode base64 to bytes
             return base64.b64decode(base64_part, validate=True)
         except Exception as e:
             current_app.logger.error(f"Failed to decode base64 image: {str(e)}")
             raise Exception(f"Failed to decode base64 image: {str(e)}")
     def generate_post_content(self, user_id: str) -> tuple:
         """
         Generate post content using AI.
         Args:
             user_id (str): User ID for personalization
         Returns:
             tuple: (Generated post content, Image URL or None)
         """
             # Ensure the client is initialized (lazy initialization)
             if self.client is None:
                 self._initialize_client()
             # Call the Hugging Face model to generate content
             result = self.client.predict(
                 code=user_id,
                 api_name="/poster_linkedin"
             )
             # Handle the case where result might be a tuple from Gradio
             # The Gradio API returns a tuple with (content, image_data)
             if isinstance(result, tuple) and len(result) >= 2:
                     except (ValueError, SyntaxError):
                         # If that fails, treat the result as a plain string
                         parsed_result = [result]
                 # Extract the first element if it's a list
                 if isinstance(parsed_result, list):
                     generated_content = parsed_result[0] if parsed_result and parsed_result[0] is not None else "Generated content will appear here..."
                 else:
                     generated_content = str(parsed_result) if parsed_result is not None else "Generated content will appear here..."
                     image_data = None
             # Validate, sanitize, and preserve formatting of the generated content
             sanitized_content = self.sanitize_content_for_api(generated_content)
             # Ensure paragraph breaks and formatting are preserved
             final_content = self.preserve_formatting(sanitized_content)
             # Handle image data - could be URL or base64
             image_bytes = None
             if image_data:
                 else:
                     # It's a URL, keep as string
                     image_bytes = image_data
             return (final_content, image_bytes)
         except Exception as e:
             error_message = str(e)
             current_app.logger.error(f"Content generation failed: {error_message}")
             raise Exception(f"Content generation failed: {error_message}")
     def add_rss_source(self, rss_link: str, user_id: str) -> str:
         """
         Add an RSS source for content generation.
         Args:
             rss_link (str): RSS feed URL
             user_id (str): User ID
         Returns:
             str: Result message
         """
             # Ensure the client is initialized (lazy initialization)
             if self.client is None:
                 self._initialize_client()
             # Call the Hugging Face model to add RSS source
             rss_input = f"{rss_link}__thi_irrh'èçs_my_id__! {user_id}"
             sanitized_rss_input = self.sanitize_content_for_api(rss_input)
             result = self.client.predict(
                 rss_link=sanitized_rss_input,
                 api_name="/ajouter_rss"
             )
             # Sanitize and preserve formatting of the result
             sanitized_result = self.sanitize_content_for_api(result)
             return self.preserve_formatting(sanitized_result)
         except Exception as e:
             raise Exception(f"Failed to add RSS source: {str(e)}")
     def analyze_keyword_frequency(self, keyword, user_id, date_range='monthly'):
         """
         Analyze the frequency of new articles/links appearing in RSS feeds generated from keywords.
         Args:
             keyword (str): The keyword to analyze
             user_id (str): User ID for filtering content
             date_range (str): The date range to analyze ('daily', 'weekly', 'monthly')
         Returns:
             dict: Analysis data with article frequency over time
         """
             from flask import current_app
             from datetime import datetime, timedelta
             import re
             # Attempt to access current_app, but handle gracefully if outside of app context
             try:
                 # Fetch posts from the database that belong to the user
                 # Check if Supabase client is initialized
                 if not hasattr(current_app, 'supabase') or current_app.supabase is None:
                     raise Exception("Database connection not initialized")
                 # Get all RSS sources for the user to analyze
                 rss_response = (
                     current_app.supabase
                     .eq("user_id", user_id)
                     .execute()
                 )
                 user_rss_sources = rss_response.data if rss_response.data else []
                 # Analyze each RSS source for frequency of new articles/links
                 keyword_data = []
                 # Create a DataFrame to store articles from RSS feeds
                 all_articles = []
                 for rss_source in user_rss_sources:
                     rss_link = rss_source["source"]
                     # Check if the source is a keyword rather than an RSS URL
                     # If it's a keyword, generate a Google News RSS URL
                     if self._is_url(rss_link):
                     else:
                         # It's a keyword, generate Google News RSS URL
                         feed_url = self._generate_google_news_rss_from_string(rss_link)
                     # Parse the RSS feed
                     feed = feedparser.parse(feed_url)
                     # Log some debug information
                     current_app.logger.info(f"Processing RSS feed: {feed_url}")
                     current_app.logger.info(f"Number of entries in feed: {len(feed.entries)}")
                     # Extract articles from the feed
                     for entry in feed.entries:
                         # Use the same date handling as in the original ai_agent.py
                             'date': entry.get('published', entry.get('updated', None)),
                             'content': entry.get('summary', '') + ' ' + entry.get('title', '')
                         }
                         # Log individual article data for debugging
                         current_app.logger.info(f"Article title: {entry.title}")
                         current_app.logger.info(f"Article date: {article_data['date']}")
                         all_articles.append(article_data)
                 # Create a DataFrame from the articles
                 df_articles = pd.DataFrame(all_articles)
                 current_app.logger.info(f"Total articles collected: {len(df_articles)}")
                 if not df_articles.empty:
                     current_app.logger.info(f"DataFrame columns: {df_articles.columns.tolist()}")
                     current_app.logger.info(f"Sample of DataFrame:\n{df_articles.head()}")
                 # Convert date column to datetime if it exists
                 if not df_articles.empty and 'date' in df_articles.columns:
                     # Convert struct_time objects to datetime
                     df_articles['date'] = pd.to_datetime(df_articles['date'], errors='coerce', utc=True)
                     current_app.logger.info(f"DataFrame shape after date conversion: {df_articles.shape}")
                     current_app.logger.info(f"Date column after conversion:\n{df_articles['date'].head()}")
                     df_articles = df_articles.dropna(subset=['date'])  # Remove entries with invalid dates
                     df_articles = df_articles.sort_values(by='date', ascending=True)
                     current_app.logger.info(f"DataFrame shape after dropping invalid dates: {df_articles.shape}")
                 # If we have articles, analyze article frequency over time
                 if not df_articles.empty:
                     # Group by date ranges and count all articles (not just those containing the keyword)
                     # This will show how many new articles appear in RSS feeds over time
                     # For the date grouping, use the appropriate pandas syntax
                     # Handle timezone-aware dates properly to avoid warnings
                     if date_range == 'daily':
                             'day': 1
                         }).dt.date
                         interval = 'MS'  # Month Start frequency
                     # Count all articles by date group (this is the key difference - we're counting all articles, not keyword matches)
                     article_counts = df_articles.groupby('date_group').size().reset_index(name='count')
                     # Create a complete date range for the chart
                     if not article_counts.empty:
                         start_date = article_counts['date_group'].min()
                         end_date = article_counts['date_group'].max()
                         # Use the correct frequency for the date range generation
                         if date_range == 'daily':
                             freq = 'D'
                             freq = 'W-MON'  # Weekly on Monday
                         else:  # monthly
                             freq = 'MS'  # Month start frequency
                         # Create a complete date range
                         full_date_range = pd.date_range(start=start_date, end=end_date, freq=freq).to_frame(index=False, name='date_group')
                         full_date_range['date_group'] = full_date_range['date_group'].dt.date
                         # Merge with article counts
                         article_counts = full_date_range.merge(article_counts, on='date_group', how='left').fillna(0)
                         # Convert counts to integers
                         article_counts['count'] = article_counts['count'].astype(int)
                         # Format the data for the frontend chart
                         for _, row in article_counts.iterrows():
                             date_str = row['date_group'].strftime('%Y-%m-%d')
                             # Calculate values for different time ranges
                             daily_val = row['count'] if date_range == 'daily' else int(row['count'] / 7) if date_range == 'weekly' else int(row['count'] / 30)
                             weekly_val = daily_val * 7 if date_range == 'daily' else row['count'] if date_range == 'weekly' else int(row['count'] / 4)
                             monthly_val = daily_val * 30 if date_range == 'daily' else weekly_val * 4 if date_range == 'weekly' else row['count']
                             keyword_data.append({
                                 'date': date_str,
                                 'daily': daily_val,
                                 date = (start_date - timedelta(weeks=i)).strftime('%Y-%m-%d')
                             else:  # monthly
                                 date = (start_date - timedelta(days=30*i)).strftime('%Y-%m-%d')
                             keyword_data.append({
                                 'date': date,
                                 'daily': 0,
                             date = (start_date - timedelta(weeks=i)).strftime('%Y-%m-%d')
                         else:  # monthly
                             date = (start_date - timedelta(days=30*i)).strftime('%Y-%m-%d')
                         keyword_data.append({
                             'date': date,
                             'daily': 0,
                             'weekly': 0,
                             'monthly': 0
                         })
                 return keyword_data
             except RuntimeError:
                 # We're outside of application context
                         date = (start_date - timedelta(weeks=i)).strftime('%Y-%m-%d')
                     else:  # monthly
                         date = (start_date - timedelta(days=30*i)).strftime('%Y-%m-%d')
                     keyword_data.append({
                         'date': date,
                         'daily': 0,
                         'weekly': 0,
                         'monthly': 0
                     })
                 return keyword_data
         except Exception as e:
             import logging
             logging.error(f"Keyword frequency analysis failed: {str(e)}")
         """
         Analyze the frequency pattern of links generated from RSS feeds for a specific keyword over time.
         Determines if the keyword follows a daily, weekly, monthly, or rare pattern based on recency and frequency.
         Args:
             keyword (str): The keyword to analyze
             user_id (str): User ID for filtering content
         Returns:
             dict: Analysis data with frequency pattern classification
         """
             from flask import current_app
             from datetime import datetime, timedelta
             import re
             # Create a DataFrame to store articles from RSS feeds
             all_articles = []
             # Attempt to access current_app, but handle gracefully if outside of app context
             try:
                 # Fetch posts from the database that belong to the user
                 # Check if Supabase client is initialized
                 if not hasattr(current_app, 'supabase') or current_app.supabase is None:
                     raise Exception("Database connection not initialized")
                 # Get all RSS sources for the user to analyze
                 rss_response = (
                     current_app.supabase
                     .eq("user_id", user_id)
                     .execute()
                 )
                 user_rss_sources = rss_response.data if rss_response.data else []
                 # Analyze each RSS source
                 # Check if the source matches the keyword or if it's any source
                 # We'll analyze any source that contains the keyword or is related to it
                 # Check if the source is a keyword rather than an RSS URL
                 # If it's a keyword, generate a Google News RSS URL
                 if self._is_url(keyword):
                 else:
                     # It's a keyword, generate Google News RSS URL
                     feed_url = self._generate_google_news_rss_from_string(keyword)
                 # Parse the RSS feed
                 feed = feedparser.parse(feed_url)
                 # Log some debug information
                 current_app.logger.info(f"Processing RSS feed: {feed_url}")
                 current_app.logger.info(f"Number of entries in feed: {len(feed.entries)}")
                 # Extract ALL articles from the feed (without filtering by keyword again)
                 for entry in feed.entries:
                     # Use the same date handling as in the original ai_agent.py
                         'date': entry.get('published', entry.get('updated', None)),
                         'content': entry.get('summary', '') + ' ' + entry.get('title', '')
                     }
                     # Log individual article data for debugging
                     current_app.logger.info(f"Article title: {entry.title}")
                     current_app.logger.info(f"Article date: {article_data['date']}")
                     all_articles.append(article_data)
                 # Create a DataFrame from the articles
                 df_articles = pd.DataFrame(all_articles)
                 current_app.logger.info(f"Total articles collected for keyword '{keyword}': {len(df_articles)}")
                 if not df_articles.empty:
                     current_app.logger.info(f"DataFrame columns: {df_articles.columns.tolist()}")
                     current_app.logger.info(f"Sample of DataFrame:\n{df_articles.head()}")
                 # Convert date column to datetime if it exists
                 if not df_articles.empty and 'date' in df_articles.columns:
                     # Convert struct_time objects to datetime
                     df_articles['date'] = pd.to_datetime(df_articles['date'], errors='coerce', utc=True)
                     current_app.logger.info(f"DataFrame shape after date conversion: {df_articles.shape}")
                     current_app.logger.info(f"Date column after conversion:\n{df_articles['date'].head()}")
                     df_articles = df_articles.dropna(subset=['date'])  # Remove entries with invalid dates
                     df_articles = df_articles.sort_values(by='date', ascending=False)  # Sort by date descending to get most recent first
                     current_app.logger.info(f"DataFrame shape after dropping invalid dates: {df_articles.shape}")
                 # Analyze frequency pattern
                 frequency_pattern = self._determine_frequency_pattern(df_articles)
                 # Prepare recent articles to return with the response
                 recent_articles = []
                 if not df_articles.empty:
                         if pd.notna(row['date']):
                             # Convert to string in a readable format
                             formatted_date = row['date'].strftime('%Y-%m-%d %H:%M:%S') if hasattr(row['date'], 'strftime') else str(row['date'])
                         recent_articles.append({
                             'title': row['title'],
                             'link': row['link'],
                             'date': formatted_date
                         })
                 # Return comprehensive analysis
                 return {
                     'keyword': keyword,
                         'end': df_articles['date'].min().strftime('%Y-%m-%d') if not df_articles.empty else None    # Earliest date last
                     }
                 }
             except RuntimeError:
                 # We're outside of application context
                 # Return default analysis for testing purposes
                         'end': None
                     }
                 }
         except Exception as e:
             import logging
             logging.error(f"Keyword frequency pattern analysis failed: {str(e)}")
     def _determine_frequency_pattern(self, df_articles):
         """
         Determine the frequency pattern based on the recency and frequency of articles.
         Args:
             df_articles: DataFrame with articles data including dates
         Returns:
             dict: Pattern classification and details
         """
                     'confidence': 1.0
                 }
             }
         # Calculate time since the latest article
         latest_date = df_articles['date'].max()
         current_time = pd.Timestamp.now(tz=latest_date.tz) if latest_date.tz else pd.Timestamp.now()
         time_since_latest = (current_time - latest_date).days
         # Calculate article frequency
         total_articles = len(df_articles)
         # Group articles by date to get daily counts
         df_articles['date_only'] = df_articles['date'].dt.date
         daily_counts = df_articles.groupby('date_only').size()
         # Calculate metrics
         avg_daily_frequency = daily_counts.mean() if len(daily_counts) > 0 else 0
         recent_activity = daily_counts.tail(7).sum()  # articles in last 7 days
         # Determine pattern based on multiple factors
         if total_articles == 0:
             return {
                     'confidence': 1.0
                 }
             }
         # Check if pattern is truly persistent by considering recency
         if time_since_latest > 30:
             # If no activity in the last month, it's likely not a daily/weekly pattern anymore
                         'confidence': 0.9
                     }
                 }
         # If there are many recent articles per day, it's likely daily
         if recent_activity > 7 and time_since_latest <= 1:
             return {
                     'confidence': 0.9
                 }
             }
         # If there are few articles per day but regular weekly activity
         if 3 <= recent_activity <= 7 and time_since_latest <= 7:
             return {
                     'confidence': 0.8
                 }
             }
         # If there are very few articles but they are somewhat spread over time
         if recent_activity < 3 and total_articles > 0 and time_since_latest <= 30:
             return {
                     'confidence': 0.7
                 }
             }
         # Default to rare if no clear pattern
         return {
             'pattern': 'rare',
     def _generate_google_news_rss_from_string(self, query, language="en", country="US"):
         """
         Génère un lien RSS Google News à partir d'une chaîne de recherche brute.
         Args:
             query (str): Requête brute de recherche Google News.
             language (str): Code langue, ex: "en".
             country (str): Code pays, ex: "US".
         Returns:
             str: URL du flux RSS Google News.
         """

requirements.txt CHANGED Viewed

@@ -19,7 +19,7 @@ apscheduler>=3.11.0
 pandas>=2.2.2
 # AI content generation
-gradio-client>=1.10.4
 # Database integration
 supabase>=2.16.0

 pandas>=2.2.2
 # AI content generation
+gradio-client==2.0.0
 # Database integration
 supabase>=2.16.0