|
""" |
|
Keyword analysis controller for analysis-related functions. |
|
""" |
|
from datetime import datetime, timedelta |
|
import logging |
|
|
|
|
|
logger = logging.getLogger(__name__) |
|
|
|
def get_time_range(filter_type: str) -> tuple[datetime, datetime]: |
|
""" |
|
Calculate time range for analysis periods. |
|
|
|
Args: |
|
filter_type: One of 'today', 'week', or 'month' |
|
|
|
Returns: |
|
tuple: (start_date, end_date) as datetime objects |
|
""" |
|
end_date = datetime.now() |
|
|
|
if filter_type == "today": |
|
start_date = end_date.replace(hour=0, minute=0, second=0, microsecond=0) |
|
elif filter_type == "week": |
|
start_date = end_date - timedelta(days=6) |
|
start_date = start_date.replace(hour=0, minute=0, second=0, microsecond=0) |
|
elif filter_type == "month": |
|
start_date = end_date - timedelta(days=29) |
|
start_date = start_date.replace(hour=0, minute=0, second=0, microsecond=0) |
|
else: |
|
|
|
start_date = end_date.replace(hour=0, minute=0, second=0, microsecond=0) |
|
|
|
return start_date, end_date |
|
|
|
def get_previous_time_range( |
|
filter_type: str, |
|
current_start: datetime |
|
) -> tuple[datetime, datetime]: |
|
""" |
|
Calculate previous time range for comparison. |
|
|
|
Args: |
|
filter_type: One of 'today', 'week', or 'month' |
|
current_start: Start date of current period |
|
|
|
Returns: |
|
tuple: (previous_start, previous_end) as datetime objects |
|
""" |
|
if filter_type == "today": |
|
|
|
previous_end = current_start - timedelta(seconds=1) |
|
previous_start = previous_end.replace(hour=0, minute=0, second=0, microsecond=0) |
|
elif filter_type == "week": |
|
|
|
previous_end = current_start - timedelta(seconds=1) |
|
previous_start = previous_end - timedelta(days=6) |
|
previous_start = previous_start.replace(hour=0, minute=0, second=0, microsecond=0) |
|
elif filter_type == "month": |
|
|
|
previous_end = current_start - timedelta(seconds=1) |
|
previous_start = previous_end - timedelta(days=29) |
|
previous_start = previous_start.replace(hour=0, minute=0, second=0, microsecond=0) |
|
else: |
|
|
|
previous_end = current_start - timedelta(seconds=1) |
|
previous_start = previous_end.replace(hour=0, minute=0, second=0, microsecond=0) |
|
|
|
return previous_start, previous_end |
|
|
|
def calculate_heating_scores_from_database( |
|
filter_type: str, |
|
keywords_collection |
|
) -> tuple[dict, dict, dict, dict]: |
|
""" |
|
Calculate heating scores by comparing stored keywords from database. |
|
|
|
Args: |
|
filter_type: Analysis period ('today', 'week', 'month') |
|
keywords_collection: MongoDB collection containing keywords |
|
|
|
Returns: |
|
tuple: (heating_scores, current_keywords, previous_keywords, keyword_categories) |
|
""" |
|
heating_scores = {} |
|
keyword_categories = {} |
|
|
|
|
|
current_start, current_end = get_time_range(filter_type) |
|
prev_start, prev_end = get_previous_time_range(filter_type, current_start) |
|
|
|
logger.info("Fetching stored keywords for heating calculation") |
|
logger.info("Current: %s to %s", |
|
current_start.strftime('%Y-%m-%d'), current_end.strftime('%Y-%m-%d')) |
|
logger.info("Previous: %s to %s", |
|
prev_start.strftime('%Y-%m-%d'), prev_end.strftime('%Y-%m-%d')) |
|
|
|
|
|
current_keywords = {} |
|
current_docs_found = 0 |
|
current_cursor = keywords_collection.find({ |
|
"_id": { |
|
"$gte": current_start.strftime("%Y-%m-%d"), |
|
"$lte": current_end.strftime("%Y-%m-%d") |
|
} |
|
}) |
|
|
|
for doc in current_cursor: |
|
current_docs_found += 1 |
|
if 'keywords' in doc: |
|
for kw in doc['keywords']: |
|
keyword = kw.get('keyword', '') |
|
frequency = kw.get('frequency', 0) |
|
category = kw.get('category', 'other_categories') |
|
|
|
if keyword in current_keywords: |
|
current_keywords[keyword] += frequency |
|
else: |
|
current_keywords[keyword] = frequency |
|
|
|
|
|
keyword_categories[keyword] = category |
|
|
|
|
|
if filter_type == "today" and current_docs_found == 0: |
|
logger.warning("No keywords found for today. Looking for most recent available data...") |
|
|
|
|
|
most_recent_doc = keywords_collection.find_one( |
|
sort=[("_id", -1)], |
|
projection={"_id": 1} |
|
) |
|
|
|
if most_recent_doc: |
|
most_recent_date_str = most_recent_doc['_id'] |
|
most_recent_date = datetime.strptime(most_recent_date_str, "%Y-%m-%d") |
|
|
|
logger.info("Using most recent available date: %s", most_recent_date_str) |
|
|
|
|
|
current_start = most_recent_date.replace(hour=0, minute=0, second=0, microsecond=0) |
|
current_end = most_recent_date.replace( |
|
hour=23, minute=59, second=59, microsecond=999999) |
|
prev_start, prev_end = get_previous_time_range(filter_type, current_start) |
|
|
|
logger.info("Adjusted current: %s to %s", |
|
current_start.strftime('%Y-%m-%d'), current_end.strftime('%Y-%m-%d')) |
|
logger.info("Adjusted previous: %s to %s", |
|
prev_start.strftime('%Y-%m-%d'), prev_end.strftime('%Y-%m-%d')) |
|
|
|
|
|
current_keywords = {} |
|
current_docs_found = 0 |
|
current_cursor = keywords_collection.find({ |
|
"_id": { |
|
"$gte": current_start.strftime("%Y-%m-%d"), |
|
"$lte": current_end.strftime("%Y-%m-%d") |
|
} |
|
}) |
|
|
|
for doc in current_cursor: |
|
current_docs_found += 1 |
|
if 'keywords' in doc: |
|
for kw in doc['keywords']: |
|
keyword = kw.get('keyword', '') |
|
frequency = kw.get('frequency', 0) |
|
category = kw.get('category', 'other_categories') |
|
|
|
if keyword in current_keywords: |
|
current_keywords[keyword] += frequency |
|
else: |
|
current_keywords[keyword] = frequency |
|
|
|
keyword_categories[keyword] = category |
|
else: |
|
logger.error("No keywords found in database at all") |
|
return {}, {}, {}, {} |
|
|
|
|
|
previous_keywords = {} |
|
previous_docs_found = 0 |
|
previous_cursor = keywords_collection.find({ |
|
"_id": { |
|
"$gte": prev_start.strftime("%Y-%m-%d"), |
|
"$lte": prev_end.strftime("%Y-%m-%d") |
|
} |
|
}) |
|
|
|
for doc in previous_cursor: |
|
previous_docs_found += 1 |
|
if 'keywords' in doc: |
|
for kw in doc['keywords']: |
|
keyword = kw.get('keyword', '') |
|
frequency = kw.get('frequency', 0) |
|
category = kw.get('category', 'other_categories') |
|
|
|
if keyword in previous_keywords: |
|
previous_keywords[keyword] += frequency |
|
else: |
|
previous_keywords[keyword] = frequency |
|
|
|
|
|
if keyword not in keyword_categories: |
|
keyword_categories[keyword] = category |
|
|
|
|
|
total_current = sum(current_keywords.values()) or 1 |
|
total_previous = sum(previous_keywords.values()) or 1 |
|
|
|
logger.info("Current period: %d database records, %d terms, total freq: %d", |
|
current_docs_found, len(current_keywords), total_current) |
|
logger.info("Previous period: %d database records, %d terms, total freq: %d", |
|
previous_docs_found, len(previous_keywords), total_previous) |
|
|
|
if not previous_keywords: |
|
|
|
logger.info("No previous data - assigning neutral heating scores") |
|
for term in current_keywords: |
|
heating_scores[term] = 0.0 |
|
else: |
|
logger.info("Calculating heating scores") |
|
|
|
|
|
for term, current_freq in current_keywords.items(): |
|
previous_freq = previous_keywords.get(term, 0) |
|
|
|
if previous_freq > 0: |
|
|
|
freq_change = ((current_freq - previous_freq) / previous_freq) * 100 |
|
heating_scores[term] = round(freq_change, 1) |
|
else: |
|
|
|
heating_scores[term] = 200.0 |
|
|
|
|
|
for term in previous_keywords: |
|
if term not in current_keywords: |
|
heating_scores[term] = -200.0 |
|
|
|
return heating_scores, current_keywords, previous_keywords, keyword_categories |
|
|
|
def calculate_heating_scores_for_period( |
|
current_terms_data: list, |
|
previous_terms_data: list |
|
) -> dict: |
|
""" |
|
Calculate heating scores between two periods. |
|
|
|
Args: |
|
current_terms_data: List of current period keyword data |
|
previous_terms_data: List of previous period keyword data |
|
|
|
Returns: |
|
dict: Heating scores for each keyword |
|
""" |
|
heating_scores = {} |
|
|
|
|
|
current_freq = {term['keyword']: term['frequency'] for term in current_terms_data} |
|
previous_freq = {term['keyword']: term['frequency'] for term in previous_terms_data} |
|
|
|
|
|
for term, current_freq_val in current_freq.items(): |
|
previous_freq_val = previous_freq.get(term, 0) |
|
|
|
if previous_freq_val > 0: |
|
|
|
freq_change = ((current_freq_val - previous_freq_val) / previous_freq_val) * 100 |
|
heating_scores[term] = round(freq_change, 1) |
|
else: |
|
|
|
heating_scores[term] = 200.0 |
|
|
|
|
|
for term in previous_freq: |
|
if term not in current_freq: |
|
heating_scores[term] = -200.0 |
|
|
|
return heating_scores |
|
|
|
def create_thematic_analysis(terms_with_data: list) -> list: |
|
""" |
|
Create thematic groupings from keyword data. |
|
|
|
Args: |
|
terms_with_data: List of keyword data with categories |
|
|
|
Returns: |
|
list: List of theme objects |
|
""" |
|
themes = {} |
|
|
|
|
|
for term in terms_with_data: |
|
category = term.get('category', 'other_categories') |
|
keyword = term.get('keyword', '') |
|
frequency = term.get('frequency', 0) |
|
_ = term.get('popularity', 0.0) |
|
|
|
if category not in themes: |
|
themes[category] = { |
|
'theme_name': category, |
|
'description': f"Keywords related to {category}", |
|
'terms': [], |
|
'total_frequency': 0, |
|
'heating_score': 0.0 |
|
} |
|
|
|
themes[category]['terms'].append(keyword) |
|
themes[category]['total_frequency'] += frequency |
|
|
|
|
|
for theme in themes.values(): |
|
if theme['terms']: |
|
avg_heating = sum(term.get('popularity', 0.0) for term in terms_with_data |
|
if term.get('keyword') in theme['terms']) / len(theme['terms']) |
|
theme['heating_score'] = round(avg_heating, 1) |
|
|
|
return list(themes.values()) |
|
|
|
def analyze_keywords_from_database( |
|
filter_type: str, |
|
keywords_collection |
|
) -> dict: |
|
""" |
|
Analyze keywords using existing database data (no re-extraction). |
|
|
|
Args: |
|
filter_type: Analysis period ('today', 'week', 'month') |
|
keywords_collection: MongoDB collection containing keywords |
|
|
|
Returns: |
|
dict: Analysis results with keywords and themes |
|
""" |
|
|
|
logger.info("Starting database analysis for %s", filter_type) |
|
|
|
|
|
heating_scores, current_keywords, previous_keywords, keyword_categories = ( |
|
calculate_heating_scores_from_database(filter_type, keywords_collection) |
|
) |
|
|
|
|
|
terms_data = [] |
|
for keyword, frequency in current_keywords.items(): |
|
term_data = { |
|
'keyword': keyword, |
|
'category': keyword_categories.get(keyword, 'other_categories'), |
|
'frequency': frequency, |
|
'popularity': heating_scores.get(keyword, 0.0), |
|
'new': keyword not in previous_keywords, |
|
'variations': [], |
|
'importanceScore': 0.8 if keyword not in previous_keywords else 0.5, |
|
'context': f"Found {frequency} times in {filter_type} period" |
|
} |
|
terms_data.append(term_data) |
|
|
|
|
|
terms_data.sort(key=lambda x: (x['frequency'], abs(x['popularity'])), reverse=True) |
|
|
|
|
|
themes_data = create_thematic_analysis(terms_data) |
|
|
|
results = { |
|
'terms': terms_data, |
|
'themes': themes_data |
|
} |
|
|
|
logger.info("Analysis completed: %d terms, %d themes", len(terms_data), len(themes_data)) |
|
return results |
|
|
|
def save_summary_to_database( |
|
filter_type: str, |
|
results_data: dict, |
|
summary_collection |
|
) -> None: |
|
""" |
|
Save analysis results to Summary collection. |
|
|
|
Args: |
|
filter_type: Analysis period ('today', 'week', 'month') |
|
results_data: Analysis results to save |
|
summary_collection: MongoDB collection to save results |
|
""" |
|
try: |
|
|
|
processed_keywords = [] |
|
for term in results_data.get('terms', [])[:50]: |
|
processed_term = term.copy() |
|
|
|
|
|
processed_term['label'] = processed_term.get('keyword', '').title() |
|
|
|
|
|
if 'heating_score' in processed_term: |
|
processed_term['popularity'] = processed_term.pop('heating_score') |
|
|
|
|
|
if 'sentiment_score' in processed_term: |
|
processed_term['sentimentScore'] = processed_term.pop('sentiment_score') |
|
if 'sentiment_label' in processed_term: |
|
processed_term['sentimentLabel'] = processed_term.pop('sentiment_label') |
|
|
|
processed_keywords.append(processed_term) |
|
|
|
|
|
processed_categories = [] |
|
for theme in results_data.get('themes', []): |
|
processed_categories.append({ |
|
"category": theme.get('theme_name', ''), |
|
"description": theme.get('description', ''), |
|
"keywords": theme.get('terms', []), |
|
"frequency": theme.get('total_frequency', 0), |
|
"popularity": theme.get('heating_score', 0.0) |
|
}) |
|
|
|
|
|
document = { |
|
"_id": filter_type, |
|
"keywords": processed_keywords, |
|
"categories": processed_categories |
|
} |
|
|
|
|
|
if processed_keywords and processed_categories: |
|
if len(processed_keywords) > 0 and len(processed_categories) > 0: |
|
summary_collection.delete_one({"_id": filter_type}) |
|
result = summary_collection.insert_one(document) |
|
logger.info("Saved %s summary to database (ID: %s)", |
|
filter_type, result.inserted_id) |
|
|
|
except Exception as e: |
|
logger.error("Error saving summary to database: %s", str(e)) |
|
raise |
|
|
|
def cleanup_old_keywords(keywords_collection, latest_date: datetime) -> int: |
|
""" |
|
Remove keywords older than 60 days from the latest date. |
|
|
|
Args: |
|
keywords_collection: MongoDB collection containing keywords |
|
latest_date: Latest article date to use as reference |
|
|
|
Returns: |
|
int: Number of deleted records |
|
""" |
|
logger.info("Cleaning up records to maintain a 60-day window from the latest article") |
|
|
|
|
|
cutoff_date = latest_date - timedelta(days=60) |
|
|
|
result = keywords_collection.delete_many({ |
|
"_id": {"$lt": cutoff_date.strftime("%Y-%m-%d")} |
|
}) |
|
|
|
logger.info("Cutoff Date: %s, based on latest article on %s", |
|
cutoff_date.strftime('%Y-%m-%d'), latest_date.strftime('%Y-%m-%d')) |
|
logger.info("Deleted %d old records", result.deleted_count) |
|
|
|
|
|
oldest_doc = keywords_collection.find_one( |
|
sort=[("_id", 1)], |
|
projection={"_id": 1} |
|
) |
|
newest_doc = keywords_collection.find_one( |
|
sort=[("_id", -1)], |
|
projection={"_id": 1} |
|
) |
|
|
|
if oldest_doc and newest_doc: |
|
logger.info("Current data range: %s to %s", oldest_doc['_id'], newest_doc['_id']) |
|
|
|
return result.deleted_count |
|
|