File size: 9,028 Bytes
9d2a0e1
 
 
 
 
 
 
 
 
 
 
 
 
72f4cb5
9d2a0e1
1757667
 
 
9d2a0e1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
"""
Script to update category collection in MongoDB with sites from Article_China DynamoDB.
Reads records from Article_China based on a delta parameter for lastModifiedDate,
extracts unique site-category pairs from specified categories, and updates
MongoDB category collection with aggregated data.
"""

import logging
import datetime
from typing import Dict, List, Tuple
from collections import defaultdict
from dataclasses import dataclass
from botocore.exceptions import ClientError
from models.database import category_collection # pylint: disable=import-error

from ..utils import get_client_connection


# Configure logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
    datefmt='%Y-%m-%d %H:%M:%S'
)
logger = logging.getLogger('category_update')

# DynamoDB table name
ARTICLE_CHINA_TABLE = 'Article_China'
SCAN_LIMIT = 50  # Limit for scan operations as requested

# Target categories with unknown site lists
TARGET_CATEGORIES = [
    "Dragon Street China Markets",
    "Beijing Briefs",
    "Knowledge Hub"
]

@dataclass
class CategoryUpdater:
    """Manages the collection and updating of category-site relationships from DynamoDB to MongoDB.
    
    This class handles the complete workflow of:
    1. Querying recent articles from DynamoDB's Article_China table
    2. Extracting unique site-category pairs
    3. Grouping sites by their categories
    4. Updating MongoDB with the latest category-site relationships
    
    The class supports both incremental updates (via delta days) and full refreshes.
    
    Attributes:
        delta (int): Default lookback period in days for incremental updates (-1 for full refresh)
        logger (Logger): Configured logger instance for tracking operations
    
    Typical usage example:
        >>> updater = CategoryUpdater()
        >>> updater.collect()  # Default 1-day delta
        >>> updater.collect(delta=7)  # Weekly refresh
        >>> updater.collect(delta=-1)  # Full rebuild
    """
    #def __init__(self):
    #   self.delta = 1   Default delta value
    delta: int = 1  # Now a type-hinted class field with default

def get_articles_by_delta(delta: int) -> List[Dict]:
    """
    Query Article_China based on delta parameter and target categories.
    
    Args:
        delta: Number of days to look back. If -1, get all records.
        
    Returns:
        List of article records matching the criteria
    """
    dynamodb = get_client_connection()
    articles = []

    try:
        # Format target categories for filter expression
        target_categories_values = {}
        filter_conditions = []

        for i, category in enumerate(TARGET_CATEGORIES):
            attribute_name = f':category{i}'
            target_categories_values[attribute_name] = {'S': category}
            filter_conditions.append(f"category = {attribute_name}")

        category_filter = f"({' OR '.join(filter_conditions)})"

        if delta == -1:
            logger.info("Retrieving all articles from Article_China for target categories")
            # Scan with only category filter
            scan_params = {
                'TableName': ARTICLE_CHINA_TABLE,
                'FilterExpression': category_filter,
                'ExpressionAttributeValues': target_categories_values,
                'Limit': SCAN_LIMIT
            }
        else:
            # Calculate cutoff date
            cutoff_date = (datetime.datetime.now() - datetime.timedelta(days
            =delta)).strftime('%Y-%m-%dT%H:%M:%S')
            logger.info("Retrieving articles modified after %s for target categories", cutoff_date)

            # Add date filter to expression attribute values
            target_categories_values[':cutoff_date'] = {'S': cutoff_date}

            scan_params = {
                'TableName': ARTICLE_CHINA_TABLE,
                'FilterExpression': f"LastModifiedDate >= :cutoff_date AND {category_filter}",
                'ExpressionAttributeValues': target_categories_values,
                'Limit': SCAN_LIMIT
            }

        # Perform initial scan
        response = dynamodb.scan(**scan_params)
        articles.extend(response.get('Items', []))

        # Continue scanning if there are more items
        while 'LastEvaluatedKey' in response:
            logger.debug("Continuing scan, found %s articles so far", len(articles))
            scan_params['ExclusiveStartKey'] = response['LastEvaluatedKey']
            response = dynamodb.scan(**scan_params)
            articles.extend(response.get('Items', []))

        logger.info("Retrieved %s articles total", len(articles))
        return articles

    except ClientError as e:
        logger.error("Error scanning Article_China table: %s", e)
        raise


def extract_unique_site_categories(articles: List[Dict]) -> List[Tuple[str, str]]:
    """
    Extract unique site-category pairs from articles.
    
    Args:
        articles: List of article records
        
    Returns:
        List of tuples containing (site, category) pairs
    """
    site_category_pairs = set()

    try:
        for article in articles:
            site = article.get('site', {}).get('S')
            category = article.get('category', {}).get('S')

            if site and category:
                site_category_pairs.add((site, category))

        result = list(site_category_pairs)
        logger.info("Extracted %s unique site-category pairs", len(result))
        return result

    except Exception as e:
        logger.error("Error extracting site-category pairs: %s", e)
        raise


def group_sites_by_category(site_category_pairs: List[Tuple[str, str]]) -> Dict[str, List[str]]:
    """
    Group sites by category.
    
    Args:
        site_category_pairs: List of (site, category) tuples
        
    Returns:
        Dictionary mapping categories to lists of sites
    """
    category_sites = defaultdict(set)

    try:
        for site, category in site_category_pairs:
            category_sites[category].add(site)

        # Convert sets to lists for JSON serialization
        result = {category: list(sites) for category, sites in category_sites.items()}
        logger.info("Grouped sites into %s categories", len(result))
        return result

    except Exception as e:
        logger.error("Error grouping sites by category: %s", e)
        raise


def update_mongodb_categories(category_sites: Dict[str, List[str]]) -> None:
    """
    Update MongoDB category collection with category-sites mapping.

    category_collection is imported from mongodb.py in database folder
    
    Args:
        category_sites: Dictionary mapping categories to lists of sites
    """

    try:
        if not category_sites:
            logger.info("No category-sites mappings to add to MongoDB")
            return

        logger.info("Updating %s categories in MongoDB", len(category_sites))

        # Update each category document
        for category, sites in category_sites.items():
            try:
                # Use upsert with $addToSet to add unique sites to the array
                result = category_collection.update_one(
                    {"_id": category},
                    {
                        "$set": {"category": category},
                        "$addToSet": {"site": {"$each": sites}}
                    },
                    upsert=True
                )

                if result.upserted_id:
                    logger.info("Created new category '%s' with %s sites", category, len(sites))
                else:
                    logger.info("Updated category '%s' with %s sites", category, len(sites))

            except Exception as e:
                logger.error("Error updating category '%s' in MongoDB: %s", category, e)
                raise

    except Exception as e:
        logger.error("Error updating MongoDB categories: %s", e)
        raise


def collect(delta: int = 1) -> None:
    """
    Main function to update MongoDB category collection with site-category pairs from Article_China.
    
    Args:
        delta: Number of days to look back for modified articles. 
               If -1, get all articles.
    """
    try:
        logger.info("Starting category update with delta = %s", delta)

        # Get articles based on delta and target categories
        articles = get_articles_by_delta(delta)

        # Extract unique site-category pairs
        site_category_pairs = extract_unique_site_categories(articles)

        if not site_category_pairs:
            logger.info("No site-category pairs found in articles, nothing to update")
            return

        # Group sites by category
        category_sites = group_sites_by_category(site_category_pairs)

        # Update MongoDB with category-sites mapping
        update_mongodb_categories(category_sites)

        logger.info("Category update completed successfully")

    except Exception as e:
        logger.error("Category update failed: %s", e)
        raise