File size: 35,550 Bytes
aefa421
2deac8b
7a41f2a
2deac8b
7a41f2a
960d86e
7a41f2a
 
 
 
 
 
 
 
 
7a28077
aefa421
7a41f2a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7a28077
 
 
 
 
7a41f2a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7a28077
7a41f2a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7a28077
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7a41f2a
7a28077
7a41f2a
 
 
 
 
 
 
 
 
7a28077
7a41f2a
 
 
 
 
 
 
 
 
 
7a28077
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7a41f2a
 
7a28077
7a41f2a
7a28077
 
7a41f2a
 
 
 
 
 
 
7a28077
 
 
 
7a41f2a
7a28077
 
 
 
 
 
7a41f2a
7a28077
 
 
 
 
 
 
 
 
 
 
 
7a41f2a
 
 
 
 
 
7a28077
7a41f2a
 
 
 
7a28077
 
 
7a41f2a
7a28077
 
 
 
 
 
 
 
 
 
7a41f2a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
##FINAL FILE

# This deploy.py file contains the complete code for the Instagram Reels Analysis Gradio App.

# --- Imports ---
import gradio as gr
import time
import random
import matplotlib.pyplot as plt
import pandas as pd
import torch
import emoji
import re
import numpy as np
import io # Import io for handling image bytes


from instagrapi import Client
from transformers import (
    pipeline,
    AutoTokenizer,
    AutoModelForSequenceClassification,
    Trainer,
    TrainingArguments,
    RobertaForSequenceClassification,
    AlbertForSequenceClassification
)
from datasets import Dataset, Features, Value
from collections import Counter
from sklearn.metrics import accuracy_score, f1_score

# --- Configuration ---
CONFIG = {
    "max_length": 128,
    "batch_size": 16,
    "learning_rate": 2e-5,
    "num_train_epochs": 3,
    "few_shot_examples": 5,  # per class
    "confidence_threshold": 0.7,
    "neutral_reanalysis_threshold": 0.33
}

# --- Global Variables for State Management ---
global cl
global explore_reels_list
global sentiment_analyzer_instance
global content_classifier_pipeline

cl = None
explore_reels_list = []
sentiment_analyzer_instance = None
content_classifier_pipeline = None


# --- Sentiment Analysis Class ---
class ReelSentimentAnalyzer:
    def __init__(self):
        self.device = "cuda" if torch.cuda.is_available() else "cpu"
        self._initialize_models()

    def _initialize_models(self):
        """Initialize and configure all models"""
        print("\nInitializing Sentiment Analysis Models...")
        # English models
        print("Loading English Emotion Model...")
        self.emotion_tokenizer = AutoTokenizer.from_pretrained("finiteautomata/bertweet-base-emotion-analysis")
        self.emotion_model = AutoModelForSequenceClassification.from_pretrained(
            "finiteautomata/bertweet-base-emotion-analysis"
        ).to(self.device)
        print("Loading English Sentiment Model...")
        self.sentiment_tokenizer = AutoTokenizer.from_pretrained("cardiffnlp/twitter-roberta-base-sentiment-latest")
        self.sentiment_model = RobertaForSequenceClassification.from_pretrained(
            "cardiffnlp/twitter-roberta-base-sentiment-latest",
            ignore_mismatched_sizes=True
        ).to(self.device)

        # Hindi/English model (we'll fine-tune this)
        print("Loading Indic-BERT Model for Hindi/Hinglish...")
        self.hindi_tokenizer = AutoTokenizer.from_pretrained("ai4bharat/indic-bert")
        self.hindi_model = AlbertForSequenceClassification.from_pretrained(
            "ai4bharat/indic-bert",
            num_labels=3,
            id2label={0: "negative", 1: "neutral", 2: "positive"},
            label2id={"negative": 0, "neutral": 1, "positive": 2}
        ).to(self.device)
        # Store label2id mapping for easy access
        self.hindi_label2id = self.hindi_model.config.label2id
        print("Models Initialized.")

        # Emotion to sentiment mapping
        self.emotion_map = {
            "joy": "positive", "love": "positive", "happy": "positive",
            "anger": "negative", "sadness": "negative", "fear": "negative",
            "surprise": "neutral", "neutral": "neutral", "disgust": "negative", "shame": "negative"
        }

        # Neutral keywords
        self.neutral_keywords = {
            "ad", "sponsored", "promo", "sale", "discount", "offer", "giveaway",
            "buy", "shop", "link in bio",
            "विज्ञापन", "प्रचार", "ऑफर", "डिस्काउंट", "बिक्री", "लिंक बायो में"
        }

    def train_hindi_model(self, train_data, eval_data=None):
        """
        Fine-tune the Hindi/English model on labeled data
        Args:
            train_data: List of dicts [{"text": "...", "label": "positive/negative/neutral"}]
            eval_data: Optional evaluation data
        """
        print("\nStarting Hindi model training...")
        # Convert to dataset
        train_dataset = Dataset.from_pandas(pd.DataFrame(train_data))

        # Map string labels to integer IDs
        def map_labels_to_ids(examples):
            # Ensure label exists and is in expected range
            labels = []
            for label_str in examples["label"]:
                 if label_str in self.hindi_label2id:
                     labels.append(self.hindi_label2id[label_str])
                 else:
                     # Handle unexpected labels, maybe map to neutral or skip
                     print(f"Warning: Unexpected label '{label_str}'. Mapping to neutral.")
                     labels.append(self.hindi_label2id["neutral"]) # Map unknown to neutral
            examples["label"] = labels
            return examples


        train_dataset = train_dataset.map(map_labels_to_ids, batched=True)

        # Explicitly set the label column to integer type
        train_dataset = train_dataset.cast_column("label", Value("int64"))


        def tokenize_function(examples):
            return self.hindi_tokenizer(
                examples["text"],
                padding="max_length",
                truncation=True,
                max_length=CONFIG["max_length"]
            )

        tokenized_train = train_dataset.map(tokenize_function, batched=True)

        # Training arguments - using eval_strategy instead of evaluation_strategy
        training_args = TrainingArguments(
            output_dir="./results",
            eval_strategy="epoch" if eval_data else "no",
            per_device_train_batch_size=CONFIG["batch_size"],
            per_device_eval_batch_size=CONFIG["batch_size"],
            learning_rate=CONFIG["learning_rate"],
            num_train_epochs=CONFIG["num_train_epochs"],
            weight_decay=0.01,
            save_strategy="no", # Don't save checkpoints during training
            logging_dir='./logs',
            logging_steps=10,
            report_to="none" # Don't report to external services
        )

        # Compute metrics function
        def compute_metrics(p):
            predictions, labels = p
            predictions = np.argmax(predictions, axis=1)
            return {
                "accuracy": accuracy_score(labels, predictions),
                "f1": f1_score(labels, predictions, average="weighted")
            }

        # Trainer
        eval_dataset_processed = None
        if eval_data:
            eval_dataset = Dataset.from_pandas(pd.DataFrame(eval_data))
            eval_dataset = eval_dataset.map(map_labels_to_ids, batched=True)
            eval_dataset_processed = eval_dataset.cast_column("label", Value("int64")).map(tokenize_function, batched=True)


        trainer = Trainer(
            model=self.hindi_model,
            args=training_args,
            train_dataset=tokenized_train,
            eval_dataset=eval_dataset_processed,
            compute_metrics=compute_metrics if eval_data else None,
        )

        # Train
        trainer.train()

        # Save the fine-tuned model
        print("Saving fine-tuned Hindi model...")
        self.hindi_model.save_pretrained("./fine_tuned_hindi_sentiment")
        self.hindi_tokenizer.save_pretrained("./fine_tuned_hindi_sentiment")
        print("Hindi model training complete.")

    def preprocess_text(self, text):
        """Enhanced text cleaning with multilingual support"""
        if not text:
            return ""

        # Convert emojis to text
        text = emoji.demojize(text, delimiters=(" ", " "))

        # Remove URLs and mentions
        text = re.sub(r"http\S+|@\w+", "", text)

        # Expand common abbreviations (can be extended)
        abbrevs = {
            r"\bomg\b": "oh my god",
            r"\btbh\b": "to be honest",
            r"\bky\b": "kyun",  # Hindi 'why'
            r"\bkb\b": "kab",   # Hindi 'when'
            r"\bkya\b": "kya",  # Hindi 'what'
            r"\bkahan\b": "kahan", # Hindi 'where'
            r"\bkaisa\b": "kaisa" # Hindi 'how'
        }
        for pattern, replacement in abbrevs.items():
            text = re.sub(pattern, replacement, text, flags=re.IGNORECASE)

        # Remove extra whitespace
        text = re.sub(r"\s+", " ", text).strip()

        return text

    def detect_language(self, text):
        """Improved language detection"""
        if re.search(r"[\u0900-\u097F]", text):  # Devanagari script (Hindi, Marathi etc.)
            return "hi"
        # Simple check for common Hindi/Hinglish words (can be expanded)
        hinglish_keywords = ["hai", "kyun", "nahi", "kya", "acha", "bas", "yaar", "main"]
        if any(re.search(rf"\b{kw}\b", text.lower()) for kw in hinglish_keywords):
            return "hi-latin"
        # Fallback to English if no strong Hindi/Hinglish indicators
        return "en"


    def analyze_content(self, text):
        """Main analysis function with improved confidence handling"""
        processed = self.preprocess_text(text)

        if not processed:
            return "neutral", 0.5, {"reason": "empty_text"}

        lang = self.detect_language(processed)

        # Check for neutral keywords first with higher confidence
        if any(re.search(rf"\b{re.escape(kw)}\b", processed.lower()) for kw in self.neutral_keywords):
            return "neutral", 0.9, {"reason": "neutral_keyword"}

        try:
            if lang in ("hi", "hi-latin"):
                # Use Hindi model for Hindi/Hinglish
                return self._analyze_hindi_content(processed)
            else:
                # Use ensemble for English
                return self._analyze_english_content(processed)
        except Exception as e:
            print(f"Analysis error for text '{processed[:50]}...': {e}")
            return "neutral", 0.5, {"error": str(e), "original_text": text[:50]}

    def _analyze_hindi_content(self, text):
        """Analyze Hindi content with fine-tuned model"""
        inputs = self.hindi_tokenizer(
            text,
            return_tensors="pt",
            truncation=True,
            padding=True,
            max_length=CONFIG["max_length"]
        ).to(self.device)

        with torch.no_grad():
            outputs = self.hindi_model(**inputs)

        probs = torch.nn.functional.softmax(outputs.logits, dim=-1)
        pred_idx = torch.argmax(probs).item()
        confidence = probs[0][pred_idx].item()

        label = self.hindi_model.config.id2label[pred_idx]
        return label, confidence, {"model": "fine-tuned-indic-bert", "lang": "hi"}

    def _analyze_english_content(self, text):
        """Analyze English content with ensemble approach"""
        # Emotion analysis
        emotion_inputs = self.emotion_tokenizer(
            text,
            return_tensors="pt",
            truncation=True,
            max_length=CONFIG["max_length"]
        ).to(self.device)

        with torch.no_grad():
            emotion_outputs = self.emotion_model(**emotion_inputs)

        emotion_probs = torch.nn.functional.softmax(emotion_outputs.logits, dim=-1)
        emotion_pred = torch.argmax(emotion_probs).item()
        emotion_label = self.emotion_model.config.id2label[emotion_pred]
        emotion_score = emotion_probs[0][emotion_pred].item()

        # Sentiment analysis
        sentiment_inputs = self.sentiment_tokenizer(
            text,
            return_tensors="pt",
            truncation=True,
            max_length=CONFIG["max_length"]
        ).to(self.device)

        with torch.no_grad():
            sentiment_outputs = self.sentiment_model(**sentiment_inputs)

        sentiment_probs = torch.nn.functional.softmax(sentiment_outputs.logits, dim=-1)
        sentiment_pred = torch.argmax(sentiment_probs).item()
        # sentiment_label comes as 'LABEL_0', 'LABEL_1', 'LABEL_2'
        # Need to map these to 'negative', 'neutral', 'positive'
        # The roberta-base-sentiment-latest model has mapping: 0: Negative, 1: Neutral, 2: Positive
        sentiment_label_mapping = {0: 'negative', 1: 'neutral', 2: 'positive'}
        sentiment_label = sentiment_label_mapping.get(sentiment_pred, 'neutral') # Default to neutral if mapping fails
        sentiment_score = sentiment_probs[0][sentiment_pred].item()

        # Combine results
        mapped_emotion = self.emotion_map.get(emotion_label, "neutral")

        # Prioritize high-confidence sentiment
        if sentiment_score > CONFIG["confidence_threshold"]:
            final_label = sentiment_label
            final_confidence = sentiment_score
            reason = "high_sentiment_confidence"
        # Then prioritize high-confidence emotion if not neutral
        elif emotion_score > CONFIG["confidence_threshold"] and mapped_emotion != "neutral":
            final_label = mapped_emotion
            final_confidence = emotion_score
            reason = "high_emotion_confidence"
        else:
            # Fallback mechanism for lower confidence or conflicting results
            # A simple weighted sum or voting could be used,
            # but let's use a clearer logic:
            # If both are low confidence or neutral, and their results align, use that.
            # Otherwise, default to neutral or pick the one with slightly higher confidence
            # if it's not neutral.

            if sentiment_label == mapped_emotion and sentiment_label != "neutral":
                 final_label = sentiment_label
                 final_confidence = (sentiment_score + emotion_score) / 2
                 reason = "emotion_sentiment_agreement"
            elif sentiment_label != "neutral" and sentiment_score > emotion_score and sentiment_score > 0.4: # Use sentiment if somewhat confident
                 final_label = sentiment_label
                 final_confidence = sentiment_score * 0.9 # Slightly reduce confidence
                 reason = "sentiment_slightly_higher"
            elif mapped_emotion != "neutral" and emotion_score > sentiment_score and emotion_score > 0.4: # Use emotion if somewhat confident
                 final_label = mapped_emotion
                 final_confidence = emotion_score * 0.9 # Slightly reduce confidence
                 reason = "emotion_slightly_higher"
            else: # Default to neutral if no strong signal
                 final_label = "neutral"
                 final_confidence = 0.6 # Assign a baseline neutral confidence
                 reason = "fallback_to_neutral"


        return final_label, final_confidence, {
            "emotion_label": emotion_label,
            "emotion_score": emotion_score,
            "sentiment_label": sentiment_label,
            "sentiment_score": sentiment_score,
            "mapped_emotion": mapped_emotion,
            "model": "ensemble",
            "lang": "en",
            "reason": reason
        }

    def analyze_reels(self, reels, max_to_analyze=100):
        """Batch analysis with improved neutral handling"""
        print(f"\n--- Starting Sentiment Analysis ({max_to_analyze} reels) ---")
        results = Counter()
        detailed_results = []

        for i, reel in enumerate(reels[:max_to_analyze], 1):
            caption = getattr(reel, 'caption_text', '') or getattr(reel, 'caption', '') or ''
            print(f"Analyzing sentiment for reel {i}/{max_to_analyze} (ID: {reel.id})...")
            label, confidence, details = self.analyze_content(caption)
            results[label] += 1
            detailed_results.append({
                "reel_id": reel.id, # Add reel ID
                "text": caption,
                "label": label,
                "confidence": confidence,
                "details": details
            })

        print("\nInitial Sentiment Distribution:", dict(results))

        # Post-analysis neutral reduction if a significant portion is neutral
        total_analyzed = sum(results.values())
        if total_analyzed > 0 and results["neutral"] / total_analyzed > CONFIG["neutral_reanalysis_threshold"]:
            print(f"High neutral count ({results['neutral']}). Attempting to re-analyze...")
            self._reduce_neutrals(results, detailed_results)
            print("Sentiment distribution after re-analysis:", dict(results))

        print("Sentiment Analysis Complete.")
        return results, detailed_results

    def _reduce_neutrals(self, results, detailed_results):
        """Apply additional techniques to reduce neutral classifications"""
        neutrals_to_recheck = [item for item in detailed_results if item["label"] == "neutral" and item["confidence"] < 0.8]

        print(f"Re-checking {len(neutrals_to_recheck)} neutral reels...")

        for item in neutrals_to_recheck:
            original_text = item["text"]
            processed_text = self.preprocess_text(original_text)
            text_lower = processed_text.lower()

            # Try keyword analysis for strong positive/negative signals
            pos_keywords_strong = {"amazing", "love", "best", "fantastic", "awesome", "superb", "great",
                                   "अद्भुत", "शानदार", "बहुत अच्छा", "मज़ेदार"}
            neg_keywords_strong = {"hate", "worst", "bad", "terrible", "awful", "disappointed", "horrible", "cringe",
                                   "खराब", "बेकार", "बहुत बुरा", "घटिया"}

            is_strong_pos = any(re.search(rf"\b{re.escape(kw)}\b", text_lower) for kw in pos_keywords_strong)
            is_strong_neg = any(re.search(rf"\b{re.escape(kw)}\b", text_lower) for kw in neg_keywords_strong)

            if is_strong_pos and not is_strong_neg:
                # Reclassify as positive if strong positive keywords found and no strong negative ones
                results["neutral"] -= 1
                results["positive"] += 1
                item.update({
                    "label": "positive",
                    "confidence": min(0.95, item["confidence"] + 0.3), # Increase confidence
                    "reanalyzed": True,
                    "reanalysis_reason": "strong_pos_keywords"
                })
                # print(f"  Reclassified reel {item['reel_id']} to Positive (Keywords)")
            elif is_strong_neg and not is_strong_pos:
                # Reclassify as negative if strong negative keywords found and no strong positive ones
                results["neutral"] -= 1
                results["negative"] += 1
                item.update({
                    "label": "negative",
                    "confidence": min(0.95, item["confidence"] + 0.3), # Increase confidence
                    "reanalyzed": True,
                    "reanalysis_reason": "strong_neg_keywords"
                })
                # print(f"  Reclassified reel {item['reel_id']} to Negative (Keywords)")
            # Add other potential re-analysis rules here if needed
            # e.g., checking for question marks (might indicate neutral query),
            # or checking length (very short captions often neutral)
            # For now, we stick to keyword-based re-analysis for simplicity


def plot_sentiment_pie(results, title="Reels Sentiment Analysis"):
    """
    Creates a pie chart from sentiment analysis results and returns the matplotlib figure.

    Args:
        results: Counter object or dict with 'positive', 'neutral', 'negative' keys
        title: Chart title

    Returns:
        Matplotlib Figure object, or None if no data.
    """
    labels = ['Positive', 'Neutral', 'Negative']
    sizes = [results.get('positive', 0), results.get('neutral', 0), results.get('negative', 0)]

    if sum(sizes) == 0:
        return None

    colors = ['#4CAF50', '#FFC107', '#F44336']
    explode = (0.05, 0, 0.05)

    fig, ax = plt.subplots(figsize=(8, 6))

    filtered_labels = [label for i, label in enumerate(labels) if sizes[i] > 0]
    filtered_sizes = [size for size in sizes if size > 0]
    filtered_colors = [colors[i] for i, size in enumerate(sizes) if size > 0]
    explode_map = {'Positive': 0.05, 'Neutral': 0, 'Negative': 0.05}
    filtered_explode = [explode_map.get(label, 0) for label in filtered_labels]

    ax.pie(filtered_sizes, explode=filtered_explode, labels=filtered_labels, colors=filtered_colors,
           autopct='%1.1f%%', shadow=True, startangle=140,
           textprops={'fontsize': 12, 'color': 'black'})

    ax.axis('equal')
    plt.title(title, fontsize=16, pad=20)
    plt.tight_layout()

    # Return the figure object
    return fig

# --- Content Analysis Logic ---
# Content categories
content_categories = [
    "news", "meme", "sports", "science", "music", "movie",
    "gym", "comedy", "food", "technology", "travel", "fashion", "art", "business"
]

category_keywords = {
    "news": {"news", "update", "breaking", "reported", "headlines"},
    "meme": {"meme", "funny", "lol", "haha", "relatable"},
    "sports": {"sports", "cricket", "football", "match", "game", "team", "score"},
    "science": {"science", "research", "discovery", "experiment", "facts", "theory"},
    "music": {"music", "song", "album", "release", "artist", "beats"},
    "movie": {"movie", "film", "bollywood", "trailer", "series", "actor"},
    "gym": {"gym", "workout", "fitness", "exercise", "training", "bodybuilding"},
    "comedy": {"comedy", "joke", "humor", "standup", "skit", "laugh"},
    "food": {"food", "recipe", "cooking", "eat", "delicious", "restaurant", "kitchen"},
    "technology": {"tech", "phone", "computer", "ai", "gadget", "software", "innovation"},
    "travel": {"travel", "trip", "vacation", "explore", "destination", "adventure"},
    "fashion": {"fashion", "style", "ootd", "outfit", "trends", "clothing"},
    "art": {"art", "artist", "painting", "drawing", "creative", "design"},
    "business": {"business", "startup", "marketing", "money", "finance", "entrepreneur"}
}

def preprocess_text_cat(text):
    """Basic text cleaning for categorization"""
    if not text:
        return ""
    text = re.sub(r"http\S+|@\w+|#\w+", "", text).lower()
    text = re.sub(r"\s+", " ", text).strip()
    return text

def classify_reel_content(text):
    """Classify content using keywords and zero-shot model"""
    global content_classifier_pipeline # Use the global pipeline

    processed = preprocess_text_cat(text)

    if not processed or len(processed.split()) < 2:
        return "other", {"reason": "short_text"}

    for category, keywords in category_keywords.items():
        if any(re.search(rf"\b{re.escape(keyword)}\b", processed) for keyword in keywords):
            return category, {"reason": "keyword_match"}

    model_text = processed[:256]

    if content_classifier_pipeline is None:
        # Should not happen if initialized in analyze_reels_gradio or globally
        print("Content classifier pipeline not initialized in classify_reel_content.")
        return "other", {"reason": "classifier_not_initialized"}

    try:
        result = content_classifier_pipeline(model_text, content_categories, multi_label=False)
        top_label = result['labels'][0]
        top_score = result['scores'][0]

        if top_score > 0.5:
             return top_label, {"reason": "model_prediction", "score": top_score}
        else:
             return "other", {"reason": "low_model_confidence", "score": top_score}

    except Exception as e:
        print(f"Error during zero-shot classification for text '{model_text}...': {e}")
        return "other", {"reason": "classification_error"}


def plot_category_distribution(counter, title="Reels Content Distribution"):
    """
    Generate pie chart from category counts and returns the matplotlib figure.

    Args:
        counter: Counter object with category counts.
        title: Chart title.

    Returns:
        Matplotlib Figure object, or None if no data.
    """
    labels = []
    sizes = []

    total = sum(counter.values())
    if total == 0:
        return None

    threshold = total * 0.02
    other_count = 0

    sorted_categories = counter.most_common()

    for category, count in sorted_categories:
        if count >= threshold and category != "other":
            labels.append(category.replace('_', ' ').title())
            sizes.append(count)
        elif category == "other":
             other_count += count
        else:
            other_count += count

    if other_count > 0:
        labels.append("Other")
        sizes.append(other_count)

    if not sizes:
         return None

    fig, ax = plt.subplots(figsize=(10, 8))
    colors = plt.cm.viridis(np.linspace(0, 1, len(sizes)))

    ax.pie(
        sizes,
        labels=labels,
        autopct='%1.1f%%',
        startangle=140,
        colors=colors,
        wedgeprops={'edgecolor': 'white', 'linewidth': 1},
        textprops={'fontsize': 11, 'color': 'black'}
    )

    plt.title(title, pad=20, fontsize=15)
    plt.axis('equal')
    plt.tight_layout()

    # Return the figure object
    return fig


# --- Gradio-Compatible Functions ---
# Preset username from Colab secrets
# Ensure USERNAME is set in your Colab secrets
USERNAME = "jattman1993" # Replace with your preset username or fetch from secrets if needed

def login_gradio_auto():
    """Gradio-compatible function for automatic login."""
    global cl
    try:
        # Fetch password securely from Colab secrets
        PASSWORD = userdata.get('password')
    except Exception as e:
        return f"Error accessing password secret: {e}", gr.update(visible=False) # Hide OTP input on error

    if not PASSWORD:
        return "Error: Instagram password not found in Colab secrets. Please add it to Colab secrets with the key 'password'.", gr.update(visible=False) # Hide OTP input

    cl = Client()

    try:
        cl.login(USERNAME, PASSWORD)
        # If login is successful, return success message and hide OTP input
        return f"Successfully logged in as {USERNAME}", gr.update(visible=False)
    except Exception as e:
        cl = None # Ensure cl is None on failure
        error_message = str(e)
        if "Two factor challenged" in error_message or "challenge_required" in error_message:
            # If 2FA is required, show the OTP input field
            return f"Login failed: Two-factor authentication required. Please enter the code below.", gr.update(visible=True)
        else:
            # For other errors, hide OTP input and show error message
            return f"Error during login: {error_message}", gr.update(visible=False)

# Function to handle OTP submission (if 2FA was required)
def submit_otp_gradio(otp_code):
    """Gradio-compatible function to submit OTP."""
    global cl
    if cl is None:
        return "Error: Not logged in or client not initialized.", "", gr.update(visible=False) # Hide OTP input

    try:
        # Assuming the challenge was set up correctly in the login attempt
        # and the cl object has the challenge_data
        cl.two_factor_login(otp_code)
        # If OTP is successful
        return f"OTP successful. Successfully logged in as {USERNAME}.", "", gr.update(visible=False) # Clear OTP input and hide field
    except Exception as e:
        # If OTP fails
        return f"OTP submission failed: {e}. Please try again.", "", gr.update(visible=True) # Keep OTP input visible


def fetch_reels_gradio():
    """Gradio-compatible function to fetch explore reels."""
    global cl
    global explore_reels_list

    if cl is None:
        explore_reels_list = [] # Ensure list is empty on failure
        return "Error: Not logged in. Please log in first."

    try:
        # Fetch a limited number of reels for demonstration purposes
        # You might want to make this number configurable later
        fetched_reels = cl.explore_reels()[:100] # Fetch up to 100 for analysis
        explore_reels_list = fetched_reels
        if explore_reels_list:
            return f"Successfully fetched {len(explore_reels_list)} explore reels."
        else:
            explore_reels_list = [] # Ensure it's an empty list
            return "Fetched 0 explore reels."
    except Exception as e:
        explore_reels_list = [] # Ensure it's an empty list on error
        return f"Error fetching explore reels: {e}"


def analyze_reels_gradio(max_to_analyze):
    """Gradio-compatible function to analyze fetched reels and generate plots."""
    global explore_reels_list
    global sentiment_analyzer_instance
    global content_classifier_pipeline

    if not explore_reels_list:
        # Return None for plots if no reels
        return "Error: No reels fetched yet. Please fetch reels first.", None, None

    # Ensure max_to_analyze does not exceed the number of fetched reels
    num_reels_to_process = min(max_to_analyze, len(explore_reels_list))
    reels_to_analyze = explore_reels_list[:num_reels_to_process]

    if not reels_to_analyze:
         return "Error: No reels available to analyze.", None, None


    # Initialize sentiment analyzer if not already done
    if sentiment_analyzer_instance is None:
        try:
            sentiment_analyzer_instance = ReelSentimentAnalyzer()
             # Optional: Train Hindi model if needed and data is available
            # sample_train_data = [...] # Define your training data
            # sentiment_analyzer_instance.train_hindi_model(sample_train_data)
        except Exception as e:
            return f"Error initializing Sentiment Analyzer: {e}", None, None

    # Initialize content classifier pipeline if not already done
    if content_classifier_pipeline is None:
         try:
             print("Initializing Content Classifier Pipeline...")
             content_classifier_pipeline = pipeline(
                 "zero-shot-classification",
                 model="facebook/bart-large-mnli",
                 device=0 if torch.cuda.is_available() else -1 # Use GPU if available
             )
             print("Content Classifier Pipeline Initialized.")
         except Exception as e:
             return f"Error initializing Content Classifier: {e}", None, None


    analysis_status_messages = []
    sentiment_plot_figure = None # Changed to figure
    content_plot_figure = None   # Changed to figure

    # Perform Sentiment Analysis
    try:
        analysis_status_messages.append(f"Starting Sentiment Analysis for {len(reels_to_analyze)} reels...")
        sentiment_results, detailed_sentiment_results = sentiment_analyzer_instance.analyze_reels(
            reels_to_analyze,
            max_to_analyze=len(reels_to_analyze) # Pass the actual number being processed
        )
        # Call the updated plotting function that returns a figure
        sentiment_plot_figure = plot_sentiment_pie(sentiment_results, title=f"Sentiment of {len(reels_to_analyze)} Instagram Reels")
        analysis_status_messages.append("Sentiment Analysis Complete.")
    except Exception as e:
        analysis_status_messages.append(f"Error during Sentiment Analysis: {e}")
        sentiment_plot_figure = None # Ensure plot is None on error


    # Perform Content Categorization
    try:
        analysis_status_messages.append(f"Starting Content Categorization for {len(reels_to_analyze)} reels...")
        category_counts = Counter()
        # Re-implement content analysis slightly to fit this flow using the global pipeline
        print(f"\n⏳ Analyzing content for {len(reels_to_analyze)} reels...")
        for i, reel in enumerate(reels_to_analyze, 1):
            caption = getattr(reel, 'caption_text', '') or getattr(reel, 'caption', '') or ''
            # Use the global classifier pipeline
            category, details = classify_reel_content(caption)
            category_counts[category] += 1

        print("\n✅ Content Analysis complete!")
        print("\n📊 Category Counts:")
        for category, count in category_counts.most_common():
            print(f"- {category.replace('_', ' ').title()}: {count}")

        # Call the updated plotting function that returns a figure
        content_plot_figure = plot_category_distribution(category_counts)
        analysis_status_messages.append("Content Categorization Complete.")

    except Exception as e:
        analysis_status_messages.append(f"Error during Content Analysis: {e}")
        content_plot_figure = None # Ensure plot is None on error


    final_status_message = "\n".join(analysis_status_messages)
    # Return the figure objects
    return final_status_message, sentiment_plot_figure, content_plot_figure


# --- Gradio Blocks Interface ---
with gr.Blocks() as demo:
    gr.Markdown("# Instagram Reels Analysis")

    # Login Section
    with gr.Row():
        connect_button = gr.Button("Connect Instagram")
    login_status_output = gr.Label(label="Login Status")

    # OTP Input (initially hidden)
    with gr.Row(visible=False) as otp_row:
        otp_input = gr.Textbox(label="Enter OTP Code")
        otp_submit_button = gr.Button("Submit OTP")


    # Fetch Reels Section
    with gr.Row():
        fetch_button = gr.Button("Fetch Reels")
    fetch_status_output = gr.Label(label="Fetch Status")

    # Analysis Section
    with gr.Row():
        max_reels_input = gr.Slider(minimum=1, maximum=100, value=10, step=1, label="Number of Reels to Analyze")
        analyze_button = gr.Button("Analyze Reels")

    analyze_status_output = gr.Label(label="Analysis Status")

    # Results Section
    with gr.Row():
        # Sentiment Analysis Outputs
        with gr.Column():
            gr.Markdown("## Sentiment Analysis")
            sentiment_plot_output = gr.Plot(label="Sentiment Distribution")

        # Content Analysis Outputs
        with gr.Column():
            gr.Markdown("## Content Analysis")
            content_plot_output = gr.Plot(label="Content Distribution")


    # Link buttons to functions
    connect_button.click(
        fn=login_gradio_auto,
        inputs=None, # No direct inputs, username is preset
        outputs=[login_status_output, otp_row]
    )

    otp_submit_button.click(
        fn=submit_otp_gradio,
        inputs=otp_input,
        outputs=[login_status_output, otp_input, otp_row]
    )

    fetch_button.click(
        fn=fetch_reels_gradio,
        inputs=None, # No direct inputs needed for fetching
        outputs=fetch_status_output
    )

    analyze_button.click(
        fn=analyze_reels_gradio,
        inputs=max_reels_input, # Input is the slider value
        outputs=[analyze_status_output, sentiment_plot_output, content_plot_output] # Outputs are status and the two plots
    )

# --- Launch the Gradio app ---
if __name__ == "__main__":
    # This block ensures the app only launches when the script is executed directly
    # (e.g., when running `python deploy.py` or `gradio deploy.py`)
    # It prevents the app from launching automatically when the file is written in Colab.
    # When deploying to Hugging Face Spaces via `gradio deploy`, it will find and run this.
    # For Colab sharing, you can use `demo.launch(share=True)` outside this if block.

    # For standalone deploy.py, you might want to uncomment this:
    # demo.launch()

    # For Colab and `gradio deploy` compatibility, the `gradio deploy` command handles launching.
    # The `demo.launch()` line is removed here from the main script block.
    pass # Keep the __main__ block if needed for local testing setup


# Note: When using `gradio deploy` on Hugging Face Spaces, the `demo` object is
# automatically discovered and launched. You don't need `demo.launch()` here
# for that specific deployment method.

# For running directly in Colab to test before deploying:
# demo.launch(share=True)