Spaces:

yalrashed
/

ScriptLLM

Sleeping

App Files Files Community

yalrashed commited on Dec 5, 2024

Commit

b7ab371

verified ·

1 Parent(s): 2e86f0d

Update src/processing/gemini_processor.py

Browse files

Files changed (1) hide show

src/processing/gemini_processor.py +16 -31

src/processing/gemini_processor.py CHANGED Viewed

@@ -1,43 +1,34 @@
 import os
 import re
 from pathlib import Path
-from typing import List
 import google.generativeai as genai
 from PyPDF2 import PdfReader
 from tqdm import tqdm
 class GeminiProcessor:
     def __init__(self):
         self.api_key = os.getenv("GOOGLE_API_KEY")
         if not self.api_key:
             raise ValueError("GOOGLE_API_KEY not found")
-        # Configure Gemini
         genai.configure(api_key=self.api_key)
         self.model = genai.GenerativeModel('gemini-pro')
     def preprocess_text(self, text: str) -> str:
         """Enhanced preprocessing for screenplay text"""
-        # Remove HTML and script tags
         text = re.sub(r'<[^>]+>', '', text)
-        # Fix standalone scene headings
         text = re.sub(r'\n(INT\.|EXT\.|INT\/EXT\.)\s*\n', '', text)
-        # Remove line numbers and (CONT'D)
         text = re.sub(r'\d+\.$', '', text, flags=re.MULTILINE)
         text = re.sub(r'\(CONT\'D\)\d*', '', text)
-        # Fix spacing around punctuation
         text = re.sub(r'\s+([.,!?])', r'\1', text)
-        # Clean up multiple spaces and line breaks
         text = re.sub(r' +', ' ', text)
         text = re.sub(r'\n{3,}', '\n\n', text)
-        # Remove repetitive content
         lines = text.split('\n')
         cleaned_lines = []
         prev_line = None
@@ -50,22 +41,23 @@ class GeminiProcessor:
             cleaned_lines.append(line)
             prev_line = line
         return '\n'.join(cleaned_lines)
     def split_into_scenes(self, text: str) -> list:
         """Split screenplay into scenes while preserving headers and content"""
-        # Match scene headers and capture all content until the next header
         scene_pattern = r'((?:INT\.|EXT\.|INT\/EXT\.)[^\n]+\n(?:(?!(?:INT\.|EXT\.|INT\/EXT\.))[^\n]+\n)*)'
         scenes = re.findall(scene_pattern, text, re.MULTILINE)
-        # Clean and validate scenes
         valid_scenes = []
         for scene in scenes:
             scene = scene.strip()
             if scene:
                 valid_scenes.append(scene)
         return valid_scenes
     def clean_scene(self, scene: str) -> str:
@@ -80,48 +72,41 @@ class GeminiProcessor:
             response = self.model.generate_content(prompt)
             if response.text:
                 cleaned = response.text
-                # Basic validation
                 if abs(len(scene.split()) - len(cleaned.split())) <= 3:
                     return cleaned.strip()
             return scene
         except Exception as e:
-            print(f"Error cleaning scene: {str(e)}")
             return scene
     def process_screenplay(self, pdf_path: str, output_path: str) -> bool:
         """Process entire screenplay"""
         try:
-            # Read PDF
             with open(pdf_path, 'rb') as file:
                 pdf = PdfReader(file)
                 text = '\n'.join(page.extract_text() for page in pdf.pages)
-            #print("Extracted Text:")
-            #print(text)  # This will show you what text was actually extracted from the PDF
-            # Initial preprocessing
             text = self.preprocess_text(text)
-            # Split into scenes
             scenes = self.split_into_scenes(text)
-            print(f"Found {len(scenes)} scenes")
-            # Process each scene
             cleaned_scenes = []
-            for scene in tqdm(scenes, desc="Processing scenes"):
                 cleaned = self.clean_scene(scene)
                 if cleaned:
                     cleaned = self.preprocess_text(cleaned)
                     cleaned_scenes.append(cleaned)
-            # Save result
             Path(output_path).parent.mkdir(parents=True, exist_ok=True)
             with open(output_path, 'w', encoding='utf-8') as f:
                 f.write('\n\n'.join(cleaned_scenes))
             return True
         except Exception as e:
-            print(f"Error processing screenplay: {str(e)}")
-            return False

 import os
 import re
 from pathlib import Path
 import google.generativeai as genai
 from PyPDF2 import PdfReader
 from tqdm import tqdm
+import logging
+logger = logging.getLogger(__name__)
 class GeminiProcessor:
     def __init__(self):
         self.api_key = os.getenv("GOOGLE_API_KEY")
         if not self.api_key:
             raise ValueError("GOOGLE_API_KEY not found")
         genai.configure(api_key=self.api_key)
         self.model = genai.GenerativeModel('gemini-pro')
     def preprocess_text(self, text: str) -> str:
         """Enhanced preprocessing for screenplay text"""
+        logger.debug("Starting text preprocessing")
         text = re.sub(r'<[^>]+>', '', text)
         text = re.sub(r'\n(INT\.|EXT\.|INT\/EXT\.)\s*\n', '', text)
         text = re.sub(r'\d+\.$', '', text, flags=re.MULTILINE)
         text = re.sub(r'\(CONT\'D\)\d*', '', text)
         text = re.sub(r'\s+([.,!?])', r'\1', text)
         text = re.sub(r' +', ' ', text)
         text = re.sub(r'\n{3,}', '\n\n', text)
         lines = text.split('\n')
         cleaned_lines = []
         prev_line = None
             cleaned_lines.append(line)
             prev_line = line
+        logger.debug("Text preprocessing complete")
         return '\n'.join(cleaned_lines)
     def split_into_scenes(self, text: str) -> list:
         """Split screenplay into scenes while preserving headers and content"""
+        logger.debug("Splitting into scenes")
         scene_pattern = r'((?:INT\.|EXT\.|INT\/EXT\.)[^\n]+\n(?:(?!(?:INT\.|EXT\.|INT\/EXT\.))[^\n]+\n)*)'
         scenes = re.findall(scene_pattern, text, re.MULTILINE)
         valid_scenes = []
         for scene in scenes:
             scene = scene.strip()
             if scene:
                 valid_scenes.append(scene)
+        logger.info(f"Found {len(valid_scenes)} scenes")
         return valid_scenes
     def clean_scene(self, scene: str) -> str:
             response = self.model.generate_content(prompt)
             if response.text:
                 cleaned = response.text
                 if abs(len(scene.split()) - len(cleaned.split())) <= 3:
                     return cleaned.strip()
             return scene
         except Exception as e:
+            logger.error(f"Error cleaning scene: {str(e)}")
             return scene
     def process_screenplay(self, pdf_path: str, output_path: str) -> bool:
         """Process entire screenplay"""
         try:
+            logger.info(f"Processing screenplay: {pdf_path}")
             with open(pdf_path, 'rb') as file:
                 pdf = PdfReader(file)
                 text = '\n'.join(page.extract_text() for page in pdf.pages)
             text = self.preprocess_text(text)
             scenes = self.split_into_scenes(text)
+            logger.info(f"Processing {len(scenes)} scenes")
             cleaned_scenes = []
+            for i, scene in enumerate(scenes, 1):
+                logger.debug(f"Processing scene {i}/{len(scenes)}")
                 cleaned = self.clean_scene(scene)
                 if cleaned:
                     cleaned = self.preprocess_text(cleaned)
                     cleaned_scenes.append(cleaned)
             Path(output_path).parent.mkdir(parents=True, exist_ok=True)
             with open(output_path, 'w', encoding='utf-8') as f:
                 f.write('\n\n'.join(cleaned_scenes))
+            logger.info("Screenplay processing complete")
             return True
         except Exception as e:
+            logger.error(f"Error processing screenplay: {str(e)}")
+            return False