yalrashed commited on
Commit
b7ab371
·
verified ·
1 Parent(s): 2e86f0d

Update src/processing/gemini_processor.py

Browse files
Files changed (1) hide show
  1. src/processing/gemini_processor.py +16 -31
src/processing/gemini_processor.py CHANGED
@@ -1,43 +1,34 @@
1
  import os
2
  import re
3
  from pathlib import Path
4
- from typing import List
5
  import google.generativeai as genai
6
  from PyPDF2 import PdfReader
7
  from tqdm import tqdm
 
8
 
 
9
 
10
  class GeminiProcessor:
11
-
12
  def __init__(self):
13
  self.api_key = os.getenv("GOOGLE_API_KEY")
14
  if not self.api_key:
15
  raise ValueError("GOOGLE_API_KEY not found")
16
 
17
- # Configure Gemini
18
  genai.configure(api_key=self.api_key)
19
  self.model = genai.GenerativeModel('gemini-pro')
20
 
21
  def preprocess_text(self, text: str) -> str:
22
  """Enhanced preprocessing for screenplay text"""
23
- # Remove HTML and script tags
 
24
  text = re.sub(r'<[^>]+>', '', text)
25
-
26
- # Fix standalone scene headings
27
  text = re.sub(r'\n(INT\.|EXT\.|INT\/EXT\.)\s*\n', '', text)
28
-
29
- # Remove line numbers and (CONT'D)
30
  text = re.sub(r'\d+\.$', '', text, flags=re.MULTILINE)
31
  text = re.sub(r'\(CONT\'D\)\d*', '', text)
32
-
33
- # Fix spacing around punctuation
34
  text = re.sub(r'\s+([.,!?])', r'\1', text)
35
-
36
- # Clean up multiple spaces and line breaks
37
  text = re.sub(r' +', ' ', text)
38
  text = re.sub(r'\n{3,}', '\n\n', text)
39
 
40
- # Remove repetitive content
41
  lines = text.split('\n')
42
  cleaned_lines = []
43
  prev_line = None
@@ -50,22 +41,23 @@ class GeminiProcessor:
50
  cleaned_lines.append(line)
51
  prev_line = line
52
 
 
53
  return '\n'.join(cleaned_lines)
54
 
55
  def split_into_scenes(self, text: str) -> list:
56
  """Split screenplay into scenes while preserving headers and content"""
57
- # Match scene headers and capture all content until the next header
 
58
  scene_pattern = r'((?:INT\.|EXT\.|INT\/EXT\.)[^\n]+\n(?:(?!(?:INT\.|EXT\.|INT\/EXT\.))[^\n]+\n)*)'
59
-
60
  scenes = re.findall(scene_pattern, text, re.MULTILINE)
61
 
62
- # Clean and validate scenes
63
  valid_scenes = []
64
  for scene in scenes:
65
  scene = scene.strip()
66
  if scene:
67
  valid_scenes.append(scene)
68
 
 
69
  return valid_scenes
70
 
71
  def clean_scene(self, scene: str) -> str:
@@ -80,48 +72,41 @@ class GeminiProcessor:
80
  response = self.model.generate_content(prompt)
81
  if response.text:
82
  cleaned = response.text
83
- # Basic validation
84
  if abs(len(scene.split()) - len(cleaned.split())) <= 3:
85
  return cleaned.strip()
86
  return scene
87
 
88
  except Exception as e:
89
- print(f"Error cleaning scene: {str(e)}")
90
  return scene
91
 
92
  def process_screenplay(self, pdf_path: str, output_path: str) -> bool:
93
  """Process entire screenplay"""
94
  try:
95
- # Read PDF
96
  with open(pdf_path, 'rb') as file:
97
  pdf = PdfReader(file)
98
  text = '\n'.join(page.extract_text() for page in pdf.pages)
99
 
100
- #print("Extracted Text:")
101
- #print(text) # This will show you what text was actually extracted from the PDF
102
-
103
- # Initial preprocessing
104
  text = self.preprocess_text(text)
105
-
106
- # Split into scenes
107
  scenes = self.split_into_scenes(text)
108
- print(f"Found {len(scenes)} scenes")
109
 
110
- # Process each scene
111
  cleaned_scenes = []
112
- for scene in tqdm(scenes, desc="Processing scenes"):
 
113
  cleaned = self.clean_scene(scene)
114
  if cleaned:
115
  cleaned = self.preprocess_text(cleaned)
116
  cleaned_scenes.append(cleaned)
117
 
118
- # Save result
119
  Path(output_path).parent.mkdir(parents=True, exist_ok=True)
120
  with open(output_path, 'w', encoding='utf-8') as f:
121
  f.write('\n\n'.join(cleaned_scenes))
122
 
 
123
  return True
124
 
125
  except Exception as e:
126
- print(f"Error processing screenplay: {str(e)}")
127
- return False
 
1
  import os
2
  import re
3
  from pathlib import Path
 
4
  import google.generativeai as genai
5
  from PyPDF2 import PdfReader
6
  from tqdm import tqdm
7
+ import logging
8
 
9
+ logger = logging.getLogger(__name__)
10
 
11
  class GeminiProcessor:
 
12
  def __init__(self):
13
  self.api_key = os.getenv("GOOGLE_API_KEY")
14
  if not self.api_key:
15
  raise ValueError("GOOGLE_API_KEY not found")
16
 
 
17
  genai.configure(api_key=self.api_key)
18
  self.model = genai.GenerativeModel('gemini-pro')
19
 
20
  def preprocess_text(self, text: str) -> str:
21
  """Enhanced preprocessing for screenplay text"""
22
+ logger.debug("Starting text preprocessing")
23
+
24
  text = re.sub(r'<[^>]+>', '', text)
 
 
25
  text = re.sub(r'\n(INT\.|EXT\.|INT\/EXT\.)\s*\n', '', text)
 
 
26
  text = re.sub(r'\d+\.$', '', text, flags=re.MULTILINE)
27
  text = re.sub(r'\(CONT\'D\)\d*', '', text)
 
 
28
  text = re.sub(r'\s+([.,!?])', r'\1', text)
 
 
29
  text = re.sub(r' +', ' ', text)
30
  text = re.sub(r'\n{3,}', '\n\n', text)
31
 
 
32
  lines = text.split('\n')
33
  cleaned_lines = []
34
  prev_line = None
 
41
  cleaned_lines.append(line)
42
  prev_line = line
43
 
44
+ logger.debug("Text preprocessing complete")
45
  return '\n'.join(cleaned_lines)
46
 
47
  def split_into_scenes(self, text: str) -> list:
48
  """Split screenplay into scenes while preserving headers and content"""
49
+ logger.debug("Splitting into scenes")
50
+
51
  scene_pattern = r'((?:INT\.|EXT\.|INT\/EXT\.)[^\n]+\n(?:(?!(?:INT\.|EXT\.|INT\/EXT\.))[^\n]+\n)*)'
 
52
  scenes = re.findall(scene_pattern, text, re.MULTILINE)
53
 
 
54
  valid_scenes = []
55
  for scene in scenes:
56
  scene = scene.strip()
57
  if scene:
58
  valid_scenes.append(scene)
59
 
60
+ logger.info(f"Found {len(valid_scenes)} scenes")
61
  return valid_scenes
62
 
63
  def clean_scene(self, scene: str) -> str:
 
72
  response = self.model.generate_content(prompt)
73
  if response.text:
74
  cleaned = response.text
 
75
  if abs(len(scene.split()) - len(cleaned.split())) <= 3:
76
  return cleaned.strip()
77
  return scene
78
 
79
  except Exception as e:
80
+ logger.error(f"Error cleaning scene: {str(e)}")
81
  return scene
82
 
83
  def process_screenplay(self, pdf_path: str, output_path: str) -> bool:
84
  """Process entire screenplay"""
85
  try:
86
+ logger.info(f"Processing screenplay: {pdf_path}")
87
  with open(pdf_path, 'rb') as file:
88
  pdf = PdfReader(file)
89
  text = '\n'.join(page.extract_text() for page in pdf.pages)
90
 
 
 
 
 
91
  text = self.preprocess_text(text)
 
 
92
  scenes = self.split_into_scenes(text)
93
+ logger.info(f"Processing {len(scenes)} scenes")
94
 
 
95
  cleaned_scenes = []
96
+ for i, scene in enumerate(scenes, 1):
97
+ logger.debug(f"Processing scene {i}/{len(scenes)}")
98
  cleaned = self.clean_scene(scene)
99
  if cleaned:
100
  cleaned = self.preprocess_text(cleaned)
101
  cleaned_scenes.append(cleaned)
102
 
 
103
  Path(output_path).parent.mkdir(parents=True, exist_ok=True)
104
  with open(output_path, 'w', encoding='utf-8') as f:
105
  f.write('\n\n'.join(cleaned_scenes))
106
 
107
+ logger.info("Screenplay processing complete")
108
  return True
109
 
110
  except Exception as e:
111
+ logger.error(f"Error processing screenplay: {str(e)}")
112
+ return False