gperdrizet commited on
Commit
f80cf2d
·
verified ·
1 Parent(s): 5af784b

Added helper function to clean whitespaces and newlines in text

Browse files
Files changed (2) hide show
  1. functions/gradio.py +2 -1
  2. functions/helper.py +33 -0
functions/gradio.py CHANGED
@@ -6,6 +6,7 @@ Functions for handling Gradio UI interactions and processing user inputs.
6
 
7
  import logging
8
  from pathlib import Path
 
9
  from functions.linkedin_resume import extract_text_from_linkedin_pdf
10
  from functions.github import get_github_repositories
11
  # from functions.job_call import summarize_job_call
@@ -55,7 +56,7 @@ def process_inputs(
55
  logger = logging.getLogger(f'{__name__}.process_inputs')
56
  logger.info("LinkedIn PDF: %s", linkedin_pdf_path)
57
  logger.info("GitHub URL: %s", github_url)
58
- logger.info("Job post: %s", job_post_text[:100])
59
  logger.info("User instructions: %s", user_instructions[:100] if user_instructions else "None")
60
  result = ""
61
 
 
6
 
7
  import logging
8
  from pathlib import Path
9
+ from functions.helper import clean_text_whitespace
10
  from functions.linkedin_resume import extract_text_from_linkedin_pdf
11
  from functions.github import get_github_repositories
12
  # from functions.job_call import summarize_job_call
 
56
  logger = logging.getLogger(f'{__name__}.process_inputs')
57
  logger.info("LinkedIn PDF: %s", linkedin_pdf_path)
58
  logger.info("GitHub URL: %s", github_url)
59
+ logger.info("Job post: %s", clean_text_whitespace(job_post_text[:100]).replace("\n", " "))
60
  logger.info("User instructions: %s", user_instructions[:100] if user_instructions else "None")
61
  result = ""
62
 
functions/helper.py ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ helper.py
3
+
4
+ Utility functions for text processing and data cleaning.
5
+ """
6
+
7
+ import re
8
+
9
+
10
+ def clean_text_whitespace(text: str) -> str:
11
+ """
12
+ Clean up text by normalizing whitespace and newlines.
13
+
14
+ Args:
15
+ text (str): Input text string to clean
16
+
17
+ Returns:
18
+ str: Cleaned text with normalized whitespace and newlines
19
+ """
20
+ if not text or not isinstance(text, str):
21
+ return text
22
+
23
+ # Replace multiple whitespace characters (spaces, tabs) with a single space
24
+ # This handles spaces, tabs, and other whitespace characters except newlines
25
+ text = re.sub(r'[^\S\n]+', ' ', text)
26
+
27
+ # Replace multiple consecutive newlines with a single newline
28
+ text = re.sub(r'\n{2,}', '\n', text)
29
+
30
+ # Strip leading and trailing whitespace
31
+ text = text.strip()
32
+
33
+ return text