Spaces:
Sleeping
Sleeping
import os | |
from uuid import uuid4 | |
import uvicorn | |
from fastapi import FastAPI, UploadFile, File | |
from fastapi.responses import JSONResponse | |
from fastapi.middleware.cors import CORSMiddleware | |
import aiofiles | |
import PyPDF2 | |
from langchain_openai import ChatOpenAI | |
from langchain.schema import HumanMessage | |
import json | |
UPLOAD_FOLDER = "uploads" | |
os.makedirs(UPLOAD_FOLDER, exist_ok=True) | |
app = FastAPI() | |
# Enable CORS (you can restrict origins later) | |
app.add_middleware( | |
CORSMiddleware, | |
allow_origins=["*"], | |
allow_credentials=True, | |
allow_methods=["*"], | |
allow_headers=["*"], | |
) | |
llm = ChatOpenAI( | |
model_name="gpt-4o-mini", # Use a valid model name like "gpt-4o" or "gpt-4-turbo" | |
temperature=0, | |
openai_api_key=os.getenv("OPENAI_API_KEY") | |
) | |
# Helper functions | |
def extract_date(date_str): | |
if not date_str or "present" in str(date_str).lower(): | |
now = datetime.now() | |
return {"year": now.year, "month": now.month} | |
try: | |
parts = date_str.split() | |
return {"year": int(parts[1]), "month": convert_month(parts[0])} | |
except: | |
return {"year": None, "month": None} | |
def convert_month(month_str): | |
months = { | |
"jan": 1, "feb": 2, "mar": 3, "apr": 4, | |
"may": 5, "jun": 6, "jul": 7, "aug": 8, | |
"sep": 9, "oct": 10, "nov": 11, "dec": 12 | |
} | |
return months.get(month_str.strip().lower()[:3], None) | |
def calculate_duration(start, end): | |
s = extract_date(start) | |
e = extract_date(end) | |
if s["year"] and e["year"]: | |
months = (e["year"] - s["year"]) * 12 + (e["month"] - s["month"]) | |
return months if months >= 0 else None | |
return None | |
def parse_resume_text(text: str) -> dict: | |
prompt = f""" | |
Extract structured information from this resume text and return the result as a JSON object with the following keys: | |
- basics: {{first_name, last_name, gender, emails, phone_numbers, address, total_experience_in_years, profession, summary, skills, has_driving_license}} | |
- educations | |
- professional_experiences | |
- trainings_and_certifications | |
- languages | |
- awards | |
- references | |
Resume: | |
{text} | |
""" | |
result = llm([HumanMessage(content=prompt)]) | |
extracted = json.loads(result.content) | |
# Map the old structure to the new one | |
basics = extracted.get("basics", {}) | |
educations = extracted.get("educations", []) | |
professional_experiences = extracted.get("professional_experiences", []) | |
new_profile = { | |
"profile": { | |
"basics": { | |
"first_name": basics.get("first_name"), | |
"last_name": basics.get("last_name"), | |
"gender": basics.get("gender", "male"), # default or infer | |
"emails": basics.get("emails", []), | |
"urls": [], # Populate if available | |
"phone_numbers": basics.get("phone_numbers", []), | |
"date_of_birth": {"year": None, "month": None, "day": None}, | |
"address": basics.get("address"), | |
"total_experience_in_years": basics.get("total_experience_in_years", 0), | |
"profession": basics.get("profession"), | |
"summary": basics.get("summary"), | |
"skills": basics.get("skills", []), | |
"has_driving_license": basics.get("has_driving_license", False), | |
}, | |
"languages": extracted.get("languages", []), | |
"educations": [ | |
{ | |
"start_year": None, | |
"is_current": False, | |
"end_year": int(e.get("graduation_date", "").split()[-1]) if "graduation_date" in e else None, | |
"issuing_organization": e.get("institution"), | |
"description": f"{e.get('degree')}, {e.get('country', '')}".strip() | |
} for e in educations | |
], | |
"trainings_and_certifications": extracted.get("trainings_and_certifications", []), | |
"professional_experiences": [ | |
{ | |
"start_date": extract_date(p.get("start_date")), | |
"is_current": p.get("end_date", "").lower() == "present", | |
"end_date": extract_date(p.get("end_date")), | |
"duration_in_months": calculate_duration(p.get("start_date"), p.get("end_date")), | |
"company": p.get("company"), | |
"location": "Hyderabad", # default or parse if available | |
"title": p.get("job_title"), | |
"description": " ".join(p.get("responsibilities", [])) | |
} for p in professional_experiences | |
], | |
"awards": extracted.get("awards", []), | |
"references": extracted.get("references", []), | |
}, | |
"cv_text": text, | |
"cv_language": "en" | |
} | |
return new_profile | |
# β Save uploaded file asynchronously | |
async def save_file(file: UploadFile) -> str: | |
filename = f"{uuid4()}_{file.filename}" | |
file_path = os.path.join(UPLOAD_FOLDER, filename) | |
async with aiofiles.open(file_path, 'wb') as out_file: | |
content = await file.read() | |
await out_file.write(content) | |
return file_path | |
# β Extract text from PDF using PyPDF2 | |
def extract_text_from_pdf(pdf_path: str) -> str: | |
text = "" | |
try: | |
with open(pdf_path, "rb") as file: | |
pdf_reader = PyPDF2.PdfReader(file) | |
for page in pdf_reader.pages: | |
page_text = page.extract_text() | |
if page_text: | |
text += page_text + "\n" | |
return text.strip() | |
except Exception as e: | |
return f"Error extracting text: {str(e)}" | |
async def parse_resume(file: UploadFile = File(...)): | |
try: | |
print("π Saving file...") | |
path = await save_file(file) | |
print(f"β File saved at {path}") | |
print("π Extracting text...") | |
text = extract_text_from_pdf(path) | |
print("β Text extracted.") | |
json_result = parse_resume_text(text) | |
print("β JSON Created.") | |
os.remove(path) | |
print("π§Ή File removed.") | |
return json_result | |
except Exception as e: | |
import traceback | |
print("β Exception occurred:\n", traceback.format_exc()) | |
return JSONResponse(status_code=500, content={"error": str(e)}) | |
async def root(): | |
return {"message": "Resume PDF Text Extractor is running π―"} | |
if __name__ == "__main__": | |
uvicorn.run("main:app", host="0.0.0.0", port=7860, reload=True) |