Spaces:
Sleeping
Sleeping
import os | |
from uuid import uuid4 | |
import uvicorn | |
from fastapi import FastAPI, UploadFile, File | |
from fastapi.responses import JSONResponse | |
from fastapi.middleware.cors import CORSMiddleware | |
import aiofiles | |
import PyPDF2 | |
from langchain_openai import ChatOpenAI | |
from langchain.schema import HumanMessage | |
import json | |
from fastapi.responses import FileResponse | |
from docx import Document | |
UPLOAD_FOLDER = "uploads" | |
os.makedirs(UPLOAD_FOLDER, exist_ok=True) | |
app = FastAPI() | |
# Enable CORS (you can restrict origins later) | |
app.add_middleware( | |
CORSMiddleware, | |
allow_origins=["*"], | |
allow_credentials=True, | |
allow_methods=["*"], | |
allow_headers=["*"], | |
) | |
llm = ChatOpenAI( | |
model_name="gpt-4o-mini", # Use a valid model name like "gpt-4o" or "gpt-4-turbo" | |
temperature=0, | |
openai_api_key=os.getenv("OPENAI_API_KEY") | |
) | |
def parse_resume_text(text: str) -> dict: | |
prompt = f""" | |
Extract structured information from this resume text and return the result in strict JSON format with the following keys: | |
- basics: {{first_name, last_name, gender, emails, phone_numbers, address, total_experience_in_years, profession, summary, skills, has_driving_license}} | |
- educations | |
- professional_experiences | |
- trainings_and_certifications | |
- languages | |
- awards | |
- references | |
- cv_text: {text} | |
- cv_language: "en" | |
Resume: | |
{text} | |
Return ONLY valid JSON, no text, no explanation. | |
""" | |
result = llm([HumanMessage(content=prompt)]) | |
#return result.content | |
raw_string = str(result.content).replace("```json\n", "").replace("\n```", "") | |
final_data = json.loads(raw_string) | |
return (json.dumps(final_data, indent=2)) | |
# β Save uploaded file asynchronously | |
async def save_file(file: UploadFile) -> str: | |
filename = f"{uuid4()}_{file.filename}" | |
file_path = os.path.join(UPLOAD_FOLDER, filename) | |
async with aiofiles.open(file_path, 'wb') as out_file: | |
content = await file.read() | |
await out_file.write(content) | |
return file_path | |
# β Extract text from DOCX | |
def extract_text_from_docx(docx_path: str) -> str: | |
try: | |
doc = Document(docx_path) | |
text = "\n".join([para.text for para in doc.paragraphs]) | |
return text.strip() | |
except Exception as e: | |
return f"Error extracting text from DOCX: {str(e)}" | |
# β Extract text from PDF using PyPDF2 | |
def extract_text_from_pdf(pdf_path: str) -> str: | |
text = "" | |
try: | |
with open(pdf_path, "rb") as file: | |
pdf_reader = PyPDF2.PdfReader(file) | |
for page in pdf_reader.pages: | |
page_text = page.extract_text() | |
if page_text: | |
text += page_text + "\n" | |
return text.strip() | |
except Exception as e: | |
return f"Error extracting text: {str(e)}" | |
async def parse_resume(file: UploadFile = File(...)): | |
try: | |
print("π Saving file...") | |
path = await save_file(file) | |
print(f"β File saved at {path}") | |
print("π Extracting text...") | |
ext = os.path.splitext(path)[-1].lower() | |
if ext == ".pdf": | |
text = extract_text_from_pdf(path) | |
elif ext in [".docx", ".doc"]: | |
text = extract_text_from_docx(path) | |
else: | |
os.remove(path) | |
return JSONResponse(status_code=400, content={"error": "Unsupported file type"}) | |
print("β Text extracted.") | |
json_result = parse_resume_text(text) | |
os.remove(path) | |
print("π§Ή File removed.") | |
filename = "cleaned_resume.json" | |
file_path = os.path.join(UPLOAD_FOLDER, filename) | |
with open(file_path, "w") as f: | |
f.write(json_result) | |
return FileResponse( path=file_path, filename=filename, media_type="application/json") | |
except Exception as e: | |
import traceback | |
print("β Exception occurred:\n", traceback.format_exc()) | |
return JSONResponse(status_code=500, content={"error": str(e)}) | |
async def root(): | |
return {"message": "Resume PDF Text Extractor is running π―"} | |
if __name__ == "__main__": | |
uvicorn.run("main:app", host="0.0.0.0", port=7860, reload=True) |