Spaces:

jzou19950715
/

Newco_information_extraction_agent

Running

App Files Files Community

Newco_information_extraction_agent / app.py

jzou19950715

Update app.py

c5fc623 verified 5 months ago

raw

history blame

9.42 kB


	import json
	import logging
	from datetime import datetime
	from typing import Dict, List, Optional, Any
	import gradio as gr
	from openai import AsyncOpenAI # Changed to AsyncOpenAI

	# Configure logging
	logging.basicConfig(level=logging.INFO)
	logger = logging.getLogger(__name__)

	# System prompts remain the same as before
	# System prompts
	CONVERSATION_PROMPT = """You are LOSS DOG, a professional profile builder. Your goal is to have natural conversations
	with users to gather information about their professional background across 9 categories:

	1. Work History & Experience
	2. Salary & Compensation
	3. Skills & Certifications
	4. Education & Learning
	5. Personal Branding & Online Presence
	6. Achievements & Awards
	7. Social Proof & Networking
	8. Project Contributions & Leadership
	9. Work Performance & Impact Metrics

	Be friendly and conversational. Ask follow-up questions naturally. When appropriate, guide users to share more details
	but respect their boundaries. Once you believe you have gathered sufficient information (or if the user indicates they
	have nothing more to share), let them know they can click 'Generate Profile' to proceed.
	"""
	EXTRACTION_PROMPT = """You are a data extraction specialist. Your task is to:
	1. Read through the provided conversation
	2. Identify relevant information across 9 categories:
	- Work History & Experience (jobs, roles, companies)
	- Salary & Compensation (if shared)
	- Skills & Certifications
	- Education & Learning
	- Personal Branding & Online Presence
	- Achievements & Awards
	- Social Proof & Networking
	- Project Contributions & Leadership
	- Work Performance & Impact Metrics

	3. Clean and structure the information:
	- Deduplicate repeated information
	- Resolve any inconsistencies
	- Make reasonable inferences when dates or details are partial
	- Standardize formatting (dates, company names, titles)

	4. Output a VALID JSON object with this exact structure:
	{
	"work_history_experience": {
	"positions": [
	{
	"title": "cleaned job title",
	"company": "cleaned company name",
	"duration": "standardized duration",
	"description": "cleaned description",
	"confidence": 0.95,
	"inferred": false
	}
	]
	},
	"skills_certifications": {
	"technical_skills": ["skill1", "skill2"],
	"certifications": [
	{
	"name": "certification name",
	"issuer": "issuing organization",
	"date": "YYYY-MM",
	"confidence": 0.9
	}
	]
	}
	// ... other categories following similar structure
	}

	IMPORTANT:
	- Return ONLY valid JSON
	- Always include confidence scores (0.0-1.0)
	- Mark any inferred information
	- Use consistent date formats (YYYY-MM-DD)
	- Clean and standardize all text fields
	- Return empty arrays [] for missing sections rather than null

	Example conversation snippet:
	User: "I worked at Google for a few years"
	Assistant: "That's interesting! What was your role there?"
	User: "I was a senior engineer, mostly doing ML stuff"

	Should extract to:
	{
	"work_history_experience": {
	"positions": [
	{
	"title": "Senior ML Engineer",
	"company": "Google",
	"duration": {
	"start": null,
	"end": null,
	"description": "multiple years",
	"inferred": true
	},
	"description": "Machine learning engineering",
	"confidence": 0.85
	}
	]
	}
	}"""
	class ProfileBuilder:
	def __init__(self):
	self.conversation_history = []
	self.client = None

	def _initialize_client(self, api_key: str) -> None:
	"""Initialize AsyncOpenAI client with API key."""
	if not api_key.startswith("sk-"):
	raise ValueError("Invalid API key format")
	self.client = AsyncOpenAI(api_key=api_key)

	async def process_message(self, message: str, api_key: str) -> Dict[str, Any]:
	"""Process a user message through conversation phase."""
	try:
	if not self.client:
	self._initialize_client(api_key)

	# Add message to history
	self.conversation_history.append({"role": "user", "content": message})

	# Get AI response - properly awaited
	completion = await self.client.chat.completions.create(
	model="gpt-4o-mini",
	messages=[
	{"role": "system", "content": CONVERSATION_PROMPT},
	*self.conversation_history
	],
	temperature=0.7
	)

	ai_message = completion.choices[0].message.content
	self.conversation_history.append({"role": "assistant", "content": ai_message})

	return {"response": ai_message}

	except Exception as e:
	logger.error(f"Error processing message: {str(e)}")
	return {"error": str(e)}

	async def generate_profile(self) -> Dict[str, Any]:
	"""Process conversation history into structured profile."""
	try:
	if not self.client:
	raise ValueError("OpenAI client not initialized")

	# Convert conversation history to text
	conversation_text = "\n".join(
	f"{msg['role']}: {msg['content']}"
	for msg in self.conversation_history
	)

	# Extract structured information - properly awaited
	completion = await self.client.chat.completions.create(
	model="gpt-4o-mini",
	messages=[
	{"role": "system", "content": EXTRACTION_PROMPT},
	{"role": "user", "content": f"Extract profile information from this conversation:\n\n{conversation_text}"}
	],
	temperature=0.3
	)

	# Parse the structured response
	profile_data = json.loads(completion.choices[0].message.content)

	# Add metadata
	profile = {
	"profile_data": profile_data,
	"metadata": {
	"generated_at": datetime.now().isoformat(),
	"conversation_length": len(self.conversation_history)
	}
	}

	# Save to file
	timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
	filename = f"profile_{timestamp}.json"
	with open(filename, 'w', encoding='utf-8') as f:
	json.dump(profile, f, indent=2)

	return {
	"profile": profile,
	"filename": filename
	}

	except Exception as e:
	logger.error(f"Error generating profile: {str(e)}")
	return {"error": str(e)}

	def create_gradio_interface():
	"""Create the Gradio interface."""
	builder = ProfileBuilder()

	with gr.Blocks(theme=gr.themes.Soft()) as demo:
	gr.Markdown("# 🐕 LOSS DOG - Professional Profile Builder")

	with gr.Row():
	with gr.Column(scale=2):
	api_key = gr.Textbox(
	label="OpenAI API Key",
	type="password",
	placeholder="Enter your OpenAI API key"
	)

	chatbot = gr.Chatbot(label="Conversation")

	with gr.Row():
	msg = gr.Textbox(
	label="Message",
	placeholder="Chat with LOSS DOG..."
	)
	send = gr.Button("Send")

	with gr.Column(scale=1):
	generate_btn = gr.Button("Generate Profile")
	profile_output = gr.JSON(label="Generated Profile")
	download_btn = gr.File(label="Download Profile")

	# Event handlers
	async def on_message(message: str, history: List[List[str]], key: str):
	if not message.strip():
	return history, None

	result = await builder.process_message(message, key)

	if "error" in result:
	return history, {"error": result["error"]}

	history = history + [[message, result["response"]]]
	return history, None

	async def on_generate():
	result = await builder.generate_profile()
	if "error" in result:
	return {"error": result["error"]}, None
	return result["profile"], result["filename"]

	# Bind events
	msg.submit(
	on_message,
	inputs=[msg, chatbot, api_key],
	outputs=[chatbot, profile_output]
	).then(lambda: "", None, msg)

	send.click(
	on_message,
	inputs=[msg, chatbot, api_key],
	outputs=[chatbot, profile_output]
	).then(lambda: "", None, msg)

	generate_btn.click(
	on_generate,
	outputs=[profile_output, download_btn]
	)

	return demo

	if __name__ == "__main__":
	demo = create_gradio_interface()
	demo.queue() # Add queue for async support
	demo.launch(
	server_name="0.0.0.0",
	server_port=7860
	)