Spaces:

jzou19950715
/

Newco_information_extraction_agent

Sleeping

App Files Files Community

jzou19950715 commited on Feb 5

Commit

7f04463

verified ·

1 Parent(s): bc642fc

Update app.py

Browse files

Files changed (1) hide show

app.py +40 -146

app.py CHANGED Viewed

@@ -3,13 +3,12 @@ import logging
 from datetime import datetime
 from typing import Dict, List, Optional, Any
 import gradio as gr
-from openai import AsyncOpenAI  # Changed to AsyncOpenAI
 # Configure logging
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
-# System prompts remain the same as before
 CONVERSATION_PROMPT = """You are LOSS DOG, a professional profile builder. Your goal is to have natural conversations
 with users to gather information about their professional background across 9 categories:
@@ -28,16 +27,9 @@ but respect their boundaries. Once you believe you have gathered sufficient info
 have nothing more to share), let them know they can click 'Generate Profile' to proceed.
 """
-EXTRACTION_PROMPT = """You are a professional information extraction system. Your task is to methodically analyze conversations and organize information into 9 specific categories. Process each category thoroughly and output in structured JSON format with no additional text.
-ANALYTICAL PROCESS:
-1. Read entire conversation history
-2. Extract explicit and implicit information
-3. Make reasonable inferences when appropriate
-4. Structure data according to defined schema
-5. Include confidence scores for all extracted information
-OUTPUT SCHEMA:
 {
     "work_history_experience": {
         "positions": [
@@ -48,78 +40,50 @@ OUTPUT SCHEMA:
                 "location": string,
                 "employment_type": string,
                 "adaptability": {
-                    "career_shifts": string[],
-                    "upskilling": string[]
                 },
-                "promotions": string[],
-                "confidence": float
             }
         ]
     },
     "salary_compensation": {
         "history": [
             {
-                "base_salary": number | null,
-                "bonus_structure": string | null,
                 "stock_options": {
                     "type": string,
                     "details": string
                 },
-                "commission": string | null,
                 "benefits": {
                     "health": string,
                     "pto": string,
                     "retirement": string,
-                    "other": string[]
                 },
-                "confidence": float
             }
         ]
     },
     "skills_certifications": {
-        "hard_skills": string[],
-        "soft_skills": string[],
-        "programming_languages": string[],
-        "spoken_languages": string[],
-        "certifications": [
-            {
-                "name": string,
-                "issuer": string,
-                "date": string,
-                "confidence": float
-            }
-        ],
-        "licenses": [
-            {
-                "type": string,
-                "issuer": string,
-                "valid_until": string,
-                "confidence": float
-            }
-        ]
     },
     "education_learning": {
-        "formal_education": [
-            {
-                "degree": string,
-                "institution": string,
-                "gpa": number | null,
-                "research": string[],
-                "period": {
-                    "start": string,
-                    "end": string | null
-                },
-                "confidence": float
-            }
-        ],
         "online_courses": [],
         "executive_education": []
     },
     "personal_branding": {
         "portfolio": {
-            "github": string | null,
-            "behance": string | null,
-            "other": string[]
         },
         "blog_posts": [],
         "blockchain_projects": {
@@ -127,7 +91,6 @@ OUTPUT SCHEMA:
             "defi": [],
             "dapps": []
         },
-        "public_speaking": [],
         "social_media": {
             "platforms": [],
             "influence_metrics": {}
@@ -145,14 +108,7 @@ OUTPUT SCHEMA:
     "social_proof_networking": {
         "mentors": [],
         "references": [],
-        "memberships": [
-            {
-                "organization": string,
-                "type": string,
-                "period": string,
-                "confidence": float
-            }
-        ],
         "conference_engagement": []
     },
     "project_contributions": {
@@ -162,8 +118,8 @@ OUTPUT SCHEMA:
         "patents": [],
         "impact": {
             "description": string,
-            "metrics": string[],
-            "confidence": float
         }
     },
     "work_performance_metrics": {
@@ -175,40 +131,7 @@ OUTPUT SCHEMA:
     }
 }
-EXTRACTION GUIDELINES:
-1. Process systematically:
-   - Analyze conversation thoroughly
-   - Look for both direct statements and implied information
-   - Cross-reference information across different parts of conversation
-   - Make reasonable inferences when appropriate
-2. For each piece of information:
-   - Clean and standardize the data
-   - Assign confidence scores (0.0-1.0)
-   - Mark inferred information
-   - Include source context where relevant
-3. Quality requirements:
-   - Use consistent date formats (YYYY-MM-DD)
-   - Standardize company names and titles
-   - Use empty arrays [] for missing information
-   - Never use null for array fields
-   - Include confidence scores for all extracted data
-4. Handle missing information:
-   - Use empty arrays [] rather than null
-   - Mark inferred information clearly
-   - Include partial information when complete data isn't available
-   - Note uncertainty in confidence scores
-Remember to:
-- Process each category thoroughly
-- Cross-reference information for consistency
-- Make reasonable inferences when appropriate
-- Maintain consistent formatting
-- Include all required fields even if empty
-"""
 class ProfileBuilder:
     def __init__(self):
@@ -216,21 +139,17 @@ class ProfileBuilder:
         self.client = None
     def _initialize_client(self, api_key: str) -> None:
-        """Initialize AsyncOpenAI client with API key."""
         if not api_key.startswith("sk-"):
             raise ValueError("Invalid API key format")
         self.client = AsyncOpenAI(api_key=api_key)
     async def process_message(self, message: str, api_key: str) -> Dict[str, Any]:
-        """Process a user message through conversation phase."""
         try:
             if not self.client:
                 self._initialize_client(api_key)
-            # Add message to history
             self.conversation_history.append({"role": "user", "content": message})
-            # Get AI response - properly awaited
             completion = await self.client.chat.completions.create(
                 model="gpt-4o-mini",
                 messages=[
@@ -250,18 +169,15 @@ class ProfileBuilder:
             return {"error": str(e)}
     async def generate_profile(self) -> Dict[str, Any]:
-        """Process conversation history into structured profile."""
         try:
             if not self.client:
                 raise ValueError("OpenAI client not initialized")
-            # Convert conversation history to text
             conversation_text = "\n".join(
                 f"{msg['role']}: {msg['content']}"
                 for msg in self.conversation_history
             )
-            # Extract structured information - properly awaited
             completion = await self.client.chat.completions.create(
                 model="gpt-4o-mini",
                 messages=[
@@ -271,21 +187,12 @@ class ProfileBuilder:
                 temperature=0.3
             )
-            # Get the raw output text from the model
-            raw_output = completion.choices[0].message.content
-            logger.info(f"Raw extraction output: {raw_output}")
-            # Attempt to parse the JSON
-            try:
-                profile_data = json.loads(raw_output)
-            except json.JSONDecodeError as decode_error:
-                logger.error("Failed to decode JSON. The output may not be valid JSON.")
-                profile_data = None  # Indicate failure to parse
-            # Build the profile output including metadata and raw output
             profile = {
                 "profile_data": profile_data,
-                "raw_output": raw_output,
                 "metadata": {
                     "generated_at": datetime.now().isoformat(),
                     "conversation_length": len(self.conversation_history)
@@ -298,17 +205,16 @@ class ProfileBuilder:
             with open(filename, 'w', encoding='utf-8') as f:
                 json.dump(profile, f, indent=2)
-            return {
-                "profile": profile,
-                "filename": filename
-            }
         except Exception as e:
             logger.error(f"Error generating profile: {str(e)}")
-            return {"error": str(e)}
 def create_gradio_interface():
-    """Create the Gradio interface."""
     builder = ProfileBuilder()
     with gr.Blocks(theme=gr.themes.Soft()) as demo:
@@ -333,13 +239,9 @@ def create_gradio_interface():
             with gr.Column(scale=1):
                 generate_btn = gr.Button("Generate Profile")
-                # JSON output for structured profile
-                profile_output = gr.JSON(label="Generated Profile (Parsed JSON)")
-                # Markdown output to always show the raw AI output
-                raw_output_markdown = gr.Markdown(label="Raw Output from AI")
                 download_btn = gr.File(label="Download Profile")
-        # Event handlers
         async def on_message(message: str, history: List[List[str]], key: str):
             if not message.strip():
                 return history, None
@@ -353,16 +255,11 @@ def create_gradio_interface():
             return history, None
         async def on_generate():
-            result = await builder.generate_profile()
-            if "error" in result:
-                error_text = f"Error generating profile: {result['error']}"
-                return {"error": error_text}, None, error_text
-            profile = result["profile"]
-            # Prepare the raw output as markdown. Wrapping in triple backticks for code formatting.
-            raw_markdown = f"```json\n{profile.get('raw_output', '')}\n```"
-            return profile, result["filename"], raw_markdown
-        # Bind events
         msg.submit(
             on_message,
             inputs=[msg, chatbot, api_key],
@@ -377,15 +274,12 @@ def create_gradio_interface():
         generate_btn.click(
             on_generate,
-            outputs=[profile_output, download_btn, raw_output_markdown]
         )
     return demo
 if __name__ == "__main__":
     demo = create_gradio_interface()
-    demo.queue()  # Add queue for async support
-    demo.launch(
-        server_name="0.0.0.0",
-        server_port=7860
-    )

 from datetime import datetime
 from typing import Dict, List, Optional, Any
 import gradio as gr
+from openai import AsyncOpenAI
 # Configure logging
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
 CONVERSATION_PROMPT = """You are LOSS DOG, a professional profile builder. Your goal is to have natural conversations
 with users to gather information about their professional background across 9 categories:
 have nothing more to share), let them know they can click 'Generate Profile' to proceed.
 """
+EXTRACTION_PROMPT = """You are a professional information extraction system. Your task is to extract information from the potentially unstructure conversation and return ONLY a valid JSON object. Do not include any explanatory text before or after the JSON.
+Return the data in this exact structure:
 {
     "work_history_experience": {
         "positions": [
                 "location": string,
                 "employment_type": string,
                 "adaptability": {
+                    "career_shifts": [],
+                    "upskilling": []
                 },
+                "promotions": [],
+                "confidence": number
             }
         ]
     },
     "salary_compensation": {
         "history": [
             {
+                "base_salary": number,
+                "bonus_structure": string,
                 "stock_options": {
                     "type": string,
                     "details": string
                 },
+                "commission": null,
                 "benefits": {
                     "health": string,
                     "pto": string,
                     "retirement": string,
+                    "other": []
                 },
+                "confidence": number
             }
         ]
     },
     "skills_certifications": {
+        "hard_skills": [],
+        "soft_skills": [],
+        "certifications": [],
+        "licenses": []
     },
     "education_learning": {
+        "formal_education": [],
         "online_courses": [],
         "executive_education": []
     },
     "personal_branding": {
         "portfolio": {
+            "github": null,
+            "behance": null,
+            "other": []
         },
         "blog_posts": [],
         "blockchain_projects": {
             "defi": [],
             "dapps": []
         },
         "social_media": {
             "platforms": [],
             "influence_metrics": {}
     "social_proof_networking": {
         "mentors": [],
         "references": [],
+        "memberships": [],
         "conference_engagement": []
     },
     "project_contributions": {
         "patents": [],
         "impact": {
             "description": string,
+            "metrics": [],
+            "confidence": number
         }
     },
     "work_performance_metrics": {
     }
 }
+IMPORTANT: Return ONLY the JSON. Do not add any explanation text."""
 class ProfileBuilder:
     def __init__(self):
         self.client = None
     def _initialize_client(self, api_key: str) -> None:
         if not api_key.startswith("sk-"):
             raise ValueError("Invalid API key format")
         self.client = AsyncOpenAI(api_key=api_key)
     async def process_message(self, message: str, api_key: str) -> Dict[str, Any]:
         try:
             if not self.client:
                 self._initialize_client(api_key)
             self.conversation_history.append({"role": "user", "content": message})
             completion = await self.client.chat.completions.create(
                 model="gpt-4o-mini",
                 messages=[
             return {"error": str(e)}
     async def generate_profile(self) -> Dict[str, Any]:
         try:
             if not self.client:
                 raise ValueError("OpenAI client not initialized")
             conversation_text = "\n".join(
                 f"{msg['role']}: {msg['content']}"
                 for msg in self.conversation_history
             )
             completion = await self.client.chat.completions.create(
                 model="gpt-4o-mini",
                 messages=[
                 temperature=0.3
             )
+            # Clean and parse the JSON response
+            response_text = completion.choices[0].message.content.strip()
+            profile_data = json.loads(response_text)
             profile = {
                 "profile_data": profile_data,
                 "metadata": {
                     "generated_at": datetime.now().isoformat(),
                     "conversation_length": len(self.conversation_history)
             with open(filename, 'w', encoding='utf-8') as f:
                 json.dump(profile, f, indent=2)
+            return profile, filename
+        except json.JSONDecodeError as e:
+            logger.error(f"JSON parsing error: {str(e)}\nRaw output: {response_text}")
+            return {"error": "Failed to parse profile data"}, None
         except Exception as e:
             logger.error(f"Error generating profile: {str(e)}")
+            return {"error": str(e)}, None
 def create_gradio_interface():
     builder = ProfileBuilder()
     with gr.Blocks(theme=gr.themes.Soft()) as demo:
             with gr.Column(scale=1):
                 generate_btn = gr.Button("Generate Profile")
+                profile_output = gr.JSON(label="Generated Profile")
                 download_btn = gr.File(label="Download Profile")
         async def on_message(message: str, history: List[List[str]], key: str):
             if not message.strip():
                 return history, None
             return history, None
         async def on_generate():
+            profile, filename = await builder.generate_profile()
+            if "error" in profile:
+                return profile, None
+            return profile["profile_data"], filename
         msg.submit(
             on_message,
             inputs=[msg, chatbot, api_key],
         generate_btn.click(
             on_generate,
+            outputs=[profile_output, download_btn]
         )
     return demo
 if __name__ == "__main__":
     demo = create_gradio_interface()
+    demo.queue()
+    demo.launch(server_name="0.0.0.0", server_port=7860)