Spaces:

jzou19950715
/

Newco_information_extraction_agent

Sleeping

App Files Files Community

jzou19950715 commited on Feb 5

Commit

d5eccc9

verified ·

1 Parent(s): c5fc623

Update app.py

Browse files

Files changed (1) hide show

app.py +163 -62

app.py CHANGED Viewed

@@ -29,85 +29,186 @@ Be friendly and conversational. Ask follow-up questions naturally. When appropri
 but respect their boundaries. Once you believe you have gathered sufficient information (or if the user indicates they
 have nothing more to share), let them know they can click 'Generate Profile' to proceed.
 """
-EXTRACTION_PROMPT = """You are a data extraction specialist. Your task is to:
-1. Read through the provided conversation
-2. Identify relevant information across 9 categories:
-   - Work History & Experience (jobs, roles, companies)
-   - Salary & Compensation (if shared)
-   - Skills & Certifications
-   - Education & Learning
-   - Personal Branding & Online Presence
-   - Achievements & Awards
-   - Social Proof & Networking
-   - Project Contributions & Leadership
-   - Work Performance & Impact Metrics
-3. Clean and structure the information:
-   - Deduplicate repeated information
-   - Resolve any inconsistencies
-   - Make reasonable inferences when dates or details are partial
-   - Standardize formatting (dates, company names, titles)
-4. Output a VALID JSON object with this exact structure:
 {
     "work_history_experience": {
         "positions": [
             {
-                "title": "cleaned job title",
-                "company": "cleaned company name",
-                "duration": "standardized duration",
-                "description": "cleaned description",
-                "confidence": 0.95,
-                "inferred": false
             }
         ]
     },
     "skills_certifications": {
-        "technical_skills": ["skill1", "skill2"],
         "certifications": [
             {
-                "name": "certification name",
-                "issuer": "issuing organization",
-                "date": "YYYY-MM",
-                "confidence": 0.9
             }
         ]
     }
-    // ... other categories following similar structure
 }
-IMPORTANT:
-- Return ONLY valid JSON
-- Always include confidence scores (0.0-1.0)
-- Mark any inferred information
-- Use consistent date formats (YYYY-MM-DD)
-- Clean and standardize all text fields
-- Return empty arrays [] for missing sections rather than null
-Example conversation snippet:
-User: "I worked at Google for a few years"
-Assistant: "That's interesting! What was your role there?"
-User: "I was a senior engineer, mostly doing ML stuff"
-Should extract to:
-{
-    "work_history_experience": {
-        "positions": [
-            {
-                "title": "Senior ML Engineer",
-                "company": "Google",
-                "duration": {
-                    "start": null,
-                    "end": null,
-                    "description": "multiple years",
-                    "inferred": true
-                },
-                "description": "Machine learning engineering",
-                "confidence": 0.85
-            }
-        ]
-    }
-}"""
 class ProfileBuilder:
     def __init__(self):
         self.conversation_history = []

 but respect their boundaries. Once you believe you have gathered sufficient information (or if the user indicates they
 have nothing more to share), let them know they can click 'Generate Profile' to proceed.
 """
+EXTRACTION_PROMPT = """You are a professional information extraction system. Your task is to methodically analyze conversations and organize information into 9 specific categories. Process each category thoroughly and output in structured JSON format.
+ANALYTICAL PROCESS:
+1. Read entire conversation history
+2. Extract explicit and implicit information
+3. Make reasonable inferences when appropriate
+4. Structure data according to defined schema
+5. Include confidence scores for all extracted information
+OUTPUT SCHEMA:
 {
     "work_history_experience": {
         "positions": [
             {
+                "title": string,
+                "company": string,
+                "industry": string,
+                "location": string,
+                "employment_type": string,
+                "adaptability": {
+                    "career_shifts": string[],
+                    "upskilling": string[]
+                },
+                "promotions": string[],
+                "confidence": float
+            }
+        ]
+    },
+    "salary_compensation": {
+        "history": [
+            {
+                "base_salary": number | null,
+                "bonus_structure": string | null,
+                "stock_options": {
+                    "type": string,
+                    "details": string
+                },
+                "commission": string | null,
+                "benefits": {
+                    "health": string,
+                    "pto": string,
+                    "retirement": string,
+                    "other": string[]
+                },
+                "confidence": float
             }
         ]
     },
     "skills_certifications": {
+        "hard_skills": string[],
+        "soft_skills": string[],
+        "programming_languages": string[],
+        "spoken_languages": string[],
         "certifications": [
             {
+                "name": string,
+                "issuer": string,
+                "date": string,
+                "confidence": float
+            }
+        ],
+        "licenses": [
+            {
+                "type": string,
+                "issuer": string,
+                "valid_until": string,
+                "confidence": float
             }
         ]
+    },
+    "education_learning": {
+        "formal_education": [
+            {
+                "degree": string,
+                "institution": string,
+                "gpa": number | null,
+                "research": string[],
+                "period": {
+                    "start": string,
+                    "end": string | null
+                },
+                "confidence": float
+            }
+        ],
+        "online_courses": [],
+        "executive_education": []
+    },
+    "personal_branding": {
+        "portfolio": {
+            "github": string | null,
+            "behance": string | null,
+            "other": string[]
+        },
+        "blog_posts": [],
+        "blockchain_projects": {
+            "nfts": [],
+            "defi": [],
+            "dapps": []
+        },
+        "public_speaking": [],
+        "social_media": {
+            "platforms": [],
+            "influence_metrics": {}
+        }
+    },
+    "achievements_awards": {
+        "industry_awards": [],
+        "hackathons": [],
+        "peer_endorsements": [],
+        "creative_projects": {
+            "ai_art": [],
+            "other": []
+        }
+    },
+    "social_proof_networking": {
+        "mentors": [],
+        "references": [],
+        "memberships": [
+            {
+                "organization": string,
+                "type": string,
+                "period": string,
+                "confidence": float
+            }
+        ],
+        "conference_engagement": []
+    },
+    "project_contributions": {
+        "major_projects": [],
+        "open_source": [],
+        "team_leadership": [],
+        "patents": [],
+        "impact": {
+            "description": string,
+            "metrics": string[],
+            "confidence": float
+        }
+    },
+    "work_performance_metrics": {
+        "kpis": [],
+        "revenue_impact": [],
+        "efficiency_gains": [],
+        "career_growth": [],
+        "leadership_influence": []
     }
 }
+EXTRACTION GUIDELINES:
+1. Process systematically:
+   - Analyze conversation thoroughly
+   - Look for both direct statements and implied information
+   - Cross-reference information across different parts of conversation
+   - Make reasonable inferences when appropriate
+2. For each piece of information:
+   - Clean and standardize the data
+   - Assign confidence scores (0.0-1.0)
+   - Mark inferred information
+   - Include source context where relevant
+3. Quality requirements:
+   - Use consistent date formats (YYYY-MM-DD)
+   - Standardize company names and titles
+   - Use empty arrays [] for missing information
+   - Never use null for array fields
+   - Include confidence scores for all extracted data
+4. Handle missing information:
+   - Use empty arrays [] rather than null
+   - Mark inferred information clearly
+   - Include partial information when complete data isn't available
+   - Note uncertainty in confidence scores
+Remember to:
+- Process each category thoroughly
+- Cross-reference information for consistency
+- Make reasonable inferences when appropriate
+- Maintain consistent formatting
+- Include all required fields even if empty"""
 class ProfileBuilder:
     def __init__(self):
         self.conversation_history = []