jzou19950715 commited on
Commit
d5eccc9
·
verified ·
1 Parent(s): c5fc623

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +163 -62
app.py CHANGED
@@ -29,85 +29,186 @@ Be friendly and conversational. Ask follow-up questions naturally. When appropri
29
  but respect their boundaries. Once you believe you have gathered sufficient information (or if the user indicates they
30
  have nothing more to share), let them know they can click 'Generate Profile' to proceed.
31
  """
32
- EXTRACTION_PROMPT = """You are a data extraction specialist. Your task is to:
33
- 1. Read through the provided conversation
34
- 2. Identify relevant information across 9 categories:
35
- - Work History & Experience (jobs, roles, companies)
36
- - Salary & Compensation (if shared)
37
- - Skills & Certifications
38
- - Education & Learning
39
- - Personal Branding & Online Presence
40
- - Achievements & Awards
41
- - Social Proof & Networking
42
- - Project Contributions & Leadership
43
- - Work Performance & Impact Metrics
44
-
45
- 3. Clean and structure the information:
46
- - Deduplicate repeated information
47
- - Resolve any inconsistencies
48
- - Make reasonable inferences when dates or details are partial
49
- - Standardize formatting (dates, company names, titles)
50
-
51
- 4. Output a VALID JSON object with this exact structure:
52
  {
53
  "work_history_experience": {
54
  "positions": [
55
  {
56
- "title": "cleaned job title",
57
- "company": "cleaned company name",
58
- "duration": "standardized duration",
59
- "description": "cleaned description",
60
- "confidence": 0.95,
61
- "inferred": false
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
62
  }
63
  ]
64
  },
65
  "skills_certifications": {
66
- "technical_skills": ["skill1", "skill2"],
 
 
 
67
  "certifications": [
68
  {
69
- "name": "certification name",
70
- "issuer": "issuing organization",
71
- "date": "YYYY-MM",
72
- "confidence": 0.9
 
 
 
 
 
 
 
 
73
  }
74
  ]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
75
  }
76
- // ... other categories following similar structure
77
  }
78
 
79
- IMPORTANT:
80
- - Return ONLY valid JSON
81
- - Always include confidence scores (0.0-1.0)
82
- - Mark any inferred information
83
- - Use consistent date formats (YYYY-MM-DD)
84
- - Clean and standardize all text fields
85
- - Return empty arrays [] for missing sections rather than null
86
 
87
- Example conversation snippet:
88
- User: "I worked at Google for a few years"
89
- Assistant: "That's interesting! What was your role there?"
90
- User: "I was a senior engineer, mostly doing ML stuff"
 
91
 
92
- Should extract to:
93
- {
94
- "work_history_experience": {
95
- "positions": [
96
- {
97
- "title": "Senior ML Engineer",
98
- "company": "Google",
99
- "duration": {
100
- "start": null,
101
- "end": null,
102
- "description": "multiple years",
103
- "inferred": true
104
- },
105
- "description": "Machine learning engineering",
106
- "confidence": 0.85
107
- }
108
- ]
109
- }
110
- }"""
 
 
 
 
 
 
111
  class ProfileBuilder:
112
  def __init__(self):
113
  self.conversation_history = []
 
29
  but respect their boundaries. Once you believe you have gathered sufficient information (or if the user indicates they
30
  have nothing more to share), let them know they can click 'Generate Profile' to proceed.
31
  """
32
+ EXTRACTION_PROMPT = """You are a professional information extraction system. Your task is to methodically analyze conversations and organize information into 9 specific categories. Process each category thoroughly and output in structured JSON format.
33
+
34
+ ANALYTICAL PROCESS:
35
+ 1. Read entire conversation history
36
+ 2. Extract explicit and implicit information
37
+ 3. Make reasonable inferences when appropriate
38
+ 4. Structure data according to defined schema
39
+ 5. Include confidence scores for all extracted information
40
+
41
+ OUTPUT SCHEMA:
 
 
 
 
 
 
 
 
 
 
42
  {
43
  "work_history_experience": {
44
  "positions": [
45
  {
46
+ "title": string,
47
+ "company": string,
48
+ "industry": string,
49
+ "location": string,
50
+ "employment_type": string,
51
+ "adaptability": {
52
+ "career_shifts": string[],
53
+ "upskilling": string[]
54
+ },
55
+ "promotions": string[],
56
+ "confidence": float
57
+ }
58
+ ]
59
+ },
60
+ "salary_compensation": {
61
+ "history": [
62
+ {
63
+ "base_salary": number | null,
64
+ "bonus_structure": string | null,
65
+ "stock_options": {
66
+ "type": string,
67
+ "details": string
68
+ },
69
+ "commission": string | null,
70
+ "benefits": {
71
+ "health": string,
72
+ "pto": string,
73
+ "retirement": string,
74
+ "other": string[]
75
+ },
76
+ "confidence": float
77
  }
78
  ]
79
  },
80
  "skills_certifications": {
81
+ "hard_skills": string[],
82
+ "soft_skills": string[],
83
+ "programming_languages": string[],
84
+ "spoken_languages": string[],
85
  "certifications": [
86
  {
87
+ "name": string,
88
+ "issuer": string,
89
+ "date": string,
90
+ "confidence": float
91
+ }
92
+ ],
93
+ "licenses": [
94
+ {
95
+ "type": string,
96
+ "issuer": string,
97
+ "valid_until": string,
98
+ "confidence": float
99
  }
100
  ]
101
+ },
102
+ "education_learning": {
103
+ "formal_education": [
104
+ {
105
+ "degree": string,
106
+ "institution": string,
107
+ "gpa": number | null,
108
+ "research": string[],
109
+ "period": {
110
+ "start": string,
111
+ "end": string | null
112
+ },
113
+ "confidence": float
114
+ }
115
+ ],
116
+ "online_courses": [],
117
+ "executive_education": []
118
+ },
119
+ "personal_branding": {
120
+ "portfolio": {
121
+ "github": string | null,
122
+ "behance": string | null,
123
+ "other": string[]
124
+ },
125
+ "blog_posts": [],
126
+ "blockchain_projects": {
127
+ "nfts": [],
128
+ "defi": [],
129
+ "dapps": []
130
+ },
131
+ "public_speaking": [],
132
+ "social_media": {
133
+ "platforms": [],
134
+ "influence_metrics": {}
135
+ }
136
+ },
137
+ "achievements_awards": {
138
+ "industry_awards": [],
139
+ "hackathons": [],
140
+ "peer_endorsements": [],
141
+ "creative_projects": {
142
+ "ai_art": [],
143
+ "other": []
144
+ }
145
+ },
146
+ "social_proof_networking": {
147
+ "mentors": [],
148
+ "references": [],
149
+ "memberships": [
150
+ {
151
+ "organization": string,
152
+ "type": string,
153
+ "period": string,
154
+ "confidence": float
155
+ }
156
+ ],
157
+ "conference_engagement": []
158
+ },
159
+ "project_contributions": {
160
+ "major_projects": [],
161
+ "open_source": [],
162
+ "team_leadership": [],
163
+ "patents": [],
164
+ "impact": {
165
+ "description": string,
166
+ "metrics": string[],
167
+ "confidence": float
168
+ }
169
+ },
170
+ "work_performance_metrics": {
171
+ "kpis": [],
172
+ "revenue_impact": [],
173
+ "efficiency_gains": [],
174
+ "career_growth": [],
175
+ "leadership_influence": []
176
  }
 
177
  }
178
 
179
+ EXTRACTION GUIDELINES:
 
 
 
 
 
 
180
 
181
+ 1. Process systematically:
182
+ - Analyze conversation thoroughly
183
+ - Look for both direct statements and implied information
184
+ - Cross-reference information across different parts of conversation
185
+ - Make reasonable inferences when appropriate
186
 
187
+ 2. For each piece of information:
188
+ - Clean and standardize the data
189
+ - Assign confidence scores (0.0-1.0)
190
+ - Mark inferred information
191
+ - Include source context where relevant
192
+
193
+ 3. Quality requirements:
194
+ - Use consistent date formats (YYYY-MM-DD)
195
+ - Standardize company names and titles
196
+ - Use empty arrays [] for missing information
197
+ - Never use null for array fields
198
+ - Include confidence scores for all extracted data
199
+
200
+ 4. Handle missing information:
201
+ - Use empty arrays [] rather than null
202
+ - Mark inferred information clearly
203
+ - Include partial information when complete data isn't available
204
+ - Note uncertainty in confidence scores
205
+
206
+ Remember to:
207
+ - Process each category thoroughly
208
+ - Cross-reference information for consistency
209
+ - Make reasonable inferences when appropriate
210
+ - Maintain consistent formatting
211
+ - Include all required fields even if empty"""
212
  class ProfileBuilder:
213
  def __init__(self):
214
  self.conversation_history = []