YoussefMorad1 commited on
Commit
25510ce
·
1 Parent(s): 2021cd6

Add deduplication and filtering for knowledge predictions

Browse files
skills_extraction/skills_extraction.py CHANGED
@@ -1,8 +1,11 @@
 
1
  import string
2
  import numpy as np
3
  from fastapi import FastAPI
4
  from pydantic import BaseModel
5
  from transformers import pipeline, AutoTokenizer
 
 
6
 
7
  # Initialize FastAPI
8
  app = FastAPI()
@@ -76,6 +79,34 @@ def chunk_text(text, tokenizer, max_length=500, overlap=100):
76
  return chunks
77
 
78
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
79
  @app.post("/predict_knowledge")
80
  def predict_knowledge(input_data: TextInput):
81
  # Clean non-printable chars
@@ -85,7 +116,18 @@ def predict_knowledge(input_data: TextInput):
85
  for chunk in chunks:
86
  preds = knowledge_nlp(chunk)
87
  all_preds.extend(convert_from_numpy(preds))
88
- return {"knowledge_predictions": merge_BI_and_get_results(all_preds)}
 
 
 
 
 
 
 
 
 
 
 
89
 
90
 
91
  @app.post("/predict_skills")
 
1
+ import re
2
  import string
3
  import numpy as np
4
  from fastapi import FastAPI
5
  from pydantic import BaseModel
6
  from transformers import pipeline, AutoTokenizer
7
+ from semantic_similarity.semantic_similarity import model as embedding_model
8
+ from sentence_transformers import util
9
 
10
  # Initialize FastAPI
11
  app = FastAPI()
 
79
  return chunks
80
 
81
 
82
+ def deduplicate_by_similarity(items, embeddings, threshold=0.7):
83
+ keep = []
84
+ used = set()
85
+ sim_matrix = util.cos_sim(embeddings, embeddings)
86
+
87
+ for i in range(len(items)):
88
+ if i in used:
89
+ continue
90
+ keep.append(items[i])
91
+ for j in range(i + 1, len(items)):
92
+ if sim_matrix[i][j] > threshold:
93
+ used.add(j)
94
+ return keep
95
+
96
+
97
+ def filter_knowledge(results):
98
+ # to_remove = ['-', '/', '(', ')', 'and', 'or', 'the', 'a', 'an']
99
+ filtered_results = []
100
+ for result in results:
101
+ result["name"] = result["name"].strip()
102
+ result["name"] = re.sub(r'[^\w\s]', '', result["name"])
103
+ result["name"] = re.sub(r'\s+', ' ', result["name"])
104
+ if len(result["name"].split()) > 3 or len(result["name"]) <= 2 or result['confidence'] < 0.95:
105
+ continue
106
+ filtered_results.append(result)
107
+ return filtered_results
108
+
109
+
110
  @app.post("/predict_knowledge")
111
  def predict_knowledge(input_data: TextInput):
112
  # Clean non-printable chars
 
116
  for chunk in chunks:
117
  preds = knowledge_nlp(chunk)
118
  all_preds.extend(convert_from_numpy(preds))
119
+ result = merge_BI_and_get_results(all_preds)
120
+ if not result:
121
+ return {"knowledge_predictions": []}
122
+
123
+ result = filter_knowledge(result)
124
+
125
+ knowledge_names = [r["name"] for r in result]
126
+ embeddings_tensor = embedding_model.encode(knowledge_names, convert_to_tensor=True)
127
+ embeddings = embeddings_tensor.cpu().numpy()
128
+ deduped_results = deduplicate_by_similarity(result, embeddings)
129
+
130
+ return {"knowledge_predictions": deduped_results}
131
 
132
 
133
  @app.post("/predict_skills")