Spaces:

YoussefMorad1
/

instacv_gp

Running on CPU Upgrade

App Files Files Community

YoussefMorad1 commited on 22 days ago

Commit

25510ce

1 Parent(s): 2021cd6

Add deduplication and filtering for knowledge predictions

Browse files

Files changed (1) hide show

skills_extraction/skills_extraction.py +43 -1

skills_extraction/skills_extraction.py CHANGED Viewed

@@ -1,8 +1,11 @@
 import string
 import numpy as np
 from fastapi import FastAPI
 from pydantic import BaseModel
 from transformers import pipeline, AutoTokenizer
 # Initialize FastAPI
 app = FastAPI()
@@ -76,6 +79,34 @@ def chunk_text(text, tokenizer, max_length=500, overlap=100):
     return chunks
 @app.post("/predict_knowledge")
 def predict_knowledge(input_data: TextInput):
     # Clean non-printable chars
@@ -85,7 +116,18 @@ def predict_knowledge(input_data: TextInput):
     for chunk in chunks:
         preds = knowledge_nlp(chunk)
         all_preds.extend(convert_from_numpy(preds))
-    return {"knowledge_predictions": merge_BI_and_get_results(all_preds)}
 @app.post("/predict_skills")

+import re
 import string
 import numpy as np
 from fastapi import FastAPI
 from pydantic import BaseModel
 from transformers import pipeline, AutoTokenizer
+from semantic_similarity.semantic_similarity import model as embedding_model
+from sentence_transformers import util
 # Initialize FastAPI
 app = FastAPI()
     return chunks
+def deduplicate_by_similarity(items, embeddings, threshold=0.7):
+    keep = []
+    used = set()
+    sim_matrix = util.cos_sim(embeddings, embeddings)
+    for i in range(len(items)):
+        if i in used:
+            continue
+        keep.append(items[i])
+        for j in range(i + 1, len(items)):
+            if sim_matrix[i][j] > threshold:
+                used.add(j)
+    return keep
+def filter_knowledge(results):
+    # to_remove = ['-', '/', '(', ')', 'and', 'or', 'the', 'a', 'an']
+    filtered_results = []
+    for result in results:
+        result["name"] = result["name"].strip()
+        result["name"] = re.sub(r'[^\w\s]', '', result["name"])
+        result["name"] = re.sub(r'\s+', ' ', result["name"])
+        if len(result["name"].split()) > 3 or len(result["name"]) <= 2 or result['confidence'] < 0.95:
+            continue
+        filtered_results.append(result)
+    return filtered_results
 @app.post("/predict_knowledge")
 def predict_knowledge(input_data: TextInput):
     # Clean non-printable chars
     for chunk in chunks:
         preds = knowledge_nlp(chunk)
         all_preds.extend(convert_from_numpy(preds))
+    result = merge_BI_and_get_results(all_preds)
+    if not result:
+        return {"knowledge_predictions": []}
+    result = filter_knowledge(result)
+    knowledge_names = [r["name"] for r in result]
+    embeddings_tensor = embedding_model.encode(knowledge_names, convert_to_tensor=True)
+    embeddings = embeddings_tensor.cpu().numpy()
+    deduped_results = deduplicate_by_similarity(result, embeddings)
+    return {"knowledge_predictions": deduped_results}
 @app.post("/predict_skills")