Zamanonymize3

Build error

App Files Files Community

jfrery-zama commited on Mar 6, 2024

Commit

2b591f4

1 Parent(s): b5afc24

add probability along with detected words

Browse files

Files changed (4) hide show

README copy.md +0 -55
README.md +50 -7
app.py +15 -7
fhe_anonymizer.py +34 -22

README copy.md DELETED Viewed

@@ -1,55 +0,0 @@
----
-title: Encrypted Anonymization Using Fully Homomorphic Encryption
-emoji: 🕵️‍♂️ 🔒
-colorFrom: blue
-colorTo: purple
-sdk: gradio
-sdk_version: 3.40.0
-app_file: app.py
-pinned: true
-tags:
-  - FHE
-  - PPML
-  - privacy
-  - privacy preserving machine learning
-  - data anonymization
-  - homomorphic encryption
-  - security
-python_version: 3.10.11
----
-# Data Anonymization using FHE
-## Run the application locally
-### Install the dependencies
-First, create a virtual env and activate it:
-```bash
-python3 -m venv .venv
-source .venv/bin/activate
-```
-Then, install the required packages:
-```python
-pip3 install pip --upgrade
-pip3 install -U pip wheel setuptools --ignore-installed
-pip3 install -r requirements.txt --ignore-installed
-```
-The above steps should only be done once.
-## Run the app
-In a terminal, run:
-```bash
-source .venv/bin/activate
-python3 anonymize_app.py
-```
-## Interact with the application
-Open the given URL link (search for a line like `Running on local URL: http://127.0.0.1:8888/`).

README.md CHANGED Viewed

@@ -1,12 +1,55 @@
 ---
-title: Encrypted Anonymization
-emoji: 🐠
-colorFrom: purple
-colorTo: red
 sdk: gradio
-sdk_version: 4.20.0
 app_file: app.py
-pinned: false
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
+title: Encrypted Anonymization Using Fully Homomorphic Encryption
+emoji: 🕵️‍♂️ 🔒
+colorFrom: blue
+colorTo: purple
 sdk: gradio
+sdk_version: 3.40.0
 app_file: app.py
+pinned: true
+tags:
+  - FHE
+  - PPML
+  - privacy
+  - privacy preserving machine learning
+  - data anonymization
+  - homomorphic encryption
+  - security
+python_version: 3.8.16
 ---
+# Data Anonymization using FHE
+## Run the application locally
+### Install the dependencies
+First, create a virtual env and activate it:
+```bash
+python3 -m venv .venv
+source .venv/bin/activate
+```
+Then, install the required packages:
+```python
+pip3 install pip --upgrade
+pip3 install -U pip wheel setuptools --ignore-installed
+pip3 install -r requirements.txt --ignore-installed
+```
+The above steps should only be done once.
+## Run the app
+In a terminal, run:
+```bash
+source .venv/bin/activate
+python3 app.py
+```
+## Interact with the application
+Open the given URL link (search for a line like `Running on local URL: http://127.0.0.1:8888/`).

app.py CHANGED Viewed

@@ -10,12 +10,15 @@ anonymizer = FHEAnonymizer()
 def deidentify_text(input_text):
-    anonymized_text, identified_words = anonymizer(input_text)
-    # Convert the list of identified words into a DataFrame
-    if identified_words:  # Ensure there are identified words to process
-        identified_df = pd.DataFrame(identified_words, columns=["Identified Words"])
     else:
-        identified_df = pd.DataFrame(columns=["Identified Words"])
     return anonymized_text, identified_df
@@ -76,7 +79,12 @@ with demo:
     )
     with gr.Row():
-        input_text = gr.Textbox(value=default_demo_text, lines=13, placeholder="Input text here...", label="Input")
         anonymized_text_output = gr.Textbox(label="Anonymized Text", lines=13)
@@ -92,4 +100,4 @@ with demo:
 # Launch the app
-demo.launch(share=False)

 def deidentify_text(input_text):
+    anonymized_text, identified_words_with_prob = anonymizer(input_text)
+    # Convert the list of identified words and probabilities into a DataFrame
+    if identified_words_with_prob:
+        identified_df = pd.DataFrame(
+            identified_words_with_prob, columns=["Identified Words", "Probability"]
+        )
     else:
+        identified_df = pd.DataFrame(columns=["Identified Words", "Probability"])
     return anonymized_text, identified_df
     )
     with gr.Row():
+        input_text = gr.Textbox(
+            value=default_demo_text,
+            lines=13,
+            placeholder="Input text here...",
+            label="Input",
+        )
         anonymized_text_output = gr.Textbox(label="Anonymized Text", lines=13)
 # Launch the app
+demo.launch(share=False)

fhe_anonymizer.py CHANGED Viewed

@@ -6,10 +6,13 @@ from concrete.ml.common.serialization.loaders import load
 base_dir = Path(__file__).parent
 class FHEAnonymizer:
     def __init__(self, punctuation_list=".,!?:;"):
-        self.embeddings_model = gensim.models.FastText.load(str(base_dir / "embedded_model.model"))
         self.punctuation_list = punctuation_list
         with open(base_dir / "cml_xgboost.model", "r") as model_file:
             self.fhe_ner_detection = load(file=model_file)
@@ -28,17 +31,19 @@ class FHEAnonymizer:
     def __call__(self, text: str):
         text = self.preprocess_sentences(text)
-        identified_words = []
         new_text = []
         for word in text.split():
             # Prediction for each word
             x = self.embeddings_model.wv[word][None]
-            prediction = self.fhe_ner_detection.predict(x)
             # prediction = self.fhe_inference(x).argmax(1)[0]
             if prediction == 1:
-                identified_words.append(word)
                 new_text.append("<REMOVED>")
             else:
                 new_text.append(word)
@@ -46,29 +51,36 @@ class FHEAnonymizer:
         # Joining the modified text
         modified_text = " ".join(new_text)
-        return modified_text, identified_words
     def preprocess_sentences(self, sentence, verbose=False):
         """Preprocess the sentence."""
-        sentence = re.sub(r'\n+', ' ', sentence)
-        if verbose: print(sentence)
-        sentence = re.sub(' +', ' ', sentence)
-        if verbose: print(sentence)
         sentence = re.sub(r"'s\b", " s", sentence)
-        if verbose: print(sentence)
-        sentence = re.sub(r'\s([,.!?;:])', r'\1', sentence)
-        if verbose: print(sentence)
-        pattern = r'(?<!\w)[{}]|[{}](?!\w)'.format(re.escape(self.punctuation_list), re.escape(self.punctuation_list))
-        sentence = re.sub(pattern, '', sentence)
-        if verbose: print(sentence)
-        sentence = re.sub(r'\s([,.!?;:])', r'\1', sentence)
-        if verbose: print(sentence)
         return sentence

 base_dir = Path(__file__).parent
 class FHEAnonymizer:
     def __init__(self, punctuation_list=".,!?:;"):
+        self.embeddings_model = gensim.models.FastText.load(
+            str(base_dir / "embedded_model.model")
+        )
         self.punctuation_list = punctuation_list
         with open(base_dir / "cml_xgboost.model", "r") as model_file:
             self.fhe_ner_detection = load(file=model_file)
     def __call__(self, text: str):
         text = self.preprocess_sentences(text)
+        identified_words_with_prob = []  # tuples of (word, probability)
         new_text = []
         for word in text.split():
             # Prediction for each word
             x = self.embeddings_model.wv[word][None]
+            prediction_proba = self.fhe_ner_detection.predict_proba(x)
             # prediction = self.fhe_inference(x).argmax(1)[0]
+            # print(word, prediction)
+            probability = prediction_proba[0][1]
+            prediction = probability >= 0.5
             if prediction == 1:
+                identified_words_with_prob.append((word, probability))
                 new_text.append("<REMOVED>")
             else:
                 new_text.append(word)
         # Joining the modified text
         modified_text = " ".join(new_text)
+        return modified_text, identified_words_with_prob
     def preprocess_sentences(self, sentence, verbose=False):
         """Preprocess the sentence."""
+        sentence = re.sub(r"\n+", " ", sentence)
+        if verbose:
+            print(sentence)
+        sentence = re.sub(" +", " ", sentence)
+        if verbose:
+            print(sentence)
         sentence = re.sub(r"'s\b", " s", sentence)
+        if verbose:
+            print(sentence)
+        sentence = re.sub(r"\s([,.!?;:])", r"\1", sentence)
+        if verbose:
+            print(sentence)
+        pattern = r"(?<!\w)[{}]|[{}](?!\w)".format(
+            re.escape(self.punctuation_list), re.escape(self.punctuation_list)
+        )
+        sentence = re.sub(pattern, "", sentence)
+        if verbose:
+            print(sentence)
+        sentence = re.sub(r"\s([,.!?;:])", r"\1", sentence)
+        if verbose:
+            print(sentence)
         return sentence