chore: update Marketing v2
Browse files- app.py +90 -59
- files/anonymized_document.txt +5 -5
- files/mapping_clear_to_anonymized.pkl +2 -2
- files/mapping_clear_to_encrypted.pkl +2 -2
- files/mapping_doc_embedding_path.pkl +3 -0
- files/original_document.txt +2 -2
- files/original_document_uuid_mapping.json +6 -8
- utils_demo.py +4 -1
app.py
CHANGED
|
@@ -35,6 +35,7 @@ ANONYMIZED_DOCUMENT = read_txt(ANONYMIZED_FILE_PATH)
|
|
| 35 |
MAPPING_ANONYMIZED_SENTENCES = read_pickle(MAPPING_ANONYMIZED_SENTENCES_PATH)
|
| 36 |
MAPPING_ENCRYPTED_SENTENCES = read_pickle(MAPPING_ENCRYPTED_SENTENCES_PATH)
|
| 37 |
ORIGINAL_DOCUMENT = read_txt(ORIGINAL_FILE_PATH).split("\n\n")
|
|
|
|
| 38 |
print(ORIGINAL_DOCUMENT)
|
| 39 |
|
| 40 |
# 4. Data Processing and Operations (No specific operations shown here, assuming it's part of anonymizer or client usage)
|
|
@@ -54,7 +55,7 @@ def select_static_anonymized_sentences_fn(selected_sentences: List):
|
|
| 54 |
|
| 55 |
anonymized_selected_sentence = [sentence for _, sentence in anonymized_selected_sentence]
|
| 56 |
|
| 57 |
-
return
|
| 58 |
|
| 59 |
|
| 60 |
def key_gen_fn() -> Dict:
|
|
@@ -92,23 +93,48 @@ def key_gen_fn() -> Dict:
|
|
| 92 |
print("Keys have been generated ✅")
|
| 93 |
return {gen_key_btn: gr.update(value="Keys have been generated ✅")}
|
| 94 |
|
| 95 |
-
def select_static_encrypted_sentences_fn(selected_sentences: List):
|
| 96 |
|
| 97 |
-
|
| 98 |
|
| 99 |
-
|
| 100 |
|
| 101 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 102 |
|
| 103 |
-
|
|
|
|
| 104 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 105 |
|
| 106 |
def encrypt_query_fn(query):
|
| 107 |
|
| 108 |
print(f"\n------------ Step 2: Query encryption: {query=}")
|
| 109 |
|
| 110 |
if not (KEYS_DIR / f"{USER_ID}/evaluation_key").is_file():
|
| 111 |
-
return {output_encrypted_box: gr.update(value="Error ❌: Please generate the key first!")}
|
| 112 |
|
| 113 |
if is_user_query_valid(query):
|
| 114 |
return {
|
|
@@ -156,8 +182,8 @@ def encrypt_query_fn(query):
|
|
| 156 |
encrypted_quant_tokens_hex = [token.hex()[500:580] for token in encrypted_tokens]
|
| 157 |
|
| 158 |
return {
|
| 159 |
-
output_encrypted_box: gr.update(value=" ".join(encrypted_quant_tokens_hex), lines=
|
| 160 |
-
|
| 161 |
identified_words_output_df: gr.update(visible=False, value=None),
|
| 162 |
}
|
| 163 |
|
|
@@ -176,14 +202,14 @@ def send_input_fn(query) -> Dict:
|
|
| 176 |
"Error Encountered While Sending Data to the Server: "
|
| 177 |
f"The key has been generated correctly - {evaluation_key_path.is_file()=}"
|
| 178 |
)
|
| 179 |
-
return {
|
| 180 |
|
| 181 |
if not encrypted_input_path.is_file():
|
| 182 |
error_message = (
|
| 183 |
"Error Encountered While Sending Data to the Server: The data has not been encrypted "
|
| 184 |
f"correctly on the client side - {encrypted_input_path.is_file()=}"
|
| 185 |
)
|
| 186 |
-
return {
|
| 187 |
|
| 188 |
# Define the data and files to post
|
| 189 |
data = {"user_id": USER_ID, "input": query}
|
|
@@ -218,14 +244,14 @@ def run_fhe_in_server_fn() -> Dict:
|
|
| 218 |
"Error Encountered While Sending Data to the Server: "
|
| 219 |
f"The key has been generated correctly - {evaluation_key_path.is_file()=}"
|
| 220 |
)
|
| 221 |
-
return {
|
| 222 |
|
| 223 |
if not encrypted_input_path.is_file():
|
| 224 |
error_message = (
|
| 225 |
"Error Encountered While Sending Data to the Server: The data has not been encrypted "
|
| 226 |
f"correctly on the client side - {encrypted_input_path.is_file()=}"
|
| 227 |
)
|
| 228 |
-
return {
|
| 229 |
|
| 230 |
data = {
|
| 231 |
"user_id": USER_ID,
|
|
@@ -239,7 +265,7 @@ def run_fhe_in_server_fn() -> Dict:
|
|
| 239 |
) as response:
|
| 240 |
if not response.ok:
|
| 241 |
return {
|
| 242 |
-
|
| 243 |
value=(
|
| 244 |
"⚠️ An error occurred on the Server Side. "
|
| 245 |
"Please check connectivity and data transmission."
|
|
@@ -260,14 +286,14 @@ def get_output_fn() -> Dict:
|
|
| 260 |
"Error Encountered While Sending Data to the Server: "
|
| 261 |
"The key has not been generated correctly"
|
| 262 |
)
|
| 263 |
-
return {
|
| 264 |
|
| 265 |
if not (KEYS_DIR / f"{USER_ID}/encrypted_input").is_file():
|
| 266 |
error_message = (
|
| 267 |
"Error Encountered While Sending Data to the Server: "
|
| 268 |
"The data has not been encrypted correctly on the client side"
|
| 269 |
)
|
| 270 |
-
return {
|
| 271 |
|
| 272 |
data = {
|
| 273 |
"user_id": USER_ID,
|
|
@@ -372,7 +398,7 @@ def decrypt_fn(text) -> Dict:
|
|
| 372 |
return anonymized_text, identified_df
|
| 373 |
|
| 374 |
|
| 375 |
-
def anonymization_with_fn(query):
|
| 376 |
|
| 377 |
encrypt_query_fn(query)
|
| 378 |
|
|
@@ -385,8 +411,9 @@ def anonymization_with_fn(query):
|
|
| 385 |
anonymized_text, identified_df = decrypt_fn(query)
|
| 386 |
|
| 387 |
return {
|
| 388 |
-
|
| 389 |
-
|
|
|
|
| 390 |
}
|
| 391 |
|
| 392 |
|
|
@@ -402,10 +429,9 @@ def query_chatgpt_fn(anonymized_query, anonymized_document):
|
|
| 402 |
error_message = "Error ❌: Please encrypt your query first!"
|
| 403 |
return {chatgpt_response_anonymized: gr.update(value=error_message)}
|
| 404 |
|
| 405 |
-
|
| 406 |
|
| 407 |
# Prepare prompt
|
| 408 |
-
initial_prompt = prompt + "\n"
|
| 409 |
query = (
|
| 410 |
"Document content:\n```\n"
|
| 411 |
+ anonymized_document
|
|
@@ -414,12 +440,12 @@ def query_chatgpt_fn(anonymized_query, anonymized_document):
|
|
| 414 |
+ anonymized_query
|
| 415 |
+ "\n```"
|
| 416 |
)
|
| 417 |
-
print(f'
|
| 418 |
|
| 419 |
completion = client.chat.completions.create(
|
| 420 |
model="gpt-4-1106-preview", # Replace with "gpt-4" if available
|
| 421 |
messages=[
|
| 422 |
-
{"role": "system", "content":
|
| 423 |
{"role": "user", "content": query},
|
| 424 |
],
|
| 425 |
)
|
|
@@ -472,26 +498,31 @@ with demo:
|
|
| 472 |
"""
|
| 473 |
)
|
| 474 |
|
| 475 |
-
|
| 476 |
-
|
| 477 |
-
|
| 478 |
-
|
| 479 |
-
|
| 480 |
-
|
| 481 |
-
|
| 482 |
-
|
| 483 |
-
|
| 484 |
-
|
| 485 |
-
|
| 486 |
-
|
| 487 |
-
|
| 488 |
-
|
| 489 |
-
|
| 490 |
-
|
| 491 |
-
|
| 492 |
-
|
| 493 |
"""
|
| 494 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 495 |
|
| 496 |
########################## Key Gen Part ##########################
|
| 497 |
|
|
@@ -535,16 +566,10 @@ with demo:
|
|
| 535 |
encrypt_doc_btn = gr.Button("Encrypt the document")
|
| 536 |
|
| 537 |
with gr.Column(scale=5):
|
| 538 |
-
|
| 539 |
-
label="Encrypted document:",
|
| 540 |
-
show_label=True, value=ANONYMIZED_DOCUMENT, interactive=False, lines=11
|
| 541 |
)
|
| 542 |
|
| 543 |
-
original_sentences_box.change(
|
| 544 |
-
fn=select_static_anonymized_sentences_fn,
|
| 545 |
-
inputs=[original_sentences_box],
|
| 546 |
-
outputs=[anonymized_doc_box],
|
| 547 |
-
)
|
| 548 |
|
| 549 |
########################## User Query Part ##########################
|
| 550 |
|
|
@@ -577,7 +602,7 @@ with demo:
|
|
| 577 |
|
| 578 |
with gr.Column(scale=1, min_width=6):
|
| 579 |
gr.HTML("<div style='height: 77px;'></div>")
|
| 580 |
-
|
| 581 |
# gr.HTML("<div style='height: 50px;'></div>")
|
| 582 |
|
| 583 |
with gr.Column(scale=5):
|
|
@@ -602,34 +627,40 @@ with demo:
|
|
| 602 |
with gr.Row():
|
| 603 |
with gr.Column(scale=5):
|
| 604 |
|
| 605 |
-
|
| 606 |
-
label="Decrypted and anonymized document", lines=
|
| 607 |
)
|
| 608 |
|
| 609 |
with gr.Column(scale=5):
|
| 610 |
|
| 611 |
anonymized_query_output = gr.Textbox(
|
| 612 |
-
label="Decrypted and anonymized prompt", lines=
|
| 613 |
)
|
| 614 |
|
| 615 |
|
| 616 |
identified_words_output_df = gr.Dataframe(label="Identified words:", visible=False)
|
| 617 |
|
| 618 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 619 |
fn=encrypt_query_fn,
|
| 620 |
inputs=[query_box],
|
| 621 |
outputs=[
|
| 622 |
query_box,
|
| 623 |
output_encrypted_box,
|
| 624 |
-
|
| 625 |
identified_words_output_df,
|
| 626 |
],
|
| 627 |
)
|
| 628 |
|
| 629 |
run_fhe_btn.click(
|
| 630 |
anonymization_with_fn,
|
| 631 |
-
inputs=[query_box],
|
| 632 |
-
outputs=[
|
| 633 |
)
|
| 634 |
|
| 635 |
########################## ChatGpt Part ##########################
|
|
@@ -651,7 +682,7 @@ with demo:
|
|
| 651 |
|
| 652 |
chatgpt_button.click(
|
| 653 |
query_chatgpt_fn,
|
| 654 |
-
inputs=[
|
| 655 |
outputs=[chatgpt_response_anonymized, chatgpt_response_deanonymized],
|
| 656 |
)
|
| 657 |
|
|
|
|
| 35 |
MAPPING_ANONYMIZED_SENTENCES = read_pickle(MAPPING_ANONYMIZED_SENTENCES_PATH)
|
| 36 |
MAPPING_ENCRYPTED_SENTENCES = read_pickle(MAPPING_ENCRYPTED_SENTENCES_PATH)
|
| 37 |
ORIGINAL_DOCUMENT = read_txt(ORIGINAL_FILE_PATH).split("\n\n")
|
| 38 |
+
MAPPING_DOC_EMBEDDING = read_pickle(MAPPING_DOC_EMBEDDING_PATH)
|
| 39 |
print(ORIGINAL_DOCUMENT)
|
| 40 |
|
| 41 |
# 4. Data Processing and Operations (No specific operations shown here, assuming it's part of anonymizer or client usage)
|
|
|
|
| 55 |
|
| 56 |
anonymized_selected_sentence = [sentence for _, sentence in anonymized_selected_sentence]
|
| 57 |
|
| 58 |
+
return "\n\n".join(anonymized_selected_sentence)
|
| 59 |
|
| 60 |
|
| 61 |
def key_gen_fn() -> Dict:
|
|
|
|
| 93 |
print("Keys have been generated ✅")
|
| 94 |
return {gen_key_btn: gr.update(value="Keys have been generated ✅")}
|
| 95 |
|
|
|
|
| 96 |
|
| 97 |
+
def encrypt_doc_fn(doc):
|
| 98 |
|
| 99 |
+
print(f"\n------------ Step 2.1: Doc encryption: {doc=}")
|
| 100 |
|
| 101 |
+
if not (KEYS_DIR / f"{USER_ID}/evaluation_key").is_file():
|
| 102 |
+
return {encrypted_doc_box: gr.update(value="Error ❌: Please generate the key first!", lines=10)}
|
| 103 |
+
|
| 104 |
+
# Retrieve the client API
|
| 105 |
+
client = FHEModelClient(path_dir=DEPLOYMENT_DIR, key_dir=KEYS_DIR / f"{USER_ID}")
|
| 106 |
+
client.load()
|
| 107 |
|
| 108 |
+
encrypted_tokens = []
|
| 109 |
+
tokens = re.findall(r"(\b[\w\.\/\-@]+\b|[\s,.!?;:'\"-]+)", ' '.join(doc))
|
| 110 |
|
| 111 |
+
for token in tokens:
|
| 112 |
+
if token.strip() and re.match(r"\w+", token):
|
| 113 |
+
emb_x = MAPPING_DOC_EMBEDDING[token]
|
| 114 |
+
assert emb_x.shape == (1, 1024)
|
| 115 |
+
encrypted_x = client.quantize_encrypt_serialize(emb_x)
|
| 116 |
+
assert isinstance(encrypted_x, bytes)
|
| 117 |
+
encrypted_tokens.append(encrypted_x)
|
| 118 |
+
|
| 119 |
+
print("Doc encrypted ✅ on Client Side")
|
| 120 |
+
|
| 121 |
+
# No need to save it
|
| 122 |
+
# write_bytes(KEYS_DIR / f"{USER_ID}/encrypted_doc", b"".join(encrypted_tokens))
|
| 123 |
+
|
| 124 |
+
encrypted_quant_tokens_hex = [token.hex()[500:510] for token in encrypted_tokens]
|
| 125 |
+
|
| 126 |
+
return {
|
| 127 |
+
encrypted_doc_box: gr.update(value=" ".join(encrypted_quant_tokens_hex), lines=10),
|
| 128 |
+
anonymized_doc_output: gr.update(visible=True, value=None),
|
| 129 |
+
}
|
| 130 |
+
|
| 131 |
|
| 132 |
def encrypt_query_fn(query):
|
| 133 |
|
| 134 |
print(f"\n------------ Step 2: Query encryption: {query=}")
|
| 135 |
|
| 136 |
if not (KEYS_DIR / f"{USER_ID}/evaluation_key").is_file():
|
| 137 |
+
return {output_encrypted_box: gr.update(value="Error ❌: Please generate the key first!", lines=8)}
|
| 138 |
|
| 139 |
if is_user_query_valid(query):
|
| 140 |
return {
|
|
|
|
| 182 |
encrypted_quant_tokens_hex = [token.hex()[500:580] for token in encrypted_tokens]
|
| 183 |
|
| 184 |
return {
|
| 185 |
+
output_encrypted_box: gr.update(value=" ".join(encrypted_quant_tokens_hex), lines=8),
|
| 186 |
+
anonymized_query_output: gr.update(visible=True, value=None),
|
| 187 |
identified_words_output_df: gr.update(visible=False, value=None),
|
| 188 |
}
|
| 189 |
|
|
|
|
| 202 |
"Error Encountered While Sending Data to the Server: "
|
| 203 |
f"The key has been generated correctly - {evaluation_key_path.is_file()=}"
|
| 204 |
)
|
| 205 |
+
return {anonymized_query_output: gr.update(value=error_message)}
|
| 206 |
|
| 207 |
if not encrypted_input_path.is_file():
|
| 208 |
error_message = (
|
| 209 |
"Error Encountered While Sending Data to the Server: The data has not been encrypted "
|
| 210 |
f"correctly on the client side - {encrypted_input_path.is_file()=}"
|
| 211 |
)
|
| 212 |
+
return {anonymized_query_output: gr.update(value=error_message)}
|
| 213 |
|
| 214 |
# Define the data and files to post
|
| 215 |
data = {"user_id": USER_ID, "input": query}
|
|
|
|
| 244 |
"Error Encountered While Sending Data to the Server: "
|
| 245 |
f"The key has been generated correctly - {evaluation_key_path.is_file()=}"
|
| 246 |
)
|
| 247 |
+
return {anonymized_query_output: gr.update(value=error_message)}
|
| 248 |
|
| 249 |
if not encrypted_input_path.is_file():
|
| 250 |
error_message = (
|
| 251 |
"Error Encountered While Sending Data to the Server: The data has not been encrypted "
|
| 252 |
f"correctly on the client side - {encrypted_input_path.is_file()=}"
|
| 253 |
)
|
| 254 |
+
return {anonymized_query_output: gr.update(value=error_message)}
|
| 255 |
|
| 256 |
data = {
|
| 257 |
"user_id": USER_ID,
|
|
|
|
| 265 |
) as response:
|
| 266 |
if not response.ok:
|
| 267 |
return {
|
| 268 |
+
anonymized_query_output: gr.update(
|
| 269 |
value=(
|
| 270 |
"⚠️ An error occurred on the Server Side. "
|
| 271 |
"Please check connectivity and data transmission."
|
|
|
|
| 286 |
"Error Encountered While Sending Data to the Server: "
|
| 287 |
"The key has not been generated correctly"
|
| 288 |
)
|
| 289 |
+
return {anonymized_query_output: gr.update(value=error_message)}
|
| 290 |
|
| 291 |
if not (KEYS_DIR / f"{USER_ID}/encrypted_input").is_file():
|
| 292 |
error_message = (
|
| 293 |
"Error Encountered While Sending Data to the Server: "
|
| 294 |
"The data has not been encrypted correctly on the client side"
|
| 295 |
)
|
| 296 |
+
return {anonymized_query_output: gr.update(value=error_message)}
|
| 297 |
|
| 298 |
data = {
|
| 299 |
"user_id": USER_ID,
|
|
|
|
| 398 |
return anonymized_text, identified_df
|
| 399 |
|
| 400 |
|
| 401 |
+
def anonymization_with_fn(selected_sentences, query):
|
| 402 |
|
| 403 |
encrypt_query_fn(query)
|
| 404 |
|
|
|
|
| 411 |
anonymized_text, identified_df = decrypt_fn(query)
|
| 412 |
|
| 413 |
return {
|
| 414 |
+
anonymized_doc_output: gr.update(value=select_static_anonymized_sentences_fn(selected_sentences)),
|
| 415 |
+
anonymized_query_output: gr.update(value=anonymized_text),
|
| 416 |
+
identified_words_output_df: gr.update(value=identified_df, visible=False),
|
| 417 |
}
|
| 418 |
|
| 419 |
|
|
|
|
| 429 |
error_message = "Error ❌: Please encrypt your query first!"
|
| 430 |
return {chatgpt_response_anonymized: gr.update(value=error_message)}
|
| 431 |
|
| 432 |
+
context_prompt = read_txt(PROMPT_PATH)
|
| 433 |
|
| 434 |
# Prepare prompt
|
|
|
|
| 435 |
query = (
|
| 436 |
"Document content:\n```\n"
|
| 437 |
+ anonymized_document
|
|
|
|
| 440 |
+ anonymized_query
|
| 441 |
+ "\n```"
|
| 442 |
)
|
| 443 |
+
print(f'Prompt of CHATGPT:\n{query}')
|
| 444 |
|
| 445 |
completion = client.chat.completions.create(
|
| 446 |
model="gpt-4-1106-preview", # Replace with "gpt-4" if available
|
| 447 |
messages=[
|
| 448 |
+
{"role": "system", "content": context_prompt},
|
| 449 |
{"role": "user", "content": query},
|
| 450 |
],
|
| 451 |
)
|
|
|
|
| 498 |
"""
|
| 499 |
)
|
| 500 |
|
| 501 |
+
gr.Markdown(
|
| 502 |
+
"""
|
| 503 |
+
<p align="center" style="font-size: 16px;">
|
| 504 |
+
Anonymization is the process of removing personally identifiable information (PII) data from
|
| 505 |
+
a document in order to protect individual privacy.</p>
|
| 506 |
+
|
| 507 |
+
<p align="center" style="font-size: 16px;">
|
| 508 |
+
Encrypted anonymization uses Fully Homomorphic Encryption (FHE) to anonymize personally
|
| 509 |
+
identifiable information (PII) within encrypted documents, enabling computations to be
|
| 510 |
+
performed on the encrypted data.</p>
|
| 511 |
+
|
| 512 |
+
<p align="center" style="font-size: 16px;">
|
| 513 |
+
In the example above, we're showing how encrypted anonymization can be leveraged to use LLM
|
| 514 |
+
services such as ChaGPT in a privacy-preserving manner.</p>
|
| 515 |
+
"""
|
| 516 |
+
)
|
| 517 |
+
|
| 518 |
+
gr.Markdown(
|
| 519 |
"""
|
| 520 |
+
<p align="center">
|
| 521 |
+
<img width="75%" height="30%" src="https://raw.githubusercontent.com/kcelia/Img/main/fhe_anonymization_banner.png">
|
| 522 |
+
</p>
|
| 523 |
+
"""
|
| 524 |
+
)
|
| 525 |
+
|
| 526 |
|
| 527 |
########################## Key Gen Part ##########################
|
| 528 |
|
|
|
|
| 566 |
encrypt_doc_btn = gr.Button("Encrypt the document")
|
| 567 |
|
| 568 |
with gr.Column(scale=5):
|
| 569 |
+
encrypted_doc_box = gr.Textbox(
|
| 570 |
+
label="Encrypted document:", show_label=True, interactive=False, lines=10
|
|
|
|
| 571 |
)
|
| 572 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 573 |
|
| 574 |
########################## User Query Part ##########################
|
| 575 |
|
|
|
|
| 602 |
|
| 603 |
with gr.Column(scale=1, min_width=6):
|
| 604 |
gr.HTML("<div style='height: 77px;'></div>")
|
| 605 |
+
encrypt_query_btn = gr.Button("Encrypt the prompt")
|
| 606 |
# gr.HTML("<div style='height: 50px;'></div>")
|
| 607 |
|
| 608 |
with gr.Column(scale=5):
|
|
|
|
| 627 |
with gr.Row():
|
| 628 |
with gr.Column(scale=5):
|
| 629 |
|
| 630 |
+
anonymized_doc_output = gr.Textbox(
|
| 631 |
+
label="Decrypted and anonymized document", lines=10, interactive=True
|
| 632 |
)
|
| 633 |
|
| 634 |
with gr.Column(scale=5):
|
| 635 |
|
| 636 |
anonymized_query_output = gr.Textbox(
|
| 637 |
+
label="Decrypted and anonymized prompt", lines=10, interactive=True
|
| 638 |
)
|
| 639 |
|
| 640 |
|
| 641 |
identified_words_output_df = gr.Dataframe(label="Identified words:", visible=False)
|
| 642 |
|
| 643 |
+
encrypt_doc_btn.click(
|
| 644 |
+
fn=encrypt_doc_fn,
|
| 645 |
+
inputs=[original_sentences_box],
|
| 646 |
+
outputs=[encrypted_doc_box, anonymized_doc_output],
|
| 647 |
+
)
|
| 648 |
+
|
| 649 |
+
encrypt_query_btn.click(
|
| 650 |
fn=encrypt_query_fn,
|
| 651 |
inputs=[query_box],
|
| 652 |
outputs=[
|
| 653 |
query_box,
|
| 654 |
output_encrypted_box,
|
| 655 |
+
anonymized_query_output,
|
| 656 |
identified_words_output_df,
|
| 657 |
],
|
| 658 |
)
|
| 659 |
|
| 660 |
run_fhe_btn.click(
|
| 661 |
anonymization_with_fn,
|
| 662 |
+
inputs=[original_sentences_box, query_box],
|
| 663 |
+
outputs=[anonymized_doc_output, anonymized_query_output, identified_words_output_df],
|
| 664 |
)
|
| 665 |
|
| 666 |
########################## ChatGpt Part ##########################
|
|
|
|
| 682 |
|
| 683 |
chatgpt_button.click(
|
| 684 |
query_chatgpt_fn,
|
| 685 |
+
inputs=[anonymized_query_output, anonymized_doc_output],
|
| 686 |
outputs=[chatgpt_response_anonymized, chatgpt_response_deanonymized],
|
| 687 |
)
|
| 688 |
|
files/anonymized_document.txt
CHANGED
|
@@ -1,11 +1,11 @@
|
|
| 1 |
-
Members:
|
| 2 |
|
| 3 |
-
Date:
|
| 4 |
|
| 5 |
-
Scope:
|
| 6 |
|
| 7 |
-
Amount: Bob agrees to pay
|
| 8 |
|
| 9 |
Deadline: The logo design must be completed and delivered to Bob within 14 days of the contract signing date.
|
| 10 |
|
| 11 |
-
Payment terms:
|
|
|
|
| 1 |
+
Members: a5989a5c and 20f545cf
|
| 2 |
|
| 3 |
+
Date: 7bbd0258 28ebebcd, 87a7f982
|
| 4 |
|
| 5 |
+
Scope: 20f545cf agrees to provide graphic design services to a5989a5c for the creation of a company logo.
|
| 6 |
|
| 7 |
+
Amount: Bob agrees to pay 20f545cf 500 upon completion and delivery of the logo.
|
| 8 |
|
| 9 |
Deadline: The logo design must be completed and delivered to Bob within 14 days of the contract signing date.
|
| 10 |
|
| 11 |
+
Payment terms: 20f545cf's international bank account N: 43a4c5f3
|
files/mapping_clear_to_anonymized.pkl
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
-
size
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:aed1a1360ae82291357e5de8369d63d5514d90114743d1845b32642df9086902
|
| 3 |
+
size 906
|
files/mapping_clear_to_encrypted.pkl
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
-
size
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:45e4ba890f0b8c8d239534f9c6c1d0878f5419b62af6b32d9d7e758a0490ea8a
|
| 3 |
+
size 916
|
files/mapping_doc_embedding_path.pkl
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:faa0f74bc4358424e29118dc9714512f092d83756a77d596dd9ce56c9555b444
|
| 3 |
+
size 211319
|
files/original_document.txt
CHANGED
|
@@ -1,4 +1,4 @@
|
|
| 1 |
-
Members: David
|
| 2 |
|
| 3 |
Date: February 06, 2000
|
| 4 |
|
|
@@ -8,4 +8,4 @@ Amount: Bob agrees to pay Kate $500 upon completion and delivery of the logo.
|
|
| 8 |
|
| 9 |
Deadline: The logo design must be completed and delivered to Bob within 14 days of the contract signing date.
|
| 10 |
|
| 11 |
-
Payment terms: Kate
|
|
|
|
| 1 |
+
Members: David and Kate
|
| 2 |
|
| 3 |
Date: February 06, 2000
|
| 4 |
|
|
|
|
| 8 |
|
| 9 |
Deadline: The logo design must be completed and delivered to Bob within 14 days of the contract signing date.
|
| 10 |
|
| 11 |
+
Payment terms: Kate's international bank account N°: IL150120690000003111111
|
files/original_document_uuid_mapping.json
CHANGED
|
@@ -1,10 +1,8 @@
|
|
| 1 |
{
|
| 2 |
-
"06": "
|
| 3 |
-
"2000": "
|
| 4 |
-
"David": "
|
| 5 |
-
"February": "
|
| 6 |
-
"
|
| 7 |
-
"
|
| 8 |
-
"Johnson": "70fc6ec5",
|
| 9 |
-
"Kate": "2708cb61"
|
| 10 |
}
|
|
|
|
| 1 |
{
|
| 2 |
+
"06": "28ebebcd",
|
| 3 |
+
"2000": "87a7f982",
|
| 4 |
+
"David": "a5989a5c",
|
| 5 |
+
"February": "7bbd0258",
|
| 6 |
+
"IL150120690000003111111": "43a4c5f3",
|
| 7 |
+
"Kate": "20f545cf"
|
|
|
|
|
|
|
| 8 |
}
|
utils_demo.py
CHANGED
|
@@ -40,6 +40,8 @@ ANONYMIZED_FILE_PATH = DATA_PATH / "anonymized_document.txt"
|
|
| 40 |
MAPPING_UUID_PATH = DATA_PATH / "original_document_uuid_mapping.json"
|
| 41 |
MAPPING_ANONYMIZED_SENTENCES_PATH = DATA_PATH / "mapping_clear_to_anonymized.pkl"
|
| 42 |
MAPPING_ENCRYPTED_SENTENCES_PATH = DATA_PATH / "mapping_clear_to_encrypted.pkl"
|
|
|
|
|
|
|
| 43 |
PROMPT_PATH = DATA_PATH / "chatgpt_prompt.txt"
|
| 44 |
|
| 45 |
|
|
@@ -57,7 +59,8 @@ EMBEDDINGS_MODEL = AutoModel.from_pretrained("obi/deid_roberta_i2b2")
|
|
| 57 |
PUNCTUATION_LIST = list(string.punctuation)
|
| 58 |
PUNCTUATION_LIST.remove("%")
|
| 59 |
PUNCTUATION_LIST.remove("$")
|
| 60 |
-
PUNCTUATION_LIST = "".join(PUNCTUATION_LIST)
|
|
|
|
| 61 |
|
| 62 |
|
| 63 |
def clean_directory() -> None:
|
|
|
|
| 40 |
MAPPING_UUID_PATH = DATA_PATH / "original_document_uuid_mapping.json"
|
| 41 |
MAPPING_ANONYMIZED_SENTENCES_PATH = DATA_PATH / "mapping_clear_to_anonymized.pkl"
|
| 42 |
MAPPING_ENCRYPTED_SENTENCES_PATH = DATA_PATH / "mapping_clear_to_encrypted.pkl"
|
| 43 |
+
MAPPING_DOC_EMBEDDING_PATH = DATA_PATH / "mapping_doc_embedding_path.pkl"
|
| 44 |
+
|
| 45 |
PROMPT_PATH = DATA_PATH / "chatgpt_prompt.txt"
|
| 46 |
|
| 47 |
|
|
|
|
| 59 |
PUNCTUATION_LIST = list(string.punctuation)
|
| 60 |
PUNCTUATION_LIST.remove("%")
|
| 61 |
PUNCTUATION_LIST.remove("$")
|
| 62 |
+
PUNCTUATION_LIST = "".join(PUNCTUATION_LIST) + '°'
|
| 63 |
+
print(f'{PUNCTUATION_LIST=}')
|
| 64 |
|
| 65 |
|
| 66 |
def clean_directory() -> None:
|