Ezi Ozoani commited on
Commit
833f9be
·
2 Parent(s): abaa2f0 b309191

pushhyyyyyy Merge branch 'main' of https://huggingface.co/spaces/Ezi/dataset_PII_checker

Browse files
__pycache__/app.cpython-310.pyc DELETED
Binary file (4.76 kB)
 
__pycache__/main.cpython-310.pyc DELETED
Binary file (4.76 kB)
 
__pycache__/server.cpython-310.pyc DELETED
Binary file (1.2 kB)
 
main.py DELETED
@@ -1,152 +0,0 @@
1
- import json
2
- import os
3
- from difflib import SequenceMatcher
4
- from typing import Any, Dict, Optional, Tuple
5
-
6
- from fastapi import FastAPI, Request, Response
7
- from huggingface_hub import (DatasetCard, HfApi, ModelCard, comment_discussion,
8
- create_discussion, get_discussion_details,
9
- get_repo_discussions, login)
10
- from huggingface_hub.utils import EntryNotFoundError
11
- from tabulate import tabulate
12
-
13
- KEY = os.environ.get("WEBHOOK_SECRET")
14
- #HF_TOKEN = os.environ.get("HF_ACCESS_TOKEN")
15
-
16
- #api = HfApi(token=HF_TOKEN)
17
- #login(HF_TOKEN)
18
-
19
- #app = FastAPI()
20
-
21
-
22
-
23
-
24
- def similar(a, b):
25
- """Check similarity of two sequences"""
26
- return SequenceMatcher(None, a, b).ratio()
27
-
28
-
29
- def create_metadata_key_dict(card_data, repo_type: str):
30
- shared_keys = ["tags", "license"]
31
- if repo_type == "model":
32
- model_keys = ["library_name", "datasets", "metrics", "co2", "pipeline_tag"]
33
- shared_keys.extend(model_keys)
34
- keys = shared_keys
35
- return {key: card_data.get(key) for key in keys}
36
- if repo_type == "dataset":
37
- data_keys = [
38
- "pretty_name",
39
- "size_categories",
40
- "task_categories",
41
- "task_ids",
42
- "source_datasets",
43
- ]
44
- shared_keys.extend(data_keys)
45
- keys = shared_keys
46
- return {key: card_data.get(key) for key in keys}
47
-
48
-
49
- def create_metadata_breakdown_table(desired_metadata_dictionary):
50
- data = {k:v or "Field Missing" for k,v in desired_metadata_dictionary.items()}
51
- metadata_fields_column = list(data.keys())
52
- metadata_values_column = list(data.values())
53
- table_data = list(zip(metadata_fields_column, metadata_values_column))
54
- return tabulate(
55
- table_data, tablefmt="github", headers=("Metadata Field", "Provided Value")
56
- )
57
-
58
-
59
- def calculate_grade(desired_metadata_dictionary):
60
- metadata_values = list(desired_metadata_dictionary.values())
61
- score = sum(1 if field else 0 for field in metadata_values) / len(metadata_values)
62
- return round(score, 2)
63
-
64
-
65
- def create_markdown_report(
66
- desired_metadata_dictionary, repo_name, repo_type, score, update: bool = False
67
- ):
68
- report = f"""# {repo_type.title()} metadata report card {"(updated)" if update else ""}
69
- \n
70
- This is an automatically produced metadata quality report card for {repo_name}. This report is meant as a POC!
71
- \n
72
- ## Breakdown of metadata fields for your{repo_type}
73
- \n
74
- {create_metadata_breakdown_table(desired_metadata_dictionary)}
75
- \n
76
- You scored a metadata coverage grade of: **{score}**% \n {f"We're not angry we're just disappointed! {repo_type.title()} metadata is super important. Please try harder..."
77
- if score <= 0.5 else f"Not too shabby! Make sure you also fill in a {repo_type} card too!"}
78
- """
79
- return report
80
-
81
-
82
- def parse_webhook_post(data: Dict[str, Any]) -> Optional[Tuple[str, str]]:
83
- event = data["event"]
84
- if event["scope"] != "repo":
85
- return None
86
- repo = data["repo"]
87
- repo_name = repo["name"]
88
- repo_type = repo["type"]
89
- if repo_type not in {"model", "dataset"}:
90
- raise ValueError("Unknown hub type")
91
- return repo_type, repo_name
92
-
93
-
94
- def load_repo_card_metadata(repo_type, repo_name):
95
- if repo_type == "dataset":
96
- try:
97
- return DatasetCard.load(repo_name).data.to_dict()
98
- except EntryNotFoundError:
99
- return {}
100
- if repo_type == "model":
101
- try:
102
- return ModelCard.load(repo_name).data.to_dict()
103
- except EntryNotFoundError:
104
- return {}
105
-
106
-
107
- def create_or_update_report(data):
108
- if parsed_post := parse_webhook_post(data):
109
- repo_type, repo_name = parsed_post
110
- else:
111
- return Response("Unable to parse webhook data", status_code=400)
112
- card_data = load_repo_card_metadata(repo_type, repo_name)
113
- desired_metadata_dictionary = create_metadata_key_dict(card_data, repo_type)
114
- score = calculate_grade(desired_metadata_dictionary)
115
- report = create_markdown_report(
116
- desired_metadata_dictionary, repo_name, repo_type, score, update=False
117
- )
118
- repo_discussions = get_repo_discussions(
119
- repo_name,
120
- repo_type=repo_type,
121
- )
122
- for discussion in repo_discussions:
123
- if (
124
- discussion.title == "Metadata Report Card" and discussion.status == "open"
125
- ): # An existing open report card thread
126
- discussion_details = get_discussion_details(
127
- repo_name, discussion.num, repo_type=repo_type
128
- )
129
- last_comment = discussion_details.events[-1].content
130
- if similar(report, last_comment) <= 0.999:
131
- report = create_markdown_report(
132
- desired_metadata_dictionary,
133
- repo_name,
134
- repo_type,
135
- score,
136
- update=True,
137
- )
138
- comment_discussion(
139
- repo_name,
140
- discussion.num,
141
- comment=report,
142
- repo_type=repo_type,
143
- )
144
- return True
145
- create_discussion(
146
- repo_name,
147
- "Metadata Report Card",
148
- description=report,
149
- repo_type=repo_type,
150
- )
151
- return True
152
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
server.py DELETED
@@ -1,27 +0,0 @@
1
- import os
2
- from fastapi import FastAPI, Request, Response
3
- from main import create_or_update_report
4
- from tabulate import tabulate
5
-
6
- KEY = os.environ.get("WEBHOOK_SECRET")
7
-
8
- app = FastAPI()
9
-
10
- @app.get("/")
11
- def read_root():
12
- data = """
13
- <h2 style="text-align:center">Metadata Review Bot</h2>
14
- <p style="text-align:center">This is a demo app showing how to use webhooks to automate metadata review for models and datasets shared on the Hugging Face Hub.</p>
15
- """
16
- return Response(content=data, media_type="text/html")
17
-
18
- @app.post("/webhook")
19
- async def webhook(request: Request):
20
- if request.method == "POST":
21
- if request.headers.get("X-Webhook-Secret") != KEY:
22
- return Response("Invalid secret", status_code=401)
23
- data = await request.json()
24
- result = create_or_update_report(data)
25
- return "Webhook received!" if result else result
26
-
27
-