Spaces:
Sleeping
Sleeping
import re | |
import streamlit as st | |
from modelcards import CardData, ModelCard | |
from markdownTagExtract import tag_checker,listToString,to_markdown | |
#from specific_extraction import extract_it | |
# from persist import persist | |
#global bytes_data | |
################################################################ | |
#### Markdown parser logic ################################# | |
################################################################ | |
def file_upload(): | |
bytes_data = st.session_state.markdown_upload | |
return bytes_data | |
# Sets up the basics | |
model_card_md = file_upload() # this is where the new model card will be read in from | |
model_card_md = model_card_md#.decode("utf-8") | |
# Does metadata appear in any other format than this? | |
metadata_re = re.compile("^---(.*?)---", re.DOTALL) | |
header_re = re.compile("^\s*# (.*)", re.MULTILINE) | |
subheader_re = re.compile("^\s*## (.*)", re.MULTILINE) | |
subsubheader_re = re.compile("^\s*### (.*)", re.MULTILINE) | |
subsubsubheader_re = re.compile("^\s*#### (.*)", re.MULTILINE) | |
# We could be a lot more flexible on this re. | |
# We require keys to be bold-faced here. | |
# We don't have to require bold, as long as it's key:value | |
# **License:** | |
# Bold terms use ** or __ | |
# Allows the mixing of ** and __ for bold but eh whatev | |
key_value_re = re.compile("^\s*([*_]{2}[^*_]+[*_]{2})([^\n]*)", re.MULTILINE) | |
# Hyphens or stars mark list items. | |
# Unordered list | |
list_item_re = re.compile("^\s*[-*+]\s+.*", re.MULTILINE) | |
# This is the ordered list | |
enum_re = re.compile("^\s*[0-9].*", re.MULTILINE) | |
table_re = re.compile("^\s*\|.*", re.MULTILINE) | |
text_item_re = re.compile("^\s*[A-Za-z(](.*)", re.MULTILINE) | |
# text_item_re = re.compile("^\s*#\s*.*", re.MULTILINE) | |
# Allows the mixing of -* and *- for italics but eh whatev | |
italicized_text_item_re = re.compile( | |
"^[_*][^_*\s].*\n?.*[^_*][_*]$", flags=re.MULTILINE | |
) | |
tag_re = re.compile("^\s*<.*", re.MULTILINE) | |
image_re = re.compile("!\[.*\]\(.*\)", re.MULTILINE) | |
subheader_re_dict = {} | |
subheader_re_dict[header_re] = subheader_re | |
subheader_re_dict[subheader_re] = subsubheader_re | |
subheader_re_dict[subsubheader_re] = subsubsubheader_re | |
def get_metadata(section_text): | |
return list(metadata_re.finditer(section_text)) | |
def find_images(section_text): | |
return list(image_re.finditer(section_text)) | |
def find_tags(section_text): | |
return list(tag_re.finditer(section_text)) | |
def find_tables(section_text): | |
return list(table_re.finditer(section_text)) | |
def find_enums(section_text): | |
return list(enum_re.finditer(section_text)) | |
# Extracts the stuff from the .md file | |
def find_key_values(section_text): | |
return list(key_value_re.finditer(section_text)) | |
def find_lists(section_text): | |
# Find lists: Those lines starting with either '-' or '*' | |
return list(list_item_re.finditer(section_text)) | |
def find_texts(section_text): | |
# Find texts: Free writing within a section | |
basic_text = list(text_item_re.finditer(section_text)) | |
ital_text = list(italicized_text_item_re.finditer(section_text)) | |
free_text = basic_text + ital_text | |
return free_text | |
def find_headers(full_text): | |
headers = list(header_re.finditer(full_text)) | |
subheaders = list(subheader_re.finditer(full_text)) | |
subsubheaders = list(subsubheader_re.finditer(full_text)) | |
subsubsubheaders = list(subsubsubheader_re.finditer(full_text)) | |
return (headers, subheaders, subsubheaders, subsubsubheaders) | |
metadata_list = get_metadata(model_card_md) | |
if metadata_list != []: | |
metadata_end = metadata_list[-1].span()[-1] | |
print("Metadata extracted") | |
# Metadata processing can happen here. | |
# For now I'm just ignoring it. | |
model_card_md = model_card_md[metadata_end:] | |
else: | |
print("No metadata found") | |
# Matches of all header types | |
headers_list = find_headers(model_card_md) | |
print("Headers extracted") | |
# This type of header (one #) | |
headers = headers_list[0] | |
## This type of header (two ##) | |
subheaders = headers_list[1] | |
### This type of header | |
subsubheaders = headers_list[2] | |
#### This type of header | |
subsubsubheaders = headers_list[3] | |
# Matches of bulleted lists | |
lists_list = find_lists(model_card_md) | |
print("Bulleted lists extracted") | |
enums_list = find_enums(model_card_md) | |
print("Enumerated lists extracted") | |
key_value_list = find_key_values(model_card_md) | |
print("Key values extracted") | |
tables_list = find_tables(model_card_md) | |
print("Tables extracted") | |
tags_list = find_tags(model_card_md) | |
print("Markup tags extracted") | |
images_list = find_images(model_card_md) | |
print("Images extracted") | |
# Matches of free text within a section | |
texts_list = find_texts(model_card_md) | |
print("Free text extracted") | |
# List items have the attribute: value; | |
# This provides for special handling of those strings, | |
# allowing us to check if it's a list item in order to split/print ok. | |
LIST_ITEM = "List item" | |
KEY_VALUE = "Key: Value" | |
FREE_TEXT = "Free text" | |
ENUM_LIST_ITEM = "Enum item" | |
TABLE_ITEM = "Table item" | |
TAG_ITEM = "Markup tag" | |
IMAGE_ITEM = "Image" | |
def create_span_dict(match_list, match_type): | |
""" | |
Creates a dictionary made out of all the spans. | |
This is useful for knowing which types to fill out with what in the app. | |
Also useful for checking if there are spans in the .md file that we've missed. | |
""" | |
span_dict = {} | |
for match in match_list: | |
if len(match.group().strip()) > 0: | |
span_dict[(match.span())] = (match.group(), match_type) | |
return span_dict | |
metadata_span_dict = create_span_dict(metadata_list, "Metadata") | |
# Makes a little dict for each span type | |
header_span_dict = create_span_dict(headers, "# Header") | |
subheader_span_dict = create_span_dict(subheaders, "## Subheader") | |
subsubheader_span_dict = create_span_dict(subsubheaders, "### Subsubheader") | |
subsubsubheader_span_dict = create_span_dict(subsubsubheaders, "#### Subsubsubheader") | |
key_value_span_dict = create_span_dict(key_value_list, KEY_VALUE) | |
lists_span_dict = create_span_dict(lists_list, LIST_ITEM) | |
enums_span_dict = create_span_dict(enums_list, ENUM_LIST_ITEM) | |
tables_span_dict = create_span_dict(tables_list, TABLE_ITEM) | |
tags_span_dict = create_span_dict(tags_list, TAG_ITEM) | |
images_span_dict = create_span_dict(images_list, IMAGE_ITEM) | |
texts_span_dict = create_span_dict(texts_list, FREE_TEXT) | |
# We don't have to have these organized by type necessarily. | |
# Doing it here for clarity. | |
all_spans_dict = {} | |
all_spans_dict["headers"] = header_span_dict | |
all_spans_dict["subheaders"] = subheader_span_dict | |
all_spans_dict["subsubheaders"] = subsubheader_span_dict | |
all_spans_dict["subsubsubheaders"] = subsubsubheader_span_dict | |
all_spans_dict[LIST_ITEM] = lists_span_dict | |
all_spans_dict[KEY_VALUE] = key_value_span_dict | |
all_spans_dict[TABLE_ITEM] = tables_span_dict | |
all_spans_dict[ENUM_LIST_ITEM] = enums_span_dict | |
all_spans_dict[TAG_ITEM] = tags_span_dict | |
all_spans_dict[IMAGE_ITEM] = images_span_dict | |
all_spans_dict[FREE_TEXT] = texts_span_dict | |
def get_sorted_spans(spans_dict): | |
merged_spans = {} | |
for span_dict in spans_dict.values(): | |
merged_spans.update(span_dict) | |
sorted_spans = sorted(merged_spans) | |
return sorted_spans, merged_spans | |
sorted_spans, merged_spans = get_sorted_spans(all_spans_dict) | |
# Sanity/Parse check. Have we captured all spans in the .md file? | |
if sorted_spans[0][0] != 0: | |
print("FYI, our spans don't start at the start of the file.") | |
print("We did not catch this start:") | |
print(model_card_md[: sorted_spans[0][0]]) | |
for idx in range(len(sorted_spans) - 1): | |
last_span_end = sorted_spans[idx][1] | |
new_span_start = sorted_spans[idx + 1][0] | |
if new_span_start > last_span_end + 1: | |
start_nonparse = sorted_spans[idx] | |
end_nonparse = sorted_spans[idx + 1] | |
text = model_card_md[start_nonparse[1] : end_nonparse[0]] | |
if text.strip(): | |
print("Found an unparsed span in the file:") | |
print(start_nonparse) | |
print(" ---> ") | |
print(end_nonparse) | |
print(text) | |
# print(header_span_dict) | |
def section_map_to_help_text(text_retrieved): | |
presit_states = { | |
"## Model Details": "Give an overview of your model, the relevant research paper, who trained it, etc.", | |
"## How to Get Started with the Model": "Give an overview of how to get started with the model", | |
"## Limitations and Biases": "Provide an overview of the possible Limitations and Risks that may be associated with this model", | |
"## Uses": "Detail the potential uses, intended use and out-of-scope uses for this model", | |
"## Training": "Provide an overview of the Training Data and Training Procedure for this model", | |
"## Evaluation Results": "Detail the Evaluation Results for this model", | |
"## Environmental Impact": "Provide an estimate for the carbon emissions: Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here.", | |
"## Citation Information": "How to best cite the model authors", | |
"## Glossary": "If relevant, include terms and calculations in this section that can help readers understand the model or model card.", | |
"## More Information": "Any additional information", | |
"## Model Card Authors": "This section provides another layer of transparency and accountability. Whose views is this model card representing? How many voices were included in its construction? Etc.", | |
"Model Card Contact": "Mediums to use, in order to contact the model creators", | |
"## Technical Specifications": " Additional technical information", | |
'## Model Examination': " Examining the model", | |
} | |
for key in presit_states: | |
if key == text_retrieved: | |
return presit_states(key) | |
def section_map_to_persist(text_retrieved): | |
presit_states = { | |
"Model_details_text": "## Model Details", | |
"Model_how_to": "## How to Get Started with the Model", | |
"Model_Limits_n_Risks": "## Limitations and Biases", | |
"Model_uses": "## Uses", | |
"Model_training": "## Training", | |
"Model_Eval": "## Evaluation Results", | |
"Model_carbon": "## Environmental Impact", | |
"Model_cite": "## Citation Information", | |
"Glossary": "## Glossary", | |
"More_info": "## More Information", | |
"Model_card_authors": "## Model Card Authors", | |
"Model_card_contact": "## Model Card Contact", | |
"Technical_specs": "## Technical specifications", | |
"Model_examin": "## Model Examination", | |
} | |
for key in presit_states: | |
if presit_states[key] == text_retrieved: | |
return key | |
def main(): | |
# st.write('here') | |
print(extract_it("Model_details_text")) | |
def extract_headers(): | |
headers = {} | |
subheaders = {} | |
subsubheaders = {} | |
subsubsubheaders = {} | |
previous = (None, None, None, None) | |
for s in sorted_spans: | |
if merged_spans[s][1] == "# Header": | |
headers[s] = (sorted_spans.index(s), previous[0]) | |
previous = (sorted_spans.index(s), previous[1], previous[2], previous[3]) | |
if merged_spans[s][1] == "## Subheader": | |
subheaders[s] = (sorted_spans.index(s), previous[1]) | |
previous = (previous[0], sorted_spans.index(s), previous[2], previous[3]) | |
if merged_spans[s][1] == "### Subsubheader": | |
subsubheaders[s] = (sorted_spans.index(s), previous[2]) | |
previous = (previous[0], previous[1], sorted_spans.index(s), previous[3]) | |
if merged_spans[s][1] == "#### Subsubsubheader": | |
subsubsubheaders[s] = (sorted_spans.index(s), previous[3]) | |
previous = (previous[0], previous[1], previous[2], sorted_spans.index(s)) | |
return headers, subheaders, subsubheaders, subsubsubheaders | |
def stringify(): | |
headers, subheaders, subsubheaders, subsubsubheaders = extract_headers() | |
headers_strings = {} | |
subheaders_strings = {} | |
subsubheaders_strings = {} | |
subsubsubheaders_strings = {} | |
first = None | |
for i in headers: | |
if headers[i][1] == None: | |
continue | |
sub_spans = sorted_spans[headers[i][1] : headers[i][0]] | |
lines = [] | |
for x in sub_spans: | |
lines.append(merged_spans[x][0]) | |
try: | |
name = lines[0] | |
except: | |
name = "Model Details" | |
lines = "".join(lines) | |
# print(merged_spans[i][0] + "-------------------") | |
# print(lines) | |
headers_strings[ | |
name.replace("\n# ", "") | |
.replace(" ", "") | |
.replace(" ", "") | |
.replace("\n", "") | |
.replace("{{", "") | |
.replace("}}", "") | |
] = lines | |
first = i | |
first = None | |
for i in subheaders: | |
if subheaders[i][1] == None: | |
continue | |
sub_spans = sorted_spans[subheaders[i][1] : subheaders[i][0]] | |
lines = [] | |
for x in sub_spans: | |
if merged_spans[x][1] == "## Subheader" and first == None: | |
break | |
elif merged_spans[x][1] == "# Header": | |
break | |
else: | |
lines.append(merged_spans[x][0]) | |
try: | |
name = lines[0] | |
except: | |
name = "Model Details" | |
lines = "".join(lines) | |
# print(merged_spans[i][0] + "-------------------") | |
# print(lines) | |
subheaders_strings[ | |
name.replace("\n# ", "").replace(" ", "").replace(" ", "") | |
] = lines | |
first = i | |
first = None | |
for i in subsubheaders: | |
if subsubheaders[i][1] == None: | |
continue | |
sub_spans = sorted_spans[subsubheaders[i][1] : subsubheaders[i][0]] | |
lines = [] | |
for x in sub_spans: | |
if merged_spans[x][1] == "## Subheader" or ( | |
merged_spans[x][1] == "### Subsubheader" and first == None | |
): | |
break | |
else: | |
lines.append(merged_spans[x][0]) | |
lines = "".join(lines) | |
subsubheaders_strings[ | |
merged_spans[i][0].replace("\n", "").replace("### ", "").replace(" ", "") | |
] = lines | |
first = i | |
for i in subsubsubheaders: | |
if subsubsubheaders[i][1] == None: | |
continue | |
sub_spans = sorted_spans[subsubsubheaders[i][1] : subsubsubheaders[i][0]] | |
lines = [] | |
for x in sub_spans: | |
if ( | |
merged_spans[x][1] == "## Subheader" | |
or merged_spans[x][1] == "### Subsubheader" | |
): | |
break | |
else: | |
lines.append(merged_spans[x][0]) | |
lines = "".join(lines) | |
subsubsubheaders_strings[ | |
merged_spans[i][0].replace("#### ", "").replace("**", "").replace("\n", "") | |
] = lines | |
return ( | |
headers_strings, | |
subheaders_strings, | |
subsubheaders_strings, | |
subsubsubheaders_strings, | |
) | |
def extract_it(text_to_retrieve): | |
print("Span\t\tType\t\tText") | |
print("-------------------------------------") | |
found_subheader = False | |
current_subheader = " " | |
page_state = " " | |
help_text = " " | |
#st.write("in cs- body here") | |
( | |
headers_strings, | |
subheaders_strings, | |
subsubheaders_strings, | |
subsubsubheaders_strings, | |
) = stringify() | |
h_keys = list(headers_strings.keys()) | |
sh_keys = list(subheaders_strings.keys()) | |
ssh_keys = list(subsubheaders_strings.keys()) | |
sssh_keys = list(subsubsubheaders_strings.keys()) | |
needed = [ | |
"model details", | |
"howto", | |
"limitations", | |
"uses", | |
"training", | |
"evaluation", | |
"environmental", | |
"citation", | |
"glossary", | |
"more information", | |
"authors", | |
"contact", | |
] # not sure what keyword should be used for citation, howto, and contact | |
# info_strings = { | |
# "details": "## Model Details", | |
# "howto": "## How to Get Started with the Model", | |
# "limitations": "## Limitations and Biases", | |
# "uses": "## Uses", | |
# "training": "## Training", | |
# "evaluation": "## Evaluation Results", | |
# "environmental": "## Environmental Impact", | |
# "citation": "## Citation Information", | |
# "glossary": "## Glossary", | |
# "more information": "## More Information", | |
# "authors": "## Model Card Authors", | |
# "contact": "## Model Card Contact", | |
# } | |
info_strings = { | |
"model details": "", | |
"howto": "", | |
"limitations": "", | |
"uses": "", | |
"training": "", | |
"evaluation": "", | |
"environmental": "", | |
"citation": "", | |
"glossary": "", | |
"more information": "", | |
"authors": "", | |
"contact": "", | |
} | |
for x in needed: | |
for l in h_keys: | |
if x in l.lower(): | |
info_strings[x] = info_strings[x] + headers_strings[l] | |
for i in sh_keys: | |
if x in i.lower(): | |
info_strings[x] = info_strings[x] + subheaders_strings[i] | |
for z in ssh_keys: | |
try: | |
if x in z.lower(): | |
info_strings[x] = info_strings[x] + subsubheaders_strings[z] | |
except: | |
continue | |
for y in sssh_keys: | |
try: | |
if x in y.lower(): | |
info_strings[x] = info_strings[x] + subsubsubheaders_strings[y] | |
except: | |
continue | |
extracted_info = { | |
"Model_details_text": info_strings["model details"], | |
"Model_how_to": info_strings["howto"], | |
"Model_Limits_n_Risks": info_strings["limitations"], | |
"Model_uses": info_strings["uses"], | |
"Model_training": info_strings["training"], | |
"Model_Eval": info_strings["evaluation"], | |
"Model_carbon": info_strings["environmental"], | |
"Model_cite": info_strings["citation"], | |
"Glossary": info_strings["glossary"], | |
"More_info": info_strings["more information"], | |
"Model_card_authors": info_strings["authors"], | |
"Model_card_contact": info_strings["contact"], | |
"Technical_specs": "## Technical specifications", | |
"Model_examin": "## Model Examination", | |
} | |
#text_to_retrieve = "Model_details_text" | |
new_t = extracted_info[text_to_retrieve] + " " | |
return(new_t) | |
if __name__ == "__main__": | |
main() | |