updated dictionary readability
Browse files- app.py +3 -0
- lsj_dict.json +2 -2
- lsj_dict.py +40 -7
app.py
CHANGED
|
@@ -227,8 +227,11 @@ elif active_tab == "Dictionary":
|
|
| 227 |
# Put text in readable format
|
| 228 |
text = format_text(data)
|
| 229 |
|
|
|
|
| 230 |
st.markdown(format_text(data), unsafe_allow_html = True)
|
| 231 |
|
|
|
|
|
|
|
| 232 |
st.markdown("""
|
| 233 |
<style>
|
| 234 |
.tab {
|
|
|
|
| 227 |
# Put text in readable format
|
| 228 |
text = format_text(data)
|
| 229 |
|
| 230 |
+
|
| 231 |
st.markdown(format_text(data), unsafe_allow_html = True)
|
| 232 |
|
| 233 |
+
|
| 234 |
+
|
| 235 |
st.markdown("""
|
| 236 |
<style>
|
| 237 |
.tab {
|
lsj_dict.json
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
-
size
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:f5f2d966dbeab082d776f146a1b5e42685e91363cbf7ce2df3835574687e2d37
|
| 3 |
+
size 138789469
|
lsj_dict.py
CHANGED
|
@@ -3,6 +3,7 @@ from collections import defaultdict
|
|
| 3 |
from autocomplete import load_compressed_word_list
|
| 4 |
import json
|
| 5 |
import streamlit as st
|
|
|
|
| 6 |
|
| 7 |
def read_xml(file):
|
| 8 |
"""
|
|
@@ -40,7 +41,10 @@ def extract_entry_info(entry):
|
|
| 40 |
definitions[lemma]['definitions'] = {'tr': definition}
|
| 41 |
|
| 42 |
|
| 43 |
-
text = get_descendants_text(entry)
|
|
|
|
|
|
|
|
|
|
| 44 |
cleaned_text = prettify_text(text)
|
| 45 |
|
| 46 |
definitions[lemma]['definitions']['text'] = cleaned_text
|
|
@@ -48,7 +52,24 @@ def extract_entry_info(entry):
|
|
| 48 |
|
| 49 |
return {'lemma': lemma, 'orthographies': orthographies, 'definitions': definitions[lemma]['definitions']}
|
| 50 |
|
| 51 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 52 |
|
| 53 |
def get_descendants_text(element):
|
| 54 |
"""
|
|
@@ -124,7 +145,11 @@ def format_text(data):
|
|
| 124 |
text = data['definitions']['text']
|
| 125 |
|
| 126 |
# Change <tr> tags to bold
|
| 127 |
-
text = text.replace("<tr>", "<span class='tr'> ").replace("</tr>", "</span>")
|
|
|
|
|
|
|
|
|
|
|
|
|
| 128 |
|
| 129 |
formatted_text = []
|
| 130 |
|
|
@@ -146,11 +171,16 @@ def format_text(data):
|
|
| 146 |
"u", "v", "w", "x", "y", "z"
|
| 147 |
]
|
| 148 |
|
| 149 |
-
|
| 150 |
-
|
| 151 |
-
|
|
|
|
|
|
|
|
|
|
| 152 |
if level:
|
| 153 |
-
if level
|
|
|
|
|
|
|
| 154 |
formatted_text.append(f"<div class='list-class secondary-class'><span class='secondary-indicator'>{level}.</span> {text_part.replace('[SENSE_SEPARATOR]', '')} </div>")
|
| 155 |
elif level in tertiary_indicators:
|
| 156 |
formatted_text.append(f"<div class='list-class tertiary-class'> <span class='tertiary-indicator'>{level}.</span> {text_part.replace('[SENSE_SEPARATOR]', '')} </div>")
|
|
@@ -163,6 +193,9 @@ def format_text(data):
|
|
| 163 |
|
| 164 |
|
| 165 |
|
|
|
|
|
|
|
|
|
|
| 166 |
def main():
|
| 167 |
# xml_info = read_xml("LSJ_GreekUnicode/grc.lsj.perseus-eng2.xml")
|
| 168 |
|
|
|
|
| 3 |
from autocomplete import load_compressed_word_list
|
| 4 |
import json
|
| 5 |
import streamlit as st
|
| 6 |
+
import re
|
| 7 |
|
| 8 |
def read_xml(file):
|
| 9 |
"""
|
|
|
|
| 41 |
definitions[lemma]['definitions'] = {'tr': definition}
|
| 42 |
|
| 43 |
|
| 44 |
+
# text = get_descendants_text(entry)
|
| 45 |
+
|
| 46 |
+
text = get_all_text(entry)
|
| 47 |
+
|
| 48 |
cleaned_text = prettify_text(text)
|
| 49 |
|
| 50 |
definitions[lemma]['definitions']['text'] = cleaned_text
|
|
|
|
| 52 |
|
| 53 |
return {'lemma': lemma, 'orthographies': orthographies, 'definitions': definitions[lemma]['definitions']}
|
| 54 |
|
| 55 |
+
|
| 56 |
+
def get_all_text(element):
|
| 57 |
+
"""Recursively collect text from an element and all its descendants."""
|
| 58 |
+
text = (element.text or "")
|
| 59 |
+
for child in element:
|
| 60 |
+
if child.tag == 'sense':
|
| 61 |
+
level = child.get('n')
|
| 62 |
+
text += f"[SENSE_SEPARATOR][level={level}]\n\n"
|
| 63 |
+
elif child.tag == 'tr' and element.tag == 'sense':
|
| 64 |
+
if child.text is not None:
|
| 65 |
+
text += f"<tr>{child.text.strip()}</tr>\n"
|
| 66 |
+
# Skip further recursion for this child since we are already handling its text
|
| 67 |
+
text += (child.tail or "") + " "
|
| 68 |
+
continue
|
| 69 |
+
text += get_all_text(child) + " "
|
| 70 |
+
text += (child.tail or "") + " "
|
| 71 |
+
return text
|
| 72 |
+
|
| 73 |
|
| 74 |
def get_descendants_text(element):
|
| 75 |
"""
|
|
|
|
| 145 |
text = data['definitions']['text']
|
| 146 |
|
| 147 |
# Change <tr> tags to bold
|
| 148 |
+
text = text.replace("<tr>", "<span class='tr'> ").replace("</tr>", "</span>")
|
| 149 |
+
|
| 150 |
+
text = re.sub(r"\s+,\s+", ", ", text)
|
| 151 |
+
|
| 152 |
+
# .replace(",", ", ").replace(";", "; ").replace(":", ": ").replace("(", " (").replace(")", ") ").replace("[", " [").replace("]", "] ").replace(" ,", ", ").replace(" ; ", "; ").replace(" : ", ": ").replace(" ." , ". ")
|
| 153 |
|
| 154 |
formatted_text = []
|
| 155 |
|
|
|
|
| 171 |
"u", "v", "w", "x", "y", "z"
|
| 172 |
]
|
| 173 |
|
| 174 |
+
header = text.split("\n")[0]
|
| 175 |
+
formatted_text.append(header)
|
| 176 |
+
|
| 177 |
+
for text_part in text.split("[SENSE_SEPARATOR]")[1:]:
|
| 178 |
+
level = text_part.split("level=")[1].split("]")[0]
|
| 179 |
+
text_part = text_part.replace(f"[level={level}]", "")
|
| 180 |
if level:
|
| 181 |
+
if level == "A":
|
| 182 |
+
formatted_text.append(f"<div class='list-class primary-class'> {text_part.replace('[SENSE_SEPARATOR]', '')} </div>")
|
| 183 |
+
elif level in secondary_indicators:
|
| 184 |
formatted_text.append(f"<div class='list-class secondary-class'><span class='secondary-indicator'>{level}.</span> {text_part.replace('[SENSE_SEPARATOR]', '')} </div>")
|
| 185 |
elif level in tertiary_indicators:
|
| 186 |
formatted_text.append(f"<div class='list-class tertiary-class'> <span class='tertiary-indicator'>{level}.</span> {text_part.replace('[SENSE_SEPARATOR]', '')} </div>")
|
|
|
|
| 193 |
|
| 194 |
|
| 195 |
|
| 196 |
+
|
| 197 |
+
|
| 198 |
+
|
| 199 |
def main():
|
| 200 |
# xml_info = read_xml("LSJ_GreekUnicode/grc.lsj.perseus-eng2.xml")
|
| 201 |
|