omwdataset

Runtime error

App Files Files Community

omkarenator commited on Oct 2, 2024

Commit

87a6313

1 Parent(s): 5171d34

unify data viewer, DV, DV2, DVS

Browse files

Files changed (4) hide show

curated.py +4 -0
data_viewer.py +157 -0
main.py +5 -2
web.py +2 -156

curated.py CHANGED Viewed

@@ -5,6 +5,7 @@ from fh_plotly import plotly2fasthtml
 import pandas as pd
 import json
 from data_viewer import view_data, gen_random_id
 from rich import print
 import uuid
 import plotly.express as px
@@ -485,6 +486,9 @@ wiki_examples = Div(
     ),
 )
 def get_freelaw_data(data_source: str = "Freelaw", doc_id: int = 3, target: str = "foo"):
     doc_id = max(0, min(int(doc_id), 9))

 import pandas as pd
 import json
 from data_viewer import view_data, gen_random_id
+from data_viewer import DV, DV2, DVS
 from rich import print
 import uuid
 import plotly.express as px
     ),
 )
+wiki_examples = DV("data/curated_samples/wiki.json", 0, "Wikipedia")
 def get_freelaw_data(data_source: str = "Freelaw", doc_id: int = 3, target: str = "foo"):
     doc_id = max(0, min(int(doc_id), 9))

data_viewer.py CHANGED Viewed

@@ -3,6 +3,7 @@ from fasthtml.components import *
 import json
 import string
 import random
 def gen_random_id() -> str:
@@ -79,3 +80,159 @@ def view_data(
         style="overflow: auto; clear: both; height: 600px; border: 1px solid #ccc; padding: 20px;",
     )
     return Div(form, data_display, style="margin-top: 10px;", id=target)

 import json
 import string
 import random
+import jsonlines
 def gen_random_id() -> str:
         style="overflow: auto; clear: both; height: 600px; border: 1px solid #ccc; padding: 20px;",
     )
     return Div(form, data_display, style="margin-top: 10px;", id=target)
+def DVS(
+    left,
+    header,
+):
+    col1 = Div(
+        Pre(
+            json.dumps(left, indent=4, ensure_ascii=False),
+            style="white-space: pre-wrap; word-break: break-all;",
+        ),
+        style="float: left; overflow-x: auto;",
+    )
+    data_display = Div(
+        col1,
+        style="overflow: auto; clear: both; height: 200px; border: 1px solid #ccc; padding: 20px;",
+    )
+    return Div(H3(header), data_display, style="margin-top: 10px;")
+def DV(
+    left_file,
+    doc_id,
+    header,
+    target: str = None,
+):
+    if target is None:
+        target = "".join(random.choices(string.ascii_lowercase, k=8))
+    if left_file.endswith("jsonl"):
+        left = [x for x in jsonlines.open(left_file)]
+    else:
+        left = json.load(open(left_file, encoding="utf-8"))
+    max_doc_id = len(left) - 1
+    slider = Input(
+        type="range",
+        name=f"doc_id_{target}",
+        min="0",
+        max=str(max_doc_id),
+        value=str(doc_id),
+        hx_get=f"/update/{target}",
+        hx_target=f"#{target}",
+        hx_trigger="change",
+        hx_swap="innerHTML",
+        hx_vals=json.dumps({"left_file": f"{left_file}", "header": f"{header}"}),
+    )
+    form = Div(
+        H3(header),
+        Label(
+            "Data sample: ", slider, f"{doc_id} of {max_doc_id}", cls="plotly_slider"
+        ),
+        cls="plotly_input_container",
+        style="padding: 20px;",
+    )
+    col1 = Div(
+        Pre(
+            json.dumps(left[doc_id], indent=4, ensure_ascii=False),
+            style="white-space: pre-wrap; word-break: break-all;",
+        ),
+        style="float: left; overflow-x: auto;",
+    )
+    data_display = Div(
+        col1,
+        style="overflow: auto; clear: both; height: 600px; border: 1px solid #ccc; padding: 20px;",
+    )
+    return Div(form, data_display, style="margin-top: 10px;", id=target)
+def DV2(
+    left_file,
+    right_file,
+    doc_id,
+    target: str = None,
+):
+    if target is None:
+        target = "".join(random.choices(string.ascii_lowercase, k=8))
+    left = json.load(open(left_file, encoding="utf-8"))
+    right = json.load(open(right_file, encoding="utf-8"))
+    max_doc_id = len(left) - 1
+    slider = Input(
+        type="range",
+        name=f"doc_id_{target}",
+        min="0",
+        max=str(max_doc_id),
+        value=str(doc_id),
+        hx_get=f"/update/{target}",
+        hx_target=f"#{target}",
+        hx_trigger="change",
+        hx_swap="innerHTML",
+        hx_vals=json.dumps(
+            {"left_file": f"{left_file}", "right_file": f"{right_file}"}
+        ),
+    )
+    form = Div(
+        Label(
+            "Data sample: ", slider, f"{doc_id} of {max_doc_id}", cls="plotly_slider"
+        ),
+        cls="plotly_input_container",
+        style="padding: 20px;",
+    )
+    col1 = Div(
+        H3("Raw format", style="margin-top: 0px;"),
+        Pre(
+            json.dumps(left[doc_id], indent=4, ensure_ascii=False),
+            style="white-space: pre-wrap; word-break: break-all;",
+        ),
+        style="width: 48%; float: left; overflow-x: auto;",
+    )
+    col2 = Div(
+        H3("Extracted format", style="margin-top: 0px;"),
+        Pre(
+            json.dumps(right[doc_id], indent=4, ensure_ascii=False),
+            style="white-space: pre-wrap; word-break: break-all;",
+        ),
+        style="width: 48%; float: right; overflow-x: auto;",
+    )
+    data_display = Div(
+        col1,
+        col2,
+        style="overflow: auto; clear: both; height: 600px; border: 1px solid #ccc; padding: 20px;",
+    )
+    return Div(form, data_display, style="margin-top: 10px;", id=target)
+def update(target: str, request):
+    params = request.query_params
+    doc_id = int(params.get(f"doc_id_{target}", 3))
+    left_file = params.get("left_file")
+    right_file = params.get("right_file")
+    if left_file and right_file:
+        return (
+            DV2(
+                left_file,
+                right_file,
+                doc_id,
+                target,
+            ),
+        )
+    else:
+        return DV(
+            left_file,
+            doc_id,
+            params.get("header"),
+            target,
+        )

main.py CHANGED Viewed

@@ -21,6 +21,7 @@ import web
 import common
 import results
 from pybtex.database import parse_file
 app, rt = fast_app(
@@ -541,11 +542,13 @@ def intro():
 #rt("/overview")(overview.overview)
 rt("/curated")(curated.curated)
-rt("/curated/{target}")(curated.update)
 rt("/webdata")(web.web_data)
-rt("/webdata/{target}")(web.update)
 rt("/common")(common.common_steps)

 import common
 import results
 from pybtex.database import parse_file
+import data_viewer
 app, rt = fast_app(
 #rt("/overview")(overview.overview)
+rt("/update/{target}")(data_viewer.update)
 rt("/curated")(curated.curated)
 rt("/webdata")(web.web_data)
 rt("/common")(common.common_steps)

web.py CHANGED Viewed

@@ -7,9 +7,11 @@ from rich import print
 import jsonlines
 from data.url_blocklist import urls_high_matches, urls_false_positives
 from data.non_web_urls import non_web_urls
 from fasthtml.components import D_code
 import pandas as pd
 data_filtering_table_data = pd.DataFrame(
         {
             "Dataset": [
@@ -176,162 +178,6 @@ table_html_qf_filter_data = qf_filtering_table_data.to_html(index=False, border=
 table_div_qf_filter_data = Div(NotStr(table_html_qf_filter_data), style="margin: 40px;")
-def DVS(
-    left,
-    header,
-):
-    col1 = Div(
-        Pre(
-            json.dumps(left, indent=4, ensure_ascii=False),
-            style="white-space: pre-wrap; word-break: break-all;",
-        ),
-        style="float: left; overflow-x: auto;",
-    )
-    data_display = Div(
-        col1,
-        style="overflow: auto; clear: both; height: 200px; border: 1px solid #ccc; padding: 20px;",
-    )
-    return Div(H3(header), data_display, style="margin-top: 10px;")
-def DV(
-    left_file,
-    doc_id,
-    header,
-    target: str = None,
-):
-    if target is None:
-        target = "".join(random.choices(string.ascii_lowercase, k=8))
-    if left_file.endswith("jsonl"):
-        left = [x for x in jsonlines.open(left_file)]
-    else:
-        left = json.load(open(left_file, encoding="utf-8"))
-    max_doc_id = len(left) - 1
-    slider = Input(
-        type="range",
-        name=f"doc_id_{target}",
-        min="0",
-        max=str(max_doc_id),
-        value=str(doc_id),
-        hx_get=f"/webdata/{target}",
-        hx_target=f"#{target}",
-        hx_trigger="change",
-        hx_swap="innerHTML",
-        hx_vals=json.dumps({"left_file": f"{left_file}", "header": f"{header}"}),
-    )
-    form = Div(
-        H3(header),
-        Label(
-            "Data sample: ", slider, f"{doc_id} of {max_doc_id}", cls="plotly_slider"
-        ),
-        cls="plotly_input_container",
-        style="padding: 20px;",
-    )
-    col1 = Div(
-        Pre(
-            json.dumps(left[doc_id], indent=4, ensure_ascii=False),
-            style="white-space: pre-wrap; word-break: break-all;",
-        ),
-        style="float: left; overflow-x: auto;",
-    )
-    data_display = Div(
-        col1,
-        style="overflow: auto; clear: both; height: 600px; border: 1px solid #ccc; padding: 20px;",
-    )
-    return Div(form, data_display, style="margin-top: 10px;", id=target)
-def DV2(
-    left_file,
-    right_file,
-    doc_id,
-    target: str = None,
-):
-    if target is None:
-        target = "".join(random.choices(string.ascii_lowercase, k=8))
-    left = json.load(open(left_file, encoding="utf-8"))
-    right = json.load(open(right_file, encoding="utf-8"))
-    max_doc_id = len(left) - 1
-    slider = Input(
-        type="range",
-        name=f"doc_id_{target}",
-        min="0",
-        max=str(max_doc_id),
-        value=str(doc_id),
-        hx_get=f"/webdata/{target}",
-        hx_target=f"#{target}",
-        hx_trigger="change",
-        hx_swap="innerHTML",
-        hx_vals=json.dumps(
-            {"left_file": f"{left_file}", "right_file": f"{right_file}"}
-        ),
-    )
-    form = Div(
-        Label(
-            "Data sample: ", slider, f"{doc_id} of {max_doc_id}", cls="plotly_slider"
-        ),
-        cls="plotly_input_container",
-        style="padding: 20px;",
-    )
-    col1 = Div(
-        H3("Raw format", style="margin-top: 0px;"),
-        Pre(
-            json.dumps(left[doc_id], indent=4, ensure_ascii=False),
-            style="white-space: pre-wrap; word-break: break-all;",
-        ),
-        style="width: 48%; float: left; overflow-x: auto;",
-    )
-    col2 = Div(
-        H3("Extracted format", style="margin-top: 0px;"),
-        Pre(
-            json.dumps(right[doc_id], indent=4, ensure_ascii=False),
-            style="white-space: pre-wrap; word-break: break-all;",
-        ),
-        style="width: 48%; float: right; overflow-x: auto;",
-    )
-    data_display = Div(
-        col1,
-        col2,
-        style="overflow: auto; clear: both; height: 600px; border: 1px solid #ccc; padding: 20px;",
-    )
-    return Div(form, data_display, style="margin-top: 10px;", id=target)
-def update(target: str, request):
-    params = request.query_params
-    print(params)
-    doc_id = int(params.get(f"doc_id_{target}", 3))
-    left_file = params.get("left_file")
-    right_file = params.get("right_file")
-    if left_file and right_file:
-        return (
-            DV2(
-                left_file,
-                right_file,
-                doc_id,
-                target,
-            ),
-        )
-    else:
-        return DV(
-            left_file,
-            doc_id,
-            params.get("header"),
-            target,
-        )
 dolma311 = """
 words = text.split()
 word_count = len(words)

 import jsonlines
 from data.url_blocklist import urls_high_matches, urls_false_positives
 from data.non_web_urls import non_web_urls
+from data_viewer import DV, DV2, DVS
 from fasthtml.components import D_code
 import pandas as pd
 data_filtering_table_data = pd.DataFrame(
         {
             "Dataset": [
 table_div_qf_filter_data = Div(NotStr(table_html_qf_filter_data), style="margin: 40px;")
 dolma311 = """
 words = text.split()
 word_count = len(words)