Spaces:
Running
Running
mylibrar
commited on
Commit
·
3dd0859
1
Parent(s):
cb27b88
Add comments to each topic graph
Browse files- data/topic_charts.json +40 -20
- results.py +3 -3
data/topic_charts.json
CHANGED
|
@@ -149,7 +149,8 @@
|
|
| 149 |
],
|
| 150 |
"pctdistance": 1.2,
|
| 151 |
"labeldistance": 1.5
|
| 152 |
-
}
|
|
|
|
| 153 |
}
|
| 154 |
],
|
| 155 |
[
|
|
@@ -304,7 +305,8 @@
|
|
| 304 |
"subplots_adjust": {
|
| 305 |
"left": 0.37,
|
| 306 |
"right": 0.98
|
| 307 |
-
}
|
|
|
|
| 308 |
}
|
| 309 |
],
|
| 310 |
[
|
|
@@ -459,7 +461,8 @@
|
|
| 459 |
"subplots_adjust": {
|
| 460 |
"left": 0.37,
|
| 461 |
"right": 0.98
|
| 462 |
-
}
|
|
|
|
| 463 |
}
|
| 464 |
],
|
| 465 |
[
|
|
@@ -614,7 +617,8 @@
|
|
| 614 |
"subplots_adjust": {
|
| 615 |
"left": 0.37,
|
| 616 |
"right": 0.98
|
| 617 |
-
}
|
|
|
|
| 618 |
}
|
| 619 |
],
|
| 620 |
[
|
|
@@ -769,7 +773,8 @@
|
|
| 769 |
"subplots_adjust": {
|
| 770 |
"left": 0.37,
|
| 771 |
"right": 0.98
|
| 772 |
-
}
|
|
|
|
| 773 |
}
|
| 774 |
],
|
| 775 |
[
|
|
@@ -924,7 +929,8 @@
|
|
| 924 |
"subplots_adjust": {
|
| 925 |
"left": 0.37,
|
| 926 |
"right": 0.98
|
| 927 |
-
}
|
|
|
|
| 928 |
}
|
| 929 |
],
|
| 930 |
[
|
|
@@ -1079,7 +1085,8 @@
|
|
| 1079 |
"subplots_adjust": {
|
| 1080 |
"left": 0.37,
|
| 1081 |
"right": 0.98
|
| 1082 |
-
}
|
|
|
|
| 1083 |
}
|
| 1084 |
],
|
| 1085 |
[
|
|
@@ -1234,7 +1241,8 @@
|
|
| 1234 |
"subplots_adjust": {
|
| 1235 |
"left": 0.37,
|
| 1236 |
"right": 0.98
|
| 1237 |
-
}
|
|
|
|
| 1238 |
}
|
| 1239 |
],
|
| 1240 |
[
|
|
@@ -1389,7 +1397,8 @@
|
|
| 1389 |
"subplots_adjust": {
|
| 1390 |
"left": 0.37,
|
| 1391 |
"right": 0.98
|
| 1392 |
-
}
|
|
|
|
| 1393 |
}
|
| 1394 |
],
|
| 1395 |
[
|
|
@@ -1544,7 +1553,8 @@
|
|
| 1544 |
"subplots_adjust": {
|
| 1545 |
"left": 0.37,
|
| 1546 |
"right": 0.98
|
| 1547 |
-
}
|
|
|
|
| 1548 |
}
|
| 1549 |
],
|
| 1550 |
[
|
|
@@ -1699,7 +1709,8 @@
|
|
| 1699 |
"subplots_adjust": {
|
| 1700 |
"left": 0.37,
|
| 1701 |
"right": 0.98
|
| 1702 |
-
}
|
|
|
|
| 1703 |
}
|
| 1704 |
],
|
| 1705 |
[
|
|
@@ -1854,7 +1865,8 @@
|
|
| 1854 |
"subplots_adjust": {
|
| 1855 |
"left": 0.37,
|
| 1856 |
"right": 0.98
|
| 1857 |
-
}
|
|
|
|
| 1858 |
}
|
| 1859 |
],
|
| 1860 |
[
|
|
@@ -2009,7 +2021,8 @@
|
|
| 2009 |
"subplots_adjust": {
|
| 2010 |
"left": 0.37,
|
| 2011 |
"right": 0.98
|
| 2012 |
-
}
|
|
|
|
| 2013 |
}
|
| 2014 |
],
|
| 2015 |
[
|
|
@@ -2164,7 +2177,8 @@
|
|
| 2164 |
"subplots_adjust": {
|
| 2165 |
"left": 0.37,
|
| 2166 |
"right": 0.98
|
| 2167 |
-
}
|
|
|
|
| 2168 |
}
|
| 2169 |
],
|
| 2170 |
[
|
|
@@ -2319,7 +2333,8 @@
|
|
| 2319 |
"subplots_adjust": {
|
| 2320 |
"left": 0.37,
|
| 2321 |
"right": 0.98
|
| 2322 |
-
}
|
|
|
|
| 2323 |
}
|
| 2324 |
],
|
| 2325 |
[
|
|
@@ -2474,7 +2489,8 @@
|
|
| 2474 |
"subplots_adjust": {
|
| 2475 |
"left": 0.37,
|
| 2476 |
"right": 0.98
|
| 2477 |
-
}
|
|
|
|
| 2478 |
}
|
| 2479 |
],
|
| 2480 |
[
|
|
@@ -2629,7 +2645,8 @@
|
|
| 2629 |
"subplots_adjust": {
|
| 2630 |
"left": 0.37,
|
| 2631 |
"right": 0.98
|
| 2632 |
-
}
|
|
|
|
| 2633 |
}
|
| 2634 |
],
|
| 2635 |
[
|
|
@@ -2784,7 +2801,8 @@
|
|
| 2784 |
"subplots_adjust": {
|
| 2785 |
"left": 0.37,
|
| 2786 |
"right": 0.98
|
| 2787 |
-
}
|
|
|
|
| 2788 |
}
|
| 2789 |
],
|
| 2790 |
[
|
|
@@ -2939,7 +2957,8 @@
|
|
| 2939 |
"subplots_adjust": {
|
| 2940 |
"left": 0.37,
|
| 2941 |
"right": 0.98
|
| 2942 |
-
}
|
|
|
|
| 2943 |
}
|
| 2944 |
],
|
| 2945 |
[
|
|
@@ -3094,7 +3113,8 @@
|
|
| 3094 |
"subplots_adjust": {
|
| 3095 |
"left": 0.37,
|
| 3096 |
"right": 0.98
|
| 3097 |
-
}
|
|
|
|
| 3098 |
}
|
| 3099 |
],
|
| 3100 |
[
|
|
|
|
| 149 |
],
|
| 150 |
"pctdistance": 1.2,
|
| 151 |
"labeldistance": 1.5
|
| 152 |
+
},
|
| 153 |
+
"comment": "As shown in the graph above, over 20% of the documents are related to Business & Economics & Finance, which makes it the largest topic group in dataset. On the contrary, the group of Culture & Cultural geography contains the smallest number of documents among all topics."
|
| 154 |
}
|
| 155 |
],
|
| 156 |
[
|
|
|
|
| 305 |
"subplots_adjust": {
|
| 306 |
"left": 0.37,
|
| 307 |
"right": 0.98
|
| 308 |
+
},
|
| 309 |
+
"comment": "In average, documents related to Shopping & Commodity have larger fraction of words corrected in lines."
|
| 310 |
}
|
| 311 |
],
|
| 312 |
[
|
|
|
|
| 461 |
"subplots_adjust": {
|
| 462 |
"left": 0.37,
|
| 463 |
"right": 0.98
|
| 464 |
+
},
|
| 465 |
+
"comment": "Compared with other topics, Personal Development & Human Resources & Career in average contain more lines ending with ellipsis."
|
| 466 |
}
|
| 467 |
],
|
| 468 |
[
|
|
|
|
| 617 |
"subplots_adjust": {
|
| 618 |
"left": 0.37,
|
| 619 |
"right": 0.98
|
| 620 |
+
},
|
| 621 |
+
"comment": "Shopping & Commodity related documents have higher percentage of lines starting with bullet point."
|
| 622 |
}
|
| 623 |
],
|
| 624 |
[
|
|
|
|
| 773 |
"subplots_adjust": {
|
| 774 |
"left": 0.37,
|
| 775 |
"right": 0.98
|
| 776 |
+
},
|
| 777 |
+
"comment": "Personal Development & Human Resources & Career in average has more lines with toxic words."
|
| 778 |
}
|
| 779 |
],
|
| 780 |
[
|
|
|
|
| 929 |
"subplots_adjust": {
|
| 930 |
"left": 0.37,
|
| 931 |
"right": 0.98
|
| 932 |
+
},
|
| 933 |
+
"comment": "Daily Life & Home & Lifestyle in average has more toxic words."
|
| 934 |
}
|
| 935 |
],
|
| 936 |
[
|
|
|
|
| 1085 |
"subplots_adjust": {
|
| 1086 |
"left": 0.37,
|
| 1087 |
"right": 0.98
|
| 1088 |
+
},
|
| 1089 |
+
"comment": "Documents in the topic of Personal Development & Human Resources & Career in average contain more words than other topics."
|
| 1090 |
}
|
| 1091 |
],
|
| 1092 |
[
|
|
|
|
| 1241 |
"subplots_adjust": {
|
| 1242 |
"left": 0.37,
|
| 1243 |
"right": 0.98
|
| 1244 |
+
},
|
| 1245 |
+
"comment": "There is no significant variance in the average word length for different topic groups. However, Education related data contain longer words than others in general."
|
| 1246 |
}
|
| 1247 |
],
|
| 1248 |
[
|
|
|
|
| 1397 |
"subplots_adjust": {
|
| 1398 |
"left": 0.37,
|
| 1399 |
"right": 0.98
|
| 1400 |
+
},
|
| 1401 |
+
"comment": "Documents in the topic of Personal Development & Human Resources & Career usually contain more sentences."
|
| 1402 |
}
|
| 1403 |
],
|
| 1404 |
[
|
|
|
|
| 1553 |
"subplots_adjust": {
|
| 1554 |
"left": 0.37,
|
| 1555 |
"right": 0.98
|
| 1556 |
+
},
|
| 1557 |
+
"comment": "Documents related to Daily Life & Home & Lifestyle usually have higher percentage of symbols."
|
| 1558 |
}
|
| 1559 |
],
|
| 1560 |
[
|
|
|
|
| 1709 |
"subplots_adjust": {
|
| 1710 |
"left": 0.37,
|
| 1711 |
"right": 0.98
|
| 1712 |
+
},
|
| 1713 |
+
"comment": "The fraction of words with alpha character seems to be relatively consistent across different topics."
|
| 1714 |
}
|
| 1715 |
],
|
| 1716 |
[
|
|
|
|
| 1865 |
"subplots_adjust": {
|
| 1866 |
"left": 0.37,
|
| 1867 |
"right": 0.98
|
| 1868 |
+
},
|
| 1869 |
+
"comment": "Culture & Cultural geography contains more stop words in average."
|
| 1870 |
}
|
| 1871 |
],
|
| 1872 |
[
|
|
|
|
| 2021 |
"subplots_adjust": {
|
| 2022 |
"left": 0.37,
|
| 2023 |
"right": 0.98
|
| 2024 |
+
},
|
| 2025 |
+
"comment": "Natural Science & Formal Science & Technology has a significantly higher rate in percentage of documents that contain curly bracket. It might be related to the coding data."
|
| 2026 |
}
|
| 2027 |
],
|
| 2028 |
[
|
|
|
|
| 2177 |
"subplots_adjust": {
|
| 2178 |
"left": 0.37,
|
| 2179 |
"right": 0.98
|
| 2180 |
+
},
|
| 2181 |
+
"comment": "Sports related documents have a higher number of duplication count."
|
| 2182 |
}
|
| 2183 |
],
|
| 2184 |
[
|
|
|
|
| 2333 |
"subplots_adjust": {
|
| 2334 |
"left": 0.37,
|
| 2335 |
"right": 0.98
|
| 2336 |
+
},
|
| 2337 |
+
"comment": "In average, Culture & Cultural geography related documents are duplicated across a higher number of common crawl dumps. Duplication of Shopping & Commodity appears in less dumps than others."
|
| 2338 |
}
|
| 2339 |
],
|
| 2340 |
[
|
|
|
|
| 2489 |
"subplots_adjust": {
|
| 2490 |
"left": 0.37,
|
| 2491 |
"right": 0.98
|
| 2492 |
+
},
|
| 2493 |
+
"comment": "In average, Culture & Cultural geography related documents are duplicated across more years than other topics."
|
| 2494 |
}
|
| 2495 |
],
|
| 2496 |
[
|
|
|
|
| 2645 |
"subplots_adjust": {
|
| 2646 |
"left": 0.37,
|
| 2647 |
"right": 0.98
|
| 2648 |
+
},
|
| 2649 |
+
"comment": "In average, Culture & Cultural geography related documents are duplicated across a wider span of years."
|
| 2650 |
}
|
| 2651 |
],
|
| 2652 |
[
|
|
|
|
| 2801 |
"subplots_adjust": {
|
| 2802 |
"left": 0.37,
|
| 2803 |
"right": 0.98
|
| 2804 |
+
},
|
| 2805 |
+
"comment": "Average language scores of different topic groups are mostly consistent. No significant differences are obeserved."
|
| 2806 |
}
|
| 2807 |
],
|
| 2808 |
[
|
|
|
|
| 2957 |
"subplots_adjust": {
|
| 2958 |
"left": 0.37,
|
| 2959 |
"right": 0.98
|
| 2960 |
+
},
|
| 2961 |
+
"comment": "In average, Shopping & Commodity has a larger fraction of duplicate lines than others."
|
| 2962 |
}
|
| 2963 |
],
|
| 2964 |
[
|
|
|
|
| 3113 |
"subplots_adjust": {
|
| 3114 |
"left": 0.37,
|
| 3115 |
"right": 0.98
|
| 3116 |
+
},
|
| 3117 |
+
"comment": "Shopping & Commodity usually has a larger fraction of characters in duplicate lines than others."
|
| 3118 |
}
|
| 3119 |
],
|
| 3120 |
[
|
results.py
CHANGED
|
@@ -990,7 +990,7 @@ for title, data in topic_charts:
|
|
| 990 |
for rgb in data["kwargs"]["color"]
|
| 991 |
]
|
| 992 |
)))
|
| 993 |
-
|
| 994 |
topic_graphs.append(go.Figure(go.Pie(
|
| 995 |
values=data["kwargs"]['x'],
|
| 996 |
labels=data["kwargs"]["labels"],
|
|
@@ -1014,8 +1014,8 @@ cluster_div = Div(
|
|
| 1014 |
)),
|
| 1015 |
H3("Results Analysis"),
|
| 1016 |
*(
|
| 1017 |
-
Section(H4(title), plotly2fasthtml(topic_graphs[i]))
|
| 1018 |
-
for i, (title,
|
| 1019 |
)
|
| 1020 |
)
|
| 1021 |
)
|
|
|
|
| 990 |
for rgb in data["kwargs"]["color"]
|
| 991 |
]
|
| 992 |
)))
|
| 993 |
+
elif data["type"] == "pie":
|
| 994 |
topic_graphs.append(go.Figure(go.Pie(
|
| 995 |
values=data["kwargs"]['x'],
|
| 996 |
labels=data["kwargs"]["labels"],
|
|
|
|
| 1014 |
)),
|
| 1015 |
H3("Results Analysis"),
|
| 1016 |
*(
|
| 1017 |
+
Section(H4(title), plotly2fasthtml(topic_graphs[i]), P(data.get("comment", '')))
|
| 1018 |
+
for i, (title, data) in enumerate(topic_charts)
|
| 1019 |
)
|
| 1020 |
)
|
| 1021 |
)
|