konbraphat51 commited on
Commit
eb24142
·
1 Parent(s): 52374ba
Files changed (3) hide show
  1. .gitmodules +3 -0
  2. Data +1 -0
  3. app.py +127 -0
.gitmodules ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ [submodule "Data"]
2
+ path = Data
3
+ url = https://github.com/konbraphat51/kato_db_dataset
Data ADDED
@@ -0,0 +1 @@
 
 
1
+ Subproject commit 885c1394e5753d003c44eb16db0a09adc4838f31
app.py ADDED
@@ -0,0 +1,127 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import pandas as pd
3
+ import pathlib
4
+ import whoosh
5
+ import whoosh.index
6
+ import whoosh.query
7
+ import os
8
+ from datetime import date as Date
9
+ import re
10
+
11
+ DATA_FOLDER = pathlib.Path(__file__).parent / "Data"
12
+ RAW_FOLDER = DATA_FOLDER / "Transcription_raw"
13
+ INDEX_FOLDER = DATA_FOLDER / "Transcription_index"
14
+
15
+ class Searcher:
16
+ def __init__(self):
17
+ self.ix = self.make_total_ix()
18
+ self.df_video_links = self.get_video_links()
19
+
20
+ def make_total_ix(self):
21
+ ixes_sub = []
22
+ index_dir_list = os.listdir(INDEX_FOLDER)
23
+ for name in index_dir_list:
24
+ if name.startswith("sub"):
25
+ ixes_sub.append(whoosh.index.open_dir(INDEX_FOLDER / name))
26
+
27
+ ix = MultiIndexSearcher(ixes_sub)
28
+
29
+ return ix
30
+
31
+ def search(self, date_start, date_end, **kwargs):
32
+ titles = self.ix.search(**kwargs)
33
+
34
+ #(index, date, title)
35
+ contents = []
36
+ for title in titles:
37
+ index = int(title.split("m")[0])
38
+ row = self.df_video_links.iloc[index]
39
+ title = row["title"]
40
+ date = row["date"]
41
+ date_datetime = Date(*map(int, date.split("/")))
42
+
43
+ if not (date_start <= date_datetime <= date_end):
44
+ continue
45
+
46
+ contents.append((date_datetime, index, date, title))
47
+
48
+ #order by date_datetime
49
+ contents.sort()
50
+ #remove date_datetime
51
+ contents = [(index, date, title) for _, index, date, title in contents]
52
+
53
+ return contents
54
+
55
+ def get_video_links(self):
56
+ return pd.read_csv(DATA_FOLDER / "video_links.csv", index_col=0)
57
+
58
+ def get_content(self, index):
59
+ #正規表現でRAW_FOLDERから「index-(数字).csv」のファイルを取得
60
+
61
+ folder_list = os.listdir(RAW_FOLDER)
62
+ pattern = re.compile(r"{}-\d+.csv".format(index))
63
+ matched = [name for name in folder_list if pattern.match(name)]
64
+ matched.sort()
65
+
66
+ best = matched[-1]
67
+
68
+ return pd.read_csv(RAW_FOLDER / best)
69
+
70
+ class MultiIndexSearcher:
71
+ def __init__(self, ixes):
72
+ self.ixes = ixes
73
+
74
+ def search(self, **kwargs):
75
+ titles = []
76
+ for ix in self.ixes:
77
+ with ix.searcher() as sub_searcher:
78
+ hits = sub_searcher.search(**kwargs, limit = None)
79
+ for hit in hits:
80
+ titles.append(hit["title"])
81
+
82
+ return titles
83
+
84
+ searcher = Searcher()
85
+
86
+ def main():
87
+ global searcher
88
+
89
+ st.title("KATO DB")
90
+
91
+ keyword = st.text_input(
92
+ "検索したいキーワードを入力して、Enterを押してください\n"\
93
+ "空欄だと全文書表示します。"
94
+ )
95
+
96
+ date_start = st.date_input(
97
+ "検索したい開始日付を入力してください",
98
+ Date(2009, 1, 1)
99
+ )
100
+
101
+ date_end = st.date_input(
102
+ "検索したい終了日付を入力してください",
103
+ Date(2050, 12, 31)
104
+ )
105
+
106
+ #make query
107
+ if keyword == "":
108
+ query = whoosh.query.Every()
109
+ else:
110
+ #AND search
111
+ keyword_list = keyword.split()
112
+ query = whoosh.query.And([whoosh.query.Term("content", word) for word in keyword_list])
113
+
114
+ contents = searcher.search(q = query, date_start=date_start, date_end=date_end)
115
+
116
+ st.write("該当件数:{}件".format(len(contents)))
117
+ results = pd.DataFrame(contents, columns=["管理番号", "放送日", "動画タイトル"])
118
+ st.dataframe(results, hide_index=True)
119
+
120
+ selected_index = st.selectbox("管理番号を選択して書き起こしを表示", results["管理番号"])
121
+ if selected_index is not None:
122
+ df_transcription = searcher.get_content(selected_index)
123
+ st.dataframe(df_transcription, width=1000)
124
+
125
+ if __name__ == "__main__":
126
+
127
+ main()