File size: 4,654 Bytes
eb24142
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
19295b2
 
eb24142
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c4411de
eb24142
 
 
 
8c5e233
f2f317f
8f8ff65
b065486
 
8c5e233
eb24142
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
import streamlit as st
import pandas as pd
import pathlib
import whoosh
import whoosh.index
import whoosh.query
import os
from datetime import date as Date
import re

DATA_FOLDER = pathlib.Path(__file__).parent / "Data"
RAW_FOLDER = DATA_FOLDER / "Transcription_raw"
INDEX_FOLDER = DATA_FOLDER / "Transcription_index"

class Searcher:
    def __init__(self):
        self.ix = self.make_total_ix()
        self.df_video_links = self.get_video_links()
        
    def make_total_ix(self):
        ixes_sub = []
        index_dir_list = os.listdir(INDEX_FOLDER)
        for name in index_dir_list:
            if name.startswith("sub"):
                ixes_sub.append(whoosh.index.open_dir(INDEX_FOLDER / name))
                
        ix = MultiIndexSearcher(ixes_sub)
                
        return ix
    
    def search(self, date_start, date_end, **kwargs):
        titles = self.ix.search(**kwargs)
        
        #(index, date, title)
        contents = []
        for title in titles:
            index = int(title.split("m")[0])
            row = self.df_video_links.iloc[index]
            title = row["title"]
            date = row["date"]
            date_datetime = Date(*map(int, date.split("/")))
            
            if not (date_start <= date_datetime <= date_end):
                continue
           
            contents.append((date_datetime, index, date, title))
        
        #order by date_datetime
        contents.sort()
        #remove date_datetime
        contents = [(index, date, title) for _, index, date, title in contents]
            
        return contents
            
    def get_video_links(self):
        return pd.read_csv(DATA_FOLDER / "video_links.csv", index_col=0)
        
    def get_content(self, index):
        #正規表現でRAW_FOLDERから「index-(数字).csv」のファイルを取得
        
        folder_list = os.listdir(RAW_FOLDER)
        pattern = re.compile(r"{}-\d+.csv".format(index)) 
        matched = [name for name in folder_list if pattern.match(name)]
        matched.sort()
        
        best = matched[-1]
        
        return pd.read_csv(RAW_FOLDER / best)
        
class MultiIndexSearcher:
    def __init__(self, ixes):
        self.ixes = ixes

    def search(self, **kwargs):
        titles = []
        for ix in self.ixes:
            with ix.searcher() as sub_searcher:
                hits = sub_searcher.search(**kwargs, limit = None)
                for hit in hits:
                    titles.append(hit["title"])
            
        return titles

searcher = Searcher()

def main():
    global searcher
    
    st.title("KATO DB")

    keyword = st.text_input(
        "検索したいキーワードを入力して、Enterを押してください\n"\
        "空欄だと全文書表示します。\n"\
        "検索にヒットしない場合書き起こしAIに認識されていない可能性があります。(「もこう」など)"
    )
    
    date_start = st.date_input(
        "検索したい開始日付を入力してください",
        Date(2009, 1, 1)
    )
    
    date_end = st.date_input(
        "検索したい終了日付を入力してください",
        Date(2050, 12, 31)
    )
    
    #make query
    if keyword == "":
        query = whoosh.query.Every()
    else: 
        #AND search
        keyword_list = keyword.split()
        query = whoosh.query.And([whoosh.query.Term("content", word) for word in keyword_list])
    
    contents = searcher.search(q = query, date_start=date_start, date_end=date_end)
    
    st.write("該当件数:{}件".format(len(contents)))
    results = pd.DataFrame(contents, columns=["管理番号", "放送日", "動画タイトル"])
    st.dataframe(results, hide_index=True)

    selected_index = st.selectbox("管理番号を選択して、Enterを押して書き起こしを表示shi", results["管理番号"])
    if selected_index is not None:
        df_transcription = searcher.get_content(selected_index)
        st.dataframe(df_transcription, width=1000)

    st.write("ダウンロードはこちらから:https://github.com/konbraphat51/kato_db_dataset")
    st.write("データ収集のプログラムコード・仕組みの説明はこちらから:https://github.com/konbraphat51/kato_db")
    st.write("加藤AIのプロトタイプ:https://colab.research.google.com/drive/1QsJN50wvLEJx04P4XaBtsKqx1q3532OU?usp=sharing")
    st.write("ブログ(協力者募集しています):https://qiita.com/konbraphat51/items/5b27afda442c13806c25")
    st.write("データ最終更新:2023/8/19")

if __name__ == "__main__":
    
    main()