Kato-DB / app.py
konbraphat51
更新
b065486
import streamlit as st
import pandas as pd
import pathlib
import whoosh
import whoosh.index
import whoosh.query
import os
from datetime import date as Date
import re
DATA_FOLDER = pathlib.Path(__file__).parent / "Data"
RAW_FOLDER = DATA_FOLDER / "Transcription_raw"
INDEX_FOLDER = DATA_FOLDER / "Transcription_index"
class Searcher:
def __init__(self):
self.ix = self.make_total_ix()
self.df_video_links = self.get_video_links()
def make_total_ix(self):
ixes_sub = []
index_dir_list = os.listdir(INDEX_FOLDER)
for name in index_dir_list:
if name.startswith("sub"):
ixes_sub.append(whoosh.index.open_dir(INDEX_FOLDER / name))
ix = MultiIndexSearcher(ixes_sub)
return ix
def search(self, date_start, date_end, **kwargs):
titles = self.ix.search(**kwargs)
#(index, date, title)
contents = []
for title in titles:
index = int(title.split("m")[0])
row = self.df_video_links.iloc[index]
title = row["title"]
date = row["date"]
date_datetime = Date(*map(int, date.split("/")))
if not (date_start <= date_datetime <= date_end):
continue
contents.append((date_datetime, index, date, title))
#order by date_datetime
contents.sort()
#remove date_datetime
contents = [(index, date, title) for _, index, date, title in contents]
return contents
def get_video_links(self):
return pd.read_csv(DATA_FOLDER / "video_links.csv", index_col=0)
def get_content(self, index):
#正規表現でRAW_FOLDERから「index-(数字).csv」のファイルを取得
folder_list = os.listdir(RAW_FOLDER)
pattern = re.compile(r"{}-\d+.csv".format(index))
matched = [name for name in folder_list if pattern.match(name)]
matched.sort()
best = matched[-1]
return pd.read_csv(RAW_FOLDER / best)
class MultiIndexSearcher:
def __init__(self, ixes):
self.ixes = ixes
def search(self, **kwargs):
titles = []
for ix in self.ixes:
with ix.searcher() as sub_searcher:
hits = sub_searcher.search(**kwargs, limit = None)
for hit in hits:
titles.append(hit["title"])
return titles
searcher = Searcher()
def main():
global searcher
st.title("KATO DB")
keyword = st.text_input(
"検索したいキーワードを入力して、Enterを押してください\n"\
"空欄だと全文書表示します。\n"\
"検索にヒットしない場合書き起こしAIに認識されていない可能性があります。(「もこう」など)"
)
date_start = st.date_input(
"検索したい開始日付を入力してください",
Date(2009, 1, 1)
)
date_end = st.date_input(
"検索したい終了日付を入力してください",
Date(2050, 12, 31)
)
#make query
if keyword == "":
query = whoosh.query.Every()
else:
#AND search
keyword_list = keyword.split()
query = whoosh.query.And([whoosh.query.Term("content", word) for word in keyword_list])
contents = searcher.search(q = query, date_start=date_start, date_end=date_end)
st.write("該当件数:{}件".format(len(contents)))
results = pd.DataFrame(contents, columns=["管理番号", "放送日", "動画タイトル"])
st.dataframe(results, hide_index=True)
selected_index = st.selectbox("管理番号を選択して、Enterを押して書き起こしを表示shi", results["管理番号"])
if selected_index is not None:
df_transcription = searcher.get_content(selected_index)
st.dataframe(df_transcription, width=1000)
st.write("ダウンロードはこちらから:https://github.com/konbraphat51/kato_db_dataset")
st.write("データ収集のプログラムコード・仕組みの説明はこちらから:https://github.com/konbraphat51/kato_db")
st.write("加藤AIのプロトタイプ:https://colab.research.google.com/drive/1QsJN50wvLEJx04P4XaBtsKqx1q3532OU?usp=sharing")
st.write("ブログ(協力者募集しています):https://qiita.com/konbraphat51/items/5b27afda442c13806c25")
st.write("データ最終更新:2023/8/19")
if __name__ == "__main__":
main()