Spaces:
Runtime error
Runtime error
import streamlit as st | |
import pandas as pd | |
import pathlib | |
import whoosh | |
import whoosh.index | |
import whoosh.query | |
import os | |
from datetime import date as Date | |
import re | |
DATA_FOLDER = pathlib.Path(__file__).parent / "Data" | |
RAW_FOLDER = DATA_FOLDER / "Transcription_raw" | |
INDEX_FOLDER = DATA_FOLDER / "Transcription_index" | |
class Searcher: | |
def __init__(self): | |
self.ix = self.make_total_ix() | |
self.df_video_links = self.get_video_links() | |
def make_total_ix(self): | |
ixes_sub = [] | |
index_dir_list = os.listdir(INDEX_FOLDER) | |
for name in index_dir_list: | |
if name.startswith("sub"): | |
ixes_sub.append(whoosh.index.open_dir(INDEX_FOLDER / name)) | |
ix = MultiIndexSearcher(ixes_sub) | |
return ix | |
def search(self, date_start, date_end, **kwargs): | |
titles = self.ix.search(**kwargs) | |
#(index, date, title) | |
contents = [] | |
for title in titles: | |
index = int(title.split("m")[0]) | |
row = self.df_video_links.iloc[index] | |
title = row["title"] | |
date = row["date"] | |
date_datetime = Date(*map(int, date.split("/"))) | |
if not (date_start <= date_datetime <= date_end): | |
continue | |
contents.append((date_datetime, index, date, title)) | |
#order by date_datetime | |
contents.sort() | |
#remove date_datetime | |
contents = [(index, date, title) for _, index, date, title in contents] | |
return contents | |
def get_video_links(self): | |
return pd.read_csv(DATA_FOLDER / "video_links.csv", index_col=0) | |
def get_content(self, index): | |
#正規表現でRAW_FOLDERから「index-(数字).csv」のファイルを取得 | |
folder_list = os.listdir(RAW_FOLDER) | |
pattern = re.compile(r"{}-\d+.csv".format(index)) | |
matched = [name for name in folder_list if pattern.match(name)] | |
matched.sort() | |
best = matched[-1] | |
return pd.read_csv(RAW_FOLDER / best) | |
class MultiIndexSearcher: | |
def __init__(self, ixes): | |
self.ixes = ixes | |
def search(self, **kwargs): | |
titles = [] | |
for ix in self.ixes: | |
with ix.searcher() as sub_searcher: | |
hits = sub_searcher.search(**kwargs, limit = None) | |
for hit in hits: | |
titles.append(hit["title"]) | |
return titles | |
searcher = Searcher() | |
def main(): | |
global searcher | |
st.title("KATO DB") | |
keyword = st.text_input( | |
"検索したいキーワードを入力して、Enterを押してください\n"\ | |
"空欄だと全文書表示します。\n"\ | |
"検索にヒットしない場合書き起こしAIに認識されていない可能性があります。(「もこう」など)" | |
) | |
date_start = st.date_input( | |
"検索したい開始日付を入力してください", | |
Date(2009, 1, 1) | |
) | |
date_end = st.date_input( | |
"検索したい終了日付を入力してください", | |
Date(2050, 12, 31) | |
) | |
#make query | |
if keyword == "": | |
query = whoosh.query.Every() | |
else: | |
#AND search | |
keyword_list = keyword.split() | |
query = whoosh.query.And([whoosh.query.Term("content", word) for word in keyword_list]) | |
contents = searcher.search(q = query, date_start=date_start, date_end=date_end) | |
st.write("該当件数:{}件".format(len(contents))) | |
results = pd.DataFrame(contents, columns=["管理番号", "放送日", "動画タイトル"]) | |
st.dataframe(results, hide_index=True) | |
selected_index = st.selectbox("管理番号を選択して、Enterを押して書き起こしを表示shi", results["管理番号"]) | |
if selected_index is not None: | |
df_transcription = searcher.get_content(selected_index) | |
st.dataframe(df_transcription, width=1000) | |
st.write("ダウンロードはこちらから:https://github.com/konbraphat51/kato_db_dataset") | |
st.write("データ収集のプログラムコード・仕組みの説明はこちらから:https://github.com/konbraphat51/kato_db") | |
st.write("加藤AIのプロトタイプ:https://colab.research.google.com/drive/1QsJN50wvLEJx04P4XaBtsKqx1q3532OU?usp=sharing") | |
st.write("ブログ(協力者募集しています):https://qiita.com/konbraphat51/items/5b27afda442c13806c25") | |
st.write("データ最終更新:2023/8/19") | |
if __name__ == "__main__": | |
main() | |