Spaces:
Runtime error
Runtime error
Commit
·
9da4a82
0
Parent(s):
:wrench: :wrench:
Browse files- .gitignore +138 -0
- LICENSE +21 -0
- README.md +37 -0
- app.py +123 -0
- cli_app.py +17 -0
- ingest_data.py +33 -0
- public/chat_history.png +0 -0
- public/chat_history_parameters.png +0 -0
- query_data.py +34 -0
- requirements.txt +5 -0
- telegram_chat_loader.py +47 -0
.gitignore
ADDED
@@ -0,0 +1,138 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Byte-compiled / optimized / DLL files
|
2 |
+
__pycache__/
|
3 |
+
*.py[cod]
|
4 |
+
*$py.class
|
5 |
+
|
6 |
+
# C extensions
|
7 |
+
*.so
|
8 |
+
|
9 |
+
# Distribution / packaging
|
10 |
+
.Python
|
11 |
+
build/
|
12 |
+
develop-eggs/
|
13 |
+
dist/
|
14 |
+
downloads/
|
15 |
+
eggs/
|
16 |
+
.eggs/
|
17 |
+
lib/
|
18 |
+
lib64/
|
19 |
+
parts/
|
20 |
+
sdist/
|
21 |
+
var/
|
22 |
+
wheels/
|
23 |
+
pip-wheel-metadata/
|
24 |
+
share/python-wheels/
|
25 |
+
*.egg-info/
|
26 |
+
.installed.cfg
|
27 |
+
*.egg
|
28 |
+
MANIFEST
|
29 |
+
|
30 |
+
# PyInstaller
|
31 |
+
# Usually these files are written by a python script from a template
|
32 |
+
# before PyInstaller builds the exe, so as to inject date/other infos into it.
|
33 |
+
*.manifest
|
34 |
+
*.spec
|
35 |
+
|
36 |
+
# Installer logs
|
37 |
+
pip-log.txt
|
38 |
+
pip-delete-this-directory.txt
|
39 |
+
|
40 |
+
# Unit test / coverage reports
|
41 |
+
htmlcov/
|
42 |
+
.tox/
|
43 |
+
.nox/
|
44 |
+
.coverage
|
45 |
+
.coverage.*
|
46 |
+
.cache
|
47 |
+
nosetests.xml
|
48 |
+
coverage.xml
|
49 |
+
*.cover
|
50 |
+
*.py,cover
|
51 |
+
.hypothesis/
|
52 |
+
.pytest_cache/
|
53 |
+
|
54 |
+
# Translations
|
55 |
+
*.mo
|
56 |
+
*.pot
|
57 |
+
|
58 |
+
# Django stuff:
|
59 |
+
*.log
|
60 |
+
local_settings.py
|
61 |
+
db.sqlite3
|
62 |
+
db.sqlite3-journal
|
63 |
+
|
64 |
+
# Flask stuff:
|
65 |
+
instance/
|
66 |
+
.webassets-cache
|
67 |
+
|
68 |
+
# Scrapy stuff:
|
69 |
+
.scrapy
|
70 |
+
|
71 |
+
# Sphinx documentation
|
72 |
+
docs/_build/
|
73 |
+
|
74 |
+
# PyBuilder
|
75 |
+
target/
|
76 |
+
|
77 |
+
# Jupyter Notebook
|
78 |
+
.ipynb_checkpoints
|
79 |
+
|
80 |
+
# IPython
|
81 |
+
profile_default/
|
82 |
+
ipython_config.py
|
83 |
+
|
84 |
+
# pyenv
|
85 |
+
.python-version
|
86 |
+
|
87 |
+
# pipenv
|
88 |
+
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
|
89 |
+
# However, in case of collaboration, if having platform-specific dependencies or dependencies
|
90 |
+
# having no cross-platform support, pipenv may install dependencies that don't work, or not
|
91 |
+
# install all needed dependencies.
|
92 |
+
#Pipfile.lock
|
93 |
+
|
94 |
+
# PEP 582; used by e.g. github.com/David-OConnor/pyflow
|
95 |
+
__pypackages__/
|
96 |
+
|
97 |
+
# Celery stuff
|
98 |
+
celerybeat-schedule
|
99 |
+
celerybeat.pid
|
100 |
+
|
101 |
+
# SageMath parsed files
|
102 |
+
*.sage.py
|
103 |
+
|
104 |
+
# Environments
|
105 |
+
.env
|
106 |
+
.venv
|
107 |
+
env/
|
108 |
+
venv/
|
109 |
+
ENV/
|
110 |
+
env.bak/
|
111 |
+
venv.bak/
|
112 |
+
|
113 |
+
# Spyder project settings
|
114 |
+
.spyderproject
|
115 |
+
.spyproject
|
116 |
+
|
117 |
+
# Rope project settings
|
118 |
+
.ropeproject
|
119 |
+
|
120 |
+
# mkdocs documentation
|
121 |
+
/site
|
122 |
+
|
123 |
+
# mypy
|
124 |
+
.mypy_cache/
|
125 |
+
.dmypy.json
|
126 |
+
dmypy.json
|
127 |
+
|
128 |
+
# Pyre type checker
|
129 |
+
.pyre/
|
130 |
+
|
131 |
+
.idea/
|
132 |
+
.DS_Store
|
133 |
+
|
134 |
+
# Chat data
|
135 |
+
data/
|
136 |
+
|
137 |
+
# Embedding data
|
138 |
+
*.pkl
|
LICENSE
ADDED
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
MIT License
|
2 |
+
|
3 |
+
Copyright (c) 2023 Harrison Chase
|
4 |
+
|
5 |
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
6 |
+
of this software and associated documentation files (the "Software"), to deal
|
7 |
+
in the Software without restriction, including without limitation the rights
|
8 |
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
9 |
+
copies of the Software, and to permit persons to whom the Software is
|
10 |
+
furnished to do so, subject to the following conditions:
|
11 |
+
|
12 |
+
The above copyright notice and this permission notice shall be included in all
|
13 |
+
copies or substantial portions of the Software.
|
14 |
+
|
15 |
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
16 |
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
17 |
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
18 |
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
19 |
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
20 |
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
21 |
+
SOFTWARE.
|
README.md
ADDED
@@ -0,0 +1,37 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Chat with your Telegram Chat!
|
2 |
+
|
3 |
+
Understand who you are and your relationships by creating a ChatGPT like experience over your own Telegram chat with [LangChain](https://github.com/hwchase17/langchain).
|
4 |
+
|
5 |
+
Here is a very scientific peer-reviewed mathematical equation:
|
6 |
+
|
7 |
+
```
|
8 |
+
Your Telegram chats ≈ Your thoughts ≈ You
|
9 |
+
```
|
10 |
+
|
11 |
+
|
12 |
+
When have you been the happiest? What triggers you instantly? How could you have been more compassionate? When do you say yes and when do you say no? At what time are you the funniest?
|
13 |
+
|
14 |
+
|
15 |
+
Ask anything you've wanted to know about yourself and your relationship with someone.
|
16 |
+
|
17 |
+
## Ingest data
|
18 |
+
From the Telegram Desktop App, export your chat history.
|
19 |
+
|
20 |
+

|
21 |
+
|
22 |
+
Uncheck all the boxes and make sure you export it in `JSON`, you will then be able to download a `result.json` file.
|
23 |
+
|
24 |
+

|
25 |
+
|
26 |
+
You can then embed the data by running
|
27 |
+
|
28 |
+
```python ingest_data.py [result.json path]```
|
29 |
+
|
30 |
+
|
31 |
+
## Running the Application
|
32 |
+
|
33 |
+
By running `python app.py` from the command line you can easily interact with your Telegram Chat through a Gradio Web App.
|
34 |
+
|
35 |
+
|
36 |
+
|
37 |
+
*The original boilerplate is from https://github.com/hwchase17/chat-your-data*
|
app.py
ADDED
@@ -0,0 +1,123 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
from typing import Optional, Tuple
|
3 |
+
|
4 |
+
import gradio as gr
|
5 |
+
import pickle
|
6 |
+
from query_data import get_chain
|
7 |
+
from ingest_data import embed_chat
|
8 |
+
from threading import Lock
|
9 |
+
|
10 |
+
|
11 |
+
def click_embed(file):
|
12 |
+
"""Embed Telegram chat.
|
13 |
+
"""
|
14 |
+
embed_chat(file.name)
|
15 |
+
with open("vectorstore.pkl", "rb") as f:
|
16 |
+
vectorstore = pickle.load(f)
|
17 |
+
chain = get_chain(vectorstore)
|
18 |
+
return chain, gr.update(visible=False), gr.update(visible=False), gr.update(visible=False), gr.update(
|
19 |
+
visible=True), gr.update(visible=True), gr.update(
|
20 |
+
visible=True), gr.update(visible=True)
|
21 |
+
|
22 |
+
|
23 |
+
def set_openai_api_key(api_key: str):
|
24 |
+
"""Set the api key.
|
25 |
+
"""
|
26 |
+
if api_key:
|
27 |
+
os.environ["OPENAI_API_KEY"] = api_key
|
28 |
+
|
29 |
+
|
30 |
+
def upload_file(file_obj):
|
31 |
+
return file_obj
|
32 |
+
|
33 |
+
|
34 |
+
class ChatWrapper:
|
35 |
+
|
36 |
+
def __init__(self):
|
37 |
+
self.lock = Lock()
|
38 |
+
|
39 |
+
def __call__(
|
40 |
+
self, api_key: str, inp: str, history: Optional[Tuple[str, str]], chain
|
41 |
+
):
|
42 |
+
"""Execute the chat functionality."""
|
43 |
+
self.lock.acquire()
|
44 |
+
try:
|
45 |
+
history = history or []
|
46 |
+
# If chain is None, that is because no API key was provided.
|
47 |
+
if chain is None:
|
48 |
+
history.append((inp, "Please paste your OpenAI key to use"))
|
49 |
+
return history, history
|
50 |
+
# Set OpenAI key
|
51 |
+
import openai
|
52 |
+
openai.api_key = api_key
|
53 |
+
# Run chain and append input.
|
54 |
+
output = chain({"question": inp, "chat_history": history})["answer"]
|
55 |
+
history.append((inp, output))
|
56 |
+
except Exception as e:
|
57 |
+
raise e
|
58 |
+
finally:
|
59 |
+
self.lock.release()
|
60 |
+
return history, history
|
61 |
+
|
62 |
+
|
63 |
+
chat = ChatWrapper()
|
64 |
+
|
65 |
+
block = gr.Blocks(css=".gradio-container {background-color: lightgray}")
|
66 |
+
|
67 |
+
with block:
|
68 |
+
with gr.Row():
|
69 |
+
gr.Markdown("<h3><center>Telegram Chat Chat</center></h3>")
|
70 |
+
|
71 |
+
openai_api_key_textbox = gr.Textbox(
|
72 |
+
placeholder="Paste your OpenAI API key (sk-...)",
|
73 |
+
show_label=False,
|
74 |
+
lines=1,
|
75 |
+
type="password",
|
76 |
+
visible=True
|
77 |
+
)
|
78 |
+
telegram_chat_file = gr.File(file_count="single", file_types=["json"], interactive=True, show_label=True,
|
79 |
+
visible=True,
|
80 |
+
label="Telegram chat history exported .json file")
|
81 |
+
embed_button = gr.Button("Create embeddings", visible=True)
|
82 |
+
|
83 |
+
chatbot = gr.Chatbot(visible=False)
|
84 |
+
|
85 |
+
with gr.Row():
|
86 |
+
message = gr.Textbox(
|
87 |
+
label="What's your question?",
|
88 |
+
placeholder="Ask questions about your Telegram conversation",
|
89 |
+
lines=1,
|
90 |
+
visible=False
|
91 |
+
)
|
92 |
+
submit = gr.Button(value="Send", variant="secondary", visible=False).style(full_width=False)
|
93 |
+
with gr.Row(visible=False) as examples_row:
|
94 |
+
gr.Examples(
|
95 |
+
examples=[
|
96 |
+
"When was [your name] happy and how does it show?",
|
97 |
+
"How could [your name] have been more compassionate?",
|
98 |
+
"What triggers [your name] instantly?",
|
99 |
+
],
|
100 |
+
inputs=message,
|
101 |
+
)
|
102 |
+
|
103 |
+
gr.HTML(
|
104 |
+
"<center>Powered by <a href='https://github.com/hwchase17/langchain'>LangChain 🦜️🔗</a></center>"
|
105 |
+
)
|
106 |
+
|
107 |
+
state = gr.State()
|
108 |
+
agent_state = gr.State()
|
109 |
+
|
110 |
+
embed_button.click(click_embed, inputs=[telegram_chat_file],
|
111 |
+
outputs=[agent_state, openai_api_key_textbox, telegram_chat_file, embed_button, chatbot, message,
|
112 |
+
submit, examples_row])
|
113 |
+
|
114 |
+
submit.click(chat, inputs=[openai_api_key_textbox, message, state, agent_state], outputs=[chatbot, state])
|
115 |
+
print(agent_state)
|
116 |
+
message.submit(chat, inputs=[openai_api_key_textbox, message, state, agent_state], outputs=[chatbot, state])
|
117 |
+
|
118 |
+
openai_api_key_textbox.change(
|
119 |
+
set_openai_api_key,
|
120 |
+
inputs=[openai_api_key_textbox],
|
121 |
+
)
|
122 |
+
|
123 |
+
block.launch(debug=True)
|
cli_app.py
ADDED
@@ -0,0 +1,17 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import pickle
|
2 |
+
from query_data import get_chain
|
3 |
+
|
4 |
+
|
5 |
+
if __name__ == "__main__":
|
6 |
+
with open("vectorstore.pkl", "rb") as f:
|
7 |
+
vectorstore = pickle.load(f)
|
8 |
+
qa_chain = get_chain(vectorstore)
|
9 |
+
chat_history = []
|
10 |
+
print("Chat with your docs!")
|
11 |
+
while True:
|
12 |
+
print("Human:")
|
13 |
+
question = input()
|
14 |
+
result = qa_chain({"question": question, "chat_history": chat_history})
|
15 |
+
chat_history.append((question, result["answer"]))
|
16 |
+
print("AI:")
|
17 |
+
print(result["answer"])
|
ingest_data.py
ADDED
@@ -0,0 +1,33 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import argparse
|
2 |
+
import pickle
|
3 |
+
|
4 |
+
from langchain.text_splitter import CharacterTextSplitter
|
5 |
+
from telegram_chat_loader import TelegramChatLoader
|
6 |
+
from langchain.vectorstores.faiss import FAISS
|
7 |
+
from langchain.embeddings import OpenAIEmbeddings
|
8 |
+
|
9 |
+
|
10 |
+
# Load Data
|
11 |
+
def embed_chat(chat_file_path):
|
12 |
+
loader = TelegramChatLoader(chat_file_path)
|
13 |
+
raw_documents = loader.load()
|
14 |
+
|
15 |
+
# Split text
|
16 |
+
text_splitter = CharacterTextSplitter(separator="\n\n", chunk_size=512, chunk_overlap=20)
|
17 |
+
documents = text_splitter.split_documents(raw_documents)
|
18 |
+
|
19 |
+
# Load Data to vectorstore
|
20 |
+
embeddings = OpenAIEmbeddings()
|
21 |
+
vectorstore = FAISS.from_documents(documents, embeddings)
|
22 |
+
|
23 |
+
# Save vectorstore
|
24 |
+
with open("vectorstore.pkl", "wb") as f:
|
25 |
+
pickle.dump(vectorstore, f)
|
26 |
+
|
27 |
+
|
28 |
+
if __name__ == '__main__':
|
29 |
+
parser = argparse.ArgumentParser()
|
30 |
+
parser.add_argument('file_name', type=str, help='The Telegram chat exported *.json file')
|
31 |
+
args = parser.parse_args()
|
32 |
+
file_name = args.file_name
|
33 |
+
embed_chat(file_name)
|
public/chat_history.png
ADDED
![]() |
public/chat_history_parameters.png
ADDED
![]() |
query_data.py
ADDED
@@ -0,0 +1,34 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from langchain.prompts.prompt import PromptTemplate
|
2 |
+
from langchain.llms import OpenAI
|
3 |
+
from langchain.chains import ChatVectorDBChain
|
4 |
+
|
5 |
+
_template = """Given the following conversation and a follow up question, rephrase the follow up question to be a standalone question.
|
6 |
+
You can assume the question about the conversation containing all the messages exchanged between these people.
|
7 |
+
|
8 |
+
Chat History:
|
9 |
+
{chat_history}
|
10 |
+
Follow Up Input: {question}
|
11 |
+
Standalone question:"""
|
12 |
+
CONDENSE_QUESTION_PROMPT = PromptTemplate.from_template(_template)
|
13 |
+
|
14 |
+
template = """You are an AI assistant for answering questions about this online conversation between these people.
|
15 |
+
You are given the following extracted parts of a long document and a question.
|
16 |
+
Provide a conversational answer that solely comes from this online conversation between these people and your interpretation.
|
17 |
+
Your responses should be informative, interesting, and engaging. You should respond thoroughly.
|
18 |
+
Question: {question}
|
19 |
+
=========
|
20 |
+
{context}
|
21 |
+
=========
|
22 |
+
Answer:"""
|
23 |
+
QA_PROMPT = PromptTemplate(template=template, input_variables=["question", "context"])
|
24 |
+
|
25 |
+
|
26 |
+
def get_chain(vectorstore):
|
27 |
+
llm = OpenAI(temperature=0)
|
28 |
+
qa_chain = ChatVectorDBChain.from_llm(
|
29 |
+
llm,
|
30 |
+
vectorstore,
|
31 |
+
qa_prompt=QA_PROMPT,
|
32 |
+
condense_question_prompt=CONDENSE_QUESTION_PROMPT,
|
33 |
+
)
|
34 |
+
return qa_chain
|
requirements.txt
ADDED
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
langchain
|
2 |
+
openai
|
3 |
+
unstructured
|
4 |
+
faiss-cpu
|
5 |
+
gradio
|
telegram_chat_loader.py
ADDED
@@ -0,0 +1,47 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""Loader that loads Telegram chat json dump."""
|
2 |
+
import json
|
3 |
+
import pandas as pd
|
4 |
+
from pathlib import Path
|
5 |
+
from typing import List
|
6 |
+
|
7 |
+
from langchain.docstore.document import Document
|
8 |
+
from langchain.document_loaders.base import BaseLoader
|
9 |
+
|
10 |
+
|
11 |
+
def concatenate_rows(row):
|
12 |
+
date = row['date']
|
13 |
+
sender = row['from']
|
14 |
+
text = row['text']
|
15 |
+
return f'{sender} on {date}: {text}\n\n'
|
16 |
+
|
17 |
+
|
18 |
+
class TelegramChatLoader(BaseLoader):
|
19 |
+
"""Loader that loads Telegram chat json directory dump."""
|
20 |
+
|
21 |
+
def __init__(self, path: str):
|
22 |
+
"""Initialize with path."""
|
23 |
+
self.file_path = path
|
24 |
+
|
25 |
+
def load(self) -> List[Document]:
|
26 |
+
"""Load documents."""
|
27 |
+
p = Path(self.file_path)
|
28 |
+
|
29 |
+
with open(p, encoding="utf8") as f:
|
30 |
+
d = json.load(f)
|
31 |
+
|
32 |
+
normalized_messages = pd.json_normalize(d['messages'])
|
33 |
+
df_normalized_messages = pd.DataFrame(normalized_messages)
|
34 |
+
|
35 |
+
# Only keep plain text messages (no services, nor links, hashtags, code, bold ...)
|
36 |
+
df_filtered = df_normalized_messages[
|
37 |
+
(df_normalized_messages.type == "message") &
|
38 |
+
(df_normalized_messages.text.apply(lambda x: type(x) == str))
|
39 |
+
]
|
40 |
+
|
41 |
+
df_filtered = df_filtered[["date", "text", "from"]]
|
42 |
+
|
43 |
+
text = df_filtered.apply(concatenate_rows, axis=1).str.cat(sep='')
|
44 |
+
|
45 |
+
metadata = {"source": str(p)}
|
46 |
+
|
47 |
+
return [Document(page_content=text, metadata=metadata)]
|