maxime commited on
Commit
9da4a82
·
0 Parent(s):

:wrench: :wrench:

Browse files
.gitignore ADDED
@@ -0,0 +1,138 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Byte-compiled / optimized / DLL files
2
+ __pycache__/
3
+ *.py[cod]
4
+ *$py.class
5
+
6
+ # C extensions
7
+ *.so
8
+
9
+ # Distribution / packaging
10
+ .Python
11
+ build/
12
+ develop-eggs/
13
+ dist/
14
+ downloads/
15
+ eggs/
16
+ .eggs/
17
+ lib/
18
+ lib64/
19
+ parts/
20
+ sdist/
21
+ var/
22
+ wheels/
23
+ pip-wheel-metadata/
24
+ share/python-wheels/
25
+ *.egg-info/
26
+ .installed.cfg
27
+ *.egg
28
+ MANIFEST
29
+
30
+ # PyInstaller
31
+ # Usually these files are written by a python script from a template
32
+ # before PyInstaller builds the exe, so as to inject date/other infos into it.
33
+ *.manifest
34
+ *.spec
35
+
36
+ # Installer logs
37
+ pip-log.txt
38
+ pip-delete-this-directory.txt
39
+
40
+ # Unit test / coverage reports
41
+ htmlcov/
42
+ .tox/
43
+ .nox/
44
+ .coverage
45
+ .coverage.*
46
+ .cache
47
+ nosetests.xml
48
+ coverage.xml
49
+ *.cover
50
+ *.py,cover
51
+ .hypothesis/
52
+ .pytest_cache/
53
+
54
+ # Translations
55
+ *.mo
56
+ *.pot
57
+
58
+ # Django stuff:
59
+ *.log
60
+ local_settings.py
61
+ db.sqlite3
62
+ db.sqlite3-journal
63
+
64
+ # Flask stuff:
65
+ instance/
66
+ .webassets-cache
67
+
68
+ # Scrapy stuff:
69
+ .scrapy
70
+
71
+ # Sphinx documentation
72
+ docs/_build/
73
+
74
+ # PyBuilder
75
+ target/
76
+
77
+ # Jupyter Notebook
78
+ .ipynb_checkpoints
79
+
80
+ # IPython
81
+ profile_default/
82
+ ipython_config.py
83
+
84
+ # pyenv
85
+ .python-version
86
+
87
+ # pipenv
88
+ # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
89
+ # However, in case of collaboration, if having platform-specific dependencies or dependencies
90
+ # having no cross-platform support, pipenv may install dependencies that don't work, or not
91
+ # install all needed dependencies.
92
+ #Pipfile.lock
93
+
94
+ # PEP 582; used by e.g. github.com/David-OConnor/pyflow
95
+ __pypackages__/
96
+
97
+ # Celery stuff
98
+ celerybeat-schedule
99
+ celerybeat.pid
100
+
101
+ # SageMath parsed files
102
+ *.sage.py
103
+
104
+ # Environments
105
+ .env
106
+ .venv
107
+ env/
108
+ venv/
109
+ ENV/
110
+ env.bak/
111
+ venv.bak/
112
+
113
+ # Spyder project settings
114
+ .spyderproject
115
+ .spyproject
116
+
117
+ # Rope project settings
118
+ .ropeproject
119
+
120
+ # mkdocs documentation
121
+ /site
122
+
123
+ # mypy
124
+ .mypy_cache/
125
+ .dmypy.json
126
+ dmypy.json
127
+
128
+ # Pyre type checker
129
+ .pyre/
130
+
131
+ .idea/
132
+ .DS_Store
133
+
134
+ # Chat data
135
+ data/
136
+
137
+ # Embedding data
138
+ *.pkl
LICENSE ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ MIT License
2
+
3
+ Copyright (c) 2023 Harrison Chase
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
README.md ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Chat with your Telegram Chat!
2
+
3
+ Understand who you are and your relationships by creating a ChatGPT like experience over your own Telegram chat with [LangChain](https://github.com/hwchase17/langchain).
4
+
5
+ Here is a very scientific peer-reviewed mathematical equation:
6
+
7
+ ```
8
+ Your Telegram chats ≈ Your thoughts ≈ You
9
+ ```
10
+
11
+
12
+ When have you been the happiest? What triggers you instantly? How could you have been more compassionate? When do you say yes and when do you say no? At what time are you the funniest?
13
+
14
+
15
+ Ask anything you've wanted to know about yourself and your relationship with someone.
16
+
17
+ ## Ingest data
18
+ From the Telegram Desktop App, export your chat history.
19
+
20
+ ![](public/chat_history.png)
21
+
22
+ Uncheck all the boxes and make sure you export it in `JSON`, you will then be able to download a `result.json` file.
23
+
24
+ ![](public/chat_history_parameters.png)
25
+
26
+ You can then embed the data by running
27
+
28
+ ```python ingest_data.py [result.json path]```
29
+
30
+
31
+ ## Running the Application
32
+
33
+ By running `python app.py` from the command line you can easily interact with your Telegram Chat through a Gradio Web App.
34
+
35
+
36
+
37
+ *The original boilerplate is from https://github.com/hwchase17/chat-your-data*
app.py ADDED
@@ -0,0 +1,123 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from typing import Optional, Tuple
3
+
4
+ import gradio as gr
5
+ import pickle
6
+ from query_data import get_chain
7
+ from ingest_data import embed_chat
8
+ from threading import Lock
9
+
10
+
11
+ def click_embed(file):
12
+ """Embed Telegram chat.
13
+ """
14
+ embed_chat(file.name)
15
+ with open("vectorstore.pkl", "rb") as f:
16
+ vectorstore = pickle.load(f)
17
+ chain = get_chain(vectorstore)
18
+ return chain, gr.update(visible=False), gr.update(visible=False), gr.update(visible=False), gr.update(
19
+ visible=True), gr.update(visible=True), gr.update(
20
+ visible=True), gr.update(visible=True)
21
+
22
+
23
+ def set_openai_api_key(api_key: str):
24
+ """Set the api key.
25
+ """
26
+ if api_key:
27
+ os.environ["OPENAI_API_KEY"] = api_key
28
+
29
+
30
+ def upload_file(file_obj):
31
+ return file_obj
32
+
33
+
34
+ class ChatWrapper:
35
+
36
+ def __init__(self):
37
+ self.lock = Lock()
38
+
39
+ def __call__(
40
+ self, api_key: str, inp: str, history: Optional[Tuple[str, str]], chain
41
+ ):
42
+ """Execute the chat functionality."""
43
+ self.lock.acquire()
44
+ try:
45
+ history = history or []
46
+ # If chain is None, that is because no API key was provided.
47
+ if chain is None:
48
+ history.append((inp, "Please paste your OpenAI key to use"))
49
+ return history, history
50
+ # Set OpenAI key
51
+ import openai
52
+ openai.api_key = api_key
53
+ # Run chain and append input.
54
+ output = chain({"question": inp, "chat_history": history})["answer"]
55
+ history.append((inp, output))
56
+ except Exception as e:
57
+ raise e
58
+ finally:
59
+ self.lock.release()
60
+ return history, history
61
+
62
+
63
+ chat = ChatWrapper()
64
+
65
+ block = gr.Blocks(css=".gradio-container {background-color: lightgray}")
66
+
67
+ with block:
68
+ with gr.Row():
69
+ gr.Markdown("<h3><center>Telegram Chat Chat</center></h3>")
70
+
71
+ openai_api_key_textbox = gr.Textbox(
72
+ placeholder="Paste your OpenAI API key (sk-...)",
73
+ show_label=False,
74
+ lines=1,
75
+ type="password",
76
+ visible=True
77
+ )
78
+ telegram_chat_file = gr.File(file_count="single", file_types=["json"], interactive=True, show_label=True,
79
+ visible=True,
80
+ label="Telegram chat history exported .json file")
81
+ embed_button = gr.Button("Create embeddings", visible=True)
82
+
83
+ chatbot = gr.Chatbot(visible=False)
84
+
85
+ with gr.Row():
86
+ message = gr.Textbox(
87
+ label="What's your question?",
88
+ placeholder="Ask questions about your Telegram conversation",
89
+ lines=1,
90
+ visible=False
91
+ )
92
+ submit = gr.Button(value="Send", variant="secondary", visible=False).style(full_width=False)
93
+ with gr.Row(visible=False) as examples_row:
94
+ gr.Examples(
95
+ examples=[
96
+ "When was [your name] happy and how does it show?",
97
+ "How could [your name] have been more compassionate?",
98
+ "What triggers [your name] instantly?",
99
+ ],
100
+ inputs=message,
101
+ )
102
+
103
+ gr.HTML(
104
+ "<center>Powered by <a href='https://github.com/hwchase17/langchain'>LangChain 🦜️🔗</a></center>"
105
+ )
106
+
107
+ state = gr.State()
108
+ agent_state = gr.State()
109
+
110
+ embed_button.click(click_embed, inputs=[telegram_chat_file],
111
+ outputs=[agent_state, openai_api_key_textbox, telegram_chat_file, embed_button, chatbot, message,
112
+ submit, examples_row])
113
+
114
+ submit.click(chat, inputs=[openai_api_key_textbox, message, state, agent_state], outputs=[chatbot, state])
115
+ print(agent_state)
116
+ message.submit(chat, inputs=[openai_api_key_textbox, message, state, agent_state], outputs=[chatbot, state])
117
+
118
+ openai_api_key_textbox.change(
119
+ set_openai_api_key,
120
+ inputs=[openai_api_key_textbox],
121
+ )
122
+
123
+ block.launch(debug=True)
cli_app.py ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pickle
2
+ from query_data import get_chain
3
+
4
+
5
+ if __name__ == "__main__":
6
+ with open("vectorstore.pkl", "rb") as f:
7
+ vectorstore = pickle.load(f)
8
+ qa_chain = get_chain(vectorstore)
9
+ chat_history = []
10
+ print("Chat with your docs!")
11
+ while True:
12
+ print("Human:")
13
+ question = input()
14
+ result = qa_chain({"question": question, "chat_history": chat_history})
15
+ chat_history.append((question, result["answer"]))
16
+ print("AI:")
17
+ print(result["answer"])
ingest_data.py ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+ import pickle
3
+
4
+ from langchain.text_splitter import CharacterTextSplitter
5
+ from telegram_chat_loader import TelegramChatLoader
6
+ from langchain.vectorstores.faiss import FAISS
7
+ from langchain.embeddings import OpenAIEmbeddings
8
+
9
+
10
+ # Load Data
11
+ def embed_chat(chat_file_path):
12
+ loader = TelegramChatLoader(chat_file_path)
13
+ raw_documents = loader.load()
14
+
15
+ # Split text
16
+ text_splitter = CharacterTextSplitter(separator="\n\n", chunk_size=512, chunk_overlap=20)
17
+ documents = text_splitter.split_documents(raw_documents)
18
+
19
+ # Load Data to vectorstore
20
+ embeddings = OpenAIEmbeddings()
21
+ vectorstore = FAISS.from_documents(documents, embeddings)
22
+
23
+ # Save vectorstore
24
+ with open("vectorstore.pkl", "wb") as f:
25
+ pickle.dump(vectorstore, f)
26
+
27
+
28
+ if __name__ == '__main__':
29
+ parser = argparse.ArgumentParser()
30
+ parser.add_argument('file_name', type=str, help='The Telegram chat exported *.json file')
31
+ args = parser.parse_args()
32
+ file_name = args.file_name
33
+ embed_chat(file_name)
public/chat_history.png ADDED
public/chat_history_parameters.png ADDED
query_data.py ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from langchain.prompts.prompt import PromptTemplate
2
+ from langchain.llms import OpenAI
3
+ from langchain.chains import ChatVectorDBChain
4
+
5
+ _template = """Given the following conversation and a follow up question, rephrase the follow up question to be a standalone question.
6
+ You can assume the question about the conversation containing all the messages exchanged between these people.
7
+
8
+ Chat History:
9
+ {chat_history}
10
+ Follow Up Input: {question}
11
+ Standalone question:"""
12
+ CONDENSE_QUESTION_PROMPT = PromptTemplate.from_template(_template)
13
+
14
+ template = """You are an AI assistant for answering questions about this online conversation between these people.
15
+ You are given the following extracted parts of a long document and a question.
16
+ Provide a conversational answer that solely comes from this online conversation between these people and your interpretation.
17
+ Your responses should be informative, interesting, and engaging. You should respond thoroughly.
18
+ Question: {question}
19
+ =========
20
+ {context}
21
+ =========
22
+ Answer:"""
23
+ QA_PROMPT = PromptTemplate(template=template, input_variables=["question", "context"])
24
+
25
+
26
+ def get_chain(vectorstore):
27
+ llm = OpenAI(temperature=0)
28
+ qa_chain = ChatVectorDBChain.from_llm(
29
+ llm,
30
+ vectorstore,
31
+ qa_prompt=QA_PROMPT,
32
+ condense_question_prompt=CONDENSE_QUESTION_PROMPT,
33
+ )
34
+ return qa_chain
requirements.txt ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ langchain
2
+ openai
3
+ unstructured
4
+ faiss-cpu
5
+ gradio
telegram_chat_loader.py ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Loader that loads Telegram chat json dump."""
2
+ import json
3
+ import pandas as pd
4
+ from pathlib import Path
5
+ from typing import List
6
+
7
+ from langchain.docstore.document import Document
8
+ from langchain.document_loaders.base import BaseLoader
9
+
10
+
11
+ def concatenate_rows(row):
12
+ date = row['date']
13
+ sender = row['from']
14
+ text = row['text']
15
+ return f'{sender} on {date}: {text}\n\n'
16
+
17
+
18
+ class TelegramChatLoader(BaseLoader):
19
+ """Loader that loads Telegram chat json directory dump."""
20
+
21
+ def __init__(self, path: str):
22
+ """Initialize with path."""
23
+ self.file_path = path
24
+
25
+ def load(self) -> List[Document]:
26
+ """Load documents."""
27
+ p = Path(self.file_path)
28
+
29
+ with open(p, encoding="utf8") as f:
30
+ d = json.load(f)
31
+
32
+ normalized_messages = pd.json_normalize(d['messages'])
33
+ df_normalized_messages = pd.DataFrame(normalized_messages)
34
+
35
+ # Only keep plain text messages (no services, nor links, hashtags, code, bold ...)
36
+ df_filtered = df_normalized_messages[
37
+ (df_normalized_messages.type == "message") &
38
+ (df_normalized_messages.text.apply(lambda x: type(x) == str))
39
+ ]
40
+
41
+ df_filtered = df_filtered[["date", "text", "from"]]
42
+
43
+ text = df_filtered.apply(concatenate_rows, axis=1).str.cat(sep='')
44
+
45
+ metadata = {"source": str(p)}
46
+
47
+ return [Document(page_content=text, metadata=metadata)]