setup
Browse files- 2Tutor.py +168 -0
- AmesHousing.tsv.csv +0 -0
- LICENSE +201 -0
- README.md +100 -2
- data2.jpeg +0 -0
- logs/conversation_logs.json +0 -0
- pages/1_Business_understanding.py +67 -0
- pages/2_Data_understanding.py +70 -0
- pages/3_Data_preparation.py +62 -0
- pages/4_Feature_engineering.py +78 -0
- pages/5_Modeling.py +54 -0
- pages/6_Evaluation.py +62 -0
- pages/7_Deployment.py +43 -0
- pages/8_Models.py +48 -0
- pages/ML_Algorithms/decision_trees.py +4 -0
- pages/ML_Algorithms/logistic_regression.py +4 -0
- pages/ML_Algorithms/neural_networks.py +4 -0
- pages/ML_Algorithms/random_forest.py +4 -0
- pages/ML_Algorithms/svm.py +4 -0
- pages/__pycache__/1_Business_understanding.cpython-311.pyc +0 -0
- pages/__pycache__/2_Data_understanding.cpython-311.pyc +0 -0
- pages/__pycache__/3_Algorithms.cpython-311.pyc +0 -0
- pages/__pycache__/3_Data_preparation.cpython-311.pyc +0 -0
- pages/__pycache__/4_Data_ingestion.cpython-311.pyc +0 -0
- pages/__pycache__/4_Feature_engineering.cpython-311.pyc +0 -0
- pages/__pycache__/5_Data_preparation.cpython-311.pyc +0 -0
- pages/__pycache__/5_Modeling.cpython-311.pyc +0 -0
- pages/__pycache__/6_Evaluation.cpython-311.pyc +0 -0
- pages/__pycache__/7_Deployment.cpython-311.pyc +0 -0
- pages/__pycache__/8_Models.cpython-311.pyc +0 -0
- sidebar.png +0 -0
2Tutor.py
ADDED
@@ -0,0 +1,168 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
import importlib
|
3 |
+
import os
|
4 |
+
import json
|
5 |
+
from datetime import datetime
|
6 |
+
from dotenv import load_dotenv
|
7 |
+
from openai import OpenAI
|
8 |
+
|
9 |
+
# Load environment variables from .env file
|
10 |
+
load_dotenv()
|
11 |
+
|
12 |
+
# Set page title and layout
|
13 |
+
st.set_page_config(page_title="Data Science Tutor", layout="wide")
|
14 |
+
|
15 |
+
# Hide Streamlit's default page navigation menu
|
16 |
+
st.markdown("""
|
17 |
+
<style>
|
18 |
+
/* Hide the default "Pages" menu in the top-left sidebar */
|
19 |
+
[data-testid="stSidebarNav"] {
|
20 |
+
display: none;
|
21 |
+
}
|
22 |
+
</style>
|
23 |
+
""", unsafe_allow_html=True)
|
24 |
+
|
25 |
+
# Sidebar with image above the CRISP-DM Steps
|
26 |
+
st.sidebar.image(
|
27 |
+
"data2.jpeg", # Replace with your file path or URL
|
28 |
+
use_container_width=True
|
29 |
+
)
|
30 |
+
|
31 |
+
# Sidebar navigation
|
32 |
+
st.sidebar.title("CRISP-DM Steps")
|
33 |
+
sections = {
|
34 |
+
"Main Page": None,
|
35 |
+
"1. Business Understanding": "1_Business_understanding",
|
36 |
+
"2. Data understanding": "2_Data_understanding",
|
37 |
+
"3. Data Preparation": "3_Data_preparation",
|
38 |
+
"4. Feature Engineering": "4_Feature_engineering",
|
39 |
+
"5. Modeling": "5_Modeling",
|
40 |
+
"6. Evaluation": "6_Evaluation",
|
41 |
+
"7. Deployment & Testing": "7_Deployment",
|
42 |
+
"8. ML, Deep Learning & Transformers": "8_Models"
|
43 |
+
}
|
44 |
+
|
45 |
+
# By default, make the first item (Main Page) selected.
|
46 |
+
selected_section = st.sidebar.radio("Select a topic:", list(sections.keys()), index=0)
|
47 |
+
|
48 |
+
# If the user selects βMain Page,β just show your introduction content.
|
49 |
+
if sections[selected_section] is None:
|
50 |
+
st.title("π Welcome to the Data Science Tutor!")
|
51 |
+
st.markdown(
|
52 |
+
"""
|
53 |
+
<div style="color: #2FA4E7; margin-top: 1rem;">
|
54 |
+
<h2>About This App</h2>
|
55 |
+
<p>
|
56 |
+
This application is designed to guide you through the CRISP-DM process
|
57 |
+
for data science projects. Each section in the sidebar highlights a
|
58 |
+
different step in the process, providing structured lessons, best
|
59 |
+
practices, and hands-on examples.
|
60 |
+
</p>
|
61 |
+
<h3>App Sections</h3>
|
62 |
+
<ul>
|
63 |
+
<li><strong>1. Business Understanding</strong> β Clarify project objectives, requirements, and success criteria.</li>
|
64 |
+
<li><strong>2. Data Understanding</strong> β Explore data sources, structures, and initial insights.</li>
|
65 |
+
<li><strong>3. Data Preparation</strong> β Clean, integrate, and transform the data for modeling.</li>
|
66 |
+
<li><strong>4. Feature Engineering</strong> β Engineer and select relevant features for better models.</li>
|
67 |
+
<li><strong>5. Modeling</strong> β Develop, train, and tune predictive models.</li>
|
68 |
+
<li><strong>6. Evaluation</strong> β Assess performance metrics and refine models.</li>
|
69 |
+
<li><strong>7. Deployment & Testing</strong> β Deploy models into production environments and validate.</li>
|
70 |
+
<li><strong>8. ML, Deep Learning & Transformers</strong> β Delve deeper into advanced methods and architectures.</li>
|
71 |
+
</ul>
|
72 |
+
</div>
|
73 |
+
""",
|
74 |
+
unsafe_allow_html=True
|
75 |
+
)
|
76 |
+
else:
|
77 |
+
# Otherwise, load the selected module from the pages folder
|
78 |
+
module_name = f"pages.{sections[selected_section]}"
|
79 |
+
module = importlib.import_module(module_name)
|
80 |
+
module.run()
|
81 |
+
|
82 |
+
# OpenAI API Section
|
83 |
+
st.sidebar.title("Ask AI")
|
84 |
+
api_key = os.getenv("OPENAI_API_KEY")
|
85 |
+
if not api_key:
|
86 |
+
api_key = st.sidebar.text_input("Enter your OpenAI API Key", type="password")
|
87 |
+
|
88 |
+
client = OpenAI()
|
89 |
+
|
90 |
+
# Create side menus for toggles from the pages/ list
|
91 |
+
st.sidebar.title("Focus Areas")
|
92 |
+
focus_areas = [
|
93 |
+
"Data Cleaning & Wrangling",
|
94 |
+
"Feature Engineering & Selection",
|
95 |
+
"Model Selection & Tuning",
|
96 |
+
"Interpretability & Explainability",
|
97 |
+
"Model Deployment & Monitoring"
|
98 |
+
]
|
99 |
+
selected_focus_areas = [area for area in focus_areas if st.sidebar.checkbox(area)]
|
100 |
+
|
101 |
+
# Main chat section
|
102 |
+
st.title("Data Science Tutor Chat")
|
103 |
+
st.image("https://miro.medium.com/v2/resize:fit:100/format:webp/1*NfE0G4nEj4xX7Z_8dSx83g.png")
|
104 |
+
|
105 |
+
# Initialize conversation in the session state
|
106 |
+
if "messages" not in st.session_state:
|
107 |
+
st.session_state["messages"] = [
|
108 |
+
{"role": "assistant", "content": "How can I assist you with Data Science today?"}
|
109 |
+
]
|
110 |
+
|
111 |
+
# Initialize context prompt added state
|
112 |
+
if "context_prompt_added" not in st.session_state:
|
113 |
+
st.session_state["context_prompt_added"] = False
|
114 |
+
|
115 |
+
st.write("---")
|
116 |
+
st.subheader("Chat")
|
117 |
+
|
118 |
+
for msg in st.session_state["messages"]:
|
119 |
+
st.chat_message(msg["role"]).write(msg["content"])
|
120 |
+
|
121 |
+
if prompt := st.chat_input("Enter your question here:"):
|
122 |
+
# Add context to the messages if toggles are selected
|
123 |
+
focus_context = ""
|
124 |
+
if selected_focus_areas:
|
125 |
+
focus_context = f"Focus on {', '.join(selected_focus_areas)} in your response."
|
126 |
+
|
127 |
+
# Add context based on the selected section
|
128 |
+
section_context = f"The user is currently viewing the {selected_section} section. "
|
129 |
+
|
130 |
+
# If the context prompt hasn't been added yet, build & inject it once;
|
131 |
+
# otherwise, just add the user's raw question.
|
132 |
+
if not st.session_state["context_prompt_added"]:
|
133 |
+
st.session_state["messages"].append({"role": "user", "content": f"{section_context}{prompt}\n{focus_context}"})
|
134 |
+
st.session_state["context_prompt_added"] = True
|
135 |
+
else:
|
136 |
+
st.session_state["messages"].append({"role": "user", "content": f"{section_context}{prompt}"})
|
137 |
+
|
138 |
+
# Display the latest user message in the chat
|
139 |
+
st.chat_message("user").write(st.session_state["messages"][-1]["content"])
|
140 |
+
|
141 |
+
# Now call GPT-4 with the entire conversation
|
142 |
+
completion = client.chat.completions.create(
|
143 |
+
model="gpt-4",
|
144 |
+
messages=st.session_state["messages"]
|
145 |
+
)
|
146 |
+
response_text = completion.choices[0].message.content.strip()
|
147 |
+
|
148 |
+
st.session_state["messages"].append({"role": "assistant", "content": response_text})
|
149 |
+
st.chat_message("assistant").write(response_text)
|
150 |
+
|
151 |
+
# Log the conversation
|
152 |
+
log_entry = {
|
153 |
+
"timestamp": datetime.now().isoformat(),
|
154 |
+
"user_query": prompt,
|
155 |
+
"assistant_response": response_text,
|
156 |
+
"focus_areas": selected_focus_areas,
|
157 |
+
"selected_section": selected_section
|
158 |
+
}
|
159 |
+
log_file_path = os.path.join("logs", "conversation_logs.json")
|
160 |
+
os.makedirs(os.path.dirname(log_file_path), exist_ok=True)
|
161 |
+
if os.path.exists(log_file_path):
|
162 |
+
with open(log_file_path, "r") as log_file:
|
163 |
+
logs = json.load(log_file)
|
164 |
+
else:
|
165 |
+
logs = []
|
166 |
+
logs.append(log_entry)
|
167 |
+
with open(log_file_path, "w") as log_file:
|
168 |
+
json.dump(logs, log_file, indent=4)
|
AmesHousing.tsv.csv
ADDED
The diff for this file is too large to render.
See raw diff
|
|
LICENSE
ADDED
@@ -0,0 +1,201 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Apache License
|
2 |
+
Version 2.0, January 2004
|
3 |
+
http://www.apache.org/licenses/
|
4 |
+
|
5 |
+
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
|
6 |
+
|
7 |
+
1. Definitions.
|
8 |
+
|
9 |
+
"License" shall mean the terms and conditions for use, reproduction,
|
10 |
+
and distribution as defined by Sections 1 through 9 of this document.
|
11 |
+
|
12 |
+
"Licensor" shall mean the copyright owner or entity authorized by
|
13 |
+
the copyright owner that is granting the License.
|
14 |
+
|
15 |
+
"Legal Entity" shall mean the union of the acting entity and all
|
16 |
+
other entities that control, are controlled by, or are under common
|
17 |
+
control with that entity. For the purposes of this definition,
|
18 |
+
"control" means (i) the power, direct or indirect, to cause the
|
19 |
+
direction or management of such entity, whether by contract or
|
20 |
+
otherwise, or (ii) ownership of fifty percent (50%) or more of the
|
21 |
+
outstanding shares, or (iii) beneficial ownership of such entity.
|
22 |
+
|
23 |
+
"You" (or "Your") shall mean an individual or Legal Entity
|
24 |
+
exercising permissions granted by this License.
|
25 |
+
|
26 |
+
"Source" form shall mean the preferred form for making modifications,
|
27 |
+
including but not limited to software source code, documentation
|
28 |
+
source, and configuration files.
|
29 |
+
|
30 |
+
"Object" form shall mean any form resulting from mechanical
|
31 |
+
transformation or translation of a Source form, including but
|
32 |
+
not limited to compiled object code, generated documentation,
|
33 |
+
and conversions to other media types.
|
34 |
+
|
35 |
+
"Work" shall mean the work of authorship, whether in Source or
|
36 |
+
Object form, made available under the License, as indicated by a
|
37 |
+
copyright notice that is included in or attached to the work
|
38 |
+
(an example is provided in the Appendix below).
|
39 |
+
|
40 |
+
"Derivative Works" shall mean any work, whether in Source or Object
|
41 |
+
form, that is based on (or derived from) the Work and for which the
|
42 |
+
editorial revisions, annotations, elaborations, or other modifications
|
43 |
+
represent, as a whole, an original work of authorship. For the purposes
|
44 |
+
of this License, Derivative Works shall not include works that remain
|
45 |
+
separable from, or merely link (or bind by name) to the interfaces of,
|
46 |
+
the Work and Derivative Works thereof.
|
47 |
+
|
48 |
+
"Contribution" shall mean any work of authorship, including
|
49 |
+
the original version of the Work and any modifications or additions
|
50 |
+
to that Work or Derivative Works thereof, that is intentionally
|
51 |
+
submitted to Licensor for inclusion in the Work by the copyright owner
|
52 |
+
or by an individual or Legal Entity authorized to submit on behalf of
|
53 |
+
the copyright owner. For the purposes of this definition, "submitted"
|
54 |
+
means any form of electronic, verbal, or written communication sent
|
55 |
+
to the Licensor or its representatives, including but not limited to
|
56 |
+
communication on electronic mailing lists, source code control systems,
|
57 |
+
and issue tracking systems that are managed by, or on behalf of, the
|
58 |
+
Licensor for the purpose of discussing and improving the Work, but
|
59 |
+
excluding communication that is conspicuously marked or otherwise
|
60 |
+
designated in writing by the copyright owner as "Not a Contribution."
|
61 |
+
|
62 |
+
"Contributor" shall mean Licensor and any individual or Legal Entity
|
63 |
+
on behalf of whom a Contribution has been received by Licensor and
|
64 |
+
subsequently incorporated within the Work.
|
65 |
+
|
66 |
+
2. Grant of Copyright License. Subject to the terms and conditions of
|
67 |
+
this License, each Contributor hereby grants to You a perpetual,
|
68 |
+
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
|
69 |
+
copyright license to reproduce, prepare Derivative Works of,
|
70 |
+
publicly display, publicly perform, sublicense, and distribute the
|
71 |
+
Work and such Derivative Works in Source or Object form.
|
72 |
+
|
73 |
+
3. Grant of Patent License. Subject to the terms and conditions of
|
74 |
+
this License, each Contributor hereby grants to You a perpetual,
|
75 |
+
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
|
76 |
+
(except as stated in this section) patent license to make, have made,
|
77 |
+
use, offer to sell, sell, import, and otherwise transfer the Work,
|
78 |
+
where such license applies only to those patent claims licensable
|
79 |
+
by such Contributor that are necessarily infringed by their
|
80 |
+
Contribution(s) alone or by combination of their Contribution(s)
|
81 |
+
with the Work to which such Contribution(s) was submitted. If You
|
82 |
+
institute patent litigation against any entity (including a
|
83 |
+
cross-claim or counterclaim in a lawsuit) alleging that the Work
|
84 |
+
or a Contribution incorporated within the Work constitutes direct
|
85 |
+
or contributory patent infringement, then any patent licenses
|
86 |
+
granted to You under this License for that Work shall terminate
|
87 |
+
as of the date such litigation is filed.
|
88 |
+
|
89 |
+
4. Redistribution. You may reproduce and distribute copies of the
|
90 |
+
Work or Derivative Works thereof in any medium, with or without
|
91 |
+
modifications, and in Source or Object form, provided that You
|
92 |
+
meet the following conditions:
|
93 |
+
|
94 |
+
(a) You must give any other recipients of the Work or
|
95 |
+
Derivative Works a copy of this License; and
|
96 |
+
|
97 |
+
(b) You must cause any modified files to carry prominent notices
|
98 |
+
stating that You changed the files; and
|
99 |
+
|
100 |
+
(c) You must retain, in the Source form of any Derivative Works
|
101 |
+
that You distribute, all copyright, patent, trademark, and
|
102 |
+
attribution notices from the Source form of the Work,
|
103 |
+
excluding those notices that do not pertain to any part of
|
104 |
+
the Derivative Works; and
|
105 |
+
|
106 |
+
(d) If the Work includes a "NOTICE" text file as part of its
|
107 |
+
distribution, then any Derivative Works that You distribute must
|
108 |
+
include a readable copy of the attribution notices contained
|
109 |
+
within such NOTICE file, excluding those notices that do not
|
110 |
+
pertain to any part of the Derivative Works, in at least one
|
111 |
+
of the following places: within a NOTICE text file distributed
|
112 |
+
as part of the Derivative Works; within the Source form or
|
113 |
+
documentation, if provided along with the Derivative Works; or,
|
114 |
+
within a display generated by the Derivative Works, if and
|
115 |
+
wherever such third-party notices normally appear. The contents
|
116 |
+
of the NOTICE file are for informational purposes only and
|
117 |
+
do not modify the License. You may add Your own attribution
|
118 |
+
notices within Derivative Works that You distribute, alongside
|
119 |
+
or as an addendum to the NOTICE text from the Work, provided
|
120 |
+
that such additional attribution notices cannot be construed
|
121 |
+
as modifying the License.
|
122 |
+
|
123 |
+
You may add Your own copyright statement to Your modifications and
|
124 |
+
may provide additional or different license terms and conditions
|
125 |
+
for use, reproduction, or distribution of Your modifications, or
|
126 |
+
for any such Derivative Works as a whole, provided Your use,
|
127 |
+
reproduction, and distribution of the Work otherwise complies with
|
128 |
+
the conditions stated in this License.
|
129 |
+
|
130 |
+
5. Submission of Contributions. Unless You explicitly state otherwise,
|
131 |
+
any Contribution intentionally submitted for inclusion in the Work
|
132 |
+
by You to the Licensor shall be under the terms and conditions of
|
133 |
+
this License, without any additional terms or conditions.
|
134 |
+
Notwithstanding the above, nothing herein shall supersede or modify
|
135 |
+
the terms of any separate license agreement you may have executed
|
136 |
+
with Licensor regarding such Contributions.
|
137 |
+
|
138 |
+
6. Trademarks. This License does not grant permission to use the trade
|
139 |
+
names, trademarks, service marks, or product names of the Licensor,
|
140 |
+
except as required for reasonable and customary use in describing the
|
141 |
+
origin of the Work and reproducing the content of the NOTICE file.
|
142 |
+
|
143 |
+
7. Disclaimer of Warranty. Unless required by applicable law or
|
144 |
+
agreed to in writing, Licensor provides the Work (and each
|
145 |
+
Contributor provides its Contributions) on an "AS IS" BASIS,
|
146 |
+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
|
147 |
+
implied, including, without limitation, any warranties or conditions
|
148 |
+
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
|
149 |
+
PARTICULAR PURPOSE. You are solely responsible for determining the
|
150 |
+
appropriateness of using or redistributing the Work and assume any
|
151 |
+
risks associated with Your exercise of permissions under this License.
|
152 |
+
|
153 |
+
8. Limitation of Liability. In no event and under no legal theory,
|
154 |
+
whether in tort (including negligence), contract, or otherwise,
|
155 |
+
unless required by applicable law (such as deliberate and grossly
|
156 |
+
negligent acts) or agreed to in writing, shall any Contributor be
|
157 |
+
liable to You for damages, including any direct, indirect, special,
|
158 |
+
incidental, or consequential damages of any character arising as a
|
159 |
+
result of this License or out of the use or inability to use the
|
160 |
+
Work (including but not limited to damages for loss of goodwill,
|
161 |
+
work stoppage, computer failure or malfunction, or any and all
|
162 |
+
other commercial damages or losses), even if such Contributor
|
163 |
+
has been advised of the possibility of such damages.
|
164 |
+
|
165 |
+
9. Accepting Warranty or Additional Liability. While redistributing
|
166 |
+
the Work or Derivative Works thereof, You may choose to offer,
|
167 |
+
and charge a fee for, acceptance of support, warranty, indemnity,
|
168 |
+
or other liability obligations and/or rights consistent with this
|
169 |
+
License. However, in accepting such obligations, You may act only
|
170 |
+
on Your own behalf and on Your sole responsibility, not on behalf
|
171 |
+
of any other Contributor, and only if You agree to indemnify,
|
172 |
+
defend, and hold each Contributor harmless for any liability
|
173 |
+
incurred by, or claims asserted against, such Contributor by reason
|
174 |
+
of your accepting any such warranty or additional liability.
|
175 |
+
|
176 |
+
END OF TERMS AND CONDITIONS
|
177 |
+
|
178 |
+
APPENDIX: How to apply the Apache License to your work.
|
179 |
+
|
180 |
+
To apply the Apache License to your work, attach the following
|
181 |
+
boilerplate notice, with the fields enclosed by brackets "[]"
|
182 |
+
replaced with your own identifying information. (Don't include
|
183 |
+
the brackets!) The text should be enclosed in the appropriate
|
184 |
+
comment syntax for the file format. We also recommend that a
|
185 |
+
file or class name and description of purpose be included on the
|
186 |
+
same "printed page" as the copyright notice for easier
|
187 |
+
identification within third-party archives.
|
188 |
+
|
189 |
+
Copyright [yyyy] [name of copyright owner]
|
190 |
+
|
191 |
+
Licensed under the Apache License, Version 2.0 (the "License");
|
192 |
+
you may not use this file except in compliance with the License.
|
193 |
+
You may obtain a copy of the License at
|
194 |
+
|
195 |
+
http://www.apache.org/licenses/LICENSE-2.0
|
196 |
+
|
197 |
+
Unless required by applicable law or agreed to in writing, software
|
198 |
+
distributed under the License is distributed on an "AS IS" BASIS,
|
199 |
+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
200 |
+
See the License for the specific language governing permissions and
|
201 |
+
limitations under the License.
|
README.md
CHANGED
@@ -1,6 +1,6 @@
|
|
1 |
---
|
2 |
title: LLM Tutor
|
3 |
-
emoji:
|
4 |
colorFrom: indigo
|
5 |
colorTo: indigo
|
6 |
sdk: streamlit
|
@@ -11,4 +11,102 @@ license: mit
|
|
11 |
short_description: Learn ML/LLM
|
12 |
---
|
13 |
|
14 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
---
|
2 |
title: LLM Tutor
|
3 |
+
emoji: π¨βπ
|
4 |
colorFrom: indigo
|
5 |
colorTo: indigo
|
6 |
sdk: streamlit
|
|
|
11 |
short_description: Learn ML/LLM
|
12 |
---
|
13 |
|
14 |
+
# Data Science Tutor
|
15 |
+
|
16 |
+
## Overview
|
17 |
+
|
18 |
+
The **Data Science Tutor** application is designed to guide users through the CRISP-DM process for data science projects. Each section in the sidebar highlights a different step in the process, providing structured lessons, best practices, and hands-on examples. The application also includes an AI-powered chat feature to assist users with their data science queries.
|
19 |
+
|
20 |
+
## Project Structure
|
21 |
+
|
22 |
+
The project is organized into the following main components:
|
23 |
+
|
24 |
+
### 1. Main Application (`Tutor.py`)
|
25 |
+
|
26 |
+
The main application file that sets up the Streamlit interface, including the sidebar navigation, chat functionality, and dynamic content loading based on the selected section.
|
27 |
+
|
28 |
+
### 2. Pages Directory (`pages/`)
|
29 |
+
|
30 |
+
Contains individual Python scripts for each section of the CRISP-DM process. Each script includes detailed content, explanations, and quizzes related to its respective topic.
|
31 |
+
|
32 |
+
- `1_Business_understanding.py`: Covers the Business Understanding phase.
|
33 |
+
- `2_Data_understanding.py`: Covers the Data Understanding phase.
|
34 |
+
- `3_Data_preparation.py`: Covers the Data Preparation phase.
|
35 |
+
- `4_Feature_engineering.py`: Covers Feature Engineering.
|
36 |
+
- `5_Modeling.py`: Covers the Modeling phase.
|
37 |
+
- `6_Evaluation.py`: Covers the Evaluation phase.
|
38 |
+
- `7_Deployment.py`: Covers Deployment and Testing.
|
39 |
+
- `8_Models.py`: Covers ML, Deep Learning, and Transformers.
|
40 |
+
|
41 |
+
### 3. Chat Functionality
|
42 |
+
|
43 |
+
The application includes an AI-powered chat feature that allows users to ask questions related to data science. The chat model's responses are tailored based on the selected section to provide relevant and focused answers.
|
44 |
+
|
45 |
+
### 4. Focus Areas
|
46 |
+
|
47 |
+
Users can select specific focus areas from the sidebar to further refine the context of their queries. The focus areas include:
|
48 |
+
- Data Cleaning & Wrangling
|
49 |
+
- Feature Engineering & Selection
|
50 |
+
- Model Selection & Tuning
|
51 |
+
- Interpretability & Explainability
|
52 |
+
- Model Deployment & Monitoring
|
53 |
+
|
54 |
+
## Installation
|
55 |
+
|
56 |
+
To run the application locally, follow these steps:
|
57 |
+
|
58 |
+
1. Clone the repository:
|
59 |
+
```sh
|
60 |
+
git clone https://github.com/your-username/LLM-Tutor.git
|
61 |
+
cd LLM-Tutor
|
62 |
+
```
|
63 |
+
|
64 |
+
2. Create a virtual environment and activate it:
|
65 |
+
```sh
|
66 |
+
python -m venv venv
|
67 |
+
source venv/bin/activate # On Windows, use `venv\Scripts\activate`
|
68 |
+
```
|
69 |
+
|
70 |
+
3. Install the required dependencies:
|
71 |
+
```sh
|
72 |
+
pip install -r requirements.txt
|
73 |
+
```
|
74 |
+
|
75 |
+
4. Set up your OpenAI API key:
|
76 |
+
- Create a [.env](http://_vscodecontentref_/1) file in the root directory of the project.
|
77 |
+
- Add your OpenAI API key to the [.env](http://_vscodecontentref_/2) file:
|
78 |
+
```
|
79 |
+
OPENAI_API_KEY=your_openai_api_key
|
80 |
+
```
|
81 |
+
|
82 |
+
5. Run the application:
|
83 |
+
```sh
|
84 |
+
streamlit run Tutor.py
|
85 |
+
```
|
86 |
+
|
87 |
+
## Usage
|
88 |
+
|
89 |
+
- **Select a CRISP-DM Step**: Use the sidebar to navigate through different steps of the CRISP-DM process.
|
90 |
+
- **Ask AI**: Enter your OpenAI API key in the sidebar and ask questions related to data science.
|
91 |
+
- **Focus Areas**: Select specific focus areas to refine the context of your queries.
|
92 |
+
- **Interactive Content**: Each section includes detailed explanations, key concepts, and quizzes to test your understanding.
|
93 |
+
|
94 |
+
## License
|
95 |
+
|
96 |
+
This project is licensed under the MIT License. See the [LICENSE](http://_vscodecontentref_/3) file for more details.
|
97 |
+
|
98 |
+
## Contributing
|
99 |
+
|
100 |
+
Contributions are welcome! Please read the CONTRIBUTING file for guidelines on how to contribute to this project.
|
101 |
+
|
102 |
+
## Acknowledgements
|
103 |
+
|
104 |
+
- [Streamlit](https://streamlit.io/)
|
105 |
+
- [OpenAI](https://www.openai.com/)
|
106 |
+
- [CRISP-DM](https://www.sv-europe.com/crisp-dm-methodology/)
|
107 |
+
|
108 |
+
---
|
109 |
+
|
110 |
+
pinned: false
|
111 |
+
license: mit
|
112 |
+
short_description: Learn ML/LLM
|
data2.jpeg
ADDED
![]() |
logs/conversation_logs.json
ADDED
The diff for this file is too large to render.
See raw diff
|
|
pages/1_Business_understanding.py
ADDED
@@ -0,0 +1,67 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
|
3 |
+
def run():
|
4 |
+
st.title("Business Understanding")
|
5 |
+
|
6 |
+
st.write("## Overview")
|
7 |
+
st.write("""
|
8 |
+
The Business Understanding phase is the first step in the CRISP-DM process. It involves understanding the project objectives and requirements from a business perspective, and then converting this knowledge into a data mining problem definition and a preliminary plan.
|
9 |
+
""")
|
10 |
+
|
11 |
+
st.write("## Key Concepts & Explanations")
|
12 |
+
st.markdown("""
|
13 |
+
- **Business Goals**: Clearly define the problem (e.g., Predict customer churn).
|
14 |
+
- **Constraints**: Time, budget, available data, legal considerations.
|
15 |
+
- **Success Metrics**: Accuracy, revenue impact, efficiency gains.
|
16 |
+
""")
|
17 |
+
|
18 |
+
st.write("## Introduction")
|
19 |
+
st.write("""
|
20 |
+
Business Understanding is crucial for the success of any data science project. It ensures that the project is aligned with the business objectives and that the results will be actionable and valuable to the organization.
|
21 |
+
""")
|
22 |
+
|
23 |
+
st.header("Objectives")
|
24 |
+
st.write("""
|
25 |
+
- **Understand the Business Objectives**: Gain a clear understanding of the business goals and how they translate into data mining goals.
|
26 |
+
- **Assess the Situation**: Evaluate the current situation, including resources, constraints, and risks.
|
27 |
+
- **Determine Data Mining Goals**: Define specific data mining goals that align with the business objectives.
|
28 |
+
- **Produce a Project Plan**: Develop a detailed project plan that outlines the steps, resources, and timeline for the project.
|
29 |
+
""")
|
30 |
+
|
31 |
+
st.header("Key Activities")
|
32 |
+
st.write("""
|
33 |
+
- **Identify Business Objectives and Constraints**: Work with stakeholders to identify the key business objectives and any constraints that may impact the project.
|
34 |
+
- **Define Success Criteria**: Establish clear criteria for success, including key performance indicators (KPIs) and metrics.
|
35 |
+
- **Develop a Project Plan**: Create a comprehensive project plan that includes a timeline, resource allocation, and risk management strategy.
|
36 |
+
""")
|
37 |
+
|
38 |
+
st.write("## Detailed Steps")
|
39 |
+
st.write("""
|
40 |
+
1. **Determine Business Objectives**:
|
41 |
+
- Interview stakeholders to understand their goals and expectations.
|
42 |
+
- Identify the key business questions that need to be answered.
|
43 |
+
2. **Assess the Situation**:
|
44 |
+
- Conduct a SWOT analysis (Strengths, Weaknesses, Opportunities, Threats).
|
45 |
+
- Review existing resources, including data, tools, and expertise.
|
46 |
+
3. **Define Data Mining Goals**:
|
47 |
+
- Translate business objectives into specific data mining goals.
|
48 |
+
- Ensure that the goals are measurable and achievable.
|
49 |
+
4. **Produce a Project Plan**:
|
50 |
+
- Outline the project phases, tasks, and deliverables.
|
51 |
+
- Allocate resources and assign responsibilities.
|
52 |
+
- Develop a risk management plan to address potential challenges.
|
53 |
+
""")
|
54 |
+
|
55 |
+
st.write("## Quiz: Conceptual Questions")
|
56 |
+
q1 = st.radio("What is the main purpose of the Business Understanding phase?", ["Define project goals", "Collect data", "Build models"])
|
57 |
+
if q1 == "Define project goals":
|
58 |
+
st.success("β
Correct!")
|
59 |
+
else:
|
60 |
+
st.error("β Incorrect. The main purpose is to define project goals.")
|
61 |
+
|
62 |
+
st.write("## Learning Resources")
|
63 |
+
st.markdown("""
|
64 |
+
- π [CRISP-DM Guide](https://www.sv-europe.com/crisp-dm-methodology/)
|
65 |
+
- π [Understanding Business Objectives](https://www.datasciencecentral.com/profiles/blogs/understanding-business-objectives-in-data-science)
|
66 |
+
- π¬ [Business Understanding in Data Science](https://towardsdatascience.com/business-understanding-in-data-science-1a1d5e8b1c3d)
|
67 |
+
""")
|
pages/2_Data_understanding.py
ADDED
@@ -0,0 +1,70 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
|
3 |
+
def run():
|
4 |
+
st.title("Data Understanding")
|
5 |
+
|
6 |
+
st.write("## Overview")
|
7 |
+
st.write("""
|
8 |
+
Data Understanding is the second phase of the CRISP-DM process. It involves collecting initial data, describing the data, exploring the data, and verifying data quality.
|
9 |
+
""")
|
10 |
+
|
11 |
+
st.write("## Key Concepts & Explanations")
|
12 |
+
st.markdown("""
|
13 |
+
- **Data Collection**: Gathering data from various sources.
|
14 |
+
- **Data Description**: Summarizing the main characteristics of the data.
|
15 |
+
- **Data Exploration**: Using statistical and visualization techniques to understand the data.
|
16 |
+
- **Data Quality Verification**: Ensuring the data is accurate, complete, and reliable.
|
17 |
+
""")
|
18 |
+
|
19 |
+
st.write("## Introduction")
|
20 |
+
st.write("""
|
21 |
+
The Data Understanding phase is crucial for identifying potential issues with the data and gaining insights that will inform the subsequent phases of the CRISP-DM process.
|
22 |
+
""")
|
23 |
+
|
24 |
+
st.header("Objectives")
|
25 |
+
st.write("""
|
26 |
+
- **Collect Initial Data**: Gather data from various sources to get a comprehensive dataset.
|
27 |
+
- **Describe the Data**: Summarize the main characteristics of the data, including its structure and content.
|
28 |
+
- **Explore the Data**: Use statistical and visualization techniques to identify patterns, trends, and anomalies.
|
29 |
+
- **Verify Data Quality**: Assess the quality of the data to ensure it is suitable for analysis.
|
30 |
+
""")
|
31 |
+
|
32 |
+
st.header("Key Activities")
|
33 |
+
st.write("""
|
34 |
+
- **Data Collection**: Gather data from internal and external sources.
|
35 |
+
- **Data Description**: Generate summary statistics and visualizations to describe the data.
|
36 |
+
- **Data Exploration**: Perform exploratory data analysis (EDA) to uncover patterns and relationships.
|
37 |
+
- **Data Quality Verification**: Check for missing values, outliers, and inconsistencies in the data.
|
38 |
+
""")
|
39 |
+
|
40 |
+
st.write("## Detailed Steps")
|
41 |
+
st.write("""
|
42 |
+
1. **Collect Initial Data**:
|
43 |
+
- Identify relevant data sources.
|
44 |
+
- Extract data from various sources and consolidate it into a single dataset.
|
45 |
+
2. **Describe the Data**:
|
46 |
+
- Generate summary statistics (e.g., mean, median, standard deviation).
|
47 |
+
- Create visualizations (e.g., histograms, box plots) to describe the data distribution.
|
48 |
+
3. **Explore the Data**:
|
49 |
+
- Perform exploratory data analysis (EDA) to identify patterns, trends, and anomalies.
|
50 |
+
- Use visualization tools (e.g., scatter plots, heatmaps) to explore relationships between variables.
|
51 |
+
4. **Verify Data Quality**:
|
52 |
+
- Check for missing values and handle them appropriately.
|
53 |
+
- Identify and address outliers and inconsistencies in the data.
|
54 |
+
- Assess the overall quality of the data to ensure it is suitable for analysis.
|
55 |
+
""")
|
56 |
+
|
57 |
+
st.write("## Quiz: Conceptual Questions")
|
58 |
+
q1 = st.radio("What is the main purpose of the Data Understanding phase?", ["Collect data", "Describe data", "Explore data", "All of the above"])
|
59 |
+
if q1 == "All of the above":
|
60 |
+
st.success("β
Correct!")
|
61 |
+
else:
|
62 |
+
st.error("β Incorrect. The main purpose is to collect, describe, and explore data.")
|
63 |
+
|
64 |
+
st.write("## Learning Resources")
|
65 |
+
st.markdown("""
|
66 |
+
- π [CRISP-DM Guide](https://www.sv-europe.com/crisp-dm-methodology/)
|
67 |
+
- π [Data Understanding in Data Science](https://towardsdatascience.com/data-understanding-in-data-science-1a1d5e8b1c3d)
|
68 |
+
- π¬ [Exploratory Data Analysis (EDA)](https://www.analyticsvidhya.com/blog/2021/06/exploratory-data-analysis-eda-a-step-by-step-guide/)
|
69 |
+
""")
|
70 |
+
|
pages/3_Data_preparation.py
ADDED
@@ -0,0 +1,62 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
|
3 |
+
def run():
|
4 |
+
st.title("Data Preparation")
|
5 |
+
st.header("Introduction")
|
6 |
+
st.write("""
|
7 |
+
Data Preparation involves cleaning and transforming raw data into a format suitable for analysis.
|
8 |
+
""")
|
9 |
+
st.header("Objectives")
|
10 |
+
st.write("""
|
11 |
+
- Clean the data.
|
12 |
+
- Transform the data.
|
13 |
+
- Integrate data from multiple sources.
|
14 |
+
""")
|
15 |
+
st.header("Key Activities")
|
16 |
+
st.write("""
|
17 |
+
- Data cleaning.
|
18 |
+
- Data transformation.
|
19 |
+
- Data integration.
|
20 |
+
""")
|
21 |
+
import pandas as pd
|
22 |
+
|
23 |
+
st.title("3. Data Preparation")
|
24 |
+
st.write("## Overview")
|
25 |
+
st.write("Cleaning and transforming data for better model performance.")
|
26 |
+
|
27 |
+
st.write("## Key Concepts & Explanations")
|
28 |
+
st.markdown("""
|
29 |
+
- red[**Handling Missing Values**]: Fill with mean/median or drop rows.
|
30 |
+
- **Feature Engineering**: Creating new features for better modeling.
|
31 |
+
- **Scaling**: Normalization and standardization for consistency.
|
32 |
+
""")
|
33 |
+
|
34 |
+
file = st.file_uploader("Upload a dataset", type=["csv"])
|
35 |
+
if file:
|
36 |
+
df = pd.read_csv(file)
|
37 |
+
option = st.radio("Choose a method to handle missing values", ["Fill with Mean", "Fill with Median", "Drop Rows"])
|
38 |
+
if option == "Fill with Mean":
|
39 |
+
df.fillna(df.mean(), inplace=True)
|
40 |
+
elif option == "Fill with Median":
|
41 |
+
df.fillna(df.median(), inplace=True)
|
42 |
+
elif option == "Drop Rows":
|
43 |
+
df.dropna(inplace=True)
|
44 |
+
|
45 |
+
st.write("## Quiz: Conceptual Questions")
|
46 |
+
q1 = st.radio("What is feature engineering?", ["Data visualization", "Creating new features", "Data storage"])
|
47 |
+
if q1 == "Creating new features":
|
48 |
+
st.success("β
Correct!")
|
49 |
+
else:
|
50 |
+
st.error("β Incorrect.")
|
51 |
+
|
52 |
+
st.write("## Code-Based Quiz")
|
53 |
+
code_input = st.text_area("Write a function to normalize a column", value="def normalize(col):\n return (col - col.min()) / (col.max() - col.min())")
|
54 |
+
if "col.max() - col.min()" in code_input:
|
55 |
+
st.success("β
Correct!")
|
56 |
+
else:
|
57 |
+
st.error("β Try again.")
|
58 |
+
|
59 |
+
st.write("## Learning Resources")
|
60 |
+
st.markdown("""
|
61 |
+
- π [Data Cleaning with Pandas](https://realpython.com/pandas-data-cleaning/)
|
62 |
+
""")
|
pages/4_Feature_engineering.py
ADDED
@@ -0,0 +1,78 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
from sklearn.preprocessing import MinMaxScaler
|
3 |
+
import pandas as pd
|
4 |
+
|
5 |
+
def run():
|
6 |
+
st.title("Feature Engineering")
|
7 |
+
|
8 |
+
st.write("## Overview")
|
9 |
+
st.write("""
|
10 |
+
Feature Engineering is the process of using domain knowledge to create features (input variables) that make machine learning algorithms work better. It involves transforming raw data into meaningful features that improve the performance of machine learning models.
|
11 |
+
""")
|
12 |
+
|
13 |
+
st.write("## Key Concepts & Explanations")
|
14 |
+
st.markdown("""
|
15 |
+
- **Feature Creation**: Generating new features from existing data.
|
16 |
+
- **Feature Transformation**: Modifying features to make them more suitable for modeling.
|
17 |
+
- **Feature Selection**: Identifying the most relevant features for the model.
|
18 |
+
- **Feature Scaling**: Normalizing or standardizing features to ensure they are on a similar scale.
|
19 |
+
""")
|
20 |
+
|
21 |
+
st.write("## Introduction")
|
22 |
+
st.write("""
|
23 |
+
Feature Engineering is a crucial step in the data science process. It can significantly impact the performance of machine learning models by providing them with the right input variables. Effective feature engineering requires a deep understanding of the data and the problem domain.
|
24 |
+
""")
|
25 |
+
|
26 |
+
st.header("Objectives")
|
27 |
+
st.write("""
|
28 |
+
- **Create New Features**: Generate new features that capture important information from the data.
|
29 |
+
- **Transform Existing Features**: Modify existing features to make them more suitable for modeling.
|
30 |
+
- **Select Relevant Features**: Identify and select the most relevant features for the model.
|
31 |
+
- **Scale Features**: Normalize or standardize features to ensure they are on a similar scale.
|
32 |
+
""")
|
33 |
+
|
34 |
+
st.header("Key Activities")
|
35 |
+
st.write("""
|
36 |
+
- **Feature Creation**: Generate new features from existing data using domain knowledge.
|
37 |
+
- **Feature Transformation**: Apply mathematical transformations to features to improve their suitability for modeling.
|
38 |
+
- **Feature Selection**: Use statistical techniques to identify the most relevant features for the model.
|
39 |
+
- **Feature Scaling**: Normalize or standardize features to ensure they are on a similar scale.
|
40 |
+
""")
|
41 |
+
|
42 |
+
st.write("## Detailed Steps")
|
43 |
+
st.write("""
|
44 |
+
1. **Feature Creation**:
|
45 |
+
- Generate new features from existing data using domain knowledge.
|
46 |
+
- Combine multiple features to create new ones (e.g., ratios, differences).
|
47 |
+
2. **Feature Transformation**:
|
48 |
+
- Apply mathematical transformations (e.g., log, square root) to features.
|
49 |
+
- Encode categorical variables using techniques like one-hot encoding or label encoding.
|
50 |
+
3. **Feature Selection**:
|
51 |
+
- Use statistical techniques (e.g., correlation, mutual information) to identify relevant features.
|
52 |
+
- Apply dimensionality reduction techniques (e.g., PCA) to reduce the number of features.
|
53 |
+
4. **Feature Scaling**:
|
54 |
+
- Normalize features to a range (e.g., 0 to 1) using MinMaxScaler.
|
55 |
+
- Standardize features to have a mean of 0 and a standard deviation of 1 using StandardScaler.
|
56 |
+
""")
|
57 |
+
|
58 |
+
st.write("## Quiz: Conceptual Questions")
|
59 |
+
q1 = st.radio("What is the main purpose of feature engineering?", ["Improve model accuracy", "Reduce dataset size", "Make data harder to interpret"])
|
60 |
+
if q1 == "Improve model accuracy":
|
61 |
+
st.success("β
Correct!")
|
62 |
+
else:
|
63 |
+
st.error("β Incorrect. The main purpose is to improve model accuracy.")
|
64 |
+
|
65 |
+
st.write("## Code-Based Quiz")
|
66 |
+
code_input = st.text_area("Write a function to normalize a dataset using MinMaxScaler",
|
67 |
+
value="from sklearn.preprocessing import MinMaxScaler\n\ndef normalize_data(df):\n scaler = MinMaxScaler()\n return pd.DataFrame(scaler.fit_transform(df), columns=df.columns)")
|
68 |
+
if "MinMaxScaler" in code_input:
|
69 |
+
st.success("β
Correct!")
|
70 |
+
else:
|
71 |
+
st.error("β Try again.")
|
72 |
+
|
73 |
+
st.write("## Learning Resources")
|
74 |
+
st.markdown("""
|
75 |
+
- π [Feature Engineering for Machine Learning](https://towardsdatascience.com/feature-engineering-for-machine-learning-3a5e293a5114)
|
76 |
+
- π [Scikit-learn Feature Engineering Guide](https://scikit-learn.org/stable/modules/feature_extraction.html)
|
77 |
+
- π¬ [Feature Engineering Techniques](https://www.analyticsvidhya.com/blog/2021/10/a-comprehensive-guide-on-feature-engineering/)
|
78 |
+
""")
|
pages/5_Modeling.py
ADDED
@@ -0,0 +1,54 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
import pandas as pd
|
3 |
+
from sklearn.model_selection import train_test_split
|
4 |
+
from sklearn.linear_model import LogisticRegression
|
5 |
+
from sklearn.metrics import accuracy_score
|
6 |
+
|
7 |
+
def run():
|
8 |
+
st.title("4. Modeling")
|
9 |
+
st.write("## Overview")
|
10 |
+
st.write("Building and training machine learning models to make predictions.")
|
11 |
+
|
12 |
+
st.write("## Key Concepts & Explanations")
|
13 |
+
st.markdown("""
|
14 |
+
- **Model Selection**: Choose the model based on the problem (e.g., Classification, Regression).
|
15 |
+
- **Training Data**: The subset used to train the model.
|
16 |
+
- **Test Data**: The subset used to evaluate the modelβs performance.
|
17 |
+
""")
|
18 |
+
|
19 |
+
file = st.file_uploader("Upload a dataset for modeling", type=["csv"])
|
20 |
+
if file:
|
21 |
+
df = pd.read_csv(file)
|
22 |
+
target = st.selectbox("Select the target variable", df.columns)
|
23 |
+
features = st.multiselect("Select the feature columns", df.columns)
|
24 |
+
|
25 |
+
if target and features:
|
26 |
+
X = df[features]
|
27 |
+
y = df[target]
|
28 |
+
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
|
29 |
+
|
30 |
+
model = LogisticRegression()
|
31 |
+
model.fit(X_train, y_train)
|
32 |
+
y_pred = model.predict(X_test)
|
33 |
+
accuracy = accuracy_score(y_test, y_pred)
|
34 |
+
|
35 |
+
st.write(f"Accuracy: {accuracy * 100:.2f}%")
|
36 |
+
|
37 |
+
st.write("## Quiz: Conceptual Questions")
|
38 |
+
q1 = st.radio("What is overfitting?", ["Model too simple", "Model too complex", "Data too large"])
|
39 |
+
if q1 == "Model too complex":
|
40 |
+
st.success("β
Correct!")
|
41 |
+
else:
|
42 |
+
st.error("β Incorrect.")
|
43 |
+
|
44 |
+
st.write("## Code-Based Quiz")
|
45 |
+
code_input = st.text_area("Write a function to split data into train and test sets", value="def split_data(df, target):\n X = df.drop(columns=[target])\n y = df[target]\n return train_test_split(X, y, test_size=0.2, random_state=42)")
|
46 |
+
if "train_test_split" in code_input:
|
47 |
+
st.success("β
Correct!")
|
48 |
+
else:
|
49 |
+
st.error("β Try again.")
|
50 |
+
|
51 |
+
st.write("## Learning Resources")
|
52 |
+
st.markdown("""
|
53 |
+
- π [Introduction to Machine Learning with Python](https://www.oreilly.com/library/view/introduction-to-machine/9781449369880/)
|
54 |
+
""")
|
pages/6_Evaluation.py
ADDED
@@ -0,0 +1,62 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
def run():
|
2 |
+
import streamlit as st
|
3 |
+
from sklearn.metrics import confusion_matrix, classification_report
|
4 |
+
import numpy as np
|
5 |
+
import pandas as pd
|
6 |
+
import matplotlib.pyplot as plt
|
7 |
+
st.title("5. Evaluation")
|
8 |
+
st.header("Introduction")
|
9 |
+
st.write("""
|
10 |
+
Model Evaluation is the process of assessing the performance of a machine learning model using various metrics.
|
11 |
+
""")
|
12 |
+
st.header("Objectives")
|
13 |
+
st.write("""
|
14 |
+
- Assess model performance.
|
15 |
+
- Compare different models.
|
16 |
+
- Select the best model.
|
17 |
+
""")
|
18 |
+
st.header("Key Activities")
|
19 |
+
st.write("""
|
20 |
+
- Model validation.
|
21 |
+
- Performance metrics calculation.
|
22 |
+
- Model comparison.
|
23 |
+
""")
|
24 |
+
|
25 |
+
st.write("## Overview")
|
26 |
+
st.write("Assessing model performance using appropriate evaluation metrics.")
|
27 |
+
|
28 |
+
st.write("## Key Concepts & Explanations")
|
29 |
+
st.markdown("### Confusion Matrix")
|
30 |
+
st.write("""
|
31 |
+
A confusion matrix is a table used to evaluate the performance of a classification model. It shows the number of true positives, true negatives, false positives, and false negatives. This helps in understanding how well the model is performing in terms of correctly and incorrectly classified instances.
|
32 |
+
""")
|
33 |
+
|
34 |
+
st.markdown("### Precision, Recall, F1-Score")
|
35 |
+
st.write("""
|
36 |
+
- **Precision**: This metric measures the accuracy of the positive predictions. It is the ratio of true positive predictions to the total predicted positives (true positives + false positives). High precision indicates a low false positive rate.
|
37 |
+
- **Recall**: Also known as sensitivity, this metric measures the ability of the model to identify all relevant instances. It is the ratio of true positive predictions to the total actual positives (true positives + false negatives). High recall indicates a low false negative rate.
|
38 |
+
- **F1-Score**: This is the harmonic mean of precision and recall. It provides a single metric that balances both precision and recall, especially useful when you need to balance the two.
|
39 |
+
""")
|
40 |
+
|
41 |
+
st.markdown("### ROC-AUC")
|
42 |
+
st.write("""
|
43 |
+
- **ROC (Receiver Operating Characteristic) Curve**: This is a graphical representation of the model's performance across different threshold values. It plots the true positive rate (recall) against the false positive rate.
|
44 |
+
- **AUC (Area Under the Curve)**: This metric summarizes the ROC curve into a single value. It represents the likelihood that the model will rank a randomly chosen positive instance higher than a randomly chosen negative one. An AUC of 1 indicates a perfect model, while an AUC of 0.5 indicates a model with no discriminative power.
|
45 |
+
""")
|
46 |
+
q1 = st.radio("Which metric is used for evaluating a classification model?", ["Accuracy", "Mean Squared Error", "All of the above"])
|
47 |
+
if q1 == "All of the above":
|
48 |
+
st.success("β
Correct!")
|
49 |
+
else:
|
50 |
+
st.error("β Incorrect.")
|
51 |
+
|
52 |
+
st.write("## Code-Based Quiz")
|
53 |
+
code_input = st.text_area("Write a function to calculate the confusion matrix", value="def confusion_mat(y_true, y_pred):\n return confusion_matrix(y_true, y_pred)")
|
54 |
+
if "confusion_matrix" in code_input:
|
55 |
+
st.success("β
Correct!")
|
56 |
+
else:
|
57 |
+
st.error("β Try again.")
|
58 |
+
|
59 |
+
st.write("## Learning Resources")
|
60 |
+
st.markdown("""
|
61 |
+
- π [Evaluation Metrics in Machine Learning](https://scikit-learn.org/stable/modules/model_evaluation.html)
|
62 |
+
""")
|
pages/7_Deployment.py
ADDED
@@ -0,0 +1,43 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
|
3 |
+
def run():
|
4 |
+
st.title("6. Deployment & Testing")
|
5 |
+
st.header("Introduction")
|
6 |
+
st.write("""
|
7 |
+
Model Deployment is the process of integrating a machine learning model into a production environment where it can make predictions on new data.
|
8 |
+
""")
|
9 |
+
st.header("Objectives")
|
10 |
+
st.write("""
|
11 |
+
- Integrate the model into production.
|
12 |
+
- Monitor model performance.
|
13 |
+
- Update the model as needed.
|
14 |
+
""")
|
15 |
+
|
16 |
+
st.write("## Overview")
|
17 |
+
st.write("Deploying the model and testing its real-world performance.")
|
18 |
+
|
19 |
+
st.write("## Key Concepts & Explanations")
|
20 |
+
st.markdown("""
|
21 |
+
- **Deployment**: Making the model available for use (e.g., via an API).
|
22 |
+
- **Testing**: Ensuring the model works in production environments.
|
23 |
+
- **Model Monitoring**: Continuously tracking model performance in real-time.
|
24 |
+
""")
|
25 |
+
|
26 |
+
st.write("## Quiz: Conceptual Questions")
|
27 |
+
q1 = st.radio("Which of the following is part of deployment?", ["Model Training", "Model Versioning", "Model Testing"])
|
28 |
+
if q1 == "Model Versioning":
|
29 |
+
st.success("β
Correct!")
|
30 |
+
else:
|
31 |
+
st.error("β Incorrect.")
|
32 |
+
|
33 |
+
st.write("## Code-Based Quiz")
|
34 |
+
code_input = st.text_area("Write code to save a model using joblib", value="import joblib\njoblib.dump(model, 'model.pkl')")
|
35 |
+
if "joblib.dump" in code_input:
|
36 |
+
st.success("β
Correct!")
|
37 |
+
else:
|
38 |
+
st.error("β Try again.")
|
39 |
+
|
40 |
+
st.write("## Learning Resources")
|
41 |
+
st.markdown("""
|
42 |
+
- π [Machine Learning Model Deployment](https://towardsdatascience.com/deploying-machine-learning-models-using-flask-285dbddedbfa)
|
43 |
+
""")
|
pages/8_Models.py
ADDED
@@ -0,0 +1,48 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
from sklearn.linear_model import LogisticRegression
|
3 |
+
import torch
|
4 |
+
from transformers import pipeline
|
5 |
+
|
6 |
+
|
7 |
+
def run():
|
8 |
+
st.title("7. Machine Learning, Deep Learning & Transformers")
|
9 |
+
st.write("## Overview")
|
10 |
+
st.write("Learn about different machine learning models, deep learning models, and transformers.")
|
11 |
+
|
12 |
+
st.write("## Key Concepts & Explanations")
|
13 |
+
st.markdown("""
|
14 |
+
- **Machine Learning Models**: Supervised, unsupervised, and reinforcement learning.
|
15 |
+
- **Deep Learning**: Neural networks with many layers, used for complex tasks like image recognition.
|
16 |
+
- **Transformers**: A powerful model architecture used in natural language processing (NLP) tasks.
|
17 |
+
""")
|
18 |
+
|
19 |
+
# ML Example: Logistic Regression
|
20 |
+
st.write("### Example: Logistic Regression")
|
21 |
+
st.write("We'll use logistic regression to classify some sample data.")
|
22 |
+
model = LogisticRegression()
|
23 |
+
# (Insert a sample dataset and training procedure here)
|
24 |
+
|
25 |
+
# Deep Learning Example: Using Pretrained Transformers
|
26 |
+
st.write("### Example: Transformer Model")
|
27 |
+
nlp = pipeline("sentiment-analysis")
|
28 |
+
st.write(nlp("I love machine learning!"))
|
29 |
+
|
30 |
+
st.write("## Quiz: Conceptual Questions")
|
31 |
+
q1 = st.radio("What is a transformer model used for?", ["Text classification", "Image processing", "Time series analysis"])
|
32 |
+
if q1 == "Text classification":
|
33 |
+
st.success("β
Correct!")
|
34 |
+
else:
|
35 |
+
st.error("β Incorrect.")
|
36 |
+
|
37 |
+
st.write("## Code-Based Quiz")
|
38 |
+
code_input = st.text_area("Write code to create a simple neural network using PyTorch", value="import torch\nimport torch.nn as nn\nclass SimpleNN(nn.Module):\n def __init__(self):\n super(SimpleNN, self).__init__()")
|
39 |
+
if "super(SimpleNN" in code_input:
|
40 |
+
st.success("β
Correct!")
|
41 |
+
else:
|
42 |
+
st.error("β Try again.")
|
43 |
+
|
44 |
+
st.write("## Learning Resources")
|
45 |
+
st.markdown("""
|
46 |
+
- π [Deep Learning with PyTorch](https://pytorch.org/tutorials/)
|
47 |
+
- π [Transformers Library Documentation](https://huggingface.co/docs/transformers/)
|
48 |
+
""")
|
pages/ML_Algorithms/decision_trees.py
ADDED
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
|
3 |
+
def run():
|
4 |
+
st.title("TBD")
|
pages/ML_Algorithms/logistic_regression.py
ADDED
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
|
3 |
+
def run():
|
4 |
+
st.title("TBD")
|
pages/ML_Algorithms/neural_networks.py
ADDED
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
|
3 |
+
def run():
|
4 |
+
st.title("TBD")
|
pages/ML_Algorithms/random_forest.py
ADDED
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
|
3 |
+
def run():
|
4 |
+
st.title("TBD")
|
pages/ML_Algorithms/svm.py
ADDED
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
|
3 |
+
def run():
|
4 |
+
st.title("TBD")
|
pages/__pycache__/1_Business_understanding.cpython-311.pyc
ADDED
Binary file (4.88 kB). View file
|
|
pages/__pycache__/2_Data_understanding.cpython-311.pyc
ADDED
Binary file (3.75 kB). View file
|
|
pages/__pycache__/3_Algorithms.cpython-311.pyc
ADDED
Binary file (1.3 kB). View file
|
|
pages/__pycache__/3_Data_preparation.cpython-311.pyc
ADDED
Binary file (3.82 kB). View file
|
|
pages/__pycache__/4_Data_ingestion.cpython-311.pyc
ADDED
Binary file (1.17 kB). View file
|
|
pages/__pycache__/4_Feature_engineering.cpython-311.pyc
ADDED
Binary file (5.75 kB). View file
|
|
pages/__pycache__/5_Data_preparation.cpython-311.pyc
ADDED
Binary file (1.13 kB). View file
|
|
pages/__pycache__/5_Modeling.cpython-311.pyc
ADDED
Binary file (3.67 kB). View file
|
|
pages/__pycache__/6_Evaluation.cpython-311.pyc
ADDED
Binary file (5.02 kB). View file
|
|
pages/__pycache__/7_Deployment.cpython-311.pyc
ADDED
Binary file (2.77 kB). View file
|
|
pages/__pycache__/8_Models.cpython-311.pyc
ADDED
Binary file (3.18 kB). View file
|
|
sidebar.png
ADDED
![]() |