Spaces:
Runtime error
Runtime error
# ----------------------Importing libraries---------------------- | |
import streamlit as st | |
from streamlit_pills import pills | |
import pandas as pd | |
import openai | |
# Imports for AgGrid | |
from st_aggrid import AgGrid, GridUpdateMode, JsCode | |
from st_aggrid.grid_options_builder import GridOptionsBuilder | |
# ----------------------Importing utils.py---------------------- | |
# For Snowflake (from Tony's utils.py) | |
import io | |
from utils import ( | |
connect_to_snowflake, | |
load_data_to_snowflake, | |
load_data_to_postgres, | |
connect_to_postgres, | |
) | |
# ----------------------Page config-------------------------------------- | |
st.set_page_config(page_title="GPT3 Dataset Generator", page_icon="π€") | |
# ----------------------Sidebar section-------------------------------- | |
# st.image( | |
# "Gifs/header.gif", | |
# ) | |
st.image("Gifs/boat_new.gif") | |
#API_Key = openai-api-key | |
c30, c31, c32 = st.columns([0.2, 0.1, 3]) | |
################# | |
# π Add the caching decorator | |
def load_data(url): | |
df = pd.read_csv(url) | |
return df | |
#df = load_data("https://github.com/plotly/datasets/raw/master/uber-rides-data1.csv") | |
#st.dataframe(df) | |
#st.button("Rerun") | |
################ | |
with c30: | |
st.caption("") | |
st.image("openai.png", width=60) | |
with c32: | |
st.title("GPT3 Dataset Generator") | |
st.write( | |
"This app generates datasets using GPT3. It was created for the βοΈ Snowflake Snowvation Hackathon" | |
) | |
tabMain, tabInfo, tabTo_dos = st.tabs(["Main", "Info", "To-do's"]) | |
with tabInfo: | |
st.write("") | |
st.write("") | |
st.subheader("π€ What is GPT-3?") | |
st.markdown( | |
"[GPT-3](https://en.wikipedia.org/wiki/GPT-3) is a large language generation model developed by [OpenAI](https://openai.com/) that can generate human-like text. It has a capacity of 175 billion parameters and is trained on a vast dataset of internet text. It can be used for tasks such as language translation, chatbot language generation, and content generation etc." | |
) | |
st.subheader("π What is Streamlit?") | |
st.markdown( | |
"[Streamlit](https://streamlit.io) is an open-source Python library that allows users to create interactive, web-based data visualization and machine learning applications without the need for extensive web development knowledge" | |
) | |
st.write("---") | |
st.subheader("π Resources") | |
st.markdown( | |
""" | |
- OpenAI | |
- [OpenAI Playground](https://beta.openai.com/playground) | |
- [OpenAI Documentation](https://beta.openai.com/docs) | |
- Streamlit | |
- [Documentation](https://docs.streamlit.io/) | |
- [Gallery](https://streamlit.io/gallery) | |
- [Cheat sheet](https://docs.streamlit.io/library/cheatsheet) | |
- [Book](https://www.amazon.com/dp/180056550X) (Getting Started with Streamlit for Data Science) | |
- Deploy your apps using [Streamlit Community Cloud](https://streamlit.io/cloud) in just a few clicks | |
""" | |
) | |
with tabTo_dos: | |
with st.expander("To-do", expanded=True): | |
st.write( | |
""" | |
- [p2] Currently, the results are displayed even if the submit button isn't pressed. | |
- [p2] There is still an issue with the index where the first element from the JSON is not being displayed. | |
- [Post Hackathon] To limit the number of API calls and costs, let's cap the maximum number - of results to 5. Alternatively, we can consider removing the free API key. | |
""" | |
) | |
st.write("") | |
with st.expander("Done", expanded=True): | |
st.write( | |
""" | |
- [p2] Check if the Json file is working | |
- [p2] On Github, remove any unused images and GIFs. | |
- [p1] Add that for postgress - localhost is required | |
- [p2] Rename the CSV and JSON as per the st-pills variable | |
- [p2] Change the color of the small arrow | |
- [p1] Adjust the size of the Gifs | |
- Add a streamlit badge in the `ReadMe` file | |
- Add the message "Please enter your API key or choose the `Free Key` option." | |
- Include a `ReadMe` file | |
- Add a section for the Snowflake credentials | |
- Remove password from the Python file | |
- Add screenshots to the `ReadMe` file | |
- Include forms in the snowflake postgres section | |
- Remove the hashed code in the Python file | |
- Include additional information in the 'info' tab | |
- p1] Fix the download issue by sorting it via session state | |
- [p1] Make the dataframe from this app editable | |
- Add more gifs to the app | |
- Change the color scheme to Snowflake Blue | |
- Include a section for Snowflake credentials | |
- Change the colors of the arrows, using this tool (https://lottiefiles.com/lottie-to-gif/convert) | |
- Try new prompts and implement the best ones | |
- Add a config file for the color scheme | |
- Include an option menu using this tool (https://github.com/victoryhb/streamlit-option-menu) | |
- Display a message when the API key is not provided | |
- Fix the arrow and rearrange the layout for the API key message | |
- Check and improve the quality of the prompt output | |
- Send the app to Tony and upload it to GitHub | |
- Re-arrange the data on the sidebar | |
- Change the colors of both gifs to match the overall color scheme | |
- Add context about the app being part of the snowvation project | |
- Add a button to convert the data to JSON format | |
- Include the Snowflake logo | |
- Add a submit button to block API calls unless pressed | |
- Add a tab with additional information | |
- Resize the columns in the st.form section | |
- Add the ability to add the dataset to Snowflake | |
- Create a section with pills, showcasing examples | |
- Change the main emoji | |
- Change the emoji in the tab (page_icon) | |
- [INFO] Sort out the issue with credits | |
""" | |
) | |
st.write("") | |
with st.expander("Not needed", expanded=True): | |
st.write( | |
""" | |
- Check index issue in readcsv (not an issue as I've changed the script) | |
- Add the mouse gif (doesn't fit) | |
- Ask Lukas - automatically resize the columns of a DataFrame | |
""" | |
) | |
st.write("") | |
st.write("") | |
st.write("") | |
st.write("") | |
with tabMain: | |
key_choice = st.sidebar.radio( | |
"", | |
( | |
"Your Key", | |
"Free Key (capped)", | |
), | |
horizontal=True, | |
) | |
if key_choice == "Your Key": | |
API_Key = st.sidebar.text_input( | |
"First, enter your OpenAI API key", type="password" | |
) | |
elif key_choice == "Free Key (capped)": | |
API_Key = st.secrets["API_KEY"] | |
image_arrow = st.sidebar.image( | |
"Gifs/blue_grey_arrow.gif", | |
) | |
if key_choice == "Free Key (capped)": | |
image_arrow.empty() | |
else: | |
st.write("") | |
st.sidebar.caption( | |
"No OpenAI API key? Get yours [here!](https://openai.com/blog/api-no-waitlist/)" | |
) | |
pass | |
st.write("") | |
c30, c31, c32 = st.columns([0.2, 0.1, 3]) | |
st.subheader("β Build your dataset") | |
example = pills( | |
"", | |
[ | |
"Sci-fi Movies", | |
"Animals", | |
"Pop Songs", | |
"POTUS's Twitter", | |
"Blank", | |
], | |
[ | |
"πΏ", | |
"π", | |
"π΅", | |
"πΊπΈ", | |
"π»", | |
], | |
label_visibility="collapsed", | |
) | |
if "counter" not in st.session_state: | |
st.session_state.counter = 0 | |
def increment(): | |
st.session_state.counter += 1 | |
if example == "Sci-fi Movies": | |
with st.form("my_form"): | |
text_input = st.text_input( | |
"What is the topic of your dataset?", value="Sci-fi movies" | |
) | |
col1, col2, col3 = st.columns(3, gap="small") | |
with col1: | |
column_01 = st.text_input("1st column", value="Title") | |
with col2: | |
column_02 = st.text_input("2nd column", value="Year") | |
with col3: | |
column_03 = st.text_input("3rd column", value="PG rating") | |
col1, col2 = st.columns(2, gap="medium") | |
with col1: | |
number = st.number_input( | |
"How many rows do you want?", | |
value=5, | |
min_value=1, | |
max_value=20, | |
step=5, | |
help="The maximum number of rows is 20.", | |
) | |
with col2: | |
engine = st.radio( | |
"GPT3 engine", | |
( | |
"Davinci", | |
"Curie", | |
"Babbage", | |
), | |
horizontal=True, | |
help="Davinci is the most powerful engine, but it's also the slowest. Curie is the fastest, but it's also the least powerful. Babbage is somewhere in the middle.", | |
) | |
if engine == "Davinci": | |
engine = "davinci-instruct-beta-v3" | |
elif engine == "Curie": | |
engine = "curie-instruct-beta-v2" | |
elif engine == "Babbage": | |
engine = "babbage-instruct-beta" | |
st.write("") | |
submitted = st.form_submit_button("Build my dataset! β¨", on_click=increment) | |
elif example == "Animals": | |
with st.form("my_form"): | |
text_input = st.text_input( | |
"What is the topic of your dataset?", value="Fastest animals on earth" | |
) | |
col1, col2, col3 = st.columns(3, gap="small") | |
with col1: | |
column_01 = st.text_input("1st column", value="Animal") | |
with col2: | |
column_02 = st.text_input("2nd column", value="Speed") | |
with col3: | |
column_03 = st.text_input("3rd column", value="Weight") | |
col1, col2 = st.columns(2, gap="medium") | |
with col1: | |
number = st.number_input( | |
"How many rows do you want?", | |
value=5, | |
min_value=1, | |
max_value=20, | |
step=5, | |
help="The maximum number of rows is 50.", | |
) | |
with col2: | |
engine = st.radio( | |
"GPT3 engine", | |
( | |
"Davinci", | |
"Curie", | |
"Babbage", | |
), | |
horizontal=True, | |
help="Davinci is the most powerful engine, but it's also the slowest. Curie is the fastest, but it's also the least powerful. Babbage is somewhere in the middle.", | |
) | |
if engine == "Davinci": | |
engine = "davinci-instruct-beta-v3" | |
elif engine == "Curie": | |
engine = "curie-instruct-beta-v2" | |
elif engine == "Babbage": | |
engine = "babbage-instruct-beta" | |
st.write("") | |
submitted = st.form_submit_button("Build my dataset! β¨", on_click=increment) | |
elif example == "Stocks": | |
with st.form("my_form"): | |
text_input = st.text_input( | |
"What is the topic of your dataset?", value="Stocks" | |
) | |
col1, col2, col3 = st.columns(3, gap="small") | |
with col1: | |
column_01 = st.text_input("1st column", value="Ticker") | |
with col2: | |
column_02 = st.text_input("2nd column", value="Price") | |
with col3: | |
column_03 = st.text_input("3rd column", value="Exchange") | |
col1, col2 = st.columns(2, gap="medium") | |
with col1: | |
number = st.number_input( | |
"How many rows do you want?", | |
value=5, | |
min_value=1, | |
max_value=20, | |
step=5, | |
help="The maximum number of rows is 50.", | |
) | |
with col2: | |
engine = st.radio( | |
"GPT3 engine", | |
( | |
"Davinci", | |
"Curie", | |
"Babbage", | |
), | |
horizontal=True, | |
help="Davinci is the most powerful engine, but it's also the slowest. Curie is the fastest, but it's also the least powerful. Babbage is somewhere in the middle.", | |
) | |
if engine == "Davinci": | |
engine = "davinci-instruct-beta-v3" | |
elif engine == "Curie": | |
engine = "curie-instruct-beta-v2" | |
elif engine == "Babbage": | |
engine = "babbage-instruct-beta" | |
st.write("") | |
submitted = st.form_submit_button("Build my dataset! β¨", on_click=increment) | |
elif example == "POTUS's Twitter": | |
with st.form("my_form"): | |
text_input = st.text_input( | |
"What is the topic of your dataset?", value="POTUS's Twitter accounts" | |
) | |
col1, col2, col3 = st.columns(3, gap="small") | |
with col1: | |
column_01 = st.text_input("1st column", value="Name") | |
with col2: | |
column_02 = st.text_input("2nd column", value="Twitter handle") | |
with col3: | |
column_03 = st.text_input("3rd column", value="# of followers") | |
col1, col2 = st.columns(2, gap="medium") | |
with col1: | |
number = st.number_input( | |
"How many rows do you want?", | |
value=5, | |
min_value=1, | |
max_value=20, | |
step=5, | |
help="The maximum number of rows is 50.", | |
) | |
with col2: | |
engine = st.radio( | |
"GPT3 engine", | |
( | |
"Davinci", | |
"Curie", | |
"Babbage", | |
), | |
horizontal=True, | |
help="Davinci is the most powerful engine, but it's also the slowest. Curie is the fastest, but it's also the least powerful. Babbage is somewhere in the middle.", | |
) | |
if engine == "Davinci": | |
engine = "davinci-instruct-beta-v3" | |
elif engine == "Curie": | |
engine = "curie-instruct-beta-v2" | |
elif engine == "Babbage": | |
engine = "babbage-instruct-beta" | |
st.write("") | |
submitted = st.form_submit_button("Build my dataset! β¨") | |
elif example == "Pop Songs": | |
with st.form("my_form"): | |
text_input = st.text_input( | |
"What is the topic of your dataset?", | |
value="Most famous songs of all time", | |
) | |
col1, col2, col3 = st.columns(3, gap="small") | |
with col1: | |
column_01 = st.text_input("1st column", value="Song") | |
with col2: | |
column_02 = st.text_input("2nd column", value="Artist") | |
with col3: | |
column_03 = st.text_input("3rd column", value="Genre") | |
col1, col2 = st.columns(2, gap="medium") | |
with col1: | |
number = st.number_input( | |
"How many rows do you want?", | |
value=5, | |
min_value=1, | |
max_value=20, | |
step=5, | |
help="The maximum number of rows is 50.", | |
) | |
with col2: | |
engine = st.radio( | |
"GPT3 engine", | |
( | |
"Davinci", | |
"Curie", | |
"Babbage", | |
), | |
horizontal=True, | |
help="Davinci is the most powerful engine, but it's also the slowest. Curie is the fastest, but it's also the least powerful. Babbage is somewhere in the middle.", | |
) | |
if engine == "Davinci": | |
engine = "davinci-instruct-beta-v3" | |
elif engine == "Curie": | |
engine = "curie-instruct-beta-v2" | |
elif engine == "Babbage": | |
engine = "babbage-instruct-beta" | |
st.write("") | |
submitted = st.form_submit_button("Build my dataset! β¨") | |
elif example == "Blank": | |
with st.form("my_form"): | |
text_input = st.text_input("What is the topic of your dataset?", value="") | |
col1, col2, col3 = st.columns(3, gap="small") | |
with col1: | |
column_01 = st.text_input("1st column", value="") | |
with col2: | |
column_02 = st.text_input("2nd column", value="") | |
with col3: | |
column_03 = st.text_input("3rd column", value="") | |
col1, col2 = st.columns(2, gap="medium") | |
with col1: | |
number = st.number_input( | |
"How many rows do you want?", | |
value=5, | |
min_value=1, | |
max_value=20, | |
step=5, | |
help="The maximum number of rows is 50.", | |
) | |
with col2: | |
engine = st.radio( | |
"GPT3 engine", | |
( | |
"Davinci", | |
"Curie", | |
"Babbage", | |
), | |
horizontal=True, | |
help="Davinci is the most powerful engine, but it's also the slowest. Curie is the fastest, but it's also the least powerful. Babbage is somewhere in the middle.", | |
) | |
if engine == "Davinci": | |
engine = "davinci-instruct-beta-v3" | |
elif engine == "Curie": | |
engine = "curie-instruct-beta-v2" | |
elif engine == "Babbage": | |
engine = "babbage-instruct-beta" | |
st.write("") | |
submitted = st.form_submit_button("Build my dataset! β¨") | |
# ----------------------API key section---------------------------------- | |
number = number + 1 | |
if not API_Key and not submitted: | |
st.stop() | |
if not API_Key and submitted: | |
st.info("Please enter your API key or choose the `Free Key` option.") | |
st.stop() | |
if st.session_state.counter >= 100: | |
pass | |
# ----------------------API key section---------------------------------- | |
if not submitted and st.session_state.counter == 0: | |
c30, c31, c32 = st.columns([1, 0.01, 4]) | |
with c30: | |
st.image("Gifs/arrow_small_new.gif") | |
st.caption("") | |
with c32: | |
st.caption("") | |
st.caption("") | |
st.info( | |
"Enter your dataset's criteria and click the button to generate it." | |
) | |
st.stop() | |
elif st.session_state.counter > 0: | |
c30, c31, c32 = st.columns([1, 0.9, 3]) | |
openai.api_key = API_Key | |
# ----------------------API call section---------------------------------- | |
response = openai.Completion.create( | |
model=engine, | |
prompt=f"Please provide a list of the top {number} {text_input} along with the following information in a three-column spreadsheet: {column_01}, {column_02}, and {column_03}. The columns should be labeled as follows: {column_01} | {column_02} | {column_03}", | |
temperature=0.5, | |
max_tokens=1707, | |
top_p=1, | |
best_of=2, | |
frequency_penalty=0, | |
presence_penalty=0, | |
) | |
st.write("___") | |
st.subheader("β‘ Check the results") | |
with st.expander("See the API Json output"): | |
response | |
output_code = response["choices"][0]["text"] | |
# ----------------------Dataframe section---------------------------------- | |
# create pandas DataFrame from string | |
df = pd.read_csv(io.StringIO(output_code), sep="|") | |
# get the number of columns in the dataframe | |
num_columns = len(df.columns) | |
# create a list of column names | |
column_names = ["Column {}".format(i) for i in range(1, num_columns + 1)] | |
# add the header to the dataframe | |
df.columns = column_names | |
# specify the mapping of old column names to new column names | |
column_mapping = { | |
"Column 1": column_01, | |
"Column 2": column_02, | |
"Column 3": column_03, | |
} | |
# rename the columns of the dataframe | |
df = df.rename(columns=column_mapping) | |
st.write("") | |
# ----------------------AgGrid section---------------------------------- | |
gd = GridOptionsBuilder.from_dataframe(df) | |
gd.configure_pagination(enabled=True) | |
gd.configure_default_column(editable=True, groupable=True) | |
gd.configure_selection(selection_mode="multiple") | |
gridoptions = gd.build() | |
grid_table = AgGrid( | |
df, | |
gridOptions=gridoptions, | |
update_mode=GridUpdateMode.SELECTION_CHANGED, | |
theme="material", | |
) | |
# df | |
# ----------------------Download section-------------------------------------- | |
c30, c31, c32, c33 = st.columns([1, 0.01, 1, 2.5]) | |
with c30: | |
def convert_df(df): | |
return df.to_csv().encode("utf-8") | |
csv = convert_df(df) | |
st.download_button( | |
label="Download CSV", | |
data=csv, | |
file_name=f"{example} dataset .csv", | |
mime="text/csv", | |
) | |
with c32: | |
json_string = df.to_json(orient="records") | |
st.download_button( | |
label="Download JSON", | |
data=json_string, | |
file_name="data_set_sample.json", | |
mime="text/csv", | |
) | |
st.write("___") | |
st.subheader("β’ Load data to Databases") | |
# Data to load to database(s) | |
# df = pd.read_csv("philox-testset-1.csv") | |
# Get user input for data storage option | |
storage_option = st.radio( | |
"Select data storage option:", | |
( | |
"Snowflake", | |
"PostgreSQL", | |
), | |
horizontal=True, | |
) | |
# Get user input for data storage option | |
# Snowflake = st.selectbox( | |
# "Select data storage option:", ["Snowflake", "Snowflake"] | |
# ) | |
def reset_form_fields(): | |
user = "" | |
password = "" | |
account = "" | |
warehouse = "" | |
database = "" | |
schema = "" | |
table = "" | |
host = "" | |
port = "" | |
if storage_option == "Snowflake": | |
st.subheader("`Enter Snowflake Credentials`π") | |
# Get user input for Snowflake credentials | |
with st.form("my_form_db"): | |
col1, col2 = st.columns(2, gap="small") | |
with col1: | |
user = st.text_input("Username:", value="TONY") | |
with col2: | |
password = st.text_input("Password:", type="password") | |
with col1: | |
account = st.text_input("Account:", value="jn27194.us-east4.gcp") | |
with col2: | |
warehouse = st.text_input("Warehouse:", value="NAH") | |
with col1: | |
database = st.text_input("Database:", value="SNOWVATION") | |
with col2: | |
schema = st.text_input("Schema:", value="PUBLIC") | |
table = st.text_input("Table:") | |
st.write("") | |
submitted = st.form_submit_button("Load to Snowflake") | |
# Load the data to Snowflake | |
if submitted: | |
# if st.button("Load data to Snowflake"): | |
if ( | |
user | |
and password | |
and account | |
and warehouse | |
and database | |
and schema | |
and table | |
): | |
conn = connect_to_snowflake( | |
username=user, | |
password=password, | |
account=account, | |
warehouse=warehouse, | |
database=database, | |
schema=schema, | |
) | |
if conn: | |
load_data_to_snowflake(df, conn, table) | |
else: | |
st.warning("Please enter all Snowflake credentials") | |
elif storage_option == "PostgreSQL": | |
st.subheader("`Enter PostgreSQL Credentials`π") | |
st.error("Localhost only") | |
# Get user input for PostgreSQL credentials | |
with st.form("my_form_db"): | |
col1, col2 = st.columns(2, gap="small") | |
with col1: | |
user = st.text_input("Username:", value="postgres") | |
with col2: | |
password = st.text_input("Password:", type="password") | |
with col1: | |
host = st.selectbox("Host:", ["localhost", "other"]) | |
if host == "other": | |
host = st.text_input("Enter host:") | |
with col2: | |
port = st.text_input("Port:", value="5432") | |
with col1: | |
database = st.text_input("Database:", value="snowvation") | |
with col2: | |
table = st.text_input("Table:") | |
st.write("") | |
submitted = st.form_submit_button("Load to PostgreSQL") | |
# Load the data to PostgreSQL | |
# if st.button("Load data to PostgreSQL"): | |
if submitted: | |
if user and password and host and port and database and table: | |
conn = connect_to_postgres( | |
username=user, | |
password=password, | |
host=host, | |
port=port, | |
database=database, | |
) | |
if conn: | |
load_data_to_postgres(df, conn, table) | |
else: | |
st.warning("Please enter all PostgreSQL credentials and table name") | |
# Reset form fields when storage_option changes | |
reset_form_fields() |