Spaces:

asoria
/

auto-notebook-creator

Running

App Files Files Community

asoria commited on Sep 3, 2024

Commit

939f6ae

1 Parent(s): 4dc6cd8

Adding filter for numeric and categoric datasets

Browse files

Files changed (2) hide show

app.py +6 -3
utils/notebook_utils.py +12 -2

app.py CHANGED Viewed

@@ -15,8 +15,7 @@ from dotenv import load_dotenv
 import os
 # TODOS:
-# 1. Add cells by data types in EDA notebook
-# 2. Add template for RAG and embeddings
 load_dotenv()
@@ -147,7 +146,11 @@ def generate_cells(dataset_id, cells, notebook_type="eda"):
     html_code = f"<iframe src='https://huggingface.co/datasets/{dataset_id}/embed/viewer' width='80%' height='560px'></iframe>"
     wildcards = ["{dataset_name}", "{first_code}", "{html_code}"]
     replacements = [dataset_id, first_code, html_code]
-    cells = replace_wildcards(cells, wildcards, replacements)
     generated_text = ""
     # Show only the first 40 lines, would like to have a scroll in gr.Code https://github.com/gradio-app/gradio/issues/9192
     viewer_lines = 0

 import os
 # TODOS:
+# Add template for RAG and embeddings
 load_dotenv()
     html_code = f"<iframe src='https://huggingface.co/datasets/{dataset_id}/embed/viewer' width='80%' height='560px'></iframe>"
     wildcards = ["{dataset_name}", "{first_code}", "{html_code}"]
     replacements = [dataset_id, first_code, html_code]
+    has_numeric_columns = len(df.select_dtypes(include=["number"]).columns) > 0
+    has_categoric_columns = len(df.select_dtypes(include=["object"]).columns) > 0
+    cells = replace_wildcards(
+        cells, wildcards, replacements, has_numeric_columns, has_categoric_columns
+    )
     generated_text = ""
     # Show only the first 40 lines, would like to have a scroll in gr.Code https://github.com/gradio-app/gradio/issues/9192
     viewer_lines = 0

utils/notebook_utils.py CHANGED Viewed

@@ -1,4 +1,6 @@
-def replace_wildcards(templates, wildcards, replacements):
     if len(wildcards) != len(replacements):
         raise ValueError(
             "The number of wildcards must match the number of replacements."
@@ -6,6 +8,10 @@ def replace_wildcards(templates, wildcards, replacements):
     new_templates = []
     for tmp in templates:
         tmp_text = tmp["source"]
         for wildcard, replacement in zip(wildcards, replacements):
             tmp_text = tmp_text.replace(wildcard, replacement)
@@ -75,7 +81,6 @@ import seaborn as sns
 # First rows of the dataset and info
 print(df.head())
 print(df.info())
-print(df.describe())
 """,
     },
     {
@@ -107,6 +112,7 @@ print(df.describe())
 """,
     },
     {
         "cell_type": "code",
         "source": """
 # Unique values in categorical columns
@@ -118,6 +124,7 @@ df.select_dtypes(include=['object']).nunique()
         "source": "## 3. Data Visualization",
     },
     {
         "cell_type": "code",
         "source": """
 # Correlation matrix for numerical columns
@@ -129,6 +136,7 @@ plt.show()
 """,
     },
     {
         "cell_type": "code",
         "source": """
 # Distribution plots for numerical columns
@@ -142,6 +150,7 @@ for column in df.select_dtypes(include=['int64', 'float64']).columns:
 """,
     },
     {
         "cell_type": "code",
         "source": """
 # Count plots for categorical columns
@@ -155,6 +164,7 @@ for column in df.select_dtypes(include=['object']).columns:
 """,
     },
     {
         "cell_type": "code",
         "source": """
 # Box plots for detecting outliers in numerical columns

+def replace_wildcards(
+    templates, wildcards, replacements, has_numeric_columns, has_categoric_columns
+):
     if len(wildcards) != len(replacements):
         raise ValueError(
             "The number of wildcards must match the number of replacements."
     new_templates = []
     for tmp in templates:
+        if "type" in tmp and tmp["type"] == "numeric" and not has_numeric_columns:
+            continue
+        if "type" in tmp and tmp["type"] == "categoric" and not has_categoric_columns:
+            continue
         tmp_text = tmp["source"]
         for wildcard, replacement in zip(wildcards, replacements):
             tmp_text = tmp_text.replace(wildcard, replacement)
 # First rows of the dataset and info
 print(df.head())
 print(df.info())
 """,
     },
     {
 """,
     },
     {
+        "type": "categoric",
         "cell_type": "code",
         "source": """
 # Unique values in categorical columns
         "source": "## 3. Data Visualization",
     },
     {
+        "type": "numeric",
         "cell_type": "code",
         "source": """
 # Correlation matrix for numerical columns
 """,
     },
     {
+        "type": "numeric",
         "cell_type": "code",
         "source": """
 # Distribution plots for numerical columns
 """,
     },
     {
+        "type": "categoric",
         "cell_type": "code",
         "source": """
 # Count plots for categorical columns
 """,
     },
     {
+        "type": "numeric",
         "cell_type": "code",
         "source": """
 # Box plots for detecting outliers in numerical columns