Spaces:

Anupam202224
/

DataAnalysis-A

Sleeping

App Files Files Community

Anupam202224 commited on Oct 11, 2024

Commit

5058119

verified ·

1 Parent(s): c4c8dcf

Update app.py

Browse files

Files changed (1) hide show

app.py +116 -70

app.py CHANGED Viewed

@@ -8,37 +8,37 @@ import matplotlib.pyplot as plt
 import seaborn as sns
 # Define constants
-MODEL_NAME = "meta-llama/Llama-2-7b-hf"  # Replace with a smaller model suitable for CPU
 FIGURES_DIR = "./figures"
 # Ensure the figures directory exists
 os.makedirs(FIGURES_DIR, exist_ok=True)
 # Initialize tokenizer and model
-# Note: Loading large models on CPU can be very slow and may not be feasible
 try:
     tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
-    model = AutoModelForCausalLM.from_pretrained(MODEL_NAME, device_map="cpu")
 except Exception as e:
     print(f"Error loading model: {e}")
     exit(1)
-# Define the base prompt
 base_prompt = """You are an expert data analyst.
-According to the features you have and the data structure given below, determine which feature should be the target.
-Then list 3 interesting questions that could be asked on this data, for instance about specific correlations with target variable.
-Then answer these questions one by one, by finding the relevant numbers.
-Meanwhile, plot some figures using matplotlib/seaborn and save them to the (already existing) folder './figures/': take care to clear each figure with plt.clf() before doing another plot.
-In your final answer: summarize these correlations and trends
-After each number derive real worlds insights, for instance: "Correlation between is_december and boredness is 1.3453, which suggest people are more bored in winter".
-Your final answer should be a long string with at least 3 numbered and detailed parts.
-Structure of the data:
-{structure_notes}
-The data file is passed to you as the variable data_file, it is a pandas dataframe, you can use it directly.
-DO NOT try to load data_file, it is already a dataframe pre-loaded in your python interpreter!
 """
 example_notes = """This data is about the Titanic wreck in 1912.
@@ -48,13 +48,8 @@ pclass: A proxy for socio-economic status (SES)
 2nd = Middle
 3rd = Lower
 age: Age is fractional if less than 1. If the age is estimated, it is in the form of xx.5
-sibsp: The dataset defines family relations in this way...
-Sibling = brother, sister, stepbrother, stepsister
-Spouse = husband, wife (mistresses and fiancés were ignored)
-parch: The dataset defines family relations in this way...
-Parent = mother, father
-Child = daughter, son, stepdaughter, stepson
-Some children traveled only with a nanny, therefore parch=0 for them."""
 def get_images_in_directory(directory):
     """Retrieve all image file paths from the specified directory."""
@@ -66,25 +61,75 @@ def get_images_in_directory(directory):
                 image_files.append(os.path.join(root, file))
     return image_files
-def generate_response(prompt):
-    """Generate a response from the language model based on the prompt."""
-    inputs = tokenizer(prompt, return_tensors="pt")
     inputs = inputs.to('cpu')  # Ensure the model runs on CPU
-    # Generate response (adjust parameters as needed)
     with torch.no_grad():
         outputs = model.generate(
-            **inputs,
-            max_length=2048,
             do_sample=True,
             top_p=0.95,
             temperature=0.7,
-            eos_token_id=tokenizer.eos_token_id
         )
     response = tokenizer.decode(outputs[0], skip_special_tokens=True)
     return response
 def interact_with_agent(file_input, additional_notes):
     """Process the uploaded file and interact with the language model to analyze data."""
     # Clear and recreate the figures directory
@@ -92,36 +137,37 @@ def interact_with_agent(file_input, additional_notes):
         shutil.rmtree(FIGURES_DIR)
     os.makedirs(FIGURES_DIR, exist_ok=True)
-    # Load the data file into a pandas dataframe
-    try:
-        data_file = pd.read_csv(file_input.name)
-    except Exception as e:
-        yield [("Error loading CSV file.",)]
         return
-    # Create structure notes
-    data_structure_notes = f"""- Description (output of .describe()):
-{data_file.describe()}
-- Columns with dtypes:
-{data_file.dtypes}"""
-    # Construct the prompt
-    prompt = base_prompt.format(structure_notes=data_structure_notes)
-    if additional_notes and additional_notes.strip():
-        prompt += "\nAdditional notes on the data:\n" + additional_notes
-    # Initialize chat history
-    messages = [("User", prompt)]
-    yield messages + [("Assistant", "⏳ _Starting analysis..._")]
-    # Generate response from the model
-    response = generate_response(prompt)
-    messages.append(("Assistant", response))
-    # Extract and display generated images
-    image_paths = get_images_in_directory(FIGURES_DIR)
-    for image_path in image_paths:
         messages.append(("Assistant", gr.Image.update(value=image_path)))
     yield messages
@@ -129,40 +175,40 @@ def interact_with_agent(file_input, additional_notes):
 # Define the Gradio interface
 with gr.Blocks(
     theme=gr.themes.Soft(
-        primary_hue=gr.themes.colors.yellow,
-        secondary_hue=gr.themes.colors.blue,
     )
 ) as demo:
-    gr.Markdown("""# Llama-2 Data Analyst 📊🤔
-Drop a `.csv` file below, add notes to describe this data if needed, and **the model will analyze the file content and draw figures for you!**""")
     with gr.Row():
-        file_input = gr.File(label="Your file to analyze", type="file")
         text_input = gr.Textbox(
-            label="Additional notes to support the analysis",
-            placeholder="Enter any additional notes here..."
         )
-    submit = gr.Button("Run analysis!", variant="primary")
-    chatbot = gr.Chatbot(
-        label="Data Analyst Agent",
-        height=400,
-    )
     gr.Examples(
         examples=[["./example/titanic.csv", example_notes]],
         inputs=[file_input, text_input],
         cache_examples=False
     )
     # Connect the submit button to the interact_with_agent function
     submit.click(
         interact_with_agent,
         inputs=[file_input, text_input],
         outputs=[chatbot],
-        show_progress=True
     )
 # Launch the Gradio app

 import seaborn as sns
 # Define constants
+MODEL_NAME = "gpt2"  # Publicly accessible model suitable for CPU
 FIGURES_DIR = "./figures"
 # Ensure the figures directory exists
 os.makedirs(FIGURES_DIR, exist_ok=True)
 # Initialize tokenizer and model
+print("Loading model and tokenizer...")
 try:
     tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
+    model = AutoModelForCausalLM.from_pretrained(MODEL_NAME)
+    model.to('cpu')  # Ensure the model runs on CPU
+    print("Model and tokenizer loaded successfully.")
 except Exception as e:
     print(f"Error loading model: {e}")
     exit(1)
+# Define the base prompt for the model
 base_prompt = """You are an expert data analyst.
+Based on the following data description, determine an appropriate target feature.
+List 3 insightful questions regarding the data.
+Provide detailed answers to each question with relevant statistics.
+Summarize the findings with real-world insights.
+Data Description:
+{data_description}
+Additional Notes:
+{additional_notes}
+Please provide your response in a structured and detailed manner.
 """
 example_notes = """This data is about the Titanic wreck in 1912.
 2nd = Middle
 3rd = Lower
 age: Age is fractional if less than 1. If the age is estimated, it is in the form of xx.5
+sibsp: Number of siblings/spouses aboard
+parch: Number of parents/children aboard"""
 def get_images_in_directory(directory):
     """Retrieve all image file paths from the specified directory."""
                 image_files.append(os.path.join(root, file))
     return image_files
+def generate_summary(prompt):
+    """Generate a summary from the language model based on the prompt."""
+    inputs = tokenizer.encode(prompt, return_tensors="pt")
     inputs = inputs.to('cpu')  # Ensure the model runs on CPU
+    # Generate response
     with torch.no_grad():
         outputs = model.generate(
+            inputs,
+            max_length=500,
             do_sample=True,
             top_p=0.95,
             temperature=0.7,
+            eos_token_id=tokenizer.eos_token_id,
+            pad_token_id=tokenizer.eos_token_id
         )
     response = tokenizer.decode(outputs[0], skip_special_tokens=True)
     return response
+def analyze_data(data_file_path):
+    """Perform data analysis on the uploaded CSV file."""
+    try:
+        data = pd.read_csv(data_file_path)
+    except Exception as e:
+        return None, f"Error loading CSV file: {e}"
+    # Generate data description
+    data_description = f"- **Data Summary (.describe()):**\n{data.describe().to_markdown()}\n\n"
+    data_description += f"- **Data Types:**\n{data.dtypes.to_frame().to_markdown()}\n"
+    # Determine target variable (for demonstration, assume 'Survived' or first numeric column)
+    if 'Survived' in data.columns:
+        target = 'Survived'
+    else:
+        numeric_cols = data.select_dtypes(include='number').columns
+        target = numeric_cols[0] if len(numeric_cols) > 0 else data.columns[0]
+    # Generate visualizations
+    visualization_paths = []
+    # Correlation heatmap
+    plt.figure(figsize=(10, 8))
+    sns.heatmap(data.corr(), annot=True, fmt=".2f", cmap='coolwarm')
+    plt.title("Correlation Heatmap")
+    heatmap_path = os.path.join(FIGURES_DIR, "correlation_heatmap.png")
+    plt.savefig(heatmap_path)
+    plt.clf()
+    visualization_paths.append(heatmap_path)
+    # Distribution of target variable
+    plt.figure(figsize=(8, 6))
+    sns.countplot(x=target, data=data)
+    plt.title(f"Distribution of {target}")
+    plt.savefig(os.path.join(FIGURES_DIR, f"{target}_distribution.png"))
+    plt.clf()
+    visualization_paths.append(os.path.join(FIGURES_DIR, f"{target}_distribution.png"))
+    # Pairplot (limited to first 5 numeric columns for performance)
+    numeric_cols = data.select_dtypes(include='number').columns[:5]
+    if len(numeric_cols) >= 2:
+        sns.pairplot(data[numeric_cols].dropna())
+        pairplot_path = os.path.join(FIGURES_DIR, "pairplot.png")
+        plt.savefig(pairplot_path)
+        plt.clf()
+        visualization_paths.append(pairplot_path)
+    return data_description, visualization_paths, target
 def interact_with_agent(file_input, additional_notes):
     """Process the uploaded file and interact with the language model to analyze data."""
     # Clear and recreate the figures directory
         shutil.rmtree(FIGURES_DIR)
     os.makedirs(FIGURES_DIR, exist_ok=True)
+    if file_input is None:
+        yield [("Error", "No file uploaded.")]
         return
+    # Analyze the data
+    data_description, visualization_paths, target = analyze_data(file_input.name)
+    if data_description is None:
+        yield [("Error", visualization_paths)]  # visualization_paths contains the error message
+        return
+    # Construct the prompt for the model
+    prompt = base_prompt.format(
+        data_description=data_description,
+        additional_notes=additional_notes if additional_notes else "None."
+    )
+    # Generate summary from the model
+    summary = generate_summary(prompt)
+    # Prepare chat messages
+    messages = [
+        ("User", "I have uploaded a CSV file for analysis."),
+        ("Assistant", "⏳ _Analyzing the data..._")
+    ]
+    # Append the summary
+    messages.append(("Assistant", summary))
+    # Append images
+    for image_path in visualization_paths:
         messages.append(("Assistant", gr.Image.update(value=image_path)))
     yield messages
 # Define the Gradio interface
 with gr.Blocks(
     theme=gr.themes.Soft(
+        primary_hue=gr.themes.colors.blue,
+        secondary_hue=gr.themes.colors.orange,
     )
 ) as demo:
+    gr.Markdown("""# 📊 Data Analyst Assistant
+    Upload a `.csv` file, add any additional notes, and **the assistant will analyze the data and generate visualizations and insights for you!**
+    **Example:** [Titanic Dataset](./example/titanic.csv)
+    """)
     with gr.Row():
+        file_input = gr.File(label="Upload CSV File", file_types=[".csv"])
         text_input = gr.Textbox(
+            label="Additional Notes",
+            placeholder="Enter any additional notes or leave blank..."
         )
+    submit = gr.Button("Run Analysis", variant="primary")
+    chatbot = gr.Chatbot(label="Data Analyst Agent")
     gr.Examples(
         examples=[["./example/titanic.csv", example_notes]],
         inputs=[file_input, text_input],
+        label="Examples",
         cache_examples=False
     )
     # Connect the submit button to the interact_with_agent function
     submit.click(
         interact_with_agent,
         inputs=[file_input, text_input],
         outputs=[chatbot],
+        api_name="run_analysis"
     )
 # Launch the Gradio app