JG1310's picture
Update app.py
e09dd64 verified
import gradio as gr
import numpy as np
import pandas as pd
from scipy import stats
from typing import List, Dict, Any, Optional, Union
def parse_numeric_input(data: str) -> List[float]:
"""
Parse comma-separated string of numbers into a list of floats.
Args:
data (str): Comma-separated string of numbers (e.g., "1.2,2.3,3.4,2.1")
Returns:
List[float]: Parsed numeric data
Raises:
ValueError: If data cannot be parsed as numeric values
Example:
>>> parse_numeric_input("85.2,90.1,78.5,92.3")
[85.2, 90.1, 78.5, 92.3]
"""
try:
parsed = [float(x.strip()) for x in data.split(',') if x.strip()]
if not parsed:
raise ValueError("No valid numbers found in input string")
return parsed
except ValueError as e:
if "could not convert" in str(e):
raise ValueError(f"Cannot parse '{data}' as comma-separated numbers")
raise e
def welch_t_test(
dataframe: Optional[pd.DataFrame] = None,
group1_str: Optional[str] = None,
group2_str: Optional[str] = None,
alternative: str = "two-sided",
alpha: float = 0.05,
effect_thresholds: str = "0.2,0.5,0.8"
) -> Dict[str, Any]:
"""
Accepts two groups of numeric data as comma-separated strings or DataFrame columns and performs Welch's t-test. This test determines whether two independent groups have significantly different means.
This test is valid even when populations have different variances. Default to this test instead of students t-test if you are unsure about population variance.
This test calculates a t-statistic using Welch's formula that accounts for unequal variances. Given an alternative hypothesis (group1 ≠ group2, group1 < group2, or group1 > group2),
it outputs the p-value: the probability of observing this result (or more extreme) if no true difference exists. Results are considered statistically significant
when p-value < alpha (typically 0.05). Cohen's d measures practical effect size, calculated using pooled standard deviation for consistency with other t-tests, with interpretation:
|d| < 0.2 = negligible, 0.2-0.5 = small, 0.5-0.8 = medium, >0.8 = large (custom thresholds may be used).
EXAMPLE USE CASES: treatment vs control groups, before/after measurements with different participants,
comparing performance between demographic groups.
Args:
dataframe (Optional[pd.DataFrame]): DataFrame containing group data in first two columns.
If provided, group1_str and group2_str will be ignored.
group1_str (Optional[str]): Comma-separated string of numeric values for the first group.
Example: "12.1,15.3,18.7,14.2,16.8" (reaction times for Group A)
Only used if dataframe is None or empty.
group2_str (Optional[str]): Comma-separated string of numeric values for the second group.
Example: "22.4,19.8,25.1,21.3" (reaction times for Group B)
Only used if dataframe is None or empty.
alternative (str): Direction of the alternative hypothesis:
- "two-sided": group1 mean ≠ group2 mean (different in either direction)
- "less": group1 mean < group2 mean (group1 is smaller)
- "greater": group1 mean > group2 mean (group1 is larger)
alpha (float): Significance level for the test (probability of Type I error).
Common values: 0.05 (5%), 0.01 (1%), 0.10 (10%)
effect_thresholds (str): Three comma-separated values defining Cohen's d effect size boundaries.
Format: "small_threshold,medium_threshold,large_threshold"
Default "0.2,0.5,0.8" means: <0.2=negligible, 0.2-0.5=small, 0.5-0.8=medium, >0.8=large
Returns:
dict: Comprehensive test results with the following keys:
- test_type (str): Always "Welch's t-test (unequal variances)"
- t_statistic (float): The calculated t-value using Welch's formula
- p_value (float): Probability of observing this result if null hypothesis is true
- degrees_of_freedom (float): Welch's adjusted df (usually non-integer), accounts for unequal variances
- cohens_d (float): Standardized effect size. Positive means group1 > group2, negative means group1 < group2
- pooled_std (float): Pooled standard deviation used in effect size calculation
- group1_stats (dict): Descriptive statistics for group1 (mean, std, n)
- group2_stats (dict): Descriptive statistics for group2 (mean, std, n)
- significant (bool): True if p_value < alpha
- effect_size (str): Categorical interpretation of Cohen's d magnitude
- alternative_hypothesis (str): Echo of alternative parameter
- alpha (float): Echo of significance level used
- effect_thresholds (List[float]): Echo of effect size thresholds used
"""
try:
# Parse effect size thresholds
try:
thresholds = [float(x.strip()) for x in effect_thresholds.split(',')]
if len(thresholds) != 3:
return {"error": "Effect thresholds must be three comma-separated numbers (small,medium,large)"}
except:
return {"error": "Invalid effect thresholds format. Use 'small,medium,large' (e.g., '0.2,0.5,0.8')"}
# Method 1: DataFrame input (preferred for LLMs and data pipelines)
if dataframe is not None and not dataframe.empty:
# Use first two columns automatically
if len(dataframe.columns) < 2:
return {"error": f"DataFrame must have at least 2 columns. Found {len(dataframe.columns)} columns."}
# Extract and validate data from first two columns
try:
# Convert to numeric, coercing errors to NaN
col1_numeric = pd.to_numeric(dataframe.iloc[:, 0], errors='coerce')
col2_numeric = pd.to_numeric(dataframe.iloc[:, 1], errors='coerce')
# Remove NaN values and convert to list
group1 = col1_numeric.dropna().tolist()
group2 = col2_numeric.dropna().tolist()
# Check if we lost too much data due to non-numeric values
original_count1 = len(dataframe.iloc[:, 0].dropna())
original_count2 = len(dataframe.iloc[:, 1].dropna())
if len(group1) < original_count1 * 0.5: # Lost more than 50% of data
return {"error": f"Column 1 contains too many non-numeric values. Only {len(group1)} out of {original_count1} values could be converted to numbers."}
if len(group2) < original_count2 * 0.5: # Lost more than 50% of data
return {"error": f"Column 2 contains too many non-numeric values. Only {len(group2)} out of {original_count2} values could be converted to numbers."}
input_method = "dataframe"
except Exception as e:
return {"error": f"Error processing DataFrame columns: {str(e)}. Ensure columns contain numeric data."}
# Method 2: String input (preferred for humans and simple use cases)
elif group1_str and group2_str and group1_str.strip() and group2_str.strip():
try:
group1 = parse_numeric_input(group1_str)
group2 = parse_numeric_input(group2_str)
input_method = "strings"
except ValueError as e:
return {"error": f"String parsing error: {str(e)}"}
else:
return {"error": "Please provide either a DataFrame with data OR comma-separated strings for both groups. Do not leave inputs empty."}
# Validate extracted data
if len(group1) < 2:
return {"error": f"Group 1 must have at least 2 observations. Found {len(group1)} values."}
if len(group2) < 2:
return {"error": f"Group 2 must have at least 2 observations. Found {len(group2)} values."}
# Perform Welch's t-test analysis
# Convert to numpy arrays for calculations
data1 = np.array(group1)
data2 = np.array(group2)
# Perform Welch's t-test (unequal variances)
t_stat, p_value = stats.ttest_ind(data1, data2, equal_var=False, alternative=alternative)
# Calculate descriptive statistics
desc1 = {"mean": np.mean(data1), "std": np.std(data1, ddof=1), "n": len(data1)}
desc2 = {"mean": np.mean(data2), "std": np.std(data2, ddof=1), "n": len(data2)}
# Welch's degrees of freedom formula
s1_sq, s2_sq = desc1["std"]**2, desc2["std"]**2
n1, n2 = desc1["n"], desc2["n"]
df = (s1_sq/n1 + s2_sq/n2)**2 / ((s1_sq/n1)**2/(n1-1) + (s2_sq/n2)**2/(n2-1))
# Effect size (Cohen's d using pooled standard deviation for consistency)
# For Welch's test, we still typically use pooled SD for Cohen's d calculation
pooled_std = np.sqrt(((len(data1)-1)*desc1["std"]**2 + (len(data2)-1)*desc2["std"]**2) / (len(data1) + len(data2) - 2))
cohens_d = (desc1["mean"] - desc2["mean"]) / pooled_std
# Interpretation using Cohen's canonical benchmarks
significant = p_value < alpha
abs_d = abs(cohens_d)
small_threshold, medium_threshold, large_threshold = thresholds
if abs_d < small_threshold:
effect_size_interp = "negligible"
elif abs_d < medium_threshold:
effect_size_interp = "small"
elif abs_d < large_threshold:
effect_size_interp = "medium"
else:
effect_size_interp = "large"
return {
"test_type": "Welch's t-test",
"t_statistic": t_stat,
"p_value": p_value,
"degrees_of_freedom": df,
"cohens_d": cohens_d,
"pooled_std": pooled_std,
"group1_stats": desc1,
"group2_stats": desc2,
"significant": significant,
"effect_size": effect_size_interp,
"alternative_hypothesis": alternative,
"alpha": alpha,
"effect_thresholds": thresholds
}
except Exception as e:
return {"error": f"Unexpected error in Welch's t-test: {str(e)}"}
def student_t_test(
dataframe: Optional[pd.DataFrame] = None,
group1_str: Optional[str] = None,
group2_str: Optional[str] = None,
alternative: str = "two-sided",
alpha: float = 0.05,
effect_thresholds: str = "0.2,0.5,0.8"
) -> Dict[str, Any]:
"""
Accepts two groups of numeric data as comma-separated strings or DataFrame columns and performs Student's t-test.
This test determines whether two independent groups have significantly different means, assuming populations from which the groups were sampled have equal
variances (if this assumption is violated, or if equal population variance cannot be verified, use Welch's t-test instead). The test calculates a t-statistic quantifying the mean
difference as a multiple of pooled standard deviation. Given an alternative hypothesis (group1 ≠ group2, group1 < group2, or group1 > group2),
it outputs the p-value: the probability of observing this result (or more extreme) if no true difference exists. Results are statistically significant
when p-value < alpha (typically 0.05). Cohen's d measures practical effect size, standardized by pooled standard deviation, with interpretation:
|d| < 0.2 = negligible, 0.2-0.5 = small, 0.5-0.8 = medium, >0.8 = large (custom thresholds may be used).
EXAMPLE USE CASES: treatment vs control groups, before/after measurements with different participants,
comparing performance between demographic groups.
Args:
dataframe (Optional[pd.DataFrame]): DataFrame containing group data in first two columns.
If provided, group1_str and group2_str will be ignored.
group1_str (Optional[str]): Comma-separated string of numeric values for the first group.
Example: "85.2,90.1,78.5,92.3" (test scores for Group A)
Only used if dataframe is None or empty.
group2_str (Optional[str]): Comma-separated string of numeric values for the second group.
Example: "88.1,85.7,91.2,87.4" (test scores for Group B)
Only used if dataframe is None or empty.
alternative (str): Direction of the alternative hypothesis:
- "two-sided": group1 mean ≠ group2 mean (different in either direction)
- "less": group1 mean < group2 mean (group1 is smaller)
- "greater": group1 mean > group2 mean (group1 is larger)
alpha (float): Significance level for the test (probability of Type I error). Reject null hypothesis if p_value below this threshold.
Common values: 0.05 (5%), 0.01 (1%), 0.10 (10%)
effect_thresholds (str): Three comma-separated values defining Cohen's d effect size boundaries.
Format: "small_threshold,medium_threshold,large_threshold"
Default "0.2,0.5,0.8" means: <0.2=negligible, 0.2-0.5=small, 0.5-0.8=medium, >0.8=large
These are Cohen's canonical benchmarks for effect size interpretation.
Returns:
dict: Comprehensive test results with the following keys:
- test_type (str): Always "Student's t-test"
- t_statistic (float): The calculated t-value, which measures how many standard errors the difference
between group means is away from zero (assuming the null hypothesis is true).
Larger absolute values indicate the observed difference is less likely under the null hypothesis.
- p_value (float): Probability of observing this result (or more extreme) if null hypothesis is true.
Values < alpha indicate statistical significance.
- degrees_of_freedom (int): df = n1 + n2 - 2, degrees of freedom for the pooled variance estimate, used for determining critical t-values.
- cohens_d (float): Effect size measure. Positive means group1 > group2, negative means group1 < group2.
Interpreted using Cohen's canonical benchmarks: negligible (<0.2), small (0.2), medium (0.5), large (0.8).
- pooled_std (float): Combined standard deviation used in Cohen's d calculation.
- group1_stats (dict): Descriptive statistics for group1 (mean, std, n)
- group2_stats (dict): Descriptive statistics for group2 (mean, std, n)
- significant (bool): True if p_value < alpha, False otherwise
- effect_size (str): Categorical interpretation ("negligible", "small", "medium", "large") based on |cohens_d| and effect_thresholds
- alternative_hypothesis (str): Echo of the alternative parameter used
- alpha (float): Echo of the significance level used
- effect_thresholds (List[float]): Echo of the thresholds used
"""
try:
# Parse effect size thresholds
try:
thresholds = [float(x.strip()) for x in effect_thresholds.split(',')]
if len(thresholds) != 3:
return {"error": "Effect thresholds must be three comma-separated numbers (small,medium,large)"}
except:
return {"error": "Invalid effect thresholds format. Use 'small,medium,large' (e.g., '0.2,0.5,0.8')"}
# Method 1: DataFrame input (preferred for LLMs and data pipelines)
if dataframe is not None and not dataframe.empty:
# Use first two columns automatically
if len(dataframe.columns) < 2:
return {"error": f"DataFrame must have at least 2 columns. Found {len(dataframe.columns)} columns."}
# Extract and validate data from first two columns
try:
# Convert to numeric, coercing errors to NaN
col1_numeric = pd.to_numeric(dataframe.iloc[:, 0], errors='coerce')
col2_numeric = pd.to_numeric(dataframe.iloc[:, 1], errors='coerce')
# Remove NaN values and convert to list
group1 = col1_numeric.dropna().tolist()
group2 = col2_numeric.dropna().tolist()
# Check if we lost too much data due to non-numeric values
original_count1 = len(dataframe.iloc[:, 0].dropna())
original_count2 = len(dataframe.iloc[:, 1].dropna())
if len(group1) < original_count1 * 0.5: # Lost more than 50% of data
return {"error": f"Column 1 contains too many non-numeric values. Only {len(group1)} out of {original_count1} values could be converted to numbers."}
if len(group2) < original_count2 * 0.5: # Lost more than 50% of data
return {"error": f"Column 2 contains too many non-numeric values. Only {len(group2)} out of {original_count2} values could be converted to numbers."}
input_method = "dataframe"
except Exception as e:
return {"error": f"Error processing DataFrame columns: {str(e)}. Ensure columns contain numeric data."}
# Method 2: String input (preferred for humans and simple use cases)
elif group1_str and group2_str and group1_str.strip() and group2_str.strip():
try:
group1 = parse_numeric_input(group1_str)
group2 = parse_numeric_input(group2_str)
input_method = "strings"
except ValueError as e:
return {"error": f"String parsing error: {str(e)}"}
else:
return {"error": "Please provide either a DataFrame with data OR comma-separated strings for both groups. Do not leave inputs empty."}
# Validate extracted data
if len(group1) < 2:
return {"error": f"Group 1 must have at least 2 observations. Found {len(group1)} values."}
if len(group2) < 2:
return {"error": f"Group 2 must have at least 2 observations. Found {len(group2)} values."}
# Perform Student's t-test analysis directly
# Convert to numpy arrays for calculations
data1 = np.array(group1)
data2 = np.array(group2)
# Perform Student's t-test (equal variances)
t_stat, p_value = stats.ttest_ind(data1, data2, equal_var=True, alternative=alternative)
# Calculate descriptive statistics
desc1 = {"mean": np.mean(data1), "std": np.std(data1, ddof=1), "n": len(data1)}
desc2 = {"mean": np.mean(data2), "std": np.std(data2, ddof=1), "n": len(data2)}
# Degrees of freedom (pooled)
df = len(data1) + len(data2) - 2
# Effect size (Cohen's d using pooled standard deviation)
pooled_std = np.sqrt(((len(data1)-1)*desc1["std"]**2 + (len(data2)-1)*desc2["std"]**2) / df)
cohens_d = (desc1["mean"] - desc2["mean"]) / pooled_std
# Interpretation using Cohen's canonical benchmarks
significant = p_value < alpha
abs_d = abs(cohens_d)
small_threshold, medium_threshold, large_threshold = thresholds
if abs_d < small_threshold:
effect_size_interp = "negligible"
elif abs_d < medium_threshold:
effect_size_interp = "small"
elif abs_d < large_threshold:
effect_size_interp = "medium"
else:
effect_size_interp = "large"
return {
"test_type": "Student's t-test",
"t_statistic": t_stat,
"p_value": p_value,
"degrees_of_freedom": df,
"cohens_d": cohens_d,
"pooled_std": pooled_std,
"group1_stats": desc1,
"group2_stats": desc2,
"significant": significant,
"effect_size": effect_size_interp,
"alternative_hypothesis": alternative,
"alpha": alpha,
"effect_thresholds": thresholds
}
except Exception as e:
return {"error": f"Unexpected error in flexible t-test: {str(e)}"}
def paired_t_test(
dataframe: Optional[pd.DataFrame] = None,
group1_str: Optional[str] = None,
group2_str: Optional[str] = None,
alternative: str = "two-sided",
alpha: float = 0.05,
effect_thresholds: str = "0.2,0.5,0.8"
) -> Dict[str, Any]:
"""
Accepts two groups of paired numeric data as comma-separated strings or DataFrame columns and performs a paired samples t-test.
This test determines whether there is a significant difference between two related measurements (same subjects measured twice),
such as before/after treatment measurements. Unlike independent samples t-tests, this test accounts for the correlation between
paired observations, making it more powerful for detecting differences in repeated measures designs. The test calculates a t-statistic
based on the mean of the differences between paired observations. Given an alternative hypothesis (group1 ≠ group2, group1 < group2,
or group1 > group2), it outputs the p-value: the probability of observing this result (or more extreme) if no true difference exists.
Results are statistically significant when p-value < alpha (typically 0.05). Cohen's d measures practical effect size, calculated
as the mean difference divided by the standard deviation of differences, with interpretation: |d| < 0.2 = negligible, 0.2-0.5 = small,
0.5-0.8 = medium, >0.8 = large (custom thresholds may be used).
EXAMPLE USE CASES: before/after treatment measurements on same subjects, pre/post test scores, repeated measurements over time.
Args:
dataframe (Optional[pd.DataFrame]): DataFrame containing paired data in first two columns.
If provided, group1_str and group2_str will be ignored.
group1_str (Optional[str]): Comma-separated string of numeric values for the first measurement.
Example: "85.2,90.1,78.5,92.3" (pre-test scores)
Only used if dataframe is None or empty.
group2_str (Optional[str]): Comma-separated string of numeric values for the second measurement.
Example: "88.1,95.7,82.2,94.4" (post-test scores)
Only used if dataframe is None or empty.
alternative (str): Direction of the alternative hypothesis:
- "two-sided": group1 mean ≠ group2 mean (different in either direction)
- "less": group1 mean < group2 mean (group1 is smaller)
- "greater": group1 mean > group2 mean (group1 is larger)
alpha (float): Significance level for the test (probability of Type I error). Reject null hypothesis if p_value below this threshold.
Common values: 0.05 (5%), 0.01 (1%), 0.10 (10%)
effect_thresholds (str): Three comma-separated values defining Cohen's d effect size boundaries.
Format: "small_threshold,medium_threshold,large_threshold"
Default "0.2,0.5,0.8" means: <0.2=negligible, 0.2-0.5=small, 0.5-0.8=medium, >0.8=large
Returns:
dict: Comprehensive test results with the following keys:
- test_type (str): Always "Paired samples t-test"
- t_statistic (float): The calculated t-value based on mean difference and standard error of differences
- p_value (float): Probability of observing this result if null hypothesis is true
- degrees_of_freedom (int): df = n - 1, where n is the number of paired observations
- cohens_d (float): Effect size measure. Positive means group2 > group1, negative means group1 > group2
- pooled_std (float): Standard deviation of the differences (used in Cohen's d calculation)
- group1_stats (dict): Descriptive statistics for group1 (mean, std, n)
- group2_stats (dict): Descriptive statistics for group2 (mean, std, n)
- significant (bool): True if p_value < alpha
- effect_size (str): Categorical interpretation of Cohen's d magnitude
- alternative_hypothesis (str): Echo of alternative parameter
- alpha (float): Echo of significance level used
- effect_thresholds (List[float]): Echo of effect size thresholds used
"""
try:
# Parse effect size thresholds
try:
thresholds = [float(x.strip()) for x in effect_thresholds.split(',')]
if len(thresholds) != 3:
return {"error": "Effect thresholds must be three comma-separated numbers (small,medium,large)"}
except:
return {"error": "Invalid effect thresholds format. Use 'small,medium,large' (e.g., '0.2,0.5,0.8')"}
# Method 1: DataFrame input (preferred for LLMs and data pipelines)
if dataframe is not None and not dataframe.empty:
# Use first two columns automatically
if len(dataframe.columns) < 2:
return {"error": f"DataFrame must have at least 2 columns. Found {len(dataframe.columns)} columns."}
# Extract and validate data from first two columns
try:
# Convert to numeric, coercing errors to NaN
col1_numeric = pd.to_numeric(dataframe.iloc[:, 0], errors='coerce')
col2_numeric = pd.to_numeric(dataframe.iloc[:, 1], errors='coerce')
# Remove NaN values and convert to list
group1 = col1_numeric.dropna().tolist()
group2 = col2_numeric.dropna().tolist()
# Check if we lost too much data due to non-numeric values
original_count1 = len(dataframe.iloc[:, 0].dropna())
original_count2 = len(dataframe.iloc[:, 1].dropna())
if len(group1) < original_count1 * 0.5: # Lost more than 50% of data
return {"error": f"Column 1 contains too many non-numeric values. Only {len(group1)} out of {original_count1} values could be converted to numbers."}
if len(group2) < original_count2 * 0.5: # Lost more than 50% of data
return {"error": f"Column 2 contains too many non-numeric values. Only {len(group2)} out of {original_count2} values could be converted to numbers."}
input_method = "dataframe"
except Exception as e:
return {"error": f"Error processing DataFrame columns: {str(e)}. Ensure columns contain numeric data."}
# Method 2: String input (preferred for humans and simple use cases)
elif group1_str and group2_str and group1_str.strip() and group2_str.strip():
try:
group1 = parse_numeric_input(group1_str)
group2 = parse_numeric_input(group2_str)
input_method = "strings"
except ValueError as e:
return {"error": f"String parsing error: {str(e)}"}
else:
return {"error": "Please provide either a DataFrame with data OR comma-separated strings for both groups. Do not leave inputs empty."}
# Validate extracted data - paired samples must have equal length
if len(group1) != len(group2):
return {"error": f"Paired samples must have equal length. Group1 has {len(group1)} observations, Group2 has {len(group2)} observations."}
if len(group1) < 2:
return {"error": f"Need at least 2 paired observations. Found {len(group1)} pairs."}
# Perform paired samples t-test
# Convert to numpy arrays for calculations
data1 = np.array(group1)
data2 = np.array(group2)
# Perform paired t-test
t_stat, p_value = stats.ttest_rel(data1, data2, alternative=alternative)
# Calculate descriptive statistics
desc1 = {"mean": np.mean(data1), "std": np.std(data1, ddof=1), "n": len(data1)}
desc2 = {"mean": np.mean(data2), "std": np.std(data2, ddof=1), "n": len(data2)}
# Calculate differences and effect size
differences = data2 - data1
mean_diff = np.mean(differences)
std_diff = np.std(differences, ddof=1)
# Degrees of freedom for paired t-test
df = len(data1) - 1
# Effect size (Cohen's d for paired samples: mean difference / std of differences)
cohens_d = mean_diff / std_diff
# Interpretation using Cohen's canonical benchmarks
significant = p_value < alpha
abs_d = abs(cohens_d)
small_threshold, medium_threshold, large_threshold = thresholds
if abs_d < small_threshold:
effect_size_interp = "negligible"
elif abs_d < medium_threshold:
effect_size_interp = "small"
elif abs_d < large_threshold:
effect_size_interp = "medium"
else:
effect_size_interp = "large"
return {
"test_type": "Paired samples t-test",
"t_statistic": t_stat,
"p_value": p_value,
"degrees_of_freedom": df,
"cohens_d": cohens_d,
"pooled_std": std_diff, # For paired t-test, this is std of differences
"group1_stats": desc1,
"group2_stats": desc2,
"significant": significant,
"effect_size": effect_size_interp,
"alternative_hypothesis": alternative,
"alpha": alpha,
"effect_thresholds": thresholds
}
except Exception as e:
return {"error": f"Unexpected error in paired t-test: {str(e)}"}
def one_sample_t_test(
dataframe: Optional[pd.DataFrame] = None,
group_str: Optional[str] = None,
population_mean: float = 0.0,
alternative: str = "two-sided",
alpha: float = 0.05,
effect_thresholds: str = "0.2,0.5,0.8"
) -> Dict[str, Any]:
"""
Accepts a single group of numeric data as comma-separated string or DataFrame column and performs a one-sample t-test
against a known or hypothesized population mean. This test determines whether the sample mean differs significantly
from the specified population mean. The test calculates a t-statistic quantifying how many standard errors the sample
mean is away from the hypothesized population mean. Given an alternative hypothesis (sample ≠ population, sample < population,
or sample > population), it outputs the p-value: the probability of observing this result (or more extreme) if the true
population mean equals the hypothesized value. Results are statistically significant when p-value < alpha (typically 0.05).
Cohen's d measures practical effect size, calculated as the difference between sample and population means divided by the
sample standard deviation, with interpretation: |d| < 0.2 = negligible, 0.2-0.5 = small, 0.5-0.8 = medium, >0.8 = large
(custom thresholds may be used).
EXAMPLE USE CASES: testing if sample mean differs from known standard, quality control against specification,
comparing sample performance against established benchmark.
Args:
dataframe (Optional[pd.DataFrame]): DataFrame containing sample data in first column.
If provided, group_str will be ignored.
group_str (Optional[str]): Comma-separated string of numeric values for the sample.
Example: "85.2,90.1,78.5,92.3" (test scores)
Only used if dataframe is None or empty.
population_mean (float): Hypothesized or known population mean to test against.
alternative (str): Direction of the alternative hypothesis:
- "two-sided": sample mean ≠ population mean (different in either direction)
- "less": sample mean < population mean (sample is smaller)
- "greater": sample mean > population mean (sample is larger)
alpha (float): Significance level for the test (probability of Type I error). Reject null hypothesis if p_value below this threshold.
Common values: 0.05 (5%), 0.01 (1%), 0.10 (10%)
effect_thresholds (str): Three comma-separated values defining Cohen's d effect size boundaries.
Format: "small_threshold,medium_threshold,large_threshold"
Default "0.2,0.5,0.8" means: <0.2=negligible, 0.2-0.5=small, 0.5-0.8=medium, >0.8=large
Returns:
dict: Comprehensive test results with the following keys:
- test_type (str): Always "One-sample t-test"
- t_statistic (float): The calculated t-value measuring sample mean deviation from population mean
- p_value (float): Probability of observing this result if null hypothesis is true
- degrees_of_freedom (int): df = n - 1, where n is the sample size
- cohens_d (float): Effect size measure. Positive means sample > population, negative means sample < population
- pooled_std (float): Sample standard deviation (used in Cohen's d calculation)
- group_stats (dict): Descriptive statistics for the sample (mean, std, n)
- significant (bool): True if p_value < alpha
- effect_size (str): Categorical interpretation of Cohen's d magnitude
- alternative_hypothesis (str): Echo of alternative parameter
- alpha (float): Echo of significance level used
- effect_thresholds (List[float]): Echo of effect size thresholds used
"""
try:
# Parse effect size thresholds
try:
thresholds = [float(x.strip()) for x in effect_thresholds.split(',')]
if len(thresholds) != 3:
return {"error": "Effect thresholds must be three comma-separated numbers (small,medium,large)"}
except:
return {"error": "Invalid effect thresholds format. Use 'small,medium,large' (e.g., '0.2,0.5,0.8')"}
# Method 1: DataFrame input (preferred for LLMs and data pipelines)
if dataframe is not None and not dataframe.empty:
# Use first column only
if len(dataframe.columns) < 1:
return {"error": f"DataFrame must have at least 1 column. Found {len(dataframe.columns)} columns."}
# Extract and validate data from first column
try:
# Convert to numeric, coercing errors to NaN
col1_numeric = pd.to_numeric(dataframe.iloc[:, 0], errors='coerce')
# Remove NaN values and convert to list
group = col1_numeric.dropna().tolist()
# Check if we lost too much data due to non-numeric values
original_count = len(dataframe.iloc[:, 0].dropna())
if len(group) < original_count * 0.5: # Lost more than 50% of data
return {"error": f"Column 1 contains too many non-numeric values. Only {len(group)} out of {original_count} values could be converted to numbers."}
except Exception as e:
return {"error": f"Error processing DataFrame column: {str(e)}. Ensure column contains numeric data."}
# Method 2: String input (preferred for humans and simple use cases)
elif group_str and group_str.strip():
try:
group = parse_numeric_input(group_str)
except ValueError as e:
return {"error": f"String parsing error: {str(e)}"}
else:
return {"error": "Please provide either a DataFrame with data OR a comma-separated string for the sample. Do not leave input empty."}
# Validate extracted data
if len(group) < 2:
return {"error": f"Sample must have at least 2 observations. Found {len(group)} values."}
# Perform one-sample t-test
# Convert to numpy array for calculations
data = np.array(group)
# Perform one-sample t-test
t_stat, p_value = stats.ttest_1samp(data, population_mean, alternative=alternative)
# Calculate descriptive statistics
group_stats = {"mean": np.mean(data), "std": np.std(data, ddof=1), "n": len(data)}
# Degrees of freedom
df = len(data) - 1
# Effect size (Cohen's d for one-sample: (sample_mean - population_mean) / sample_std)
sample_std = group_stats["std"]
cohens_d = (group_stats["mean"] - population_mean) / sample_std
# Interpretation using Cohen's canonical benchmarks
significant = p_value < alpha
abs_d = abs(cohens_d)
small_threshold, medium_threshold, large_threshold = thresholds
if abs_d < small_threshold:
effect_size_interp = "negligible"
elif abs_d < medium_threshold:
effect_size_interp = "small"
elif abs_d < large_threshold:
effect_size_interp = "medium"
else:
effect_size_interp = "large"
return {
"test_type": "One-sample t-test",
"t_statistic": t_stat,
"p_value": p_value,
"degrees_of_freedom": df,
"cohens_d": cohens_d,
"pooled_std": sample_std,
"group_stats": group_stats,
"significant": significant,
"effect_size": effect_size_interp,
"alternative_hypothesis": alternative,
"alpha": alpha,
"effect_thresholds": thresholds
}
except Exception as e:
return {"error": f"Unexpected error in one-sample t-test: {str(e)}"}
def one_way_anova(
dataframe: Optional[pd.DataFrame] = None,
groups_str: Optional[str] = None,
alpha: float = 0.05,
effect_thresholds: str = "0.01,0.06,0.14"
) -> Dict[str, Any]:
"""
Accepts multiple groups of numeric data as semicolon-separated groups or DataFrame columns and performs a one-way ANOVA
(Analysis of Variance). This test determines whether there are statistically significant differences between the means
of three or more independent groups. ANOVA tests the null hypothesis that all group means are equal against the alternative
that at least one group mean differs from the others. The test calculates an F-statistic by comparing the variance between
groups to the variance within groups. A significant result (p-value < alpha) indicates that at least one group differs,
but does not identify which specific groups differ (post-hoc tests needed for pairwise comparisons). Eta-squared (η²)
measures effect size as the proportion of total variance explained by group membership, with interpretation: η² < 0.01 = negligible,
0.01-0.06 = small, 0.06-0.14 = medium, >0.14 = large (custom thresholds may be used).
EXAMPLE USE CASES: comparing means across multiple treatment conditions, testing differences between multiple demographic groups,
evaluating performance across several experimental conditions.
Args:
dataframe (Optional[pd.DataFrame]): DataFrame containing group data in columns. All columns will be treated as separate groups.
If provided, groups_str will be ignored.
groups_str (Optional[str]): Multiple groups separated by semicolons, with each group containing comma-separated values.
Example: "85.2,90.1,78.5;88.1,85.7,91.2;82.3,87.4,89.1" (3 groups with their respective values)
Only used if dataframe is None or empty.
alpha (float): Significance level for the test (probability of Type I error). Reject null hypothesis if p_value below this threshold.
Common values: 0.05 (5%), 0.01 (1%), 0.10 (10%)
effect_thresholds (str): Three comma-separated values defining eta-squared effect size boundaries.
Format: "small_threshold,medium_threshold,large_threshold"
Default "0.01,0.06,0.14" means: <0.01=negligible, 0.01-0.06=small, 0.06-0.14=medium, >0.14=large
Returns:
dict: Comprehensive test results with the following keys:
- test_type (str): Always "One-way ANOVA"
- f_statistic (float): The calculated F-value comparing between-group to within-group variance
- p_value (float): Probability of observing this result if null hypothesis is true
- degrees_of_freedom (dict): Contains df_between (groups-1) and df_within (total_n - groups)
- eta_squared (float): Effect size measure (proportion of variance explained by groups)
- group_stats (List[dict]): Descriptive statistics for each group (mean, std, n)
- significant (bool): True if p_value < alpha
- effect_size (str): Categorical interpretation of eta-squared magnitude
- alpha (float): Echo of significance level used
- effect_thresholds (List[float]): Echo of effect size thresholds used
"""
try:
# Parse effect size thresholds
try:
thresholds = [float(x.strip()) for x in effect_thresholds.split(',')]
if len(thresholds) != 3:
return {"error": "Effect thresholds must be three comma-separated numbers (small,medium,large)"}
except:
return {"error": "Invalid effect thresholds format. Use 'small,medium,large' (e.g., '0.01,0.06,0.14')"}
groups = []
# Method 1: DataFrame input (preferred for LLMs and data pipelines)
if dataframe is not None and not dataframe.empty:
# Use all columns as separate groups
if len(dataframe.columns) < 2:
return {"error": f"DataFrame must have at least 2 columns for ANOVA. Found {len(dataframe.columns)} columns."}
# Extract and validate data from all columns
try:
for col_idx, col in enumerate(dataframe.columns):
col_numeric = pd.to_numeric(dataframe.iloc[:, col_idx], errors='coerce')
group_data = col_numeric.dropna().tolist()
# Check if we have enough data
original_count = len(dataframe.iloc[:, col_idx].dropna())
if len(group_data) < original_count * 0.5: # Lost more than 50% of data
return {"error": f"Column {col_idx+1} contains too many non-numeric values. Only {len(group_data)} out of {original_count} values could be converted to numbers."}
if len(group_data) < 2:
return {"error": f"Column {col_idx+1} must have at least 2 observations. Found {len(group_data)} values."}
groups.append(group_data)
except Exception as e:
return {"error": f"Error processing DataFrame columns: {str(e)}. Ensure columns contain numeric data."}
# Method 2: String input (preferred for humans and simple use cases)
elif groups_str and groups_str.strip():
try:
# Split by semicolon to separate groups
group_strings = [group.strip() for group in groups_str.split(';') if group.strip()]
if len(group_strings) < 2:
return {"error": "ANOVA requires at least 2 groups. Please provide groups separated by semicolons (;)."}
for i, group_str in enumerate(group_strings):
try:
group_data = parse_numeric_input(group_str)
if len(group_data) < 2:
return {"error": f"Group {i+1} must have at least 2 observations. Found {len(group_data)} values."}
groups.append(group_data)
except ValueError as e:
return {"error": f"String parsing error for group {i+1}: {str(e)}"}
except Exception as e:
return {"error": f"Error parsing groups string: {str(e)}. Use format 'group1_values;group2_values;group3_values' where each group contains comma-separated numbers."}
else:
return {"error": "Please provide either a DataFrame with data OR a semicolon-separated string of groups. Do not leave input empty."}
# Validate we have enough groups
if len(groups) < 2:
return {"error": "ANOVA requires at least 2 groups. Please provide data for at least 2 groups."}
# Perform one-way ANOVA
# Convert to numpy arrays for calculations
numpy_groups = [np.array(group) for group in groups]
# Perform ANOVA
f_stat, p_value = stats.f_oneway(*numpy_groups)
# Calculate descriptive statistics for each group
group_stats = []
all_data = []
for i, group in enumerate(numpy_groups):
group_stats.append({
"group": i+1,
"mean": np.mean(group),
"std": np.std(group, ddof=1),
"n": len(group)
})
all_data.extend(group)
# Calculate effect size (eta-squared)
all_data = np.array(all_data)
overall_mean = np.mean(all_data)
# Sum of squares
ss_total = np.sum((all_data - overall_mean)**2)
ss_between = sum(len(group) * (np.mean(group) - overall_mean)**2 for group in numpy_groups)
eta_squared = ss_between / ss_total if ss_total > 0 else 0
# Degrees of freedom
df_between = len(groups) - 1
df_within = len(all_data) - len(groups)
# Interpretation using effect size thresholds
significant = p_value < alpha
small_threshold, medium_threshold, large_threshold = thresholds
if eta_squared < small_threshold:
effect_size_interp = "negligible"
elif eta_squared < medium_threshold:
effect_size_interp = "small"
elif eta_squared < large_threshold:
effect_size_interp = "medium"
else:
effect_size_interp = "large"
return {
"test_type": "One-way ANOVA",
"f_statistic": f_stat,
"p_value": p_value,
"degrees_of_freedom": {"df_between": df_between, "df_within": df_within},
"eta_squared": eta_squared,
"group_stats": group_stats,
"significant": significant,
"effect_size": effect_size_interp,
"alpha": alpha,
"effect_thresholds": thresholds
}
except Exception as e:
return {"error": f"Unexpected error in one-way ANOVA: {str(e)}"}
def multi_way_anova(
dataframe: Optional[pd.DataFrame] = None,
dependent_var: Optional[str] = None,
factors: Optional[str] = None,
alpha: float = 0.05,
effect_thresholds: str = "0.01,0.06,0.14",
include_interactions: bool = True,
max_interaction_order: Optional[int] = None,
sum_squares_type: int = 2
) -> Dict[str, Any]:
"""
Accepts multiple categorical factors and performs Multi-Way ANOVA to determine whether there are
statistically significant differences between group means when multiple factors are involved simultaneously.
Multi-way ANOVA extends the one-way ANOVA framework to handle complex experimental designs with multiple
categorical independent variables (factors), each with two or more levels. Unlike one-way ANOVA which tests
a single factor, multi-way ANOVA can simultaneously test: (1) main effects of each individual factor,
(2) interaction effects between factors, and (3) higher-order interactions. The test uses F-statistics to
compare variance between groups to variance within groups for each effect. Eta-squared (η²) measures effect
size as the proportion of total variance explained by each factor and interaction, with interpretation:
η² < 0.01 = negligible, 0.01-0.06 = small, 0.06-0.14 = medium, >0.14 = large (custom thresholds may be used).
EXAMPLE USE CASES: 2-way ANOVA for treatment × gender effects on blood pressure, 3-way ANOVA for teaching
method × school type × student age on test scores, 4-way ANOVA for drug × dose × gender × age effects on recovery.
Args:
dataframe (Optional[pd.DataFrame]): DataFrame containing the experimental data with factors as columns
and the dependent variable. All factors must be categorical.
If provided, dependent_var and factors parameters are required.
dependent_var (Optional[str]): Name of the dependent (outcome) variable column in the DataFrame.
Must be a continuous numeric variable.
Example: "test_score", "recovery_time", "blood_pressure"
factors (Optional[str]): Comma-separated string of factor column names from the DataFrame.
Format: "factor1,factor2,factor3"
Example: "treatment,gender,age_group" for a 3-way ANOVA
Each factor must be categorical with 2 or more levels.
alpha (float): Significance level for the test (probability of Type I error). Reject null hypothesis if p_value below this threshold.
Common values: 0.05 (5%), 0.01 (1%), 0.10 (10%)
effect_thresholds (str): Three comma-separated values defining eta-squared effect size boundaries.
Format: "small_threshold,medium_threshold,large_threshold"
Default "0.01,0.06,0.14" means: <0.01=negligible, 0.01-0.06=small, 0.06-0.14=medium, >0.14=large
These follow Cohen's conventions for eta-squared interpretation.
include_interactions (bool): Whether to include interaction terms in the model.
True (default): Tests main effects AND interactions
False: Tests only main effects (additive model)
max_interaction_order (Optional[int]): Maximum order of interactions to include in the model.
If None, includes all possible interactions up to the number of factors.
Example: For 4 factors, setting to 2 includes only 2-way interactions.
Useful for simplifying complex models with many factors.
sum_squares_type (int): Type of sum of squares calculation for the ANOVA table.
Type 1: Sequential (depends on order of factors)
Type 2: Marginal (recommended for balanced designs, default)
Type 3: Partial (recommended for unbalanced designs)
Returns:
dict: Comprehensive test results with the following keys:
- test_type (str): Description of the multi-way ANOVA performed (e.g., "3-way ANOVA with interactions")
- anova_table (pd.DataFrame): Complete ANOVA table with sum of squares, F-statistics, p-values, etc.
- significant_effects (List[str]): List of statistically significant main effects and interactions
- effect_sizes (Dict[str, float]): Eta-squared values for each effect measuring proportion of variance explained
- effect_interpretations (Dict[str, str]): Categorical interpretation of each effect size ("negligible", "small", "medium", "large")
- factor_summaries (Dict[str, dict]): Descriptive statistics for each factor level
- model_summary (dict): Overall model statistics (R², F-statistic, AIC, BIC, etc.)
- formula_used (str): The statsmodels formula string used for the analysis
- design_summary (dict): Information about the experimental design (balanced/unbalanced, sample sizes)
- alpha (float): Echo of significance level used
- factors_analyzed (List[str]): Echo of factors included in the analysis
- sum_squares_type (int): Echo of sum of squares type used
- effect_thresholds (List[float]): Echo of effect size thresholds used
"""
try:
# Parse effect size thresholds
try:
thresholds = [float(x.strip()) for x in effect_thresholds.split(',')]
if len(thresholds) != 3:
return {"error": "Effect thresholds must be three comma-separated numbers (small,medium,large)"}
except:
return {"error": "Invalid effect thresholds format. Use 'small,medium,large' (e.g., '0.01,0.06,0.14')"}
# Validate inputs
if dataframe is None or dataframe.empty:
return {"error": "DataFrame cannot be None or empty"}
if not dependent_var:
return {"error": "Dependent variable name is required"}
if dependent_var not in dataframe.columns:
return {"error": f"Dependent variable '{dependent_var}' not found in DataFrame columns"}
if not factors:
return {"error": "Factor names are required. Provide as comma-separated string (e.g., 'factor1,factor2,factor3')"}
# Parse factors
try:
factor_list = [f.strip() for f in factors.split(',') if f.strip()]
if len(factor_list) < 2:
return {"error": "At least 2 factors are required for multi-way ANOVA"}
except:
return {"error": "Invalid factors format. Use comma-separated factor names (e.g., 'treatment,gender,age_group')"}
# Check factors exist in DataFrame
missing_factors = [f for f in factor_list if f not in dataframe.columns]
if missing_factors:
return {"error": f"Factors not found in DataFrame: {missing_factors}"}
# Validate sum of squares type
if sum_squares_type not in [1, 2, 3]:
return {"error": "sum_squares_type must be 1, 2, or 3"}
# Clean and prepare the data
analysis_columns = [dependent_var] + factor_list
analysis_df = dataframe[analysis_columns].copy()
# Remove rows with missing values
initial_rows = len(analysis_df)
analysis_df = analysis_df.dropna()
final_rows = len(analysis_df)
if final_rows < initial_rows * 0.5:
return {"error": f"Too much missing data: only {final_rows} out of {initial_rows} rows usable"}
if final_rows < 20:
return {"error": f"Insufficient data after removing missing values: {final_rows} rows remaining (minimum 20 required)"}
# Validate dependent variable is numeric
try:
analysis_df[dependent_var] = pd.to_numeric(analysis_df[dependent_var])
except:
return {"error": f"Dependent variable '{dependent_var}' must be numeric"}
# Ensure factors are categorical and check levels
factor_level_counts = {}
for factor in factor_list:
analysis_df[factor] = analysis_df[factor].astype('category')
unique_levels = len(analysis_df[factor].cat.categories)
factor_level_counts[factor] = unique_levels
if unique_levels < 2:
return {"error": f"Factor '{factor}' must have at least 2 levels. Found {unique_levels} level(s)"}
if unique_levels > 20:
return {"error": f"Factor '{factor}' has too many levels ({unique_levels}). Consider combining levels or using a different analysis method"}
# Check for sufficient observations per factor combination
try:
cell_counts = analysis_df.groupby(factor_list).size()
min_cell_size = cell_counts.min()
empty_cells = (cell_counts == 0).sum()
if min_cell_size < 2:
return {"error": f"Some factor combinations have fewer than 2 observations. Minimum cell size: {min_cell_size}"}
if empty_cells > 0:
return {"error": f"Missing data: {empty_cells} factor combinations have no observations"}
except Exception as e:
return {"error": f"Error checking experimental design: {str(e)}"}
# Build formula components
formula_terms = []
# Add main effects (always included)
for factor in factor_list:
formula_terms.append(f"C({factor})")
# Add interaction terms if requested
if include_interactions and len(factor_list) > 1:
max_order = max_interaction_order if max_interaction_order is not None else len(factor_list)
max_order = min(max_order, len(factor_list)) # Don't exceed number of factors
# Generate all interaction combinations
for order in range(2, max_order + 1):
for combination in itertools.combinations(factor_list, order):
interaction_term = ":".join([f"C({factor})" for factor in combination])
formula_terms.append(interaction_term)
# Build the complete formula
formula = f"{dependent_var} ~ " + " + ".join(formula_terms)
# Fit the model
try:
model = ols(formula, data=analysis_df).fit()
except Exception as e:
return {"error": f"Model fitting failed: {str(e)}. This may indicate perfect multicollinearity or insufficient data variation"}
# Generate ANOVA table
try:
anova_table = sm.stats.anova_lm(model, typ=sum_squares_type)
except Exception as e:
return {"error": f"ANOVA table generation failed: {str(e)}"}
# Calculate effect sizes (eta-squared)
effect_sizes = {}
effect_interpretations = {}
total_ss = anova_table['sum_sq'].sum()
for index, row in anova_table.iterrows():
if index != 'Residual':
eta_squared = row['sum_sq'] / total_ss
effect_sizes[index] = eta_squared
# Interpret effect size
small_threshold, medium_threshold, large_threshold = thresholds
if eta_squared < small_threshold:
effect_interpretations[index] = "negligible"
elif eta_squared < medium_threshold:
effect_interpretations[index] = "small"
elif eta_squared < large_threshold:
effect_interpretations[index] = "medium"
else:
effect_interpretations[index] = "large"
# Identify significant effects
significant_effects = []
for index, row in anova_table.iterrows():
if index != 'Residual' and row['PR(>F)'] < alpha:
significant_effects.append(index)
# Calculate factor summaries
factor_summaries = {}
for factor in factor_list:
factor_stats = analysis_df.groupby(factor)[dependent_var].agg(['mean', 'std', 'count']).round(4)
factor_summaries[factor] = factor_stats.to_dict('index')
# Model summary statistics
model_summary = {
"r_squared": model.rsquared,
"adj_r_squared": model.rsquared_adj,
"f_statistic": model.fvalue,
"f_pvalue": model.f_pvalue,
"aic": model.aic,
"bic": model.bic,
"df_model": model.df_model,
"df_resid": model.df_resid,
"n_observations": int(model.nobs),
"mse_resid": model.mse_resid
}
# Design summary
total_combinations = np.prod(list(factor_level_counts.values()))
observed_combinations = len(cell_counts)
balanced = len(cell_counts.unique()) == 1 # All cells have same count
design_summary = {
"n_factors": len(factor_list),
"factor_levels": factor_level_counts,
"total_possible_combinations": total_combinations,
"observed_combinations": observed_combinations,
"is_balanced": balanced,
"min_cell_size": int(min_cell_size),
"max_cell_size": int(cell_counts.max()),
"mean_cell_size": round(cell_counts.mean(), 2)
}
# Determine test description
n_factors = len(factor_list)
test_description = f"{n_factors}-way ANOVA"
if include_interactions:
max_order_desc = max_interaction_order if max_interaction_order else n_factors
test_description += f" with interactions (up to {max_order_desc}-way)"
else:
test_description += " (main effects only)"
return {
"test_type": test_description,
"anova_table": anova_table,
"significant_effects": significant_effects,
"effect_sizes": effect_sizes,
"effect_interpretations": effect_interpretations,
"factor_summaries": factor_summaries,
"model_summary": model_summary,
"formula_used": formula,
"design_summary": design_summary,
"alpha": alpha,
"factors_analyzed": factor_list,
"sum_squares_type": sum_squares_type,
"effect_thresholds": thresholds
}
except Exception as e:
return {"error": f"Unexpected error in multi-way ANOVA: {str(e)}"}
def chi_square_test(
dataframe: Optional[pd.DataFrame] = None,
observed_str: Optional[str] = None,
expected_str: Optional[str] = None,
alpha: float = 0.05,
effect_thresholds: str = "0.1,0.3,0.5"
) -> Dict[str, Any]:
"""
Accepts observed frequencies (and optionally expected frequencies) as comma-separated strings or DataFrame columns
and performs a chi-square goodness of fit test. This test determines whether observed categorical data frequencies
differ significantly from expected frequencies. If no expected frequencies are provided, the test assumes equal
distribution across all categories. The test calculates a chi-square statistic measuring the discrepancy between
observed and expected frequencies. A significant result (p-value < alpha) indicates that the observed distribution
differs from the expected distribution. Cramér's V measures effect size as the strength of association, with
interpretation: V < 0.1 = negligible, 0.1-0.3 = small, 0.3-0.5 = medium, >0.5 = large (custom thresholds may be used).
EXAMPLE USE CASES: testing if dice rolls follow uniform distribution, comparing observed vs expected sales across
categories, analyzing survey response distributions.
Args:
dataframe (Optional[pd.DataFrame]): DataFrame containing frequency data in first column (observed) and
optionally second column (expected). If provided, string parameters will be ignored.
observed_str (Optional[str]): Comma-separated string of observed frequencies.
Example: "25,30,20,15" (frequencies for 4 categories)
Only used if dataframe is None or empty.
expected_str (Optional[str]): Comma-separated string of expected frequencies (optional).
Example: "22.5,22.5,22.5,22.5" (equal distribution)
If not provided, assumes equal distribution. Only used if dataframe is None or empty.
alpha (float): Significance level for the test (probability of Type I error). Reject null hypothesis if p_value below this threshold.
Common values: 0.05 (5%), 0.01 (1%), 0.10 (10%)
effect_thresholds (str): Three comma-separated values defining Cramér's V effect size boundaries.
Format: "small_threshold,medium_threshold,large_threshold"
Default "0.1,0.3,0.5" means: <0.1=negligible, 0.1-0.3=small, 0.3-0.5=medium, >0.5=large
Returns:
dict: Comprehensive test results with the following keys:
- test_type (str): Always "Chi-square goodness of fit test"
- chi_square_statistic (float): The calculated chi-square value measuring discrepancy from expected
- p_value (float): Probability of observing this result if null hypothesis is true
- degrees_of_freedom (int): df = categories - 1
- cramers_v (float): Effect size measure (strength of association)
- significant (bool): True if p_value < alpha
- effect_size (str): Categorical interpretation of Cramér's V magnitude
- alpha (float): Echo of significance level used
- effect_thresholds (List[float]): Echo of effect size thresholds used
"""
try:
# Parse effect size thresholds
try:
thresholds = [float(x.strip()) for x in effect_thresholds.split(',')]
if len(thresholds) != 3:
return {"error": "Effect thresholds must be three comma-separated numbers (small,medium,large)"}
except:
return {"error": "Invalid effect thresholds format. Use 'small,medium,large' (e.g., '0.1,0.3,0.5')"}
# Method 1: DataFrame input (preferred for LLMs and data pipelines)
if dataframe is not None and not dataframe.empty:
# Use first column for observed, second column for expected (if available)
if len(dataframe.columns) < 1:
return {"error": f"DataFrame must have at least 1 column. Found {len(dataframe.columns)} columns."}
try:
# Convert first column to numeric (observed frequencies)
col1_numeric = pd.to_numeric(dataframe.iloc[:, 0], errors='coerce')
observed = col1_numeric.dropna().tolist()
# Check if we lost too much data
original_count1 = len(dataframe.iloc[:, 0].dropna())
if len(observed) < original_count1 * 0.5:
return {"error": f"Column 1 contains too many non-numeric values. Only {len(observed)} out of {original_count1} values could be converted to numbers."}
# Check for second column (expected frequencies)
if len(dataframe.columns) >= 2:
col2_numeric = pd.to_numeric(dataframe.iloc[:, 1], errors='coerce')
expected = col2_numeric.dropna().tolist()
if len(expected) != len(observed):
return {"error": "Observed and expected columns must have the same number of valid entries."}
else:
# Calculate equal distribution
total = sum(observed)
expected = [total / len(observed)] * len(observed)
except Exception as e:
return {"error": f"Error processing DataFrame columns: {str(e)}. Ensure columns contain numeric data."}
# Method 2: String input (preferred for humans and simple use cases)
elif observed_str and observed_str.strip():
try:
observed = parse_numeric_input(observed_str)
if expected_str and expected_str.strip():
expected = parse_numeric_input(expected_str)
if len(observed) != len(expected):
return {"error": "Observed and expected must have the same number of categories."}
else:
# Calculate equal distribution
total = sum(observed)
expected = [total / len(observed)] * len(observed)
except ValueError as e:
return {"error": f"String parsing error: {str(e)}"}
else:
return {"error": "Please provide either a DataFrame with data OR a comma-separated string for observed frequencies. Do not leave input empty."}
# Validate extracted data
if len(observed) < 2:
return {"error": f"Need at least 2 categories for chi-square test. Found {len(observed)} categories."}
# Check for non-negative frequencies
if any(x < 0 for x in observed) or any(x < 0 for x in expected):
return {"error": "Frequencies cannot be negative."}
# Check for zero expected frequencies
if any(x <= 0 for x in expected):
return {"error": "Expected frequencies must be greater than zero."}
# Perform chi-square goodness of fit test
observed_array = np.array(observed)
expected_array = np.array(expected)
# Perform chi-square test
chi2_stat, p_value = stats.chisquare(observed_array, expected_array)
# Degrees of freedom
df = len(observed) - 1
# Effect size (Cramér's V for goodness of fit)
n = sum(observed)
cramers_v = np.sqrt(chi2_stat / (n * df)) if df > 0 else 0
# Interpretation using effect size thresholds
significant = p_value < alpha
small_threshold, medium_threshold, large_threshold = thresholds
if cramers_v < small_threshold:
effect_size_interp = "negligible"
elif cramers_v < medium_threshold:
effect_size_interp = "small"
elif cramers_v < large_threshold:
effect_size_interp = "medium"
else:
effect_size_interp = "large"
return {
"test_type": "Chi-square goodness of fit test",
"chi_square_statistic": chi2_stat,
"p_value": p_value,
"degrees_of_freedom": df,
"cramers_v": cramers_v,
"significant": significant,
"effect_size": effect_size_interp,
"alpha": alpha,
"effect_thresholds": thresholds
}
except Exception as e:
return {"error": f"Unexpected error in chi-square test: {str(e)}"}
def correlation_test(
dataframe: Optional[pd.DataFrame] = None,
group1_str: Optional[str] = None,
group2_str: Optional[str] = None,
method: str = "pearson",
alpha: float = 0.05,
effect_thresholds: str = "0.1,0.3,0.5"
) -> Dict[str, Any]:
"""
Accepts two variables as comma-separated strings or DataFrame columns and performs correlation analysis.
This test determines the strength and direction of the linear relationship between two continuous variables.
Pearson correlation measures linear relationships, Spearman correlation measures monotonic relationships
(rank-based), and Kendall's tau is robust to outliers and suitable for small samples. The test calculates
a correlation coefficient ranging from -1 (perfect negative correlation) to +1 (perfect positive correlation),
with 0 indicating no linear relationship. A significant result (p-value < alpha) indicates that the observed
correlation is statistically different from zero. Effect size interpretation: |r| < 0.1 = negligible,
0.1-0.3 = small, 0.3-0.5 = medium, >0.5 = large (custom thresholds may be used).
EXAMPLE USE CASES: examining relationship between height and weight, analyzing correlation between study time
and test scores, investigating association between variables in research.
Args:
dataframe (Optional[pd.DataFrame]): DataFrame containing two variables in first two columns.
If provided, group1_str and group2_str will be ignored.
group1_str (Optional[str]): Comma-separated string of numeric values for the first variable (X).
Example: "5.2,6.1,4.8,7.3" (hours studied)
Only used if dataframe is None or empty.
group2_str (Optional[str]): Comma-separated string of numeric values for the second variable (Y).
Example: "78,85,72,92" (test scores)
Only used if dataframe is None or empty.
method (str): Correlation method to use:
- "pearson": Pearson product-moment correlation (linear relationships)
- "spearman": Spearman rank correlation (monotonic relationships)
- "kendall": Kendall's tau (robust to outliers, good for small samples)
alpha (float): Significance level for the test (probability of Type I error). Reject null hypothesis if p_value below this threshold.
Common values: 0.05 (5%), 0.01 (1%), 0.10 (10%)
effect_thresholds (str): Three comma-separated values defining correlation effect size boundaries.
Format: "small_threshold,medium_threshold,large_threshold"
Default "0.1,0.3,0.5" means: <0.1=negligible, 0.1-0.3=small, 0.3-0.5=medium, >0.5=large
Returns:
dict: Comprehensive test results with the following keys:
- test_type (str): Type of correlation test performed
- correlation_coefficient (float): The calculated correlation coefficient (-1 to +1)
- p_value (float): Probability of observing this result if null hypothesis (no correlation) is true
- sample_size (int): Number of paired observations
- significant (bool): True if p_value < alpha
- effect_size (str): Categorical interpretation of correlation magnitude
- method (str): Echo of correlation method used
- alpha (float): Echo of significance level used
- effect_thresholds (List[float]): Echo of effect size thresholds used
- group1_stats (dict): Descriptive statistics for first variable (mean, std, n)
- group2_stats (dict): Descriptive statistics for second variable (mean, std, n)
"""
try:
# Parse effect size thresholds
try:
thresholds = [float(x.strip()) for x in effect_thresholds.split(',')]
if len(thresholds) != 3:
return {"error": "Effect thresholds must be three comma-separated numbers (small,medium,large)"}
except:
return {"error": "Invalid effect thresholds format. Use 'small,medium,large' (e.g., '0.1,0.3,0.5')"}
# Method 1: DataFrame input (preferred for LLMs and data pipelines)
if dataframe is not None and not dataframe.empty:
# Use first two columns
if len(dataframe.columns) < 2:
return {"error": f"DataFrame must have at least 2 columns for correlation. Found {len(dataframe.columns)} columns."}
try:
# Convert to numeric, coercing errors to NaN
col1_numeric = pd.to_numeric(dataframe.iloc[:, 0], errors='coerce')
col2_numeric = pd.to_numeric(dataframe.iloc[:, 1], errors='coerce')
# Remove rows where either value is NaN
valid_mask = ~(col1_numeric.isna() | col2_numeric.isna())
group1 = col1_numeric[valid_mask].tolist()
group2 = col2_numeric[valid_mask].tolist()
# Check if we lost too much data
original_count = len(dataframe)
if len(group1) < original_count * 0.5:
return {"error": f"Too many non-numeric values in the data. Only {len(group1)} out of {original_count} rows could be used."}
except Exception as e:
return {"error": f"Error processing DataFrame columns: {str(e)}. Ensure columns contain numeric data."}
# Method 2: String input (preferred for humans and simple use cases)
elif group1_str and group2_str and group1_str.strip() and group2_str.strip():
try:
group1 = parse_numeric_input(group1_str)
group2 = parse_numeric_input(group2_str)
if len(group1) != len(group2):
return {"error": f"Variables must have the same number of observations. Variable 1 has {len(group1)}, Variable 2 has {len(group2)}."}
except ValueError as e:
return {"error": f"String parsing error: {str(e)}"}
else:
return {"error": "Please provide either a DataFrame with data OR comma-separated strings for both variables. Do not leave inputs empty."}
# Validate extracted data
if len(group1) < 3:
return {"error": f"Need at least 3 paired observations for correlation. Found {len(group1)} pairs."}
# Perform correlation analysis
data1 = np.array(group1)
data2 = np.array(group2)
# Choose correlation method
method_lower = method.lower()
if method_lower == "pearson":
corr_coef, p_value = stats.pearsonr(data1, data2)
test_name = "Pearson correlation"
elif method_lower == "spearman":
corr_coef, p_value = stats.spearmanr(data1, data2)
test_name = "Spearman rank correlation"
elif method_lower == "kendall":
corr_coef, p_value = stats.kendalltau(data1, data2)
test_name = "Kendall's tau correlation"
else:
return {"error": "Method must be 'pearson', 'spearman', or 'kendall'"}
# Calculate descriptive statistics
desc1 = {"mean": np.mean(data1), "std": np.std(data1, ddof=1), "n": len(data1)}
desc2 = {"mean": np.mean(data2), "std": np.std(data2, ddof=1), "n": len(data2)}
# Interpretation using effect size thresholds
significant = p_value < alpha
abs_corr = abs(corr_coef)
small_threshold, medium_threshold, large_threshold = thresholds
if abs_corr < small_threshold:
effect_size_interp = "negligible"
elif abs_corr < medium_threshold:
effect_size_interp = "small"
elif abs_corr < large_threshold:
effect_size_interp = "medium"
else:
effect_size_interp = "large"
return {
"test_type": test_name,
"correlation_coefficient": corr_coef,
"p_value": p_value,
"sample_size": len(data1),
"significant": significant,
"effect_size": effect_size_interp,
"method": method_lower,
"alpha": alpha,
"effect_thresholds": thresholds,
"group1_stats": desc1,
"group2_stats": desc2
}
except Exception as e:
return {"error": f"Unexpected error in correlation test: {str(e)}"}
# SHARED UTILITY FUNCTIONS (Hidden from MCP)
def load_uploaded_file(file_path, has_header_flag):
"""Shared function to load uploaded files and return both the DataFrame and preview."""
if file_path is None:
return None, None
try:
# Determine header parameter for pandas
header_param = 0 if has_header_flag else None
if file_path.endswith('.csv'):
df = pd.read_csv(file_path, header=header_param)
elif file_path.endswith(('.xlsx', '.xls')):
df = pd.read_excel(file_path, header=header_param)
else:
return None, pd.DataFrame({'Error': ['Unsupported file format']})
# Take only first two columns
if len(df.columns) >= 2:
df_subset = df.iloc[:, :2].copy()
# Set column names based on whether headers were detected
if has_header_flag and not str(df_subset.columns[0]).startswith('Unnamed'):
# Keep original column names if they exist and aren't auto-generated
df_subset.columns = [str(df_subset.columns[0]), str(df_subset.columns[1])]
else:
# Use default names
df_subset.columns = ['Group1', 'Group2']
# Convert columns to numeric, replacing non-numeric with NaN
df_subset.iloc[:, 0] = pd.to_numeric(df_subset.iloc[:, 0], errors='coerce')
df_subset.iloc[:, 1] = pd.to_numeric(df_subset.iloc[:, 1], errors='coerce')
# Remove rows where both values are NaN
df_subset = df_subset.dropna(how='all')
# Return full dataframe for processing and preview for display
preview = df_subset.head(10) # Show first 10 rows
return df_subset, preview
else:
error_df = pd.DataFrame({'Error': ['File must have at least 2 columns']})
return None, error_df
except Exception as e:
error_df = pd.DataFrame({'Error': [f"Failed to load file: {str(e)}"]})
return None, error_df
def toggle_input_method(method):
"""Toggle between file upload and text input sections."""
if method == "File Upload":
return gr.update(visible=True), gr.update(visible=False)
else:
return gr.update(visible=False), gr.update(visible=True)
def clear_all():
"""Clear all form inputs and reset to defaults."""
return (
"File Upload", # input_method
None, # loaded_dataframe
None, # data_preview
"", # group1_str
"", # group2_str
"two-sided", # alternative
0.05, # alpha
"0.2,0.5,0.8", # effect_thresholds
{} # output
)
def load_example():
"""Load example data for demonstration purposes."""
example_df = pd.DataFrame({
'Treatment': [85.2, 90.1, 78.5, 92.3, 88.7, 86.4, 89.2],
'Control': [88.1, 85.7, 91.2, 87.4, 89.3, 90.8, 86.9]
})
preview = example_df.head(10)
return "File Upload", example_df, preview, "", ""
# COMPONENT FACTORY FUNCTIONS
def create_input_components():
"""Create reusable input components for both test tabs."""
# Input method selector
input_method = gr.Radio(
choices=["File Upload", "Text Input"],
value="File Upload",
label="Choose Input Method",
info="Select how you want to provide your data"
)
# File upload input section
with gr.Group(visible=True) as file_section:
gr.Markdown("### File Upload")
gr.Markdown("*Upload CSV or Excel file - first two columns will be used as Group 1 and Group 2*")
with gr.Row():
file_upload = gr.File(
label="Upload CSV/Excel File",
file_types=[".csv", ".xlsx", ".xls"],
type="filepath"
)
has_header = gr.Checkbox(
label="File has header row",
value=True,
info="Check if first row contains column names"
)
# Display loaded data preview
data_preview = gr.Dataframe(
label="Data Preview (first two columns)",
interactive=False,
row_count=5
)
# Text input section
with gr.Group(visible=False) as text_section:
gr.Markdown("### Text Input")
gr.Markdown("*Enter comma-separated numbers for each group*")
group1_str = gr.Textbox(
placeholder="85.2,90.1,78.5,92.3,88.7",
label="Group 1 Data",
info="Comma-separated numbers (e.g., test scores for condition A)"
)
group2_str = gr.Textbox(
placeholder="88.1,85.7,91.2,87.4,89.3",
label="Group 2 Data",
info="Comma-separated numbers (e.g., test scores for condition B)"
)
return input_method, file_section, text_section, file_upload, has_header, data_preview, group1_str, group2_str
def create_parameter_components():
"""Create reusable parameter components for both test tabs."""
gr.Markdown("### Test Parameters")
with gr.Row():
alternative = gr.Dropdown(
choices=["two-sided", "less", "greater"],
value="two-sided",
label="Alternative Hypothesis",
info="two-sided: groups differ; less: group1 < group2; greater: group1 > group2"
)
alpha = gr.Number(
value=0.05,
minimum=0,
maximum=1,
step=0.01,
label="Significance Level (α)",
info="Probability threshold for statistical significance (typically 0.05)"
)
effect_thresholds = gr.Textbox(
value="0.2,0.5,0.8",
label="Effect Size Thresholds",
info="Cohen's d boundaries: small,medium,large (Cohen's canonical values)"
)
return alternative, alpha, effect_thresholds
def create_t_test_tab(test_function, test_name, description):
"""
Factory function to create a complete t-test tab with all components and handlers.
Args:
test_function: The statistical function to call (student_t_test or welch_t_test)
test_name: Display name for the tab (e.g., "Student's T-Test")
description: Markdown description to show at the top of the tab
Returns:
dict: Dictionary containing all created components and state for external reference
"""
with gr.TabItem(test_name):
gr.Markdown(description)
# Create input components
(input_method, file_section, text_section, file_upload,
has_header, data_preview, group1_str, group2_str) = create_input_components()
# Create parameter components
alternative, alpha, effect_thresholds = create_parameter_components()
# Create action buttons
with gr.Row():
run_button = gr.Button(f"Run {test_name}", variant="primary", scale=1)
clear_button = gr.Button("Clear All", variant="secondary", scale=1)
# Output display
output = gr.JSON(label="Statistical Test Results")
# Example data section
with gr.Row():
gr.Markdown("### Quick Examples")
example_button = gr.Button("Load Example Data", variant="outline")
# State management
loaded_dataframe = gr.State(value=None)
# EVENT HANDLERS
# Toggle between input methods
input_method.change(
fn=toggle_input_method,
inputs=input_method,
outputs=[file_section, text_section],
show_api=False # Hide UI helper from MCP
)
# File upload handlers
file_upload.change(
fn=load_uploaded_file,
inputs=[file_upload, has_header],
outputs=[loaded_dataframe, data_preview],
show_api=False # Hide UI helper from MCP
)
has_header.change(
fn=load_uploaded_file,
inputs=[file_upload, has_header],
outputs=[loaded_dataframe, data_preview],
show_api=False # Hide UI helper from MCP
)
# MAIN STATISTICAL FUNCTION CALL - This will be exposed to MCP!
run_button.click(
fn=test_function, # Direct call to the statistical function
inputs=[
loaded_dataframe, # dataframe
group1_str, # group1_str
group2_str, # group2_str
alternative, # alternative
alpha, # alpha
effect_thresholds # effect_thresholds
],
outputs=output
# Note: No show_api=False here - we want the main function exposed to MCP!
)
# Clear form handler
clear_button.click(
fn=clear_all,
outputs=[
input_method, loaded_dataframe, data_preview,
group1_str, group2_str, alternative,
alpha, effect_thresholds, output
],
show_api=False # Hide UI helper from MCP
)
# Example data handler
example_button.click(
fn=load_example,
outputs=[input_method, loaded_dataframe, data_preview,
group1_str, group2_str],
show_api=False # Hide UI helper from MCP
)
# Return components for external reference if needed
return {
'input_method': input_method,
'file_upload': file_upload,
'has_header': has_header,
'data_preview': data_preview,
'group1_str': group1_str,
'group2_str': group2_str,
'alternative': alternative,
'alpha': alpha,
'effect_thresholds': effect_thresholds,
'run_button': run_button,
'clear_button': clear_button,
'example_button': example_button,
'output': output,
'loaded_dataframe': loaded_dataframe
}
def create_one_sample_t_test_tab():
"""Create a complete one-sample t-test tab with all components and handlers."""
with gr.TabItem("One-Sample T-Test"):
gr.Markdown("**Test a sample against a known population mean**")
# Input method selector
input_method = gr.Radio(
choices=["File Upload", "Text Input"],
value="File Upload",
label="Choose Input Method",
info="Select how you want to provide your data"
)
# File upload input section
with gr.Group(visible=True) as file_section:
gr.Markdown("### File Upload")
gr.Markdown("*Upload CSV or Excel file - first column will be used as sample data*")
with gr.Row():
file_upload = gr.File(
label="Upload CSV/Excel File",
file_types=[".csv", ".xlsx", ".xls"],
type="filepath"
)
has_header = gr.Checkbox(
label="File has header row",
value=True,
info="Check if first row contains column names"
)
# Display loaded data preview
data_preview = gr.Dataframe(
label="Data Preview (first column)",
interactive=False,
row_count=5
)
# Text input section
with gr.Group(visible=False) as text_section:
gr.Markdown("### Text Input")
gr.Markdown("*Enter comma-separated numbers for your sample*")
group_str = gr.Textbox(
placeholder="85.2,90.1,78.5,92.3,88.7",
label="Sample Data",
info="Comma-separated numbers (e.g., test scores, measurements)"
)
# Test parameters
gr.Markdown("### Test Parameters")
with gr.Row():
population_mean = gr.Number(
value=0.0,
label="Population Mean (μ₀)",
info="Known or hypothesized population mean to test against"
)
alternative = gr.Dropdown(
choices=["two-sided", "less", "greater"],
value="two-sided",
label="Alternative Hypothesis",
info="two-sided: sample ≠ population; less: sample < population; greater: sample > population"
)
with gr.Row():
alpha = gr.Number(
value=0.05,
minimum=0,
maximum=1,
step=0.01,
label="Significance Level (α)",
info="Probability threshold for statistical significance (typically 0.05)"
)
effect_thresholds = gr.Textbox(
value="0.2,0.5,0.8",
label="Effect Size Thresholds",
info="Cohen's d boundaries: small,medium,large"
)
# Action buttons
with gr.Row():
run_button = gr.Button("Run One-Sample T-Test", variant="primary", scale=1)
clear_button = gr.Button("Clear All", variant="secondary", scale=1)
# Output display
output = gr.JSON(label="Statistical Test Results")
# Example data section
with gr.Row():
gr.Markdown("### Quick Examples")
example_button = gr.Button("Load Example Data", variant="outline")
# State management
loaded_dataframe = gr.State(value=None)
# EVENT HANDLERS
# Toggle between input methods
input_method.change(
fn=toggle_input_method,
inputs=input_method,
outputs=[file_section, text_section],
show_api=False
)
# File upload handlers
file_upload.change(
fn=load_uploaded_file,
inputs=[file_upload, has_header],
outputs=[loaded_dataframe, data_preview],
show_api=False
)
has_header.change(
fn=load_uploaded_file,
inputs=[file_upload, has_header],
outputs=[loaded_dataframe, data_preview],
show_api=False
)
# MAIN STATISTICAL FUNCTION CALL - Exposed to MCP!
run_button.click(
fn=one_sample_t_test,
inputs=[
loaded_dataframe, # dataframe
group_str, # group_str
population_mean, # population_mean
alternative, # alternative
alpha, # alpha
effect_thresholds # effect_thresholds
],
outputs=output
)
# Clear form handler
def clear_one_sample():
return (
"File Upload", # input_method
None, # loaded_dataframe
None, # data_preview
"", # group_str
0.0, # population_mean
"two-sided", # alternative
0.05, # alpha
"0.2,0.5,0.8", # effect_thresholds
{} # output
)
clear_button.click(
fn=clear_one_sample,
outputs=[
input_method, loaded_dataframe, data_preview,
group_str, population_mean, alternative,
alpha, effect_thresholds, output
],
show_api=False
)
# Example data handler
def load_one_sample_example():
example_data = "100,105,98,102,97,103,99,101,96,104"
return "Text Input", None, None, example_data, 100.0
example_button.click(
fn=load_one_sample_example,
outputs=[input_method, loaded_dataframe, data_preview, group_str, population_mean],
show_api=False
)
def create_anova_tab():
"""Create a complete one-way ANOVA tab with all components and handlers."""
with gr.TabItem("One-Way ANOVA"):
gr.Markdown("**Compare means across multiple independent groups**")
# Input method selector
input_method = gr.Radio(
choices=["File Upload", "Text Input"],
value="File Upload",
label="Choose Input Method",
info="Select how you want to provide your data"
)
# File upload input section
with gr.Group(visible=True) as file_section:
gr.Markdown("### File Upload")
gr.Markdown("*Upload CSV or Excel file - each column will be treated as a separate group*")
with gr.Row():
file_upload = gr.File(
label="Upload CSV/Excel File",
file_types=[".csv", ".xlsx", ".xls"],
type="filepath"
)
has_header = gr.Checkbox(
label="File has header row",
value=True,
info="Check if first row contains column names"
)
# Display loaded data preview
data_preview = gr.Dataframe(
label="Data Preview (all columns as groups)",
interactive=False,
row_count=5
)
# Text input section
with gr.Group(visible=False) as text_section:
gr.Markdown("### Text Input")
gr.Markdown("*Enter groups separated by semicolons (;) with comma-separated values within each group*")
groups_str = gr.Textbox(
placeholder="85.2,90.1,78.5;88.1,85.7,91.2;82.3,87.4,89.1",
label="Groups Data",
info="Format: group1_values;group2_values;group3_values (e.g., treatment A;treatment B;control)",
lines=3
)
gr.Markdown("**Example**: `85.2,90.1,78.5;88.1,85.7,91.2;82.3,87.4,89.1` represents 3 groups with their respective measurements")
# Test parameters
gr.Markdown("### Test Parameters")
with gr.Row():
alpha = gr.Number(
value=0.05,
minimum=0,
maximum=1,
step=0.01,
label="Significance Level (α)",
info="Probability threshold for statistical significance (typically 0.05)"
)
effect_thresholds = gr.Textbox(
value="0.01,0.06,0.14",
label="Effect Size Thresholds",
info="Eta-squared (η²) boundaries: small,medium,large"
)
# Action buttons
with gr.Row():
run_button = gr.Button("Run One-Way ANOVA", variant="primary", scale=1)
clear_button = gr.Button("Clear All", variant="secondary", scale=1)
# Output display
output = gr.JSON(label="Statistical Test Results")
# Interpretation note
gr.Markdown("""
### Post-Hoc Note
If ANOVA shows significant differences (p < α), consider running post-hoc tests to identify which specific groups differ from each other.
""")
# Example data section
with gr.Row():
gr.Markdown("### Quick Examples")
example_button = gr.Button("Load Example Data", variant="outline")
# State management
loaded_dataframe = gr.State(value=None)
# EVENT HANDLERS
# Toggle between input methods
input_method.change(
fn=toggle_input_method,
inputs=input_method,
outputs=[file_section, text_section],
show_api=False
)
# File upload handlers
file_upload.change(
fn=load_uploaded_file,
inputs=[file_upload, has_header],
outputs=[loaded_dataframe, data_preview],
show_api=False
)
has_header.change(
fn=load_uploaded_file,
inputs=[file_upload, has_header],
outputs=[loaded_dataframe, data_preview],
show_api=False
)
# MAIN STATISTICAL FUNCTION CALL - Exposed to MCP!
run_button.click(
fn=one_way_anova,
inputs=[
loaded_dataframe, # dataframe
groups_str, # groups_str
alpha, # alpha
effect_thresholds # effect_thresholds
],
outputs=output
)
# Clear form handler
def clear_anova():
return (
"File Upload", # input_method
None, # loaded_dataframe
None, # data_preview
"", # groups_str
0.05, # alpha
"0.01,0.06,0.14", # effect_thresholds
{} # output
)
clear_button.click(
fn=clear_anova,
outputs=[
input_method, loaded_dataframe, data_preview,
groups_str, alpha, effect_thresholds, output
],
show_api=False
)
# Example data handler
def load_anova_example():
example_data = "85.2,90.1,78.5,92.3;88.1,85.7,91.2,87.4;82.3,87.4,89.1,83.7"
return "Text Input", None, None, example_data
example_button.click(
fn=load_anova_example,
outputs=[input_method, loaded_dataframe, data_preview, groups_str],
show_api=False
)
def create_multi_way_anova_tab():
"""Create a complete multi-way ANOVA tab with all components and handlers."""
with gr.TabItem("Multi-Way ANOVA"):
gr.Markdown("**Compare means across multiple categorical factors simultaneously**")
# Input method selector
input_method = gr.Radio(
choices=["File Upload"],
value="File Upload",
label="Input Method",
info="Multi-way ANOVA requires structured data - file upload recommended"
)
# File upload input section
with gr.Group(visible=True) as file_section:
gr.Markdown("### File Upload")
gr.Markdown("*Upload CSV or Excel file with dependent variable and multiple categorical factors*")
with gr.Row():
file_upload = gr.File(
label="Upload CSV/Excel File",
file_types=[".csv", ".xlsx", ".xls"],
type="filepath"
)
has_header = gr.Checkbox(
label="File has header row",
value=True,
info="Check if first row contains column names"
)
# Display loaded data preview
data_preview = gr.Dataframe(
label="Data Preview",
interactive=False,
row_count=10
)
# Variable specification
gr.Markdown("### Variable Specification")
with gr.Row():
dependent_var = gr.Dropdown(
label="Dependent Variable",
info="Select the continuous outcome variable",
interactive=True
)
factors = gr.Textbox(
label="Factors (comma-separated)",
placeholder="treatment,gender,age_group",
info="Enter factor column names separated by commas",
lines=2
)
# Advanced options
gr.Markdown("### Analysis Options")
with gr.Row():
include_interactions = gr.Checkbox(
label="Include Interactions",
value=True,
info="Test for interaction effects between factors"
)
max_interaction_order = gr.Number(
label="Max Interaction Order",
value=None,
minimum=2,
step=1,
info="Maximum interaction order (leave empty for all interactions)"
)
with gr.Row():
sum_squares_type = gr.Dropdown(
choices=[1, 2, 3],
value=2,
label="Sum of Squares Type",
info="Type 2 for balanced, Type 3 for unbalanced designs"
)
alpha = gr.Number(
value=0.05,
minimum=0,
maximum=1,
step=0.01,
label="Significance Level (α)",
info="Probability threshold for statistical significance"
)
with gr.Row():
effect_thresholds = gr.Textbox(
value="0.01,0.06,0.14",
label="Effect Size Thresholds",
info="Eta-squared boundaries: small,medium,large"
)
# Action buttons
with gr.Row():
run_button = gr.Button("Run Multi-Way ANOVA", variant="primary", scale=1)
clear_button = gr.Button("Clear All", variant="secondary", scale=1)
# Output display
output = gr.JSON(label="Multi-Way ANOVA Results")
# Information section
with gr.Accordion("Multi-Way ANOVA Information", open=False):
gr.Markdown("""
### What is Multi-Way ANOVA?
Multi-way ANOVA extends one-way ANOVA to handle multiple categorical factors simultaneously:
**Main Effects**: How each factor independently affects the outcome
**Interaction Effects**: How factors work together (non-additively)
### Example Designs:
- **2-way**: Treatment (A,B,C) × Gender (Male,Female) → 6 combinations
- **3-way**: Drug (A,B) × Dose (Low,High) × Age (Young,Old) → 8 combinations
- **4-way**: Method (A,B) × School (Public,Private) × Gender (M,F) × Grade (1st,2nd) → 16 combinations
### Requirements:
- All factors must be categorical (not continuous)
- Dependent variable must be continuous
- At least 2 observations per factor combination
- Independence, normality, and equal variances assumptions
""")
# Example data section
with gr.Row():
gr.Markdown("### Quick Examples")
example_button = gr.Button("Load Example Data", variant="outline")
# State management
loaded_dataframe = gr.State(value=None)
# Helper function to load and preview file data
def load_multi_way_file(file_path, has_header_flag):
if file_path is None:
return None, None, []
try:
# Determine header parameter
header_param = 0 if has_header_flag else None
if file_path.endswith('.csv'):
df = pd.read_csv(file_path, header=header_param)
elif file_path.endswith(('.xlsx', '.xls')):
df = pd.read_excel(file_path, header=header_param)
else:
return None, pd.DataFrame({'Error': ['Unsupported file format']}), []
# Set column names if no header
if not has_header_flag:
df.columns = [f'Column_{i+1}' for i in range(len(df.columns))]
# Get column options for dropdown
column_options = list(df.columns)
# Return dataframe, preview, and column options
preview = df.head(15)
return df, preview, column_options
except Exception as e:
error_df = pd.DataFrame({'Error': [f"Failed to load file: {str(e)}"]})
return None, error_df, []
# Clear form function
def clear_multi_way():
return (
None, # loaded_dataframe
None, # data_preview
[], # dependent_var choices
None, # dependent_var value
"", # factors
True, # include_interactions
None, # max_interaction_order
2, # sum_squares_type
0.05, # alpha
"0.01,0.06,0.14", # effect_thresholds
{} # output
)
# Example data function
def load_multi_way_example():
# Create example 3-way ANOVA data
np.random.seed(42)
treatments = ['Control', 'Treatment_A', 'Treatment_B']
genders = ['Male', 'Female']
ages = ['Young', 'Old']
data = []
for treatment in treatments:
for gender in genders:
for age in ages:
# Generate scores with different effects
base_score = 50
treatment_effect = {'Control': 0, 'Treatment_A': 8, 'Treatment_B': 12}[treatment]
gender_effect = {'Male': 3, 'Female': -3}[gender]
age_effect = {'Young': 5, 'Old': -5}[age]
# Add interaction: Treatment_B works better for older patients
interaction_effect = 0
if treatment == 'Treatment_B' and age == 'Old':
interaction_effect = 6
n_per_cell = 15
mean_score = base_score + treatment_effect + gender_effect + age_effect + interaction_effect
scores = np.random.normal(mean_score, 6, n_per_cell)
for score in scores:
data.append({
'test_score': round(score, 2),
'treatment': treatment,
'gender': gender,
'age_group': age
})
df = pd.DataFrame(data)
preview = df.head(15)
column_options = list(df.columns)
return df, preview, column_options, 'test_score', 'treatment,gender,age_group'
# EVENT HANDLERS
# File upload handlers
file_upload.change(
fn=load_multi_way_file,
inputs=[file_upload, has_header],
outputs=[loaded_dataframe, data_preview, dependent_var],
show_api=False
)
has_header.change(
fn=load_multi_way_file,
inputs=[file_upload, has_header],
outputs=[loaded_dataframe, data_preview, dependent_var],
show_api=False
)
# MAIN STATISTICAL FUNCTION CALL - Exposed to MCP!
run_button.click(
fn=multi_way_anova,
inputs=[
loaded_dataframe, # dataframe
dependent_var, # dependent_var
factors, # factors
alpha, # alpha
effect_thresholds, # effect_thresholds
include_interactions, # include_interactions
max_interaction_order, # max_interaction_order
sum_squares_type # sum_squares_type
],
outputs=output
)
# Clear form handler
clear_button.click(
fn=clear_multi_way,
outputs=[
loaded_dataframe, data_preview, dependent_var, dependent_var,
factors, include_interactions, max_interaction_order,
sum_squares_type, alpha, effect_thresholds, output
],
show_api=False
)
# Example data handler
example_button.click(
fn=load_multi_way_example,
outputs=[loaded_dataframe, data_preview, dependent_var, dependent_var, factors],
show_api=False
)
def create_chi_square_tab():
"""Create a complete chi-square goodness of fit test tab with all components and handlers."""
with gr.TabItem("Chi-Square Test"):
gr.Markdown("**Test if observed frequencies differ from expected frequencies**")
# Input method selector
input_method = gr.Radio(
choices=["File Upload", "Text Input"],
value="File Upload",
label="Choose Input Method",
info="Select how you want to provide your data"
)
# File upload input section
with gr.Group(visible=True) as file_section:
gr.Markdown("### File Upload")
gr.Markdown("*Upload CSV or Excel file - first column: observed frequencies, second column: expected frequencies (optional)*")
with gr.Row():
file_upload = gr.File(
label="Upload CSV/Excel File",
file_types=[".csv", ".xlsx", ".xls"],
type="filepath"
)
has_header = gr.Checkbox(
label="File has header row",
value=True,
info="Check if first row contains column names"
)
# Display loaded data preview
data_preview = gr.Dataframe(
label="Data Preview (observed and expected frequencies)",
interactive=False,
row_count=5
)
# Text input section
with gr.Group(visible=False) as text_section:
gr.Markdown("### Text Input")
gr.Markdown("*Enter comma-separated frequency values*")
observed_str = gr.Textbox(
placeholder="25,30,20,15",
label="Observed Frequencies",
info="Comma-separated observed frequencies for each category"
)
expected_str = gr.Textbox(
placeholder="22.5,22.5,22.5,22.5",
label="Expected Frequencies (Optional)",
info="Comma-separated expected frequencies. Leave empty for equal distribution"
)
# Test parameters
gr.Markdown("### Test Parameters")
with gr.Row():
alpha = gr.Number(
value=0.05,
minimum=0,
maximum=1,
step=0.01,
label="Significance Level (α)",
info="Probability threshold for statistical significance (typically 0.05)"
)
effect_thresholds = gr.Textbox(
value="0.1,0.3,0.5",
label="Effect Size Thresholds",
info="Cramér's V boundaries: small,medium,large"
)
# Action buttons
with gr.Row():
run_button = gr.Button("Run Chi-Square Test", variant="primary", scale=1)
clear_button = gr.Button("Clear All", variant="secondary", scale=1)
# Output display
output = gr.JSON(label="Statistical Test Results")
# Example data section
with gr.Row():
gr.Markdown("### Quick Examples")
example_button = gr.Button("Load Example Data", variant="outline")
# State management
loaded_dataframe = gr.State(value=None)
# EVENT HANDLERS
# Toggle between input methods
input_method.change(
fn=toggle_input_method,
inputs=input_method,
outputs=[file_section, text_section],
show_api=False
)
# File upload handlers
file_upload.change(
fn=load_uploaded_file,
inputs=[file_upload, has_header],
outputs=[loaded_dataframe, data_preview],
show_api=False
)
has_header.change(
fn=load_uploaded_file,
inputs=[file_upload, has_header],
outputs=[loaded_dataframe, data_preview],
show_api=False
)
# MAIN STATISTICAL FUNCTION CALL - Exposed to MCP!
run_button.click(
fn=chi_square_test,
inputs=[
loaded_dataframe, # dataframe
observed_str, # observed_str
expected_str, # expected_str
alpha, # alpha
effect_thresholds # effect_thresholds
],
outputs=output
)
# Clear form handler
def clear_chi_square():
return (
"File Upload", # input_method
None, # loaded_dataframe
None, # data_preview
"", # observed_str
"", # expected_str
0.05, # alpha
"0.1,0.3,0.5", # effect_thresholds
{} # output
)
clear_button.click(
fn=clear_chi_square,
outputs=[
input_method, loaded_dataframe, data_preview,
observed_str, expected_str, alpha, effect_thresholds, output
],
show_api=False
)
# Example data handler
def load_chi_square_example():
observed_example = "25,30,20,15"
expected_example = "22.5,22.5,22.5,22.5"
return "Text Input", None, None, observed_example, expected_example
example_button.click(
fn=load_chi_square_example,
outputs=[input_method, loaded_dataframe, data_preview, observed_str, expected_str],
show_api=False
)
def create_correlation_tab():
"""Create a complete correlation analysis tab with all components and handlers."""
with gr.TabItem("Correlation Test"):
gr.Markdown("**Analyze the relationship between two continuous variables**")
# Input method selector
input_method = gr.Radio(
choices=["File Upload", "Text Input"],
value="File Upload",
label="Choose Input Method",
info="Select how you want to provide your data"
)
# File upload input section
with gr.Group(visible=True) as file_section:
gr.Markdown("### File Upload")
gr.Markdown("*Upload CSV or Excel file - first two columns will be used as the two variables*")
with gr.Row():
file_upload = gr.File(
label="Upload CSV/Excel File",
file_types=[".csv", ".xlsx", ".xls"],
type="filepath"
)
has_header = gr.Checkbox(
label="File has header row",
value=True,
info="Check if first row contains column names"
)
# Display loaded data preview
data_preview = gr.Dataframe(
label="Data Preview (first two columns as variables)",
interactive=False,
row_count=5
)
# Text input section
with gr.Group(visible=False) as text_section:
gr.Markdown("### Text Input")
gr.Markdown("*Enter comma-separated values for each variable*")
group1_str = gr.Textbox(
placeholder="5.2,6.1,4.8,7.3,5.9",
label="Variable 1 (X)",
info="Comma-separated numbers (e.g., hours studied, height, age)"
)
group2_str = gr.Textbox(
placeholder="78,85,72,92,81",
label="Variable 2 (Y)",
info="Comma-separated numbers (e.g., test scores, weight, income)"
)
# Test parameters
gr.Markdown("### Test Parameters")
with gr.Row():
method = gr.Dropdown(
choices=["pearson", "spearman", "kendall"],
value="pearson",
label="Correlation Method",
info="pearson: linear relationships; spearman: monotonic; kendall: robust to outliers"
)
alpha = gr.Number(
value=0.05,
minimum=0,
maximum=1,
step=0.01,
label="Significance Level (α)",
info="Probability threshold for statistical significance (typically 0.05)"
)
with gr.Row():
effect_thresholds = gr.Textbox(
value="0.1,0.3,0.5",
label="Effect Size Thresholds",
info="Correlation coefficient boundaries: small,medium,large"
)
# Action buttons
with gr.Row():
run_button = gr.Button("Run Correlation Test", variant="primary", scale=1)
clear_button = gr.Button("Clear All", variant="secondary", scale=1)
# Output display
output = gr.JSON(label="Statistical Test Results")
# Example data section
with gr.Row():
gr.Markdown("### Quick Examples")
example_button = gr.Button("Load Example Data", variant="outline")
# State management
loaded_dataframe = gr.State(value=None)
# EVENT HANDLERS
# Toggle between input methods
input_method.change(
fn=toggle_input_method,
inputs=input_method,
outputs=[file_section, text_section],
show_api=False
)
# File upload handlers
file_upload.change(
fn=load_uploaded_file,
inputs=[file_upload, has_header],
outputs=[loaded_dataframe, data_preview],
show_api=False
)
has_header.change(
fn=load_uploaded_file,
inputs=[file_upload, has_header],
outputs=[loaded_dataframe, data_preview],
show_api=False
)
# MAIN STATISTICAL FUNCTION CALL - Exposed to MCP!
run_button.click(
fn=correlation_test,
inputs=[
loaded_dataframe, # dataframe
group1_str, # group1_str
group2_str, # group2_str
method, # method
alpha, # alpha
effect_thresholds # effect_thresholds
],
outputs=output
)
# Clear form handler
def clear_correlation():
return (
"File Upload", # input_method
None, # loaded_dataframe
None, # data_preview
"", # group1_str
"", # group2_str
"pearson", # method
0.05, # alpha
"0.1,0.3,0.5", # effect_thresholds
{} # output
)
clear_button.click(
fn=clear_correlation,
outputs=[
input_method, loaded_dataframe, data_preview,
group1_str, group2_str, method, alpha, effect_thresholds, output
],
show_api=False
)
# Example data handler
def load_correlation_example():
x_example = "5.2,6.1,4.8,7.3,5.9,6.8,4.5,7.1"
y_example = "78,85,72,92,81,89,70,88"
return "Text Input", None, None, x_example, y_example
example_button.click(
fn=load_correlation_example,
outputs=[input_method, loaded_dataframe, data_preview, group1_str, group2_str],
show_api=False
)
def create_t_test_interface():
"""Create the complete t-test interface with both Student's and Welch's tabs."""
with gr.Blocks(title="T-Test Analysis", theme=gr.themes.Soft()) as demo:
gr.Markdown("""
# Statistical Analysis MCP
""")
with gr.Tabs():
# Create Student's t-test tab
student_components = create_t_test_tab(
test_function=student_t_test,
test_name="Student's T-Test",
description="**t-test between independent groups assuming equal population variances**"
)
# Create Welch's t-test tab
welch_components = create_t_test_tab(
test_function=welch_t_test,
test_name="Welch's T-Test",
description="**t-test between independent groups that does not assume equal population variances**"
)
# Create paired t-test tab
paired_components = create_t_test_tab(
test_function=paired_t_test,
test_name="Paired T-Test",
description="**t-test between paired groups**"
)
one_sample_components = create_one_sample_t_test_tab()
anova_components = create_anova_tab()
manova_components = create_multi_way_anova_tab()
chi_square_components = create_chi_square_tab()
corr_components = create_correlation_tab()
return demo
# Main execution
if __name__ == "__main__":
demo = create_t_test_interface()
demo.launch(mcp_server=True)