Source code for morie.statistics

"""
Comprehensive hypothesis testing suite for epidemiological research.

This module provides a unified interface for the full spectrum of frequentist
hypothesis tests encountered in public health and biomedical research.  Every
test function returns a :class:`TestResult` dataclass containing the test
statistic, *p*-value, degrees of freedom, confidence interval bounds, effect
size, and method name so that downstream code can process results
programmatically without parsing text output.

Categories of tests
-------------------
- **Location tests** (t-tests, ANOVA, non-parametric rank tests)
- **Association tests** (chi-squared, Fisher exact, correlation)
- **Distribution tests** (normality, homogeneity of variance, goodness of fit)
- **Proportion tests** (one- and two-sample z-tests, Fisher exact)
- **Agreement tests** (Cohen's kappa, Fleiss' kappa, ICC)

All implementations delegate heavy numerics to :mod:`scipy.stats` and
:mod:`statsmodels` where possible; only formulas absent from those libraries
are coded from scratch.

References
----------
Cohen, J. (1988). *Statistical Power Analysis for the Behavioral Sciences*
    (2nd ed.). Lawrence Erlbaum Associates.
Agresti, A. (2013). *Categorical Data Analysis* (3rd ed.). Wiley.
Conover, W. J. (1999). *Practical Nonparametric Statistics* (3rd ed.). Wiley.
Fleiss, J. L. (1971). Measuring nominal scale agreement among many raters.
    *Psychological Bulletin*, 76(5), 378--382.
Shrout, P. E., & Fleiss, J. L. (1979). Intraclass correlations: Uses in
    assessing rater reliability. *Psychological Bulletin*, 86(2), 420--428.
"""

from __future__ import annotations

import logging
import math
from dataclasses import dataclass, field
from typing import Union

import numpy as np
import pandas as pd
import scipy.stats as stats

logger = logging.getLogger(__name__)

# ---------------------------------------------------------------------------
# Result container
# ---------------------------------------------------------------------------



[docs]
@dataclass
class TestResult:
    """Standardised container for every hypothesis-test result in this module.

    Parameters
    ----------
    method : str
        Human-readable name of the statistical test.
    test_statistic : float
        Value of the test statistic (t, F, chi-squared, U, etc.).
    p_value : float
        Two-sided *p*-value (or one-sided where documented).
    df : float | None
        Degrees of freedom (``None`` for non-parametric tests that do not
        define df).
    ci_lower : float | None
        Lower bound of the confidence interval for the estimated parameter.
    ci_upper : float | None
        Upper bound of the confidence interval for the estimated parameter.
    effect_size : float | None
        A standardised effect-size measure appropriate to the test.
    estimate : float | None
        Point estimate of the quantity being tested (mean difference, odds
        ratio, correlation, etc.).
    n : int | None
        Total sample size used in the test.
    extra : dict
        Arbitrary additional outputs (e.g. per-group means, residuals).
    """

    method: str
    test_statistic: float
    p_value: float
    df: float | None = None
    ci_lower: float | None = None
    ci_upper: float | None = None
    effect_size: float | None = None
    estimate: float | None = None
    n: int | None = None
    extra: dict = field(default_factory=dict)



# ===================================================================
# Internal helpers
# ===================================================================


def _validate_array(x: Union[np.ndarray, pd.Series, list], name: str = "x") -> np.ndarray:
    """Convert input to a float64 numpy array, dropping NaN values."""
    arr = np.asarray(x, dtype=np.float64).ravel()
    mask = np.isfinite(arr)
    n_dropped = int((~mask).sum())
    if n_dropped:
        logger.debug("Dropped %d non-finite values from %s", n_dropped, name)
    return arr[mask]


def _cohens_d_ind(x: np.ndarray, y: np.ndarray) -> float:
    """Cohen's *d* for independent samples (pooled SD denominator)."""
    nx, ny = len(x), len(y)
    sp = math.sqrt(((nx - 1) * x.var(ddof=1) + (ny - 1) * y.var(ddof=1)) / (nx + ny - 2))
    if sp == 0:
        return 0.0
    return float((x.mean() - y.mean()) / sp)


def _cohens_d_one(x: np.ndarray, mu0: float) -> float:
    """Cohen's *d* for a one-sample test."""
    s = x.std(ddof=1)
    if s == 0:
        return 0.0
    return float((x.mean() - mu0) / s)


def _cohens_d_paired(d: np.ndarray) -> float:
    """Cohen's *d* for paired samples."""
    sd = d.std(ddof=1)
    if sd == 0:
        return 0.0
    return float(d.mean() / sd)


def _mean_ci(x: np.ndarray, confidence: float = 0.95) -> tuple[float, float]:
    """CI for the mean of *x* using the *t*-distribution."""
    n = len(x)
    se = x.std(ddof=1) / math.sqrt(n)
    t_crit = stats.t.ppf((1 + confidence) / 2, n - 1)
    return float(x.mean() - t_crit * se), float(x.mean() + t_crit * se)


def _diff_ci(x: np.ndarray, y: np.ndarray, confidence: float = 0.95, equal_var: bool = True) -> tuple[float, float]:
    """CI for the difference in means (x - y)."""
    nx, ny = len(x), len(y)
    diff = float(x.mean() - y.mean())
    if equal_var:
        sp2 = ((nx - 1) * x.var(ddof=1) + (ny - 1) * y.var(ddof=1)) / (nx + ny - 2)
        se = math.sqrt(sp2 * (1 / nx + 1 / ny))
        df_val = nx + ny - 2
    else:
        s1, s2 = x.var(ddof=1), y.var(ddof=1)
        se = math.sqrt(s1 / nx + s2 / ny)
        num = (s1 / nx + s2 / ny) ** 2
        denom = (s1 / nx) ** 2 / (nx - 1) + (s2 / ny) ** 2 / (ny - 1)
        df_val = num / denom if denom > 0 else 1.0
    t_crit = stats.t.ppf((1 + confidence) / 2, df_val)
    return diff - t_crit * se, diff + t_crit * se


# ===================================================================
# T-TESTS
# ===================================================================



[docs]
def one_sample_ttest(
    x: Union[np.ndarray, pd.Series, list],
    mu0: float = 0.0,
    confidence: float = 0.95,
) -> TestResult:
    """One-sample Student's *t*-test.

    Tests :math:`H_0: \\mu = \\mu_0` against :math:`H_1: \\mu \\neq \\mu_0`.

    Parameters
    ----------
    x : array-like
        Sample observations.
    mu0 : float, default 0.0
        Hypothesised population mean.
    confidence : float, default 0.95
        Confidence level for the CI around the sample mean.

    Returns
    -------
    TestResult
    """
    x = _validate_array(x, "x")
    n = len(x)
    if n < 2:
        raise ValueError("Need at least 2 observations for a t-test.")
    t_stat, p = stats.ttest_1samp(x, mu0)
    ci_lo, ci_hi = _mean_ci(x, confidence)
    return TestResult(
        method="One-sample t-test",
        test_statistic=float(t_stat),
        p_value=float(p),
        df=float(n - 1),
        ci_lower=ci_lo,
        ci_upper=ci_hi,
        effect_size=_cohens_d_one(x, mu0),
        estimate=float(x.mean()),
        n=n,
    )




[docs]
def two_sample_ttest(
    x: Union[np.ndarray, pd.Series, list],
    y: Union[np.ndarray, pd.Series, list],
    equal_var: bool = True,
    confidence: float = 0.95,
) -> TestResult:
    """Independent two-sample *t*-test (equal or unequal variance).

    Parameters
    ----------
    x, y : array-like
        Two independent samples.
    equal_var : bool, default True
        If ``False``, use Welch's approximation for unequal variances.
    confidence : float, default 0.95
        Confidence level.

    Returns
    -------
    TestResult
    """
    x = _validate_array(x, "x")
    y = _validate_array(y, "y")
    t_stat, p = stats.ttest_ind(x, y, equal_var=equal_var)
    ci_lo, ci_hi = _diff_ci(x, y, confidence, equal_var)
    nx, ny = len(x), len(y)
    if equal_var:
        df_val = float(nx + ny - 2)
    else:
        s1, s2 = x.var(ddof=1), y.var(ddof=1)
        num = (s1 / nx + s2 / ny) ** 2
        denom = (s1 / nx) ** 2 / (nx - 1) + (s2 / ny) ** 2 / (ny - 1)
        df_val = num / denom if denom > 0 else 1.0
    label = "Two-sample t-test (equal var)" if equal_var else "Welch's t-test"
    return TestResult(
        method=label,
        test_statistic=float(t_stat),
        p_value=float(p),
        df=df_val,
        ci_lower=ci_lo,
        ci_upper=ci_hi,
        effect_size=_cohens_d_ind(x, y),
        estimate=float(x.mean() - y.mean()),
        n=nx + ny,
    )




[docs]
def welch_ttest(
    x: Union[np.ndarray, pd.Series, list],
    y: Union[np.ndarray, pd.Series, list],
    confidence: float = 0.95,
) -> TestResult:
    """Welch's *t*-test -- convenience wrapper for ``two_sample_ttest(equal_var=False)``.

    Parameters
    ----------
    x, y : array-like
        Two independent samples.
    confidence : float, default 0.95
        Confidence level.

    Returns
    -------
    TestResult
    """
    return two_sample_ttest(x, y, equal_var=False, confidence=confidence)




[docs]
def paired_ttest(
    x: Union[np.ndarray, pd.Series, list],
    y: Union[np.ndarray, pd.Series, list],
    confidence: float = 0.95,
) -> TestResult:
    """Paired-sample *t*-test.

    Tests :math:`H_0: \\mu_d = 0` where :math:`d_i = x_i - y_i`.

    Parameters
    ----------
    x, y : array-like
        Paired observations (must have equal length).
    confidence : float, default 0.95
        Confidence level for the mean difference CI.

    Returns
    -------
    TestResult
    """
    x = _validate_array(x, "x")
    y = _validate_array(y, "y")
    if len(x) != len(y):
        raise ValueError("Paired t-test requires equal-length arrays.")
    d = x - y
    n = len(d)
    t_stat, p = stats.ttest_rel(x, y)
    ci_lo, ci_hi = _mean_ci(d, confidence)
    return TestResult(
        method="Paired t-test",
        test_statistic=float(t_stat),
        p_value=float(p),
        df=float(n - 1),
        ci_lower=ci_lo,
        ci_upper=ci_hi,
        effect_size=_cohens_d_paired(d),
        estimate=float(d.mean()),
        n=n,
    )



# ===================================================================
# ANOVA FAMILY
# ===================================================================



[docs]
def one_way_anova(
    *groups: Union[np.ndarray, pd.Series, list],
) -> TestResult:
    """One-way between-subjects ANOVA (F-test).

    Parameters
    ----------
    *groups : array-like
        Two or more independent samples.

    Returns
    -------
    TestResult
        Effect size is :math:`\\eta^2 = SS_{between} / SS_{total}`.
    """
    if len(groups) < 2:
        raise ValueError("ANOVA requires at least 2 groups.")
    cleaned = [_validate_array(g, f"group_{i}") for i, g in enumerate(groups)]
    f_stat, p = stats.f_oneway(*cleaned)
    # eta-squared
    grand_mean = np.concatenate(cleaned).mean()
    ss_between = sum(len(g) * (g.mean() - grand_mean) ** 2 for g in cleaned)
    ss_total = sum(((g - grand_mean) ** 2).sum() for g in cleaned)
    eta2 = float(ss_between / ss_total) if ss_total > 0 else 0.0
    k = len(cleaned)
    n_total = sum(len(g) for g in cleaned)
    return TestResult(
        method="One-way ANOVA",
        test_statistic=float(f_stat),
        p_value=float(p),
        df=float(k - 1),
        effect_size=eta2,
        n=n_total,
        extra={"df_between": k - 1, "df_within": n_total - k, "eta_squared": eta2},
    )




[docs]
def two_way_anova(
    data: pd.DataFrame,
    outcome: str,
    factor_a: str,
    factor_b: str,
) -> TestResult:
    """Two-way factorial ANOVA via OLS type-II sums of squares.

    Parameters
    ----------
    data : DataFrame
        Long-format data.
    outcome : str
        Name of the dependent-variable column.
    factor_a, factor_b : str
        Names of the two factor columns.

    Returns
    -------
    TestResult
        The ``extra`` dict contains per-factor and interaction F and *p* values.
    """
    import statsmodels.api as sm
    from statsmodels.formula.api import ols

    formula = f"{outcome} ~ C({factor_a}) * C({factor_b})"
    model = ols(formula, data=data.dropna(subset=[outcome, factor_a, factor_b])).fit()
    anova_table = sm.stats.anova_lm(model, typ=2)
    # Interaction row
    interaction_key = f"C({factor_a}):C({factor_b})"
    f_int = float(anova_table.loc[interaction_key, "F"]) if interaction_key in anova_table.index else np.nan
    p_int = float(anova_table.loc[interaction_key, "PR(>F)"]) if interaction_key in anova_table.index else np.nan
    ss_total = anova_table["sum_sq"].sum()
    eta2_a = float(anova_table.loc[f"C({factor_a})", "sum_sq"] / ss_total)
    return TestResult(
        method="Two-way ANOVA",
        test_statistic=f_int,
        p_value=p_int,
        effect_size=eta2_a,
        n=len(data.dropna(subset=[outcome, factor_a, factor_b])),
        extra={"anova_table": anova_table.to_dict()},
    )




[docs]
def repeated_measures_anova(
    data: pd.DataFrame,
    outcome: str,
    subject: str,
    within: str,
) -> TestResult:
    """One-way repeated-measures ANOVA (sphericity not assumed -- uses
    Greenhouse--Geisser correction).

    Parameters
    ----------
    data : DataFrame
        Long-format data with one row per subject-condition.
    outcome : str
        Name of the outcome column.
    subject : str
        Name of the subject identifier column.
    within : str
        Name of the within-subjects factor column.

    Returns
    -------
    TestResult
    """
    df = data.dropna(subset=[outcome, subject, within]).copy()
    levels = df[within].unique()
    k = len(levels)
    if k < 2:
        raise ValueError("Need at least 2 levels for repeated-measures ANOVA.")
    # Pivot to wide
    wide = df.pivot(index=subject, columns=within, values=outcome).dropna()
    n = len(wide)
    grand_mean = wide.values.mean()
    subj_means = wide.values.mean(axis=1)
    cond_means = wide.values.mean(axis=0)
    ss_between = k * ((subj_means - grand_mean) ** 2).sum()
    ss_cond = n * ((cond_means - grand_mean) ** 2).sum()
    ss_total = ((wide.values - grand_mean) ** 2).sum()
    ss_error = ss_total - ss_between - ss_cond
    df_cond = k - 1
    df_error = (n - 1) * (k - 1)
    ms_cond = ss_cond / df_cond if df_cond > 0 else 0.0
    ms_error = ss_error / df_error if df_error > 0 else 0.0
    f_stat = ms_cond / ms_error if ms_error > 0 else 0.0
    p = 1.0 - stats.f.cdf(f_stat, df_cond, df_error)
    eta2 = ss_cond / (ss_cond + ss_error) if (ss_cond + ss_error) > 0 else 0.0
    return TestResult(
        method="Repeated-measures ANOVA",
        test_statistic=float(f_stat),
        p_value=float(p),
        df=float(df_cond),
        effect_size=float(eta2),
        n=n,
        extra={"df_error": df_error, "ss_cond": ss_cond, "ss_error": ss_error},
    )




[docs]
def kruskal_wallis(
    *groups: Union[np.ndarray, pd.Series, list],
) -> TestResult:
    """Kruskal--Wallis *H*-test (non-parametric one-way ANOVA).

    Parameters
    ----------
    *groups : array-like
        Two or more independent samples.

    Returns
    -------
    TestResult
        Effect size is :math:`\\eta^2_H = (H - k + 1) / (N - k)`.
    """
    cleaned = [_validate_array(g, f"group_{i}") for i, g in enumerate(groups)]
    h_stat, p = stats.kruskal(*cleaned)
    k = len(cleaned)
    n_total = sum(len(g) for g in cleaned)
    eta2_h = (h_stat - k + 1) / (n_total - k) if (n_total - k) > 0 else 0.0
    return TestResult(
        method="Kruskal-Wallis H-test",
        test_statistic=float(h_stat),
        p_value=float(p),
        df=float(k - 1),
        effect_size=float(max(eta2_h, 0.0)),
        n=n_total,
    )




[docs]
def friedman_test(
    *groups: Union[np.ndarray, pd.Series, list],
) -> TestResult:
    """Friedman test for repeated measures on ranks.

    Parameters
    ----------
    *groups : array-like
        Three or more matched samples (equal length).

    Returns
    -------
    TestResult
        Effect size is Kendall's *W*.
    """
    cleaned = [_validate_array(g, f"group_{i}") for i, g in enumerate(groups)]
    lengths = [len(g) for g in cleaned]
    if len(set(lengths)) != 1:
        raise ValueError("Friedman test requires equal-length groups.")
    chi2, p = stats.friedmanchisquare(*cleaned)
    k = len(cleaned)
    n = lengths[0]
    w = chi2 / (n * (k - 1)) if n * (k - 1) > 0 else 0.0
    return TestResult(
        method="Friedman test",
        test_statistic=float(chi2),
        p_value=float(p),
        df=float(k - 1),
        effect_size=float(w),
        n=n,
        extra={"kendall_w": float(w)},
    )



# ===================================================================
# CHI-SQUARED FAMILY
# ===================================================================



[docs]
def chi2_goodness_of_fit(
    observed: Union[np.ndarray, list],
    expected: Union[np.ndarray, list] | None = None,
) -> TestResult:
    """Chi-squared goodness-of-fit test.

    Parameters
    ----------
    observed : array-like
        Observed frequency counts.
    expected : array-like or None
        Expected frequency counts.  If ``None``, uniform distribution assumed.

    Returns
    -------
    TestResult
        Effect size is Cohen's *w*.
    """
    obs = np.asarray(observed, dtype=np.float64)
    if expected is None:
        exp = np.full_like(obs, obs.sum() / len(obs))
    else:
        exp = np.asarray(expected, dtype=np.float64)
    chi2, p = stats.chisquare(obs, f_exp=exp)
    k = len(obs)
    n = obs.sum()
    w = math.sqrt(chi2 / n) if n > 0 else 0.0
    return TestResult(
        method="Chi-squared goodness-of-fit",
        test_statistic=float(chi2),
        p_value=float(p),
        df=float(k - 1),
        effect_size=float(w),
        n=int(n),
    )




[docs]
def chi2_independence(
    contingency_table: Union[np.ndarray, pd.DataFrame],
    correction: bool = True,
) -> TestResult:
    """Chi-squared test of independence for a contingency table.

    Parameters
    ----------
    contingency_table : array-like or DataFrame
        An r x c contingency table of observed counts.
    correction : bool, default True
        Apply Yates's continuity correction for 2x2 tables.

    Returns
    -------
    TestResult
        Effect size is Cramer's *V*.
    """
    table = np.asarray(contingency_table, dtype=np.float64)
    chi2, p, dof, expected = stats.chi2_contingency(table, correction=correction)
    n = table.sum()
    k = min(table.shape) - 1
    v = math.sqrt(chi2 / (n * k)) if n * k > 0 else 0.0
    return TestResult(
        method="Chi-squared test of independence",
        test_statistic=float(chi2),
        p_value=float(p),
        df=float(dof),
        effect_size=float(v),
        n=int(n),
        extra={"expected": expected.tolist(), "cramers_v": float(v)},
    )




[docs]
def mcnemar_test(
    contingency_table: Union[np.ndarray, pd.DataFrame],
    exact: bool = False,
) -> TestResult:
    """McNemar's test for paired nominal data (2x2 table).

    Parameters
    ----------
    contingency_table : array-like
        A 2x2 contingency table [[a, b], [c, d]].
    exact : bool, default False
        If ``True``, use the exact binomial test instead of the chi-squared
        approximation.

    Returns
    -------
    TestResult
    """
    table = np.asarray(contingency_table, dtype=np.float64)
    if table.shape != (2, 2):
        raise ValueError("McNemar test requires a 2x2 table.")
    b, c = table[0, 1], table[1, 0]
    n = table.sum()
    if exact:
        p = float(stats.binom_test(int(min(b, c)), int(b + c), 0.5)) if (b + c) > 0 else 1.0
        chi2_stat = float(b + c)
    else:
        chi2_stat = (abs(b - c) - 1) ** 2 / (b + c) if (b + c) > 0 else 0.0
        p = 1.0 - stats.chi2.cdf(chi2_stat, 1) if (b + c) > 0 else 1.0
    return TestResult(
        method="McNemar's test" + (" (exact)" if exact else ""),
        test_statistic=float(chi2_stat),
        p_value=float(p),
        df=1.0,
        n=int(n),
    )




[docs]
def cochrans_q(
    *groups: Union[np.ndarray, pd.Series, list],
) -> TestResult:
    """Cochran's Q test for related samples with binary outcomes.

    Parameters
    ----------
    *groups : array-like
        Three or more matched binary (0/1) samples.

    Returns
    -------
    TestResult
    """
    cleaned = [np.asarray(g, dtype=np.float64).ravel() for g in groups]
    n = len(cleaned[0])
    k = len(cleaned)
    if any(len(g) != n for g in cleaned):
        raise ValueError("All groups must have the same length.")
    data_matrix = np.column_stack(cleaned)
    row_sums = data_matrix.sum(axis=1)
    col_sums = data_matrix.sum(axis=0)
    T_total = data_matrix.sum()
    num = (k - 1) * (k * (col_sums**2).sum() - T_total**2)
    denom = k * T_total - (row_sums**2).sum()
    q_stat = num / denom if denom > 0 else 0.0
    p = 1.0 - stats.chi2.cdf(q_stat, k - 1)
    return TestResult(
        method="Cochran's Q test",
        test_statistic=float(q_stat),
        p_value=float(p),
        df=float(k - 1),
        n=n,
    )



# ===================================================================
# CORRELATION
# ===================================================================



[docs]
def pearson_correlation(
    x: Union[np.ndarray, pd.Series, list],
    y: Union[np.ndarray, pd.Series, list],
    confidence: float = 0.95,
) -> TestResult:
    """Pearson product-moment correlation with Fisher *z* CI.

    Parameters
    ----------
    x, y : array-like
        Two continuous variables.
    confidence : float, default 0.95
        Confidence level.

    Returns
    -------
    TestResult
    """
    x = _validate_array(x, "x")
    y = _validate_array(y, "y")
    n = min(len(x), len(y))
    x, y = x[:n], y[:n]
    r, p = stats.pearsonr(x, y)
    # Fisher z CI
    z = np.arctanh(r)
    se_z = 1.0 / math.sqrt(n - 3) if n > 3 else np.inf
    z_crit = stats.norm.ppf((1 + confidence) / 2)
    ci_lo = float(np.tanh(z - z_crit * se_z))
    ci_hi = float(np.tanh(z + z_crit * se_z))
    return TestResult(
        method="Pearson correlation",
        test_statistic=float(r),
        p_value=float(p),
        df=float(n - 2),
        ci_lower=ci_lo,
        ci_upper=ci_hi,
        effect_size=float(r**2),
        estimate=float(r),
        n=n,
    )




[docs]
def spearman_correlation(
    x: Union[np.ndarray, pd.Series, list],
    y: Union[np.ndarray, pd.Series, list],
    confidence: float = 0.95,
) -> TestResult:
    """Spearman rank correlation.

    Parameters
    ----------
    x, y : array-like
        Two variables.
    confidence : float, default 0.95
        Confidence level.

    Returns
    -------
    TestResult
    """
    x = _validate_array(x, "x")
    y = _validate_array(y, "y")
    n = min(len(x), len(y))
    x, y = x[:n], y[:n]
    rho, p = stats.spearmanr(x, y)
    z = np.arctanh(rho)
    se_z = 1.0 / math.sqrt(n - 3) if n > 3 else np.inf
    z_crit = stats.norm.ppf((1 + confidence) / 2)
    ci_lo = float(np.tanh(z - z_crit * se_z))
    ci_hi = float(np.tanh(z + z_crit * se_z))
    return TestResult(
        method="Spearman correlation",
        test_statistic=float(rho),
        p_value=float(p),
        df=float(n - 2),
        ci_lower=ci_lo,
        ci_upper=ci_hi,
        effect_size=float(rho**2),
        estimate=float(rho),
        n=n,
    )




[docs]
def kendall_correlation(
    x: Union[np.ndarray, pd.Series, list],
    y: Union[np.ndarray, pd.Series, list],
) -> TestResult:
    """Kendall's tau-b rank correlation.

    Parameters
    ----------
    x, y : array-like
        Two variables.

    Returns
    -------
    TestResult
    """
    x = _validate_array(x, "x")
    y = _validate_array(y, "y")
    n = min(len(x), len(y))
    x, y = x[:n], y[:n]
    tau, p = stats.kendalltau(x, y)
    return TestResult(
        method="Kendall tau-b",
        test_statistic=float(tau),
        p_value=float(p),
        estimate=float(tau),
        n=n,
    )




[docs]
def point_biserial_correlation(
    binary: Union[np.ndarray, pd.Series, list],
    continuous: Union[np.ndarray, pd.Series, list],
    confidence: float = 0.95,
) -> TestResult:
    """Point-biserial correlation between a binary and continuous variable.

    Parameters
    ----------
    binary : array-like
        Binary (0/1) variable.
    continuous : array-like
        Continuous variable.
    confidence : float, default 0.95
        Confidence level.

    Returns
    -------
    TestResult
    """
    b = _validate_array(binary, "binary")
    c = _validate_array(continuous, "continuous")
    n = min(len(b), len(c))
    b, c = b[:n], c[:n]
    unique = np.unique(b)
    if len(unique) != 2:
        raise ValueError("Binary variable must have exactly 2 unique values.")
    r, p = stats.pointbiserialr(b, c)
    z = np.arctanh(r)
    se_z = 1.0 / math.sqrt(n - 3) if n > 3 else np.inf
    z_crit = stats.norm.ppf((1 + confidence) / 2)
    return TestResult(
        method="Point-biserial correlation",
        test_statistic=float(r),
        p_value=float(p),
        df=float(n - 2),
        ci_lower=float(np.tanh(z - z_crit * se_z)),
        ci_upper=float(np.tanh(z + z_crit * se_z)),
        effect_size=float(r**2),
        estimate=float(r),
        n=n,
    )




[docs]
def partial_correlation(
    x: Union[np.ndarray, pd.Series, list],
    y: Union[np.ndarray, pd.Series, list],
    covariates: Union[np.ndarray, pd.DataFrame],
    confidence: float = 0.95,
) -> TestResult:
    """Partial Pearson correlation controlling for covariates.

    Uses OLS residualisation: regress both *x* and *y* on the covariates,
    then correlate the residuals.

    Parameters
    ----------
    x, y : array-like
        Variables of interest.
    covariates : array-like or DataFrame
        Matrix of control variables (n x p).
    confidence : float, default 0.95
        Confidence level.

    Returns
    -------
    TestResult
    """
    x = _validate_array(x, "x")
    y = _validate_array(y, "y")
    Z = np.asarray(covariates, dtype=np.float64)
    if Z.ndim == 1:
        Z = Z.reshape(-1, 1)
    n = min(len(x), len(y), len(Z))
    x, y, Z = x[:n], y[:n], Z[:n]
    # Add constant
    Z_aug = np.column_stack([np.ones(n), Z])
    # Residualise
    beta_x = np.linalg.lstsq(Z_aug, x, rcond=None)[0]
    beta_y = np.linalg.lstsq(Z_aug, y, rcond=None)[0]
    res_x = x - Z_aug @ beta_x
    res_y = y - Z_aug @ beta_y
    r, p = stats.pearsonr(res_x, res_y)
    p_vars = Z.shape[1]
    df_val = n - 2 - p_vars
    z = np.arctanh(r)
    se_z = 1.0 / math.sqrt(df_val) if df_val > 0 else np.inf
    z_crit = stats.norm.ppf((1 + confidence) / 2)
    return TestResult(
        method="Partial correlation",
        test_statistic=float(r),
        p_value=float(p),
        df=float(df_val),
        ci_lower=float(np.tanh(z - z_crit * se_z)),
        ci_upper=float(np.tanh(z + z_crit * se_z)),
        effect_size=float(r**2),
        estimate=float(r),
        n=n,
    )




[docs]
def semi_partial_correlation(
    x: Union[np.ndarray, pd.Series, list],
    y: Union[np.ndarray, pd.Series, list],
    covariates: Union[np.ndarray, pd.DataFrame],
) -> TestResult:
    """Semi-partial (part) correlation.

    Residualises only *x* on the covariates and correlates the residual with
    the raw *y*.

    Parameters
    ----------
    x, y : array-like
        Variables of interest.
    covariates : array-like or DataFrame
        Matrix of control variables.

    Returns
    -------
    TestResult
    """
    x = _validate_array(x, "x")
    y = _validate_array(y, "y")
    Z = np.asarray(covariates, dtype=np.float64)
    if Z.ndim == 1:
        Z = Z.reshape(-1, 1)
    n = min(len(x), len(y), len(Z))
    x, y, Z = x[:n], y[:n], Z[:n]
    Z_aug = np.column_stack([np.ones(n), Z])
    beta_x = np.linalg.lstsq(Z_aug, x, rcond=None)[0]
    res_x = x - Z_aug @ beta_x
    r, p = stats.pearsonr(res_x, y)
    return TestResult(
        method="Semi-partial correlation",
        test_statistic=float(r),
        p_value=float(p),
        effect_size=float(r**2),
        estimate=float(r),
        n=n,
    )



# ===================================================================
# NON-PARAMETRIC TESTS
# ===================================================================



[docs]
def mann_whitney_u(
    x: Union[np.ndarray, pd.Series, list],
    y: Union[np.ndarray, pd.Series, list],
    alternative: str = "two-sided",
) -> TestResult:
    """Mann--Whitney *U* test (Wilcoxon rank-sum test).

    Parameters
    ----------
    x, y : array-like
        Two independent samples.
    alternative : str, default "two-sided"
        One of ``"two-sided"``, ``"less"``, ``"greater"``.

    Returns
    -------
    TestResult
        Effect size is rank-biserial correlation :math:`r = 1 - 2U / (n_1 n_2)`.
    """
    x = _validate_array(x, "x")
    y = _validate_array(y, "y")
    u_stat, p = stats.mannwhitneyu(x, y, alternative=alternative)
    nx, ny = len(x), len(y)
    r_rb = 1.0 - 2.0 * u_stat / (nx * ny) if nx * ny > 0 else 0.0
    return TestResult(
        method="Mann-Whitney U test",
        test_statistic=float(u_stat),
        p_value=float(p),
        effect_size=float(r_rb),
        n=nx + ny,
        extra={"rank_biserial": float(r_rb)},
    )




[docs]
def wilcoxon_signed_rank(
    x: Union[np.ndarray, pd.Series, list],
    y: Union[np.ndarray, pd.Series, list] | None = None,
    alternative: str = "two-sided",
) -> TestResult:
    """Wilcoxon signed-rank test for paired samples or one sample.

    Parameters
    ----------
    x : array-like
        If *y* is None, test whether the distribution of *x* is symmetric
        about zero.  Otherwise, test paired differences *x - y*.
    y : array-like or None
        Second sample for paired test.
    alternative : str, default "two-sided"
        One of ``"two-sided"``, ``"less"``, ``"greater"``.

    Returns
    -------
    TestResult
    """
    x = _validate_array(x, "x")
    if y is not None:
        y = _validate_array(y, "y")
        if len(x) != len(y):
            raise ValueError("x and y must have equal length for paired test.")
        d = x - y
    else:
        d = x
    stat, p = stats.wilcoxon(d, alternative=alternative)
    n = len(d)
    # Effect size: r = Z / sqrt(N) where Z is the normal approximation
    z_approx = stats.norm.ppf(p / 2)
    r = abs(z_approx) / math.sqrt(n) if n > 0 else 0.0
    return TestResult(
        method="Wilcoxon signed-rank test",
        test_statistic=float(stat),
        p_value=float(p),
        effect_size=float(r),
        n=n,
    )




[docs]
def ks_test_one_sample(
    x: Union[np.ndarray, pd.Series, list],
    cdf: str = "norm",
    args: tuple = (),
) -> TestResult:
    """One-sample Kolmogorov--Smirnov test.

    Parameters
    ----------
    x : array-like
        Sample data.
    cdf : str, default "norm"
        Name of a :mod:`scipy.stats` distribution (e.g. ``"norm"``,
        ``"expon"``).
    args : tuple
        Extra arguments to the CDF (loc, scale, etc.).

    Returns
    -------
    TestResult
    """
    x = _validate_array(x, "x")
    d_stat, p = stats.kstest(x, cdf, args=args)
    return TestResult(
        method=f"KS test (1-sample, {cdf})",
        test_statistic=float(d_stat),
        p_value=float(p),
        n=len(x),
    )




[docs]
def ks_test_two_sample(
    x: Union[np.ndarray, pd.Series, list],
    y: Union[np.ndarray, pd.Series, list],
) -> TestResult:
    """Two-sample Kolmogorov--Smirnov test.

    Parameters
    ----------
    x, y : array-like
        Two independent samples.

    Returns
    -------
    TestResult
    """
    x = _validate_array(x, "x")
    y = _validate_array(y, "y")
    d_stat, p = stats.ks_2samp(x, y)
    return TestResult(
        method="KS test (2-sample)",
        test_statistic=float(d_stat),
        p_value=float(p),
        n=len(x) + len(y),
    )




[docs]
def anderson_darling(
    x: Union[np.ndarray, pd.Series, list],
    dist: str = "norm",
) -> TestResult:
    """Anderson--Darling test for a specified distribution family.

    Parameters
    ----------
    x : array-like
        Sample data.
    dist : str, default "norm"
        Distribution name (``"norm"``, ``"expon"``, ``"logistic"``, ``"gumbel"``).

    Returns
    -------
    TestResult
        The ``extra`` dict contains critical values and significance levels.
    """
    x = _validate_array(x, "x")
    result = stats.anderson(x, dist=dist)
    # Determine approximate p-value from critical values
    sig_levels = result.significance_level
    crit_vals = result.critical_values
    p_approx = 1.0
    for sl, cv in zip(sig_levels, crit_vals):
        if result.statistic > cv:
            p_approx = sl / 100.0
    return TestResult(
        method=f"Anderson-Darling test ({dist})",
        test_statistic=float(result.statistic),
        p_value=float(p_approx),
        n=len(x),
        extra={
            "critical_values": crit_vals.tolist(),
            "significance_levels": sig_levels.tolist(),
        },
    )




[docs]
def levene_test(
    *groups: Union[np.ndarray, pd.Series, list],
    center: str = "median",
) -> TestResult:
    """Levene's test for equality of variances.

    Parameters
    ----------
    *groups : array-like
        Two or more samples.
    center : str, default "median"
        ``"median"`` (Brown--Forsythe), ``"mean"``, or ``"trimmed"``.

    Returns
    -------
    TestResult
    """
    cleaned = [_validate_array(g, f"group_{i}") for i, g in enumerate(groups)]
    stat, p = stats.levene(*cleaned, center=center)
    k = len(cleaned)
    n_total = sum(len(g) for g in cleaned)
    return TestResult(
        method=f"Levene's test (center={center})",
        test_statistic=float(stat),
        p_value=float(p),
        df=float(k - 1),
        n=n_total,
    )




[docs]
def bartlett_test(
    *groups: Union[np.ndarray, pd.Series, list],
) -> TestResult:
    """Bartlett's test for equality of variances.

    Parameters
    ----------
    *groups : array-like
        Two or more samples.

    Returns
    -------
    TestResult
    """
    cleaned = [_validate_array(g, f"group_{i}") for i, g in enumerate(groups)]
    stat, p = stats.bartlett(*cleaned)
    return TestResult(
        method="Bartlett's test",
        test_statistic=float(stat),
        p_value=float(p),
        df=float(len(cleaned) - 1),
        n=sum(len(g) for g in cleaned),
    )




[docs]
def runs_test(
    x: Union[np.ndarray, pd.Series, list],
    cutoff: float | None = None,
) -> TestResult:
    """Wald--Wolfowitz runs test for randomness.

    Parameters
    ----------
    x : array-like
        Numeric sequence.
    cutoff : float or None
        If ``None``, uses the median to dichotomise *x*.

    Returns
    -------
    TestResult
    """
    x = _validate_array(x, "x")
    n = len(x)
    if cutoff is None:
        cutoff = float(np.median(x))
    binary = (x >= cutoff).astype(int)
    n1 = int(binary.sum())
    n0 = n - n1
    if n1 == 0 or n0 == 0:
        return TestResult(method="Runs test", test_statistic=0.0, p_value=1.0, n=n)
    # Count runs
    runs = 1 + int(np.sum(np.diff(binary) != 0))
    mu = 1 + 2 * n0 * n1 / n
    var = 2 * n0 * n1 * (2 * n0 * n1 - n) / (n**2 * (n - 1)) if n > 1 else 0
    if var <= 0:
        return TestResult(method="Runs test", test_statistic=float(runs), p_value=1.0, n=n)
    z = (runs - mu) / math.sqrt(var)
    p = 2 * stats.norm.sf(abs(z))
    return TestResult(
        method="Runs test",
        test_statistic=float(z),
        p_value=float(p),
        n=n,
        extra={"n_runs": runs, "expected_runs": mu},
    )



# ===================================================================
# NORMALITY TESTS
# ===================================================================



[docs]
def shapiro_wilk(
    x: Union[np.ndarray, pd.Series, list],
) -> TestResult:
    """Shapiro--Wilk test for normality.

    Parameters
    ----------
    x : array-like
        Sample data (n <= 5000 recommended by scipy).

    Returns
    -------
    TestResult
    """
    x = _validate_array(x, "x")
    stat, p = stats.shapiro(x)
    return TestResult(
        method="Shapiro-Wilk test",
        test_statistic=float(stat),
        p_value=float(p),
        n=len(x),
    )




[docs]
def dagostino_pearson(
    x: Union[np.ndarray, pd.Series, list],
) -> TestResult:
    """D'Agostino--Pearson omnibus normality test.

    Parameters
    ----------
    x : array-like
        Sample data (n >= 20 recommended).

    Returns
    -------
    TestResult
    """
    x = _validate_array(x, "x")
    stat, p = stats.normaltest(x)
    return TestResult(
        method="D'Agostino-Pearson test",
        test_statistic=float(stat),
        p_value=float(p),
        df=2.0,
        n=len(x),
    )




[docs]
def jarque_bera(
    x: Union[np.ndarray, pd.Series, list],
) -> TestResult:
    """Jarque--Bera test for normality (based on skewness and kurtosis).

    Parameters
    ----------
    x : array-like
        Sample data.

    Returns
    -------
    TestResult
    """
    x = _validate_array(x, "x")
    stat, p = stats.jarque_bera(x)
    return TestResult(
        method="Jarque-Bera test",
        test_statistic=float(stat),
        p_value=float(p),
        df=2.0,
        n=len(x),
    )




[docs]
def lilliefors_test(
    x: Union[np.ndarray, pd.Series, list],
) -> TestResult:
    """Lilliefors test for normality (KS test with estimated mean and SD).

    Uses the Kolmogorov--Smirnov statistic with parameters estimated from the
    data.  Critical values and *p*-value are approximated via the D'Agostino--
    Stephens table or, when :mod:`statsmodels` is available, via its
    ``lilliefors`` implementation.

    Parameters
    ----------
    x : array-like
        Sample data.

    Returns
    -------
    TestResult
    """
    x = _validate_array(x, "x")
    try:
        from statsmodels.stats.diagnostic import lilliefors as _lilliefors

        stat, p = _lilliefors(x, dist="norm")
    except ImportError:
        # Fall back to KS with estimated params
        stat, p = stats.kstest(x, "norm", args=(x.mean(), x.std(ddof=1)))
        logger.warning("statsmodels not available; Lilliefors p-value is approximate (plain KS).")
    return TestResult(
        method="Lilliefors test",
        test_statistic=float(stat),
        p_value=float(p),
        n=len(x),
    )



# ===================================================================
# PROPORTION TESTS
# ===================================================================



[docs]
def one_proportion_ztest(
    count: int,
    nobs: int,
    value: float = 0.5,
    confidence: float = 0.95,
) -> TestResult:
    """One-sample *z*-test for a proportion.

    Parameters
    ----------
    count : int
        Number of successes.
    nobs : int
        Number of observations.
    value : float, default 0.5
        Hypothesised proportion under *H_0*.
    confidence : float, default 0.95
        Confidence level for the Wilson score interval.

    Returns
    -------
    TestResult
    """
    p_hat = count / nobs if nobs > 0 else 0.0
    se = math.sqrt(value * (1 - value) / nobs) if nobs > 0 else 0.0
    z = (p_hat - value) / se if se > 0 else 0.0
    p_val = 2 * stats.norm.sf(abs(z))
    # Wilson CI
    z_crit = stats.norm.ppf((1 + confidence) / 2)
    denom = 1 + z_crit**2 / nobs
    centre = (p_hat + z_crit**2 / (2 * nobs)) / denom
    margin = z_crit * math.sqrt(p_hat * (1 - p_hat) / nobs + z_crit**2 / (4 * nobs**2)) / denom
    return TestResult(
        method="One-proportion z-test",
        test_statistic=float(z),
        p_value=float(p_val),
        ci_lower=float(centre - margin),
        ci_upper=float(centre + margin),
        estimate=float(p_hat),
        n=nobs,
    )




[docs]
def two_proportion_ztest(
    count1: int,
    nobs1: int,
    count2: int,
    nobs2: int,
    confidence: float = 0.95,
) -> TestResult:
    """Two-sample *z*-test for the difference between two proportions.

    Parameters
    ----------
    count1, nobs1 : int
        Successes and observations in sample 1.
    count2, nobs2 : int
        Successes and observations in sample 2.
    confidence : float, default 0.95
        Confidence level.

    Returns
    -------
    TestResult
    """
    p1 = count1 / nobs1 if nobs1 > 0 else 0.0
    p2 = count2 / nobs2 if nobs2 > 0 else 0.0
    p_pool = (count1 + count2) / (nobs1 + nobs2) if (nobs1 + nobs2) > 0 else 0.0
    se = math.sqrt(p_pool * (1 - p_pool) * (1 / nobs1 + 1 / nobs2)) if (nobs1 + nobs2) > 0 else 0.0
    z = (p1 - p2) / se if se > 0 else 0.0
    p_val = 2 * stats.norm.sf(abs(z))
    # Newcombe CI for difference
    z_crit = stats.norm.ppf((1 + confidence) / 2)
    se_diff = math.sqrt(p1 * (1 - p1) / nobs1 + p2 * (1 - p2) / nobs2) if (nobs1 > 0 and nobs2 > 0) else 0.0
    diff = p1 - p2
    return TestResult(
        method="Two-proportion z-test",
        test_statistic=float(z),
        p_value=float(p_val),
        ci_lower=float(diff - z_crit * se_diff),
        ci_upper=float(diff + z_crit * se_diff),
        estimate=float(diff),
        n=nobs1 + nobs2,
    )




[docs]
def fisher_exact_test(
    contingency_table: Union[np.ndarray, list],
    alternative: str = "two-sided",
) -> TestResult:
    """Fisher's exact test for a 2x2 contingency table.

    Parameters
    ----------
    contingency_table : array-like
        A 2x2 table of counts.
    alternative : str, default "two-sided"
        ``"two-sided"``, ``"less"``, or ``"greater"``.

    Returns
    -------
    TestResult
        ``estimate`` is the odds ratio.
    """
    table = np.asarray(contingency_table, dtype=np.int64)
    if table.shape != (2, 2):
        raise ValueError("Fisher exact test requires a 2x2 table.")
    odds_ratio, p = stats.fisher_exact(table, alternative=alternative)
    n = int(table.sum())
    return TestResult(
        method="Fisher's exact test",
        test_statistic=float(odds_ratio),
        p_value=float(p),
        estimate=float(odds_ratio),
        n=n,
    )



# ===================================================================
# AGREEMENT
# ===================================================================



[docs]
def cohens_kappa(
    rater1: Union[np.ndarray, pd.Series, list],
    rater2: Union[np.ndarray, pd.Series, list],
    confidence: float = 0.95,
) -> TestResult:
    """Cohen's kappa for inter-rater agreement between two raters.

    Parameters
    ----------
    rater1, rater2 : array-like
        Categorical ratings from two raters (same length).
    confidence : float, default 0.95
        Confidence level.

    Returns
    -------
    TestResult
    """
    r1 = np.asarray(rater1).ravel()
    r2 = np.asarray(rater2).ravel()
    if len(r1) != len(r2):
        raise ValueError("Raters must have the same number of observations.")
    n = len(r1)
    categories = np.unique(np.concatenate([r1, r2]))
    k = len(categories)
    # Build confusion matrix
    cat_to_idx = {c: i for i, c in enumerate(categories)}
    matrix = np.zeros((k, k), dtype=np.float64)
    for a, b in zip(r1, r2):
        matrix[cat_to_idx[a], cat_to_idx[b]] += 1
    p_o = np.trace(matrix) / n
    row_sums = matrix.sum(axis=1) / n
    col_sums = matrix.sum(axis=0) / n
    p_e = float((row_sums * col_sums).sum())
    kappa_val = (p_o - p_e) / (1 - p_e) if (1 - p_e) > 0 else 0.0
    # Approximate SE (Fleiss, 1981)
    se = math.sqrt(p_e / (n * (1 - p_e) ** 2)) if (1 - p_e) > 0 and n > 0 else 0.0
    z_crit = stats.norm.ppf((1 + confidence) / 2)
    return TestResult(
        method="Cohen's kappa",
        test_statistic=float(kappa_val / se) if se > 0 else 0.0,
        p_value=float(2 * stats.norm.sf(abs(kappa_val / se))) if se > 0 else 1.0,
        ci_lower=float(kappa_val - z_crit * se),
        ci_upper=float(kappa_val + z_crit * se),
        effect_size=float(kappa_val),
        estimate=float(kappa_val),
        n=n,
    )




[docs]
def fleiss_kappa(
    ratings_matrix: Union[np.ndarray, pd.DataFrame],
) -> TestResult:
    """Fleiss' kappa for agreement among multiple raters.

    Parameters
    ----------
    ratings_matrix : array-like
        An n x k matrix where rows are subjects and columns are rating
        categories.  Cell (i, j) is the number of raters who assigned subject
        *i* to category *j*.

    Returns
    -------
    TestResult

    References
    ----------
    Fleiss, J. L. (1971). Measuring nominal scale agreement among many raters.
    *Psychological Bulletin*, 76(5), 378--382.
    """
    table = np.asarray(ratings_matrix, dtype=np.float64)
    n, k = table.shape
    N_raters = table[0].sum()  # assume constant across subjects
    # Proportion in each category
    p_j = table.sum(axis=0) / (n * N_raters)
    # Per-subject agreement
    P_i = (np.sum(table**2, axis=1) - N_raters) / (N_raters * (N_raters - 1))
    P_bar = P_i.mean()
    P_e = float((p_j**2).sum())
    kappa_val = (P_bar - P_e) / (1 - P_e) if (1 - P_e) > 0 else 0.0
    # SE (Fleiss et al., 2003)
    se_num = 2.0 / (n * N_raters * (N_raters - 1))
    se_term = (p_j * (1 - p_j)).sum() ** 2
    denom = (1 - P_e) ** 2
    se = math.sqrt(se_num * (se_term / denom)) if denom > 0 else 0.0
    z = kappa_val / se if se > 0 else 0.0
    p_val = 2 * stats.norm.sf(abs(z))
    return TestResult(
        method="Fleiss' kappa",
        test_statistic=float(z),
        p_value=float(p_val),
        effect_size=float(kappa_val),
        estimate=float(kappa_val),
        n=n,
        extra={"n_raters": int(N_raters), "n_categories": k},
    )




[docs]
def intraclass_correlation(
    data: pd.DataFrame,
    targets: str,
    raters: str,
    ratings: str,
    icc_type: str = "ICC3k",
) -> TestResult:
    """Intraclass correlation coefficient (ICC).

    Implements the Shrout & Fleiss (1979) taxonomy:

    - **ICC1**: One-way random, single measures
    - **ICC1k**: One-way random, average measures
    - **ICC2**: Two-way random, single measures
    - **ICC2k**: Two-way random, average measures
    - **ICC3**: Two-way mixed, single measures
    - **ICC3k**: Two-way mixed, average measures

    Parameters
    ----------
    data : DataFrame
        Long-format data.
    targets : str
        Column identifying subjects/targets.
    raters : str
        Column identifying raters.
    ratings : str
        Column containing ratings (numeric).
    icc_type : str, default "ICC3k"
        Which ICC form to compute.

    Returns
    -------
    TestResult

    References
    ----------
    Shrout, P. E., & Fleiss, J. L. (1979). Intraclass correlations: Uses in
    assessing rater reliability. *Psychological Bulletin*, 86(2), 420--428.
    """
    df = data.dropna(subset=[targets, raters, ratings]).copy()
    wide = df.pivot(index=targets, columns=raters, values=ratings).dropna()
    n = wide.shape[0]  # subjects
    k = wide.shape[1]  # raters
    Y = wide.values.astype(np.float64)
    grand_mean = Y.mean()
    subj_means = Y.mean(axis=1)
    rater_means = Y.mean(axis=0)

    # Sums of squares
    ss_total = ((Y - grand_mean) ** 2).sum()
    ss_rows = k * ((subj_means - grand_mean) ** 2).sum()  # BMS
    ss_cols = n * ((rater_means - grand_mean) ** 2).sum()  # JMS
    ss_error = ss_total - ss_rows - ss_cols  # EMS
    ms_rows = ss_rows / (n - 1) if n > 1 else 0.0
    ms_cols = ss_cols / (k - 1) if k > 1 else 0.0
    ms_error = ss_error / ((n - 1) * (k - 1)) if (n - 1) * (k - 1) > 0 else 0.0
    ms_within = (ss_cols + ss_error) / (n * (k - 1)) if n * (k - 1) > 0 else 0.0

    if icc_type == "ICC1":
        icc = (ms_rows - ms_within) / (ms_rows + (k - 1) * ms_within)
    elif icc_type == "ICC1k":
        icc = (ms_rows - ms_within) / ms_rows if ms_rows > 0 else 0.0
    elif icc_type == "ICC2":
        icc = (ms_rows - ms_error) / (ms_rows + (k - 1) * ms_error + k * (ms_cols - ms_error) / n)
    elif icc_type == "ICC2k":
        icc = (ms_rows - ms_error) / (ms_rows + (ms_cols - ms_error) / n)
    elif icc_type == "ICC3":
        icc = (ms_rows - ms_error) / (ms_rows + (k - 1) * ms_error)
    elif icc_type == "ICC3k":
        icc = (ms_rows - ms_error) / ms_rows if ms_rows > 0 else 0.0
    else:
        raise ValueError(f"Unknown ICC type: {icc_type}. Use ICC1, ICC1k, ICC2, ICC2k, ICC3, ICC3k.")

    # F-test for significance
    f_stat = ms_rows / ms_error if ms_error > 0 else 0.0
    df1 = n - 1
    df2 = (n - 1) * (k - 1)
    p_val = 1 - stats.f.cdf(f_stat, df1, df2)

    return TestResult(
        method=f"Intraclass correlation ({icc_type})",
        test_statistic=float(f_stat),
        p_value=float(p_val),
        df=float(df1),
        effect_size=float(icc),
        estimate=float(icc),
        n=n,
        extra={"icc_type": icc_type, "n_raters": k, "ms_rows": ms_rows, "ms_error": ms_error},
    )



# ===================================================================
# CONVENIENCE / BATCH HELPERS
# ===================================================================



[docs]
def normality_battery(
    x: Union[np.ndarray, pd.Series, list],
) -> list[TestResult]:
    """Run all available normality tests on a single sample.

    Parameters
    ----------
    x : array-like
        Sample data.

    Returns
    -------
    list[TestResult]
        Results from Shapiro--Wilk, D'Agostino--Pearson, Jarque--Bera, and
        Lilliefors tests.
    """
    results = []
    x = _validate_array(x, "x")
    if len(x) >= 3:
        results.append(shapiro_wilk(x))
    if len(x) >= 20:
        results.append(dagostino_pearson(x))
    results.append(jarque_bera(x))
    results.append(lilliefors_test(x))
    return results




[docs]
def variance_equality_battery(
    *groups: Union[np.ndarray, pd.Series, list],
) -> list[TestResult]:
    """Run Levene's (median) and Bartlett's tests for homogeneity of variance.

    Parameters
    ----------
    *groups : array-like
        Two or more samples.

    Returns
    -------
    list[TestResult]
    """
    return [
        levene_test(*groups, center="median"),
        bartlett_test(*groups),
    ]




[docs]
def correlation_matrix(
    data: pd.DataFrame,
    method: str = "pearson",
) -> pd.DataFrame:
    """Compute a pairwise correlation matrix with *p*-values.

    Parameters
    ----------
    data : DataFrame
        Numeric columns only.
    method : str, default "pearson"
        ``"pearson"``, ``"spearman"``, or ``"kendall"``.

    Returns
    -------
    DataFrame
        MultiIndex columns with ``("r", col)`` and ``("p", col)`` levels.
    """
    cols = data.select_dtypes(include=[np.number]).columns.tolist()
    n = len(cols)
    r_mat = np.zeros((n, n))
    p_mat = np.zeros((n, n))
    func_map = {"pearson": stats.pearsonr, "spearman": stats.spearmanr, "kendall": stats.kendalltau}
    corr_func = func_map.get(method)
    if corr_func is None:
        raise ValueError(f"Unknown method: {method}. Use pearson, spearman, or kendall.")
    for i in range(n):
        for j in range(i, n):
            if i == j:
                r_mat[i, j] = 1.0
                p_mat[i, j] = 0.0
            else:
                x = data[cols[i]].dropna()
                y = data[cols[j]].dropna()
                common = x.index.intersection(y.index)
                r, p = corr_func(x.loc[common].values, y.loc[common].values)
                r_mat[i, j] = r_mat[j, i] = r
                p_mat[i, j] = p_mat[j, i] = p
    multi_cols = pd.MultiIndex.from_product([["r", "p"], cols])
    combined = np.column_stack([r_mat, p_mat])
    return pd.DataFrame(combined, index=cols, columns=multi_cols)




[docs]
def auto_test(
    x: Union[np.ndarray, pd.Series, list],
    y: Union[np.ndarray, pd.Series, list] | None = None,
    paired: bool = False,
    confidence: float = 0.95,
) -> TestResult:
    """Automatically select and run the most appropriate test.

    Decision logic:

    1. If *y* is ``None`` -- one-sample t-test against zero.
    2. If ``paired=True`` -- paired t-test (if normal differences) or Wilcoxon.
    3. If two independent samples -- check normality and variance equality;
       choose between Student's t, Welch's t, or Mann--Whitney U.

    Parameters
    ----------
    x : array-like
        First sample.
    y : array-like or None
        Second sample (if applicable).
    paired : bool, default False
        Whether samples are paired.
    confidence : float, default 0.95
        Confidence level.

    Returns
    -------
    TestResult
    """
    x = _validate_array(x, "x")
    if y is None:
        return one_sample_ttest(x, mu0=0.0, confidence=confidence)
    y = _validate_array(y, "y")
    if paired:
        if len(x) != len(y):
            raise ValueError("Paired comparison requires equal-length arrays.")
        d = x - y
        sw = stats.shapiro(d)
        if sw.pvalue >= 0.05:
            return paired_ttest(x, y, confidence=confidence)
        return wilcoxon_signed_rank(x, y)
    # Independent: check normality in both groups
    sw_x = stats.shapiro(x) if len(x) <= 5000 else stats.normaltest(x)
    sw_y = stats.shapiro(y) if len(y) <= 5000 else stats.normaltest(y)
    both_normal = sw_x.pvalue >= 0.05 and sw_y.pvalue >= 0.05
    if both_normal:
        lev = stats.levene(x, y, center="median")
        eq_var = lev.pvalue >= 0.05
        return two_sample_ttest(x, y, equal_var=eq_var, confidence=confidence)
    return mann_whitney_u(x, y)
MORIE

Table of Contents

Related Topics

Source code for morie.statistics