Source code for pertpy.tools._distances._distance_tests

from __future__ import annotations

from typing import TYPE_CHECKING

import numpy as np
import pandas as pd
from rich.progress import track
from sklearn.metrics import pairwise_distances
from statsmodels.stats.multitest import multipletests

from ._distances import Distance

if TYPE_CHECKING:
    from anndata import AnnData



[docs]
class DistanceTest:
    """Run permutation tests using a distance of choice between groups of cells.

    Performs Monte-Carlo permutation testing using a distance metric of choice
    as test statistic. Tests all groups of cells against a specified contrast
    group (which normally would be your "control" cells).

    Args:
        metric: Distance metric to use between groups of cells.
        n_perms: Number of permutations to run. Defaults to 1000.
        layer_key: Name of the counts layer containing raw counts to calculate distances for.
                   Mutually exclusive with 'obsm_key'.
                   Defaults to None and is then not used.
        obsm_key: Name of embedding in adata.obsm to use.
                  Mutually exclusive with 'counts_layer_key'.
                  Defaults to None, but is set to "X_pca" if not set explicitly internally.
        alpha: Significance level. Defaults to 0.05.
        correction: Multiple testing correction method. Defaults to 'holm-sidak'.
        cell_wise_metric: Metric to use between single cells. Defaults to "euclidean".

    Examples:
        >>> import pertpy as pt
        >>> adata = pt.dt.distance_example()
        >>> distance_test = pt.tl.DistanceTest("edistance", n_perms=1000)
        >>> tab = distance_test(adata, groupby="perturbation", contrast="control")
    """

    def __init__(
        self,
        metric: str,
        n_perms: int = 1000,
        layer_key: str = None,
        obsm_key: str = None,
        alpha: float = 0.05,
        correction: str = "holm-sidak",
        cell_wise_metric=None,
    ):
        self.metric = metric
        self.n_perms = n_perms

        if layer_key and obsm_key:
            raise ValueError(
                "Cannot use 'counts_layer_key' and 'obsm_key' at the same time.\n"
                "Please provide only one of the two keys."
            )
        if not layer_key and not obsm_key:
            obsm_key = "X_pca"
        self.layer_key = layer_key
        self.obsm_key = obsm_key
        self.alpha = alpha
        self.correction = correction
        self.cell_wise_metric = (
            cell_wise_metric if cell_wise_metric else Distance(self.metric, self.obsm_key).cell_wise_metric
        )

        self.distance = Distance(
            self.metric, layer_key=self.layer_key, obsm_key=self.obsm_key, cell_wise_metric=self.cell_wise_metric
        )

    def __call__(
        self,
        adata: AnnData,
        groupby: str,
        contrast: str,
        show_progressbar: bool = True,
    ) -> pd.DataFrame:
        """Run a permutation test using the specified distance metric, testing
        all groups of cells against a specified contrast group ("control").

        Args:
            adata: Annotated data matrix.
            groupby: Key in adata.obs for grouping cells.
            contrast: Name of the contrast group.
            show_progressbar: Whether to print progress. Defaults to True.

        Returns:
            pandas.DataFrame: Results of the permutation test, with columns:
                - distance: distance between the contrast group and the group
                - pvalue: p-value of the permutation test
                - significant: whether the group is significantly different from the contrast group
                - pvalue_adj: p-value after multiple testing correction
                - significant_adj: whether the group is significantly different from the contrast group after multiple testing correction

        Examples:
            >>> import pertpy as pt
            >>> adata = pt.dt.distance_example()
            >>> distance_test = pt.tl.DistanceTest("edistance", n_perms=1000)
            >>> tab = distance_test(adata, groupby="perturbation", contrast="control")
        """
        if self.distance.metric_fct.accepts_precomputed:
            # Much faster if the metric can be called on the precomputed
            # distance matrix, but not all metrics can do that.
            return self.test_precomputed(adata, groupby, contrast, show_progressbar)
        else:
            return self.test_xy(adata, groupby, contrast, show_progressbar)


[docs]
    def test_xy(self, adata: AnnData, groupby: str, contrast: str, show_progressbar: bool = True) -> pd.DataFrame:
        """Run permutation test for metric not supporting precomputed distances.

        Runs a permutation test for a metric that can not be computed using
        precomputed pairwise distances, but need the actual data points. This is
        generally slower than test_precomputed.

        Args:
            adata: Annotated data matrix.
            groupby: Key in adata.obs for grouping cells.
            contrast: Name of the contrast group.
            show_progressbar: Whether to print progress. Defaults to True.

        Returns:
            pandas.DataFrame: Results of the permutation test, with columns:
                - distance: distance between the contrast group and the group
                - pvalue: p-value of the permutation test
                - significant: whether the group is significantly different from the contrast group
                - pvalue_adj: p-value after multiple testing correction
                - significant_adj: whether the group is significantly different from the contrast group after multiple testing correction

        Examples:
            >>> import pertpy as pt
            >>> adata = pt.dt.distance_example()
            >>> distance_test = pt.tl.DistanceTest("edistance", n_perms=1000)
            >>> test_results = distance_test.test_xy(adata, groupby="perturbation", contrast="control")
        """
        groups = adata.obs[groupby].unique()
        if contrast not in groups:
            raise ValueError(f"Contrast group {contrast} not found in {groupby} of adata.obs.")
        fct = track if show_progressbar else lambda iterable: iterable
        embedding = adata.obsm[self.obsm_key]

        # Generate the null distribution
        results = []
        for _permutation in fct(range(self.n_perms)):
            # per perturbation, shuffle with control and compute e-distance
            df = pd.DataFrame(index=groups, columns=["distance"], dtype=float)
            for group in groups:
                if group == contrast:
                    continue
                # Shuffle the labels of the groups
                mask = adata.obs[groupby].isin([group, contrast])
                labels = adata.obs[groupby].values[mask]
                rng = np.random.default_rng()
                shuffled_labels = rng.permutation(labels)
                idx = shuffled_labels == group

                X = embedding[mask][idx]  # shuffled group
                Y = embedding[mask][~idx]  # shuffled contrast
                dist = self.distance(X, Y)

                df.loc[group, "distance"] = dist
            results.append(df.sort_index())

        # Generate the empirical distribution
        for group in groups:
            if group == contrast:
                continue
            X = embedding[adata.obs[groupby] == group]
            Y = embedding[adata.obs[groupby] == contrast]
            df.loc[group, "distance"] = self.distance(X, Y)

        # Evaluate the test
        # count times shuffling resulted in larger distance
        comparison_results = np.array(
            pd.concat([r["distance"] - df["distance"] for r in results], axis=1) > 0, dtype=int
        )
        n_failures = pd.Series(np.clip(np.sum(comparison_results, axis=1), 1, np.inf), index=df.index)
        pvalues = n_failures / self.n_perms

        # Apply multiple testing correction
        significant_adj, pvalue_adj, _, _ = multipletests(pvalues.values, alpha=self.alpha, method=self.correction)

        # Aggregate results
        tab = pd.DataFrame(
            {
                "distance": df["distance"],
                "pvalue": pvalues,
                "significant": pvalues < self.alpha,
                "pvalue_adj": pvalue_adj,
                "significant_adj": significant_adj,
            },
            index=df.index,
        )

        # Set the contrast group
        tab.loc[contrast, "distance"] = 0
        tab.loc[contrast, "pvalue"] = 1
        tab.loc[contrast, "significant"] = False
        tab.loc[contrast, "pvalue_adj"] = 1
        tab.loc[contrast, "significant_adj"] = False

        return tab



[docs]
    def test_precomputed(self, adata: AnnData, groupby: str, contrast: str, verbose: bool = True) -> pd.DataFrame:
        """Run permutation test for metrics that take precomputed distances.

        Args:
            adata: Annotated data matrix.
            groupby: Key in adata.obs for grouping cells.
            contrast: Name of the contrast group.
            cell_wise_metric: Metric to use for pairwise distances.
            verbose: Whether to print progress. Defaults to True.

        Returns:
            pandas.DataFrame: Results of the permutation test, with columns:
                - distance: distance between the contrast group and the group
                - pvalue: p-value of the permutation test
                - significant: whether the group is significantly different from the contrast group
                - pvalue_adj: p-value after multiple testing correction
                - significant_adj: whether the group is significantly different from the contrast group after multiple testing correction

        Examples:
            >>> import pertpy as pt
            >>> adata = pt.dt.distance_example()
            >>> distance_test = pt.tl.DistanceTest("edistance", n_perms=1000)
            >>> test_results = distance_test.test_precomputed(adata, groupby="perturbation", contrast="control")
        """
        if not self.distance.metric_fct.accepts_precomputed:
            raise ValueError(f"Metric {self.metric} does not accept precomputed distances.")

        groups = adata.obs[groupby].unique()
        if contrast not in groups:
            raise ValueError(f"Contrast group {contrast} not found in {groupby} of adata.obs.")
        fct = track if verbose else lambda iterable: iterable

        # Precompute the pairwise distances
        precomputed_distances = {}
        for group in groups:
            if self.layer_key:
                cells = adata[adata.obs[groupby].isin([group, contrast])].layers[self.layer_key].copy()
            else:
                cells = adata[adata.obs[groupby].isin([group, contrast])].obsm[self.obsm_key].copy()
            pwd = pairwise_distances(cells, cells, metric=self.distance.cell_wise_metric)
            precomputed_distances[group] = pwd

        # Generate the null distribution
        results = []
        for _permutation in fct(range(self.n_perms)):
            # per perturbation, shuffle with control and compute e-distance
            df = pd.DataFrame(index=groups, columns=["distance"], dtype=float)
            for group in groups:
                if group == contrast:
                    continue
                # Shuffle the labels of the groups
                mask = adata.obs[groupby].isin([group, contrast])
                labels = adata.obs[groupby].values[mask]
                rng = np.random.default_rng()
                shuffled_labels = rng.permutation(labels)
                idx = shuffled_labels == group

                precomputed_distance = precomputed_distances[group]
                distance_result = self.distance.metric_fct.from_precomputed(precomputed_distance, idx)

                df.loc[group, "distance"] = distance_result
            results.append(df.sort_index())

        # Generate the empirical distribution
        for group in groups:
            if group == contrast:
                continue
            mask = adata.obs[groupby].isin([group, contrast])
            labels = adata.obs[groupby].values[mask]
            idx = labels == group

            precomputed_distance = precomputed_distances[group]
            distance_result = self.distance.metric_fct.from_precomputed(precomputed_distance, idx)

            df.loc[group, "distance"] = distance_result

        # Evaluate the test
        # count times shuffling resulted in larger distance
        comparison_results = np.array(
            pd.concat([r["distance"] - df["distance"] for r in results], axis=1) > 0, dtype=int
        )
        n_failures = pd.Series(np.clip(np.sum(comparison_results, axis=1), 1, np.inf), index=df.index)
        pvalues = n_failures / self.n_perms

        # Apply multiple testing correction
        significant_adj, pvalue_adj, _, _ = multipletests(pvalues.values, alpha=self.alpha, method=self.correction)

        # Aggregate results
        tab = pd.DataFrame(
            {
                "distance": df["distance"],
                "pvalue": pvalues,
                "significant": pvalues < self.alpha,
                "pvalue_adj": pvalue_adj,
                "significant_adj": significant_adj,
            },
            index=df.index,
        )

        # Set the contrast group
        tab.loc[contrast, "distance"] = 0
        tab.loc[contrast, "pvalue"] = 1
        tab.loc[contrast, "significant"] = False
        tab.loc[contrast, "pvalue_adj"] = 1
        tab.loc[contrast, "significant_adj"] = False
        return tab