Source code for pertpy.tools._distances._distance_tests

from __future__ import annotations

from typing import TYPE_CHECKING

import numpy as np
import pandas as pd
from rich.progress import track
from sklearn.metrics import pairwise_distances
from statsmodels.stats.multitest import multipletests

from ._distances import Distance

if TYPE_CHECKING:
    from anndata import AnnData


[docs] class DistanceTest: """Run permutation tests using a distance of choice between groups of cells. Performs Monte-Carlo permutation testing using a distance metric of choice as test statistic. Tests all groups of cells against a specified contrast group (which normally would be your "control" cells). Args: metric: Distance metric to use between groups of cells. n_perms: Number of permutations to run. Defaults to 1000. layer_key: Name of the counts layer containing raw counts to calculate distances for. Mutually exclusive with 'obsm_key'. Defaults to None and is then not used. obsm_key: Name of embedding in adata.obsm to use. Mutually exclusive with 'counts_layer_key'. Defaults to None, but is set to "X_pca" if not set explicitly internally. alpha: Significance level. Defaults to 0.05. correction: Multiple testing correction method. Defaults to 'holm-sidak'. cell_wise_metric: Metric to use between single cells. Defaults to "euclidean". Examples: >>> import pertpy as pt >>> adata = pt.dt.distance_example() >>> distance_test = pt.tl.DistanceTest("edistance", n_perms=1000) >>> tab = distance_test(adata, groupby="perturbation", contrast="control") """ def __init__( self, metric: str, n_perms: int = 1000, layer_key: str = None, obsm_key: str = None, alpha: float = 0.05, correction: str = "holm-sidak", cell_wise_metric=None, ): self.metric = metric self.n_perms = n_perms if layer_key and obsm_key: raise ValueError( "Cannot use 'counts_layer_key' and 'obsm_key' at the same time.\n" "Please provide only one of the two keys." ) if not layer_key and not obsm_key: obsm_key = "X_pca" self.layer_key = layer_key self.obsm_key = obsm_key self.alpha = alpha self.correction = correction self.cell_wise_metric = ( cell_wise_metric if cell_wise_metric else Distance(self.metric, self.obsm_key).cell_wise_metric ) self.distance = Distance( self.metric, layer_key=self.layer_key, obsm_key=self.obsm_key, cell_wise_metric=self.cell_wise_metric ) def __call__( self, adata: AnnData, groupby: str, contrast: str, show_progressbar: bool = True, ) -> pd.DataFrame: """Run a permutation test using the specified distance metric, testing all groups of cells against a specified contrast group ("control"). Args: adata: Annotated data matrix. groupby: Key in adata.obs for grouping cells. contrast: Name of the contrast group. show_progressbar: Whether to print progress. Defaults to True. Returns: pandas.DataFrame: Results of the permutation test, with columns: - distance: distance between the contrast group and the group - pvalue: p-value of the permutation test - significant: whether the group is significantly different from the contrast group - pvalue_adj: p-value after multiple testing correction - significant_adj: whether the group is significantly different from the contrast group after multiple testing correction Examples: >>> import pertpy as pt >>> adata = pt.dt.distance_example() >>> distance_test = pt.tl.DistanceTest("edistance", n_perms=1000) >>> tab = distance_test(adata, groupby="perturbation", contrast="control") """ if self.distance.metric_fct.accepts_precomputed: # Much faster if the metric can be called on the precomputed # distance matrix, but not all metrics can do that. return self.test_precomputed(adata, groupby, contrast, show_progressbar) else: return self.test_xy(adata, groupby, contrast, show_progressbar)
[docs] def test_xy(self, adata: AnnData, groupby: str, contrast: str, show_progressbar: bool = True) -> pd.DataFrame: """Run permutation test for metric not supporting precomputed distances. Runs a permutation test for a metric that can not be computed using precomputed pairwise distances, but need the actual data points. This is generally slower than test_precomputed. Args: adata: Annotated data matrix. groupby: Key in adata.obs for grouping cells. contrast: Name of the contrast group. show_progressbar: Whether to print progress. Defaults to True. Returns: pandas.DataFrame: Results of the permutation test, with columns: - distance: distance between the contrast group and the group - pvalue: p-value of the permutation test - significant: whether the group is significantly different from the contrast group - pvalue_adj: p-value after multiple testing correction - significant_adj: whether the group is significantly different from the contrast group after multiple testing correction Examples: >>> import pertpy as pt >>> adata = pt.dt.distance_example() >>> distance_test = pt.tl.DistanceTest("edistance", n_perms=1000) >>> test_results = distance_test.test_xy(adata, groupby="perturbation", contrast="control") """ groups = adata.obs[groupby].unique() if contrast not in groups: raise ValueError(f"Contrast group {contrast} not found in {groupby} of adata.obs.") fct = track if show_progressbar else lambda iterable: iterable embedding = adata.obsm[self.obsm_key] # Generate the null distribution results = [] for _permutation in fct(range(self.n_perms)): # per perturbation, shuffle with control and compute e-distance df = pd.DataFrame(index=groups, columns=["distance"], dtype=float) for group in groups: if group == contrast: continue # Shuffle the labels of the groups mask = adata.obs[groupby].isin([group, contrast]) labels = adata.obs[groupby].values[mask] rng = np.random.default_rng() shuffled_labels = rng.permutation(labels) idx = shuffled_labels == group X = embedding[mask][idx] # shuffled group Y = embedding[mask][~idx] # shuffled contrast dist = self.distance(X, Y) df.loc[group, "distance"] = dist results.append(df.sort_index()) # Generate the empirical distribution for group in groups: if group == contrast: continue X = embedding[adata.obs[groupby] == group] Y = embedding[adata.obs[groupby] == contrast] df.loc[group, "distance"] = self.distance(X, Y) # Evaluate the test # count times shuffling resulted in larger distance comparison_results = np.array( pd.concat([r["distance"] - df["distance"] for r in results], axis=1) > 0, dtype=int ) n_failures = pd.Series(np.clip(np.sum(comparison_results, axis=1), 1, np.inf), index=df.index) pvalues = n_failures / self.n_perms # Apply multiple testing correction significant_adj, pvalue_adj, _, _ = multipletests(pvalues.values, alpha=self.alpha, method=self.correction) # Aggregate results tab = pd.DataFrame( { "distance": df["distance"], "pvalue": pvalues, "significant": pvalues < self.alpha, "pvalue_adj": pvalue_adj, "significant_adj": significant_adj, }, index=df.index, ) # Set the contrast group tab.loc[contrast, "distance"] = 0 tab.loc[contrast, "pvalue"] = 1 tab.loc[contrast, "significant"] = False tab.loc[contrast, "pvalue_adj"] = 1 tab.loc[contrast, "significant_adj"] = False return tab
[docs] def test_precomputed(self, adata: AnnData, groupby: str, contrast: str, verbose: bool = True) -> pd.DataFrame: """Run permutation test for metrics that take precomputed distances. Args: adata: Annotated data matrix. groupby: Key in adata.obs for grouping cells. contrast: Name of the contrast group. cell_wise_metric: Metric to use for pairwise distances. verbose: Whether to print progress. Defaults to True. Returns: pandas.DataFrame: Results of the permutation test, with columns: - distance: distance between the contrast group and the group - pvalue: p-value of the permutation test - significant: whether the group is significantly different from the contrast group - pvalue_adj: p-value after multiple testing correction - significant_adj: whether the group is significantly different from the contrast group after multiple testing correction Examples: >>> import pertpy as pt >>> adata = pt.dt.distance_example() >>> distance_test = pt.tl.DistanceTest("edistance", n_perms=1000) >>> test_results = distance_test.test_precomputed(adata, groupby="perturbation", contrast="control") """ if not self.distance.metric_fct.accepts_precomputed: raise ValueError(f"Metric {self.metric} does not accept precomputed distances.") groups = adata.obs[groupby].unique() if contrast not in groups: raise ValueError(f"Contrast group {contrast} not found in {groupby} of adata.obs.") fct = track if verbose else lambda iterable: iterable # Precompute the pairwise distances precomputed_distances = {} for group in groups: if self.layer_key: cells = adata[adata.obs[groupby].isin([group, contrast])].layers[self.layer_key].copy() else: cells = adata[adata.obs[groupby].isin([group, contrast])].obsm[self.obsm_key].copy() pwd = pairwise_distances(cells, cells, metric=self.distance.cell_wise_metric) precomputed_distances[group] = pwd # Generate the null distribution results = [] for _permutation in fct(range(self.n_perms)): # per perturbation, shuffle with control and compute e-distance df = pd.DataFrame(index=groups, columns=["distance"], dtype=float) for group in groups: if group == contrast: continue # Shuffle the labels of the groups mask = adata.obs[groupby].isin([group, contrast]) labels = adata.obs[groupby].values[mask] rng = np.random.default_rng() shuffled_labels = rng.permutation(labels) idx = shuffled_labels == group precomputed_distance = precomputed_distances[group] distance_result = self.distance.metric_fct.from_precomputed(precomputed_distance, idx) df.loc[group, "distance"] = distance_result results.append(df.sort_index()) # Generate the empirical distribution for group in groups: if group == contrast: continue mask = adata.obs[groupby].isin([group, contrast]) labels = adata.obs[groupby].values[mask] idx = labels == group precomputed_distance = precomputed_distances[group] distance_result = self.distance.metric_fct.from_precomputed(precomputed_distance, idx) df.loc[group, "distance"] = distance_result # Evaluate the test # count times shuffling resulted in larger distance comparison_results = np.array( pd.concat([r["distance"] - df["distance"] for r in results], axis=1) > 0, dtype=int ) n_failures = pd.Series(np.clip(np.sum(comparison_results, axis=1), 1, np.inf), index=df.index) pvalues = n_failures / self.n_perms # Apply multiple testing correction significant_adj, pvalue_adj, _, _ = multipletests(pvalues.values, alpha=self.alpha, method=self.correction) # Aggregate results tab = pd.DataFrame( { "distance": df["distance"], "pvalue": pvalues, "significant": pvalues < self.alpha, "pvalue_adj": pvalue_adj, "significant_adj": significant_adj, }, index=df.index, ) # Set the contrast group tab.loc[contrast, "distance"] = 0 tab.loc[contrast, "pvalue"] = 1 tab.loc[contrast, "significant"] = False tab.loc[contrast, "pvalue_adj"] = 1 tab.loc[contrast, "significant_adj"] = False return tab