Source code for pertpy.tools._augur

from __future__ import annotations

import random
from collections import defaultdict
from dataclasses import dataclass
from math import floor, nan
from typing import TYPE_CHECKING, Any, Literal

import anndata as ad
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import scanpy as sc
import statsmodels.api as sm
from anndata import AnnData
from joblib import Parallel, delayed
from lamin_utils import logger
from rich import print
from rich.progress import track
from scipy import sparse, stats
from sklearn.base import is_classifier, is_regressor
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    accuracy_score,
    explained_variance_score,
    f1_score,
    make_scorer,
    mean_squared_error,
    precision_score,
    r2_score,
    recall_score,
    roc_auc_score,
)
from sklearn.model_selection import StratifiedKFold, cross_validate
from sklearn.preprocessing import LabelEncoder
from skmisc.loess import loess
from statsmodels.stats.multitest import fdrcorrection

if TYPE_CHECKING:
    from matplotlib.axes import Axes
    from matplotlib.figure import Figure


@dataclass
class Params:
    """Type signature for random forest and logistic regression parameters.

    Parameters:
        n_estimators: defines the number of trees in the forest;
        max_depth: specifies the maximal depth of each tree;
        max_features: specifies the maximal number of features considered when looking at best split.

            * if int then consider max_features for each split
            * if float consider round(max_features*n_features)
            * if `auto` then max_features=n_features (default)
            * if `log2` then max_features=log2(n_features)
            * if `sqrt` then max_featuers=sqrt(n_features)

        penalty: defines the norm of the penalty used in logistic regression

            * if `l1` then L1 penalty is added
            * if `l2` then L2 penalty is added (default)
            * if `elasticnet` both L1 and L2 penalties are added
            * if `none` no penalty is added

        random_state: sets random model seed
    """

    n_estimators: int = 100
    max_depth: int | None = None
    max_features: Literal["auto"] | Literal["log2"] | Literal["sqrt"] | int | float = 2
    penalty: Literal["l1"] | Literal["l2"] | Literal["elasticnet"] | Literal["none"] = "l2"
    random_state: int | None = None


[docs] class Augur: """Python implementation of Augur.""" def __init__( self, estimator: Literal["random_forest_classifier"] | Literal["random_forest_regressor"] | Literal["logistic_regression_classifier"], params: Params | None = None, ): self.estimator = self.create_estimator(classifier=estimator, params=params)
[docs] def load( self, input: AnnData | pd.DataFrame, meta: pd.DataFrame | None = None, label_col: str = "label_col", cell_type_col: str = "cell_type_col", condition_label: str | None = None, treatment_label: str | None = None, ) -> AnnData: """Loads the input data. Args: input: Anndata or matrix containing gene expression values (genes in rows, cells in columns) and optionally meta data about each cell. meta: Optional Pandas DataFrame containing meta data about each cell. label_col: column of the meta DataFrame or the Anndata or matrix containing the condition labels for each cell in the cell-by-gene expression matrix cell_type_col: column of the meta DataFrame or the Anndata or matrix containing the cell type labels for each cell in the cell-by-gene expression matrix condition_label: in the case of more than two labels, this label is used in the analysis treatment_label: in the case of more than two labels, this label is used in the analysis Returns: Anndata object containing gene expression values (cells in rows, genes in columns) and cell type, label and y dummy variables as obs Examples: >>> import pertpy as pt >>> adata = pt.dt.sc_sim_augur() >>> ag_rfc = pt.tl.Augur("random_forest_classifier") >>> loaded_data = ag_rfc.load(adata) """ if isinstance(input, AnnData): input.obs = input.obs.rename(columns={cell_type_col: "cell_type", label_col: "label"}) adata = input elif isinstance(input, pd.DataFrame): if meta is None: try: _ = input[cell_type_col] _ = input[label_col] except KeyError: logger.error("No column names matching cell_type_col and label_col.") label = input[label_col] if meta is None else meta[label_col] cell_type = input[cell_type_col] if meta is None else meta[cell_type_col] x = input.drop([label_col, cell_type_col], axis=1) if meta is None else input adata = AnnData(X=x, obs=pd.DataFrame({"cell_type": cell_type, "label": label})) if len(adata.obs["label"].unique()) < 2: raise ValueError("Less than two unique labels in dataset. At least two are needed for the analysis.") # dummy variables for categorical data if adata.obs["label"].dtype.name == "category": # filter samples according to label if condition_label is not None and treatment_label is not None: logger.info(f"Filtering samples with {condition_label} and {treatment_label} labels.") adata = ad.concat( [adata[adata.obs["label"] == condition_label], adata[adata.obs["label"] == treatment_label]] ) label_encoder = LabelEncoder() adata.obs["y_"] = label_encoder.fit_transform(adata.obs["label"]) else: y = adata.obs["label"].to_frame() y = y.rename(columns={"label": "y_"}) adata.obs = pd.concat([adata.obs, y], axis=1) return adata
[docs] def create_estimator( self, classifier: ( Literal["random_forest_classifier"] | Literal["random_forest_regressor"] | Literal["logistic_regression_classifier"] ), params: Params | None = None, ) -> RandomForestClassifier | RandomForestRegressor | LogisticRegression: """Creates a model object of the provided type and populates it with desired parameters. Args: classifier: classifier to use in calculating the area under the curve. Either random forest classifier or logistic regression for categorical data or random forest regressor for continous data params: parameters used to populate the model object. Default values are `n_estimators` = 100, `max_depth` = None, `max_features` = 2, `penalty` = `l2`, `random_state` = None. Returns: Estimator object. Examples: >>> import pertpy as pt >>> augur = pt.tl.Augur("random_forest_classifier") >>> estimator = augur.create_estimator("logistic_regression_classifier") """ if params is None: params = Params() if classifier == "random_forest_classifier": return RandomForestClassifier( n_estimators=params.n_estimators, max_depth=params.max_depth, max_features=params.max_features, random_state=params.random_state, ) elif classifier == "random_forest_regressor": return RandomForestRegressor( n_estimators=params.n_estimators, max_depth=params.max_depth, max_features=params.max_features, random_state=params.random_state, ) elif classifier == "logistic_regression_classifier": return LogisticRegression(penalty=params.penalty, random_state=params.random_state) else: raise ValueError("Invalid classifier")
[docs] def sample(self, adata: AnnData, categorical: bool, subsample_size: int, random_state: int, features: list): """Sample AnnData observations. Args: adata: Anndata with obs `label` and `cell_type` for label and cell type and dummie variable `y_` columns used as target categorical: `True` if target values are categorical subsample_size: number of cells to subsample randomly per type from each experimental condition random_state: set numpy random seed and sampling seed features: features returned Anndata object Returns: Subsample of AnnData object of size subsample_size with given features Examples: >>> import pertpy as pt >>> adata = pt.dt.sc_sim_augur() >>> ag_rfc = pt.tl.Augur("random_forest_classifier") >>> loaded_data = ag_rfc.load(adata) >>> ag_rfc.select_highly_variable(loaded_data) >>> features = loaded_data.var_names >>> subsample = ag_rfc.sample( ... loaded_data, categorical=True, subsample_size=20, random_state=42, features=loaded_data.var_names ... ) """ # export subsampling. random.seed(random_state) if categorical: label_subsamples = [] y_encodings = adata.obs["y_"].unique() for code in y_encodings: label_subsamples.append( sc.pp.subsample( adata[adata.obs["y_"] == code, features], n_obs=subsample_size, copy=True, random_state=random_state, ) ) subsample = ad.concat([*label_subsamples], index_unique=None) else: subsample = sc.pp.subsample(adata[:, features], n_obs=subsample_size, copy=True, random_state=random_state) # filter features with 0 variance subsample.var["highly_variable"] = False subsample.var["means"] = np.ravel(subsample.X.mean(axis=0)) # Converting because the Syntax for power for numpy arrays is different -> use sparse Syntax as default if isinstance(subsample.X, np.ndarray): subsample.X = sparse.csc_matrix(subsample.X) subsample.var["var"] = np.ravel(subsample.X.power(2).mean(axis=0) - np.power(subsample.X.mean(axis=0), 2)) # remove all features with 0 variance subsample.var.loc[subsample.var["var"] > 0, "highly_variable"] = True return subsample[:, subsample.var["highly_variable"]]
[docs] def draw_subsample( self, adata: AnnData, augur_mode: str, subsample_size: int, feature_perc: float, categorical: bool, random_state: int, ) -> AnnData: """Subsample and select random features of anndata object. Args: adata: Anndata with obs `label` and `cell_type` for label and cell type and dummie variable `y_` columns used as target augur_mode: one of default, velocity or permute. Setting augur_mode = "velocity" disables feature selection, assuming feature selection has been performed by the RNA velocity procedure to produce the input matrix, while setting augur_mode = "permute" will generate a null distribution of AUCs for each cell type by permuting the labels subsample_size: number of cells to subsample randomly per type from each experimental condition feature_perc: proportion of genes that are randomly selected as features for input to the classifier in each subsample using the random gene filter categorical: `True` if target values are categorical random_state: set numpy random seed and sampling seed Returns: Subsample of anndata object of size subsample_size Examples: >>> import pertpy as pt >>> adata = pt.dt.sc_sim_augur() >>> ag_rfc = pt.tl.Augur("random_forest_classifier") >>> loaded_data = ag_rfc.load(adata) >>> ag_rfc.select_highly_variable(loaded_data) >>> subsample = ag_rfc.draw_subsample(adata, augur_mode="default", subsample_size=20, feature_perc=0.5, \ categorical=True, random_state=42) """ random.seed(random_state) if augur_mode == "permute": # shuffle labels adata = adata.copy() y_columns = [col for col in adata.obs if col.startswith("y_")] adata.obs[y_columns] = adata.obs[y_columns].sample(frac=1, random_state=random_state).values if augur_mode == "velocity": # no feature selection, assuming this has already happenend in calculating velocity features = adata.var_names else: # randomly sample features from highly variable genes highly_variable_genes = adata.var_names[adata.var["highly_variable"]].tolist() features = random.sample(highly_variable_genes, floor(len(highly_variable_genes) * feature_perc)) # randomly sample samples for each label return self.sample( adata=adata, categorical=categorical, subsample_size=subsample_size, random_state=random_state, features=features, )
[docs] def cross_validate_subsample( self, adata: AnnData, augur_mode: str, subsample_size: int, folds: int, feature_perc: float, subsample_idx: int, random_state: int | None, zero_division: int | str, ) -> dict: """Cross validate subsample anndata object. Args: adata: Anndata with obs `label` and `cell_type` for label and cell type and dummie variable `y_` columns used as target augur_mode: one of default, velocity or permute. Setting augur_mode = "velocity" disables feature selection, assuming feature selection has been performed by the RNA velocity procedure to produce the input matrix, while setting augur_mode = "permute" will generate a null distribution of AUCs for each cell type by permuting the labels subsample_size: number of cells to subsample randomly per type from each experimental condition folds: number of folds to run cross validation on feature_perc: proportion of genes that are randomly selected as features for input to the classifier in each subsample using the random gene filter subsample_idx: index of the subsample random_state: set numpy random seed, sampling seed and fold seed zero_division: 0 or 1 or `warn`; Sets the value to return when there is a zero division. If set to “warn”, this acts as 0, but warnings are also raised. Precision metric parameter. Returns: Results for each cross validation fold. Examples: >>> import pertpy as pt >>> adata = pt.dt.sc_sim_augur() >>> ag_rfc = pt.tl.Augur("random_forest_classifier") >>> loaded_data = ag_rfc.load(adata) >>> ag_rfc.select_highly_variable(loaded_data) >>> results = ag_rfc.cross_validate_subsample(loaded_data, augur_mode="default", subsample_size=20, \ folds=3, feature_perc=0.5, subsample_idx=0, random_state=42, zero_division=0) """ subsample = self.draw_subsample( adata, augur_mode, subsample_size, feature_perc=feature_perc, categorical=is_classifier(self.estimator), random_state=subsample_idx, ) results = self.run_cross_validation( subsample=subsample, folds=folds, subsample_idx=subsample_idx, random_state=random_state, zero_division=zero_division, ) return results
[docs] def ccc_score(self, y_true, y_pred) -> float: """Implementation of Lin's Concordance correlation coefficient, based on https://gitlab.com/-/snippets/1730605. Args: y_true: array-like of shape (n_samples), ground truth (correct) target values y_pred: array-like of shape (n_samples), estimated target values Returns: Concordance correlation coefficient. """ # covariance between y_true and y_pred s_xy = np.cov([y_true, y_pred])[0, 1] # means x_m = np.mean(y_true) y_m = np.mean(y_pred) # variances s_x_sq = np.var(y_true) s_y_sq = np.var(y_pred) # condordance correlation coefficient ccc = (2.0 * s_xy) / (s_x_sq + s_y_sq + (x_m - y_m) ** 2) return ccc
[docs] def set_scorer( self, multiclass: bool, zero_division: int | str, ) -> dict[str, Any]: """Set scoring fuctions for cross-validation based on estimator. Args: multiclass: `True` if there are more than two target classes zero_division: 0 or 1 or `warn`; Sets the value to return when there is a zero division. If set to “warn”, this acts as 0, but warnings are also raised. Precision metric parameter. Returns: Dict linking name to scorer object and string name Examples: >>> import pertpy as pt >>> ag_rfc = pt.tl.Augur("random_forest_classifier") >>> scorer = ag_rfc.set_scorer(True, 0) """ if multiclass: return { "augur_score": make_scorer(roc_auc_score, multi_class="ovo", response_method="predict_proba"), "auc": make_scorer(roc_auc_score, multi_class="ovo", response_method="predict_proba"), "accuracy": make_scorer(accuracy_score), "precision": make_scorer(precision_score, average="macro", zero_division=zero_division), "f1": make_scorer(f1_score, average="macro"), "recall": make_scorer(recall_score, average="macro"), } return ( { "augur_score": make_scorer(roc_auc_score, response_method="predict_proba"), "auc": make_scorer(roc_auc_score, response_method="predict_proba"), "accuracy": make_scorer(accuracy_score), "precision": make_scorer(precision_score, average="binary", zero_division=zero_division), "f1": make_scorer(f1_score, average="binary"), "recall": make_scorer(recall_score, average="binary"), } if isinstance(self.estimator, RandomForestClassifier) or isinstance(self.estimator, LogisticRegression) else { "augur_score": make_scorer(self.ccc_score), "r2": make_scorer(r2_score), "ccc": make_scorer(self.ccc_score), "neg_mean_squared_error": make_scorer(mean_squared_error), "explained_variance": make_scorer(explained_variance_score), } )
[docs] def run_cross_validation( self, subsample: AnnData, subsample_idx: int, folds: int, random_state: int | None, zero_division: int | str, ) -> dict: """Perform cross validation on given subsample. Args: subsample: subsample of gene expression matrix of size subsample_size estimator: classifier object to use in calculating the area under the curve subsample_idx: index of subsample folds: number of folds random_state: set random fold seed zero_division: 0 or 1 or `warn`; Sets the value to return when there is a zero division. If set to “warn”, this acts as 0, but warnings are also raised. Precision metric parameter. Returns: Dictionary containing prediction metrics and estimator for each fold. Examples: >>> import pertpy as pt >>> adata = pt.dt.sc_sim_augur() >>> ag_rfc = pt.tl.Augur("random_forest_classifier") >>> loaded_data = ag_rfc.load(adata) >>> ag_rfc.select_highly_variable(loaded_data) >>> subsample = ag_rfc.draw_subsample(adata, augur_mode="default", subsample_size=20, feature_perc=0.5, \ categorical=True, random_state=42) >>> results = ag_rfc.run_cross_validation(subsample=subsample, folds=3, subsample_idx=0, random_state=42, zero_division=0) """ x = subsample.to_df() y = subsample.obs["y_"] scorer = self.set_scorer(multiclass=True if len(y.unique()) > 2 else False, zero_division=zero_division) folds = StratifiedKFold(n_splits=folds, random_state=random_state, shuffle=True) results = cross_validate( estimator=self.estimator, X=x, y=y.values.ravel(), scoring=scorer, cv=folds, return_estimator=True, ) results["subsample_idx"] = subsample_idx for score in scorer.keys(): results[f"mean_{score}"] = results[f"test_{score}"].mean() # feature importances feature_importances = defaultdict(list) if isinstance(self.estimator, RandomForestClassifier) or isinstance(self.estimator, RandomForestRegressor): for fold, estimator in list(zip(range(len(results["estimator"])), results["estimator"], strict=False)): feature_importances["genes"].extend(x.columns.tolist()) feature_importances["feature_importances"].extend(estimator.feature_importances_.tolist()) feature_importances["subsample_idx"].extend(len(x.columns) * [subsample_idx]) feature_importances["fold"].extend(len(x.columns) * [fold]) # standardized coefficients with Agresti method # cf. https://think-lab.github.io/d/205/#3 if isinstance(self.estimator, LogisticRegression): for fold, self.estimator in list(zip(range(len(results["estimator"])), results["estimator"], strict=False)): feature_importances["genes"].extend(x.columns.tolist()) feature_importances["feature_importances"].extend( (self.estimator.coef_ * self.estimator.coef_.std()).flatten().tolist() ) feature_importances["subsample_idx"].extend(len(x.columns) * [subsample_idx]) feature_importances["fold"].extend(len(x.columns) * [fold]) results["feature_importances"] = feature_importances return results
[docs] def average_metrics(self, cell_cv_results: list[Any]) -> dict[Any, Any]: """Calculate average metric of cross validation runs done of one cell type. Args: cell_cv_results: list of all cross validation runs of one cell type Returns: Dict containing the average result for each metric of one cell type """ metric_names = [metric for metric in [*cell_cv_results[0].keys()] if metric.startswith("mean")] metric_list: dict[Any, Any] = {} for subsample_cv_result in cell_cv_results: for metric in metric_names: metric_list[metric] = metric_list.get(metric, []) + [subsample_cv_result[metric]] return {metric: np.mean(values) for metric, values in metric_list.items()}
[docs] def select_highly_variable(self, adata: AnnData) -> AnnData: """Feature selection by variance using scanpy highly variable genes function. Args: adata: Anndata object containing gene expression values (cells in rows, genes in columns) Results: Anndata object with highly variable genes added as layer Examples: >>> import pertpy as pt >>> adata = pt.dt.sc_sim_augur() >>> ag_rfc = pt.tl.Augur("random_forest_classifier") >>> loaded_data = ag_rfc.load(adata) >>> ag_rfc.select_highly_variable(loaded_data) """ min_features_for_selection = 1000 if len(adata.var_names) - 2 > min_features_for_selection: try: sc.pp.highly_variable_genes(adata) except ValueError: logger.warn("Data not normalized. Normalizing now using scanpy log1p normalize.") sc.pp.log1p(adata) sc.pp.highly_variable_genes(adata) return adata
[docs] def cox_compare(self, loess1, loess2): """Cox compare test on two models. Based on: https://www.statsmodels.org/dev/generated/statsmodels.stats.diagnostic.compare_cox.html Info: Tests of non-nested hypothesis might not provide unambiguous answers. The test should be performed in both directions and it is possible that both or neither test rejects. Args: loess1: fitted loess regression object loess2: fitted loess regression object Returns: t-statistic for the test that including the fitted values of the first model in the second model has no effect and two-sided pvalue for the t-statistic """ x = loess1.inputs.x z = loess2.inputs.x nobs = loess1.inputs.y.shape[0] # coxtest sigma2_x = np.sum(np.power(loess1.outputs.fitted_residuals, 2)) / nobs sigma2_z = np.sum(np.power(loess2.outputs.fitted_residuals, 2)) / nobs yhat_x = loess1.outputs.fitted_values res_dx = sm.OLS(yhat_x, z).fit() err_zx = res_dx.resid res_xzx = sm.OLS(err_zx, x).fit() err_xzx = res_xzx.resid sigma2_zx = sigma2_x + np.dot(err_zx.T, err_zx) / nobs c01 = nobs / 2.0 * (np.log(sigma2_z) - np.log(sigma2_zx)) v01 = sigma2_x * np.dot(err_xzx.T, err_xzx) / sigma2_zx**2 q = c01 / np.sqrt(v01) pval = 2 * stats.norm.sf(np.abs(q)) return q, pval
[docs] def select_variance(self, adata: AnnData, var_quantile: float, filter_negative_residuals: bool, span: float = 0.75): """Feature selection based on Augur implementation. Args: adata: Anndata object var_quantile: The quantile below which features will be filtered, based on their residuals in a loess model. filter_negative_residuals: if `True`, filter residuals at a fixed threshold of zero, instead of `var_quantile` span: Smoothing factor, as a fraction of the number of points to take into account. Should be in the range (0, 1]. Defaults to 0.75 Return: AnnData object with additional select_variance column in var. Examples: >>> import pertpy as pt >>> adata = pt.dt.sc_sim_augur() >>> ag_rfc = pt.tl.Augur("random_forest_classifier") >>> loaded_data = ag_rfc.load(adata) >>> ag_rfc.select_variance(loaded_data, var_quantile=0.5, filter_negative_residuals=False, span=0.75) """ adata.var["highly_variable"] = False adata.var["means"] = np.ravel(adata.X.mean(axis=0)) # Converting because the Syntax for power for numpy arrays is different -> use sparse Syntax as default if isinstance(adata.X, np.ndarray): adata.X = sparse.csc_matrix(adata.X) adata.var["sds"] = np.ravel(np.sqrt(adata.X.power(2).mean(axis=0) - np.power(adata.X.mean(axis=0), 2))) # remove all features with 0 variance adata.var.loc[adata.var["sds"] > 0, "highly_variable"] = True cvs = adata.var.loc[adata.var["highly_variable"], "means"] / adata.var.loc[adata.var["highly_variable"], "sds"] # clip outliers, get best prediction model, get residuals lower = np.quantile(cvs, 0.01) upper = np.quantile(cvs, 0.99) keep = cvs.loc[cvs.between(lower, upper)].index cv0 = cvs.loc[keep] mean0 = adata.var.loc[keep, "means"] if any(mean0 < 0): # if there are negative values, don't bother comparing to log-transformed # means - fit on normalized data directly model = loess(mean0, cv0, span=span) else: fit1 = loess(mean0, cv0, span=span) fit1.fit() fit2 = loess(np.log(mean0), cv0, span=span) fit2.fit() # use a cox test to guess whether we should be fitting the raw or # log-transformed means cox1 = self.cox_compare(fit1, fit2) cox2 = self.cox_compare(fit2, fit1) # compare pvalues if cox1[1] < cox2[1]: model = fit1 else: model = fit2 residuals = model.outputs.fitted_residuals # select features by quantile (or override w/positive residuals) genes = ( keep[residuals > 0] if filter_negative_residuals else keep[residuals > np.quantile(residuals, var_quantile)] ) adata.var["highly_variable"] = [x in genes for x in adata.var.index] return adata
[docs] def predict( self, adata: AnnData, n_subsamples: int = 50, subsample_size: int = 20, folds: int = 3, min_cells: int = None, feature_perc: float = 0.5, var_quantile: float = 0.5, span: float = 0.75, filter_negative_residuals: bool = False, n_threads: int = 4, augur_mode: Literal["permute"] | Literal["default"] | Literal["velocity"] = "default", select_variance_features: bool = True, key_added: str = "augurpy_results", random_state: int | None = None, zero_division: int | str = 0, ) -> tuple[AnnData, dict[str, Any]]: """Calculates the Area under the Curve using the given classifier. Args: adata: Anndata with obs `label` and `cell_type` for label and cell type and dummie variable `y_` columns used as target n_subsamples: number of random subsamples to draw from complete dataset for each cell type subsample_size: number of cells to subsample randomly per type from each experimental condition folds: number of folds to run cross validation on. Be careful changing this parameter without also changing `subsample_size`. min_cells: minimum number of cells for a particular cell type in each condition in order to retain that type for analysis (depricated..) feature_perc: proportion of genes that are randomly selected as features for input to the classifier in each subsample using the random gene filter var_quantile: The quantile below which features will be filtered, based on their residuals in a loess model. Defaults to 0.5. span: Smoothing factor, as a fraction of the number of points to take into account. Should be in the range (0, 1]. Defaults to 0.75. filter_negative_residuals: if `True`, filter residuals at a fixed threshold of zero, instead of `var_quantile` n_threads: number of threads to use for parallelization select_variance_features: Whether to select genes based on the original Augur implementation (True) or using scanpy's highly_variable_genes (False). Defaults to True. key_added: Key to add results to in .uns augur_mode: One of 'default', 'velocity' or 'permute'. Setting augur_mode = "velocity" disables feature selection, assuming feature selection has been performed by the RNA velocity procedure to produce the input matrix, while setting augur_mode = "permute" will generate a null distribution of AUCs for each cell type by permuting the labels. Note that when setting augur_mode = "permute" n_subsample values less than 100 will be set to 500. random_state: set numpy random seed, sampling seed and fold seed zero_division: 0 or 1 or `warn`; Sets the value to return when there is a zero division. If set to “warn”, this acts as 0, but warnings are also raised. Precision metric parameter. Returns: A tuple with a dictionary containing the following keys with an updated AnnData object with mean_augur_score metrics in obs: * summary_metrics: Pandas Dataframe containing mean metrics for each cell type * feature_importances: Pandas Dataframe containing feature importances of genes across all cross validation runs * full_results: Dict containing merged results of individual cross validation runs for each cell type * [cell_types]: Cross validation runs of the cell type called Examples: >>> import pertpy as pt >>> adata = pt.dt.sc_sim_augur() >>> ag_rfc = pt.tl.Augur("random_forest_classifier") >>> loaded_data = ag_rfc.load(adata) >>> h_adata, h_results = ag_rfc.predict(loaded_data, subsample_size=20, n_threads=4) """ adata = adata.copy() if augur_mode == "permute" and n_subsamples < 100: n_subsamples = 500 if is_regressor(self.estimator) and len(adata.obs["y_"].unique()) <= 3: raise ValueError( f"Regressors cannot be used on {len(adata.obs['label'].unique())} labels. Try a classifier." ) if isinstance(self.estimator, LogisticRegression) and len(adata.obs["y_"].unique()) > 2: raise ValueError( "Logistic regression cannot be used for multiclass classification. " + "Use a random forest classifier or filter labels in load()." ) if min_cells is None: min_cells = n_subsamples results: dict[str, Any] = { "summary_metrics": {}, "feature_importances": defaultdict(list), "full_results": defaultdict(list), } if select_variance_features: logger.warning("Set smaller span value in the case of a `segmentation fault` error.") logger.warning("Set larger span in case of svddc or other near singularities error.") adata.obs["augur_score"] = nan for cell_type in track(adata.obs["cell_type"].unique(), description="Processing data..."): cell_type_subsample = adata[adata.obs["cell_type"] == cell_type].copy() if augur_mode == "default" or augur_mode == "permute": cell_type_subsample = ( self.select_highly_variable(cell_type_subsample) if not select_variance_features else self.select_variance( cell_type_subsample, var_quantile=var_quantile, filter_negative_residuals=filter_negative_residuals, span=span, ) ) if len(cell_type_subsample) < min_cells: logger.warning( f"Skipping {cell_type} cell type - {len(cell_type_subsample)} samples is less than min_cells {min_cells}." ) elif ( cell_type_subsample.obs.groupby( ["cell_type", "label"], observed=True, ).y_.count() < subsample_size ).any(): logger.warning( f"Skipping {cell_type} cell type - the number of samples for at least one class type is less than " f"subsample size {subsample_size}." ) else: results[cell_type] = Parallel(n_jobs=n_threads)( delayed(self.cross_validate_subsample)( adata=cell_type_subsample, augur_mode=augur_mode, subsample_size=subsample_size, folds=folds, feature_perc=feature_perc, subsample_idx=i, random_state=random_state, zero_division=zero_division, ) for i in range(n_subsamples) ) # summarize scores for cell type results["summary_metrics"][cell_type] = self.average_metrics(results[cell_type]) # add scores as observation to anndata mask = adata.obs["cell_type"].str.startswith(cell_type) adata.obs.loc[mask, "augur_score"] = results["summary_metrics"][cell_type]["mean_augur_score"] # concatenate feature importances for each subsample cv subsample_feature_importances_dicts = [cv["feature_importances"] for cv in results[cell_type]] for dictionary in subsample_feature_importances_dicts: for key, value in dictionary.items(): results["feature_importances"][key].extend(value) results["feature_importances"]["cell_type"].extend( [cell_type] * (len(results["feature_importances"]["genes"]) - len(results["feature_importances"]["cell_type"])) ) for idx, cv in zip(range(n_subsamples), results[cell_type], strict=False): results["full_results"]["idx"].extend([idx] * folds) results["full_results"]["augur_score"].extend(cv["test_augur_score"]) results["full_results"]["folds"].extend(range(folds)) results["full_results"]["cell_type"].extend([cell_type] * folds * n_subsamples) # make sure one cell type worked if len(results) <= 2: logger.warning("No cells types had more than min_cells needed. Please adjust data or min_cells parameter.") results["summary_metrics"] = pd.DataFrame(results["summary_metrics"]) results["feature_importances"] = pd.DataFrame(results["feature_importances"]) results["full_results"] = pd.DataFrame(results["full_results"]) adata.uns["summary_metrics"] = pd.DataFrame(results["summary_metrics"]) adata.uns[key_added] = results return adata, results
[docs] def predict_differential_prioritization( self, augur_results1: dict[str, Any], augur_results2: dict[str, Any], permuted_results1: dict[str, Any], permuted_results2: dict[str, Any], n_subsamples: int = 50, n_permutations: int = 1000, ) -> pd.DataFrame: """Predicts the differential prioritization by performing permutation tests on samples. Performs permutation tests that identifies cell types with statistically significant differences in `augur_score` between two conditions respectively compared to the control. Args: augur1: Augurpy results from condition 1, obtained from `predict()[1]` augur2: Augurpy results from condition 2, obtained from `predict()[1]` permuted1: permuted Augurpy results from condition 1, obtained from `predict()` with argument `augur_mode=permute` permuted2: permuted Augurpy results from condition 2, obtained from `predict()` with argument `augur_mode=permute` n_subsamples: number of subsamples to pool when calculating the mean augur score for each permutation; Defaults to 50. n_permutations: the total number of mean augur scores to calculate from a background distribution Returns: Results object containing mean augur scores. Examples: >>> import pertpy as pt >>> adata = pt.dt.bhattacherjee() >>> ag_rfc = pt.tl.Augur("random_forest_classifier") >>> data_15 = ag_rfc.load(adata, condition_label="Maintenance_Cocaine", treatment_label="withdraw_15d_Cocaine") >>> adata_15, results_15 = ag_rfc.predict(data_15, random_state=None, n_threads=4) >>> adata_15_permute, results_15_permute = ag_rfc.predict(data_15, augur_mode="permute", n_subsamples=100, random_state=None, n_threads=4) >>> data_48 = ag_rfc.load(adata, condition_label="Maintenance_Cocaine", treatment_label="withdraw_48h_Cocaine") >>> adata_48, results_48 = ag_rfc.predict(data_48, random_state=None, n_threads=4) >>> adata_48_permute, results_48_permute = ag_rfc.predict(data_48, augur_mode="permute", n_subsamples=100, random_state=None, n_threads=4) >>> pvals = ag_rfc.predict_differential_prioritization(augur_results1=results_15, augur_results2=results_48, \ permuted_results1=results_15_permute, permuted_results2=results_48_permute) """ # compare available cell types cell_types = ( set(augur_results1["summary_metrics"].columns) & set(augur_results2["summary_metrics"].columns) & set(permuted_results1["summary_metrics"].columns) & set(permuted_results2["summary_metrics"].columns) ) cell_types_list = list(cell_types) # mean augur scores augur_score1 = ( augur_results1["summary_metrics"] .loc["mean_augur_score", cell_types_list] .reset_index() .rename(columns={"index": "cell_type"}) ) augur_score2 = ( augur_results2["summary_metrics"] .loc["mean_augur_score", cell_types_list] .reset_index() .rename(columns={"index": "cell_type"}) ) # mean permuted scores over cross validation runs permuted_cv_augur1 = ( permuted_results1["full_results"][permuted_results1["full_results"]["cell_type"].isin(cell_types_list)] .groupby(["cell_type", "idx"], as_index=False) .mean() ) permuted_cv_augur2 = ( permuted_results2["full_results"][permuted_results2["full_results"]["cell_type"].isin(cell_types_list)] .groupby(["cell_type", "idx"], as_index=False) .mean() ) sampled_permuted_cv_augur1 = [] sampled_permuted_cv_augur2 = [] # draw mean aucs for permute1 and permute2 for celltype in permuted_cv_augur1["cell_type"].unique(): df1 = permuted_cv_augur1[permuted_cv_augur1["cell_type"] == celltype] df2 = permuted_cv_augur2[permuted_cv_augur2["cell_type"] == celltype] for permutation_idx in range(n_permutations): # subsample sample1 = df1.sample(n=n_subsamples, random_state=permutation_idx, axis="index") sampled_permuted_cv_augur1.append( pd.DataFrame( { "cell_type": [celltype], "permutation_idx": [permutation_idx], "mean": [sample1["augur_score"].mean(axis=0)], "std": [sample1["augur_score"].std(axis=0)], } ) ) sample2 = df2.sample(n=n_subsamples, random_state=permutation_idx, axis="index") sampled_permuted_cv_augur2.append( pd.DataFrame( { "cell_type": [celltype], "permutation_idx": [permutation_idx], "mean": [sample2["augur_score"].mean(axis=0)], "std": [sample2["augur_score"].std(axis=0)], } ) ) permuted_samples1 = pd.concat(sampled_permuted_cv_augur1) permuted_samples2 = pd.concat(sampled_permuted_cv_augur2) # delta between augur scores delta = augur_score1.merge(augur_score2, on=["cell_type"], suffixes=("1", "2")).assign( delta_augur=lambda x: x.mean_augur_score2 - x.mean_augur_score1 ) # delta between permutation scores delta_rnd = permuted_samples1.merge( permuted_samples2, on=["cell_type", "permutation_idx"], suffixes=("1", "2") ).assign(delta_rnd=lambda x: x.mean2 - x.mean1) # number of values where permutations are larger than test statistic delta["b"] = ( pd.merge( delta_rnd[["cell_type", "delta_rnd"]], delta[["cell_type", "delta_augur"]], on="cell_type", how="left" ) .assign(b=lambda x: (x.delta_rnd >= x.delta_augur)) .groupby("cell_type", as_index=False) .sum()["b"] ) delta["m"] = n_permutations delta["z"] = ( delta["delta_augur"] - delta_rnd.groupby("cell_type", as_index=False).mean()["delta_rnd"] ) / delta_rnd.groupby("cell_type", as_index=False).std()["delta_rnd"] # calculate pvalues delta["pval"] = np.minimum( 2 * (delta["b"] + 1) / (delta["m"] + 1), 2 * (delta["m"] - delta["b"] + 1) / (delta["m"] + 1) ) delta["padj"] = fdrcorrection(delta["pval"])[1] return delta
[docs] def plot_dp_scatter( self, results: pd.DataFrame, top_n: int = None, return_fig: bool | None = None, ax: Axes = None, show: bool | None = None, save: str | bool | None = None, ) -> Axes | Figure | None: """Plot scatterplot of differential prioritization. Args: results: Results after running differential prioritization. top_n: optionally, the number of top prioritized cell types to label in the plot ax: optionally, axes used to draw plot Returns: Axes of the plot. Examples: >>> import pertpy as pt >>> adata = pt.dt.bhattacherjee() >>> ag_rfc = pt.tl.Augur("random_forest_classifier") >>> data_15 = ag_rfc.load(adata, condition_label="Maintenance_Cocaine", treatment_label="withdraw_15d_Cocaine") >>> adata_15, results_15 = ag_rfc.predict(data_15, random_state=None, n_threads=4) >>> adata_15_permute, results_15_permute = ag_rfc.predict(data_15, augur_mode="permute", n_subsamples=100, random_state=None, n_threads=4) >>> data_48 = ag_rfc.load(adata, condition_label="Maintenance_Cocaine", treatment_label="withdraw_48h_Cocaine") >>> adata_48, results_48 = ag_rfc.predict(data_48, random_state=None, n_threads=4) >>> adata_48_permute, results_48_permute = ag_rfc.predict(data_48, augur_mode="permute", n_subsamples=100, random_state=None, n_threads=4) >>> pvals = ag_rfc.predict_differential_prioritization(augur_results1=results_15, augur_results2=results_48, \ permuted_results1=results_15_permute, permuted_results2=results_48_permute) >>> ag_rfc.plot_dp_scatter(pvals) Preview: .. image:: /_static/docstring_previews/augur_dp_scatter.png """ x = results["mean_augur_score1"] y = results["mean_augur_score2"] if ax is None: fig, ax = plt.subplots() scatter = ax.scatter(x, y, c=results.z, cmap="Greens") # adding optional labels top_n_index = results.sort_values(by="pval").index[:top_n] for idx in top_n_index: ax.annotate( results.loc[idx, "cell_type"], (results.loc[idx, "mean_augur_score1"], results.loc[idx, "mean_augur_score2"]), ) # add diagonal limits = max(ax.get_xlim(), ax.get_ylim()) (_,) = ax.plot(limits, limits, ls="--", c=".3") # formatting and details plt.xlabel("Augur scores 1") plt.ylabel("Augur scores 2") legend1 = ax.legend(*scatter.legend_elements(), loc="center left", title="z-scores", bbox_to_anchor=(1, 0.5)) ax.add_artist(legend1) if save: plt.savefig(save, bbox_inches="tight") if show: plt.show() if return_fig: return plt.gcf() if not (show or save): return ax return None
[docs] def plot_important_features( self, data: dict[str, Any], key: str = "augurpy_results", top_n: int = 10, return_fig: bool | None = None, ax: Axes = None, show: bool | None = None, save: str | bool | None = None, ) -> Axes | None: """Plot a lollipop plot of the n features with largest feature importances. Args: results: results after running `predict()` as dictionary or the AnnData object. key: Key in the AnnData object of the results top_n: n number feature importance values to plot. Default is 10. ax: optionally, axes used to draw plot return_figure: if `True` returns figure of the plot, default is `False` Returns: Axes of the plot. Examples: >>> import pertpy as pt >>> adata = pt.dt.sc_sim_augur() >>> ag_rfc = pt.tl.Augur("random_forest_classifier") >>> loaded_data = ag_rfc.load(adata) >>> v_adata, v_results = ag_rfc.predict( ... loaded_data, subsample_size=20, select_variance_features=True, n_threads=4 ... ) >>> ag_rfc.plot_important_features(v_results) Preview: .. image:: /_static/docstring_previews/augur_important_features.png """ if isinstance(data, AnnData): results = data.uns[key] else: results = data n_features = ( results["feature_importances"] .groupby("genes", as_index=False) .feature_importances.mean() .sort_values(by="feature_importances")[-top_n:] ) if ax is None: fig, ax = plt.subplots() y_axes_range = range(1, top_n + 1) ax.hlines( y_axes_range, xmin=0, xmax=n_features["feature_importances"], ) ax.plot(n_features["feature_importances"], y_axes_range, "o") plt.xlabel("Mean Feature Importance") plt.ylabel("Gene") plt.yticks(y_axes_range, n_features["genes"]) if save: plt.savefig(save, bbox_inches="tight") if show: plt.show() if return_fig: return plt.gcf() if not (show or save): return ax return None
[docs] def plot_lollipop( self, data: dict[str, Any], key: str = "augurpy_results", return_fig: bool | None = None, ax: Axes = None, show: bool | None = None, save: str | bool | None = None, ) -> Axes | Figure | None: """Plot a lollipop plot of the mean augur values. Args: results: results after running `predict()` as dictionary or the AnnData object. key: Key in the AnnData object of the results ax: optionally, axes used to draw plot return_figure: if `True` returns figure of the plot Returns: Axes of the plot. Examples: >>> import pertpy as pt >>> adata = pt.dt.sc_sim_augur() >>> ag_rfc = pt.tl.Augur("random_forest_classifier") >>> loaded_data = ag_rfc.load(adata) >>> v_adata, v_results = ag_rfc.predict( ... loaded_data, subsample_size=20, select_variance_features=True, n_threads=4 ... ) >>> ag_rfc.plot_lollipop(v_results) Preview: .. image:: /_static/docstring_previews/augur_lollipop.png """ if isinstance(data, AnnData): results = data.uns[key] else: results = data if ax is None: fig, ax = plt.subplots() y_axes_range = range(1, len(results["summary_metrics"].columns) + 1) ax.hlines( y_axes_range, xmin=0, xmax=results["summary_metrics"].sort_values("mean_augur_score", axis=1).loc["mean_augur_score"], ) ax.plot( results["summary_metrics"].sort_values("mean_augur_score", axis=1).loc["mean_augur_score"], y_axes_range, "o", ) plt.xlabel("Mean Augur Score") plt.ylabel("Cell Type") plt.yticks(y_axes_range, results["summary_metrics"].sort_values("mean_augur_score", axis=1).columns) if save: plt.savefig(save, bbox_inches="tight") if show: plt.show() if return_fig: return plt.gcf() if not (show or save): return ax return None
[docs] def plot_scatterplot( self, results1: dict[str, Any], results2: dict[str, Any], top_n: int = None, return_fig: bool | None = None, show: bool | None = None, save: str | bool | None = None, ) -> Axes | Figure | None: """Create scatterplot with two augur results. Args: results1: results after running `predict()` results2: results after running `predict()` top_n: optionally, the number of top prioritized cell types to label in the plot return_figure: if `True` returns figure of the plot Returns: Axes of the plot. Examples: >>> import pertpy as pt >>> adata = pt.dt.sc_sim_augur() >>> ag_rfc = pt.tl.Augur("random_forest_classifier") >>> loaded_data = ag_rfc.load(adata) >>> h_adata, h_results = ag_rfc.predict(loaded_data, subsample_size=20, n_threads=4) >>> v_adata, v_results = ag_rfc.predict( ... loaded_data, subsample_size=20, select_variance_features=True, n_threads=4 ... ) >>> ag_rfc.plot_scatterplot(v_results, h_results) Preview: .. image:: /_static/docstring_previews/augur_scatterplot.png """ cell_types = results1["summary_metrics"].columns fig, ax = plt.subplots() ax.scatter( results1["summary_metrics"].loc["mean_augur_score", cell_types], results2["summary_metrics"].loc["mean_augur_score", cell_types], ) # adding optional labels top_n_cell_types = ( (results1["summary_metrics"].loc["mean_augur_score"] - results2["summary_metrics"].loc["mean_augur_score"]) .sort_values(ascending=False) .index[:top_n] ) for txt in top_n_cell_types: ax.annotate( txt, ( results1["summary_metrics"].loc["mean_augur_score", txt], results2["summary_metrics"].loc["mean_augur_score", txt], ), ) # adding diagonal limits = max(ax.get_xlim(), ax.get_ylim()) (diag_line,) = ax.plot(limits, limits, ls="--", c=".3") plt.xlabel("Augur scores 1") plt.ylabel("Augur scores 2") if save: plt.savefig(save, bbox_inches="tight") if show: plt.show() if return_fig: return plt.gcf() if not (show or save): return ax return None