Source code for pertpy.tools._cinemaot

from __future__ import annotations

from typing import TYPE_CHECKING

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import scanpy as sc
import scipy.stats as ss
import seaborn as sns
import sklearn.metrics
from ott.geometry import pointcloud
from ott.problems.linear import linear_problem
from ott.solvers.linear import sinkhorn, sinkhorn_lr
from scanpy.plotting import _utils
from scipy.sparse import issparse
from sklearn.decomposition import FastICA
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import NearestNeighbors

if TYPE_CHECKING:
    from anndata import AnnData
    from matplotlib.axes import Axes
    from statsmodels.tools.typing import ArrayLike



[docs]
class Cinemaot:
    """CINEMA-OT is a causal framework for perturbation effect analysis to identify individual treatment effects and synergy."""

    def __init__(self):
        pass


[docs]
    def causaleffect(
        self,
        adata: AnnData,
        pert_key: str,
        control: str,
        return_matching: bool = False,
        cf_rep: str = "cf",
        use_rep: str = "X_pca",
        batch_size: int | None = None,
        dim: int | None = 20,
        thres: float = 0.15,
        smoothness: float = 1e-4,
        rank: int = 200,
        eps: float = 1e-3,
        solver: str = "Sinkhorn",
        preweight_label: str | None = None,
    ):
        """Calculate the confounding variation, optimal transport counterfactual pairs, and single-cell level treatment effects.

        Args:
            adata: The annotated data object.
            pert_key: The column  of `.obs` with perturbation categories, should also contain `control`.
            control: Control category from the `pert_key` column.
            return_matching: Whether to return the matching matrix in the returned de.obsm['ot'].
            cf_rep: the place to put the confounder embedding in the original adata.obsm.
            use_rep: Use the indicated representation. `'X'` or any key for `.obsm` is valid.
            batch_size: Size of batch to calculate the optimal transport map.
            dim: Use the first dim components in use_rep.
                 If none, use a biwhitening procedure on the raw count matrix to derive a reasonable rank.
            thres: the threshold for the rank-dependence metric.
            smoothness: the coefficient determining the smooth level in entropic optimal transport problem.
            rank: Only used if the solver "LRSinkhorn" is used. Specifies the rank number of the transport map.
            eps: Tolerate error of the optimal transport.
            solver: Either "Sinkhorn" or "LRSinkhorn". The ott-jax solver used.
            preweight_label: The annotated label (e.g. cell type) that is used to assign weights for treated
                             and control cells to balance across the label. Helps overcome the differential abundance issue.

        Returns:
            Returns an AnnData object that contains the single-cell level treatment effect as de.X and the
            corresponding low dimensional embedding in de.obsm['X_embedding'], and optional matching matrix
            stored in the de.obsm['ot']. Also puts the confounding variation in adata.obsm[cf_rep].

        Examples:
            >>> import pertpy as pt
            >>> adata = pt.dt.cinemaot_example()
            >>> model = pt.tl.Cinemaot()
            >>> out_adata = model.causaleffect(
            >>>         adata, pert_key="perturbation", control="No stimulation", return_matching=True,
            >>>         thres=0.5, smoothness=1e-5, eps=1e-3, solver="Sinkhorn", preweight_label="cell_type0528")
        """
        available_solvers = ["Sinkhorn", "LRSinkhorn"]
        if solver not in available_solvers:
            raise ValueError(f"solver = {solver} is not one of the supported solvers:" f" {available_solvers}")

        if dim is None:
            dim = self.get_dim(adata, use_rep=use_rep)

        transformer = FastICA(n_components=dim, random_state=0, whiten="arbitrary-variance")
        X_transformed = transformer.fit_transform(adata.obsm[use_rep][:, :dim])
        groupvec = (adata.obs[pert_key] == control * 1).values  # control
        xi = np.zeros(dim)
        j = 0
        for source_row in X_transformed.T:
            xi_obj = Xi(source_row, groupvec * 1)
            xi[j] = xi_obj.correlation
            j = j + 1

        cf = X_transformed[:, xi < thres]
        cf1 = cf[adata.obs[pert_key] == control, :]
        cf2 = cf[adata.obs[pert_key] != control, :]
        if sum(xi < thres) == 1:
            sklearn.metrics.pairwise_distances(cf1.reshape(-1, 1), cf2.reshape(-1, 1))
        elif sum(xi < thres) == 0:
            raise ValueError("No confounder components identified. Please try a higher threshold.")
        else:
            sklearn.metrics.pairwise_distances(cf1, cf2)

        e = smoothness * sum(xi < thres)
        geom = pointcloud.PointCloud(cf1, cf2, epsilon=e, batch_size=batch_size)

        if preweight_label is None:
            ot_prob = linear_problem.LinearProblem(geom, a=None, b=None)

        else:
            # Implement a simple function here, taking adata.obs, output inverse prob weight.
            # For consistency, c is still the empirical distribution, while r is weighted.
            a = np.zeros(cf1.shape[0])
            b = np.zeros(cf2.shape[0])

            adata1 = adata[adata.obs[pert_key] == control, :].copy()
            adata2 = adata[adata.obs[pert_key] != control, :].copy()

            for label in adata1.obs[pert_key].unique():
                for ct in adata1.obs[preweight_label].unique():
                    a[((adata1.obs[preweight_label] == ct) & (adata1.obs[pert_key] == label)).values] = np.sum(
                        (adata2.obs[preweight_label] == ct).values
                    ) / np.sum((adata1.obs[preweight_label] == ct).values)
                a[(adata1.obs[pert_key] == label).values] = a[(adata1.obs[pert_key] == label).values] / np.sum(
                    a[(adata1.obs[pert_key] == label).values]
                )

            a = a / np.sum(a)
            b[:] = 1 / cf2.shape[0]
            ot_prob = linear_problem.LinearProblem(geom, a=a, b=b)

        if solver == "LRSinkhorn":
            if rank is None:
                rank = int(min(cf1.shape[0], cf2.shape[0]) / 2)
            _solver = sinkhorn_lr.LRSinkhorn(rank=rank, threshold=eps)
            ot_sink = _solver(ot_prob)
            embedding = (
                X_transformed[adata.obs[pert_key] != control, :]
                - ot_sink.apply(X_transformed[adata.obs[pert_key] == control, :].T).T
                / ot_sink.apply(np.ones_like(X_transformed[adata.obs[pert_key] == control, :].T)).T
            )
            if issparse(adata.X):
                te2 = (
                    adata.X.toarray()[adata.obs[pert_key] != control, :]
                    - ot_sink.apply(adata.X.toarray()[adata.obs[pert_key] == control, :].T).T
                    / ot_sink.apply(np.ones_like(adata.X.toarray()[adata.obs[pert_key] == control, :].T)).T
                )
            else:
                te2 = (
                    adata.X[adata.obs[pert_key] != control, :]
                    - ot_sink.apply(adata.X[adata.obs[pert_key] == control, :].T).T
                    / ot_sink.apply(np.ones_like(adata.X[adata.obs[pert_key] == control, :].T)).T
                )

            adata.obsm[cf_rep] = cf
            adata.obsm[cf_rep][adata.obs[pert_key] != control, :] = (
                ot_sink.apply(adata.obsm[cf_rep][adata.obs[pert_key] == control, :].T).T
                / ot_sink.apply(np.ones_like(adata.obsm[cf_rep][adata.obs[pert_key] == control, :].T)).T
            )

        else:
            _solver = sinkhorn.Sinkhorn(threshold=eps)
            ot_sink = _solver(ot_prob)
            ot_matrix = ot_sink.matrix.T
            embedding = X_transformed[adata.obs[pert_key] != control, :] - np.matmul(
                ot_matrix / np.sum(ot_matrix, axis=1)[:, None], X_transformed[adata.obs[pert_key] == control, :]
            )

            if issparse(adata.X):
                te2 = adata.X.toarray()[adata.obs[pert_key] != control, :] - np.matmul(
                    ot_matrix / np.sum(ot_matrix, axis=1)[:, None], adata.X.toarray()[adata.obs[pert_key] == control, :]
                )
            else:
                te2 = adata.X[adata.obs[pert_key] != control, :] - np.matmul(
                    ot_matrix / np.sum(ot_matrix, axis=1)[:, None], adata.X[adata.obs[pert_key] == control, :]
                )

            adata.obsm[cf_rep] = cf
            adata.obsm[cf_rep][adata.obs[pert_key] != control, :] = np.matmul(
                ot_matrix / np.sum(ot_matrix, axis=1)[:, None], adata.obsm[cf_rep][adata.obs[pert_key] == control, :]
            )

        TE = sc.AnnData(np.array(te2), obs=adata[adata.obs[pert_key] != control, :].obs.copy(), var=adata.var.copy())
        TE.obsm["X_embedding"] = embedding

        if return_matching:
            TE.obsm["ot"] = ot_sink.matrix.T
            return TE
        else:
            return TE



[docs]
    def causaleffect_weighted(
        self,
        adata: AnnData,
        pert_key: str,
        control: str,
        return_matching: bool = False,
        cf_rep: str = "cf",
        use_rep: str = "X_pca",
        batch_size: int | None = None,
        k: int = 20,
        dim: int | None = 20,
        thres: float = 0.15,
        smoothness: float = 1e-4,
        rank: int = 200,
        eps: float = 1e-3,
        solver: str = "Sinkhorn",
        resolution: float = 1.0,
    ):
        """The resampling CINEMA-OT algorithm that allows tackling the differential abundance in an unsupervised manner.

        Args:
            adata: The annotated data object.
            pert_key: The column  of `.obs` with perturbation categories, should also contain `control`.
            control: Control category from the `pert_key` column.
            return_matching: Whether to return the matching matrix in the returned de.obsm['ot'].
            cf_rep: the place to put the confounder embedding in the original adata.obsm.
            use_rep: Use the indicated representation. `'X'` or any key for `.obsm` is valid.
            batch_size: Size of batch to calculate the optimal transport map.
            k: the number of neighbors used in the k-NN matching phase.
            dim: Use the first dim components in use_rep.
                 If None, use a biwhitening procedure on the raw count matrix to derive a reasonable rank.
            thres: the threshold for the rank-dependence metric.
            smoothness: the coefficient determining the smooth level in entropic optimal transport problem.
            rank: Only used if the solver "LRSinkhorn" is used. Specifies the rank number of the transport map.
            eps: Tolerate error of the optimal transport.
            solver: Either "Sinkhorn" or "LRSinkhorn". The ott-jax solver used.
            resolution: the clustering resolution used in the sampling phase.

        Returns:
            Returns an anndata object that contains the single-cell level treatment effect as de.X and the
            corresponding low dimensional embedding in de.obsm['X_embedding'], and optional matching matrix
            stored in the de.obsm['ot']. Also puts the confounding variation in adata.obsm[cf_rep].

        Examples:
            >>> import pertpy as pt
            >>> adata = pt.dt.cinemaot_example()
            >>> model = pt.tl.Cinemaot()
            >>> ad, de = model.causaleffect_weighted(
            >>>              adata, pert_key="perturbation", control="No stimulation", return_matching=True,
            >>>              thres=0.5, smoothness=1e-5, eps=1e-3, solver="Sinkhorn")
        """
        available_solvers = ["Sinkhorn", "LRSinkhorn"]
        assert solver in available_solvers, (
            f"solver = {solver} is not one of the supported solvers:" f" {available_solvers}"
        )

        if dim is None:
            dim = self.get_dim(adata, use_rep=use_rep)

        adata.obs_names_make_unique()

        idx = self.get_weightidx(adata, pert_key=pert_key, control=control, k=k, use_rep=use_rep, resolution=resolution)
        adata_ = adata[idx].copy()
        TE = self.causaleffect(
            adata_,
            pert_key,
            control,
            return_matching=return_matching,
            cf_rep=cf_rep,
            use_rep=use_rep,
            batch_size=batch_size,
            dim=dim,
            thres=thres,
            smoothness=smoothness,
            rank=rank,
            eps=eps,
            solver=solver,
        )
        return adata_, TE



[docs]
    def generate_pseudobulk(
        self,
        adata: AnnData,
        de: AnnData,
        pert_key: str,
        control: str,
        label_list: list,
        cf_rep: str = "cf",
        de_rep: str = "X_embedding",
        cf_resolution: float = 0.5,
        de_resolution: float = 0.5,
        use_raw: bool = True,
    ):
        """Generating pseudobulk anndata considering the differential response behaviors revealed by CINEMA-OT.

        Requires running Cinemaot.causaleffect() or Cinemaot.causaleffect_weighted() first.

        Args:
            adata: The annotated data object.
            de: The anndata output from Cinemaot.causaleffect() or Cinemaot.causaleffect_weighted().
            pert_key: The column  of `.obs` with perturbation categories, should also contain `control`.
            control: Control category from the `pert_key` column.
            label_list: Additional covariate labels used to segragate pseudobulk.
                        Should at least contain sample information (sample 1, sample 2,..., etc).
            cf_rep: the place to put the confounder embedding in the original adata.obsm.
            de_rep: Use the indicated representation in de.obsm.
            assign_cf: If a str is passed, a label in adata.obs instead of confounder Leiden label is used.
            cf_resolution: The leiden clustering resolution for the confounder.
            de_resolution: The leiden clustering resolution for the differential response.
            use_raw: If true, use adata.raw.X to aggregate the pseudobulk profiles. Otherwise use adata.X.

        Returns:
            Returns an anndata object that contains aggregated pseudobulk profiles and associated metadata.

        Examples:
            >>> import pertpy as pt
            >>> adata = pt.dt.cinemaot_example()
            >>> model = pt.tl.Cinemaot()
            >>> de = model.causaleffect(
            >>>         adata, pert_key="perturbation", control="No stimulation", return_matching=True, thres=0.5,
            >>>         smoothness=1e-5, eps=1e-3, solver="Sinkhorn", preweight_label="cell_type0528")
            >>> adata_pb = model.generate_pseudobulk(
            >>>         adata, de, pert_key="perturbation", control="No stimulation", label_list=None)
        """
        sc.pp.neighbors(de, use_rep=de_rep)
        sc.tl.leiden(de, resolution=de_resolution)
        if use_raw:
            if issparse(adata.raw.X):
                df = pd.DataFrame(adata.raw.X.toarray(), columns=adata.raw.var_names, index=adata.raw.obs_names)
            else:
                df = pd.DataFrame(adata.raw.X, columns=adata.raw.var_names, index=adata.raw.obs_names)
        else:
            if issparse(adata.X):
                df = pd.DataFrame(adata.X.toarray(), columns=adata.var_names, index=adata.obs_names)
            else:
                df = pd.DataFrame(adata.X, columns=adata.var_names, index=adata.obs_names)

        if label_list is None:
            label_list = ["ct"]
            sc.pp.neighbors(adata, use_rep=cf_rep)
            sc.tl.leiden(adata, resolution=cf_resolution)
            df["ct"] = adata.obs["leiden"].astype(str)
        df["ptb"] = "control"
        df["ptb"][adata.obs[pert_key] != control] = de.obs["leiden"].astype(str)
        label_list.append("ptb")
        df = df.groupby(label_list).sum()
        new_index = df.index.map(lambda x: "_".join(map(str, x)))
        df_ = df.set_index(new_index)
        adata_pb = sc.AnnData(df_)
        adata_pb.obs = pd.DataFrame(
            df.index.to_frame().values,
            index=adata_pb.obs_names,
            columns=df.index.to_frame().columns,
        )
        return adata_pb



[docs]
    def get_dim(
        self,
        adata: AnnData,
        c: float = 0.5,
        use_rep: str = "X_pca",
    ):
        """Estimating the rank of the count matrix. Always use adata.raw.X. Make sure it is the raw count matrix.

        Args:
            adata: The annotated data object.
            c: the parameter regarding the quadratic variance distribution. c=0 means Poisson count matrices.
            use_rep: the embedding used to give a upper bound for the estimated rank.

        Returns:
            Returns the estimated dimension number.

        Examples:
            >>> import pertpy as pt
            >>> adata = pt.dt.cinemaot_example()
            >>> model = pt.tl.Cinemaot()
            >>> dim = model.get_dim(adata)
        """
        sk = SinkhornKnopp()
        if issparse(adata.raw.X):
            data = adata.raw.X.toarray()
        else:
            data = adata.raw.X
        vm = (1e-3 + data + c * data * data) / (1 + c)
        sk.fit(vm)
        wm = np.dot(np.dot(np.sqrt(sk._D1), vm), np.sqrt(sk._D2))
        u, s, vt = np.linalg.svd(wm)
        dim = min(sum(s > (np.sqrt(data.shape[0]) + np.sqrt(data.shape[1]))), adata.obsm[use_rep].shape[1])
        return dim



[docs]
    def get_weightidx(
        self,
        adata: AnnData,
        pert_key: str,
        control: str,
        use_rep: str = "X_pca",
        k: int = 20,
        resolution: float = 1.0,
    ):
        """Generating the resampled indices that balances covariates across treatment conditions.

        Args:
            adata: The annotated data object.
            c: the parameter regarding the quadratic variance distribution. c=0 means Poisson count matrices.
            use_rep: the embedding used to give a upper bound for the estimated rank.
            k: the number of neighbors used in the k-NN matching phase.
            resolution: the clustering resolution used in the sampling phase.

        Returns:
            Returns the indices.

        Examples:
            >>> import pertpy as pt
            >>> adata = pt.dt.cinemaot_example()
            >>> model = pt.tl.Cinemaot()
            >>> idx = model.get_weightidx(adata, pert_key="perturbation", control="No stimulation")
        """
        adata_ = adata.copy()
        X_pca1 = adata_.obsm[use_rep][adata_.obs[pert_key] == control, :]
        X_pca2 = adata_.obsm[use_rep][adata_.obs[pert_key] != control, :]
        nbrs = NearestNeighbors(n_neighbors=k, algorithm="ball_tree").fit(X_pca1)
        mixscape_pca = adata.obsm[use_rep].copy()
        mixscapematrix = nbrs.kneighbors_graph(X_pca2).toarray()
        mixscape_pca[adata_.obs[pert_key] != control, :] = (
            np.dot(mixscapematrix, mixscape_pca[adata_.obs[pert_key] == control, :]) / k
        )

        adata_.obsm["X_mpca"] = mixscape_pca
        sc.pp.neighbors(adata_, use_rep="X_mpca")
        sc.tl.leiden(adata_, resolution=resolution)

        j = 0

        ref_label = "noncontrol"
        expr_label = "control"

        adata_.obs["ct"] = ref_label
        adata_.obs["ct"][adata_.obs[pert_key] == control] = expr_label
        pert_key = "ct"
        z = np.zeros(adata_.shape[0]) + 1

        for i in adata_.obs["leiden"].cat.categories:
            if (
                adata_[(adata_.obs["leiden"] == i) & (adata_.obs[pert_key] == ref_label)].shape[0]
                >= adata_[(adata_.obs["leiden"] == i) & (adata_.obs[pert_key] == expr_label)].shape[0]
            ):
                z[(adata_.obs["leiden"] == i) & (adata_.obs[pert_key] == ref_label)] = (
                    adata_[(adata_.obs["leiden"] == i) & (adata_.obs[pert_key] == expr_label)].shape[0]
                    / adata_[(adata_.obs["leiden"] == i) & (adata_.obs[pert_key] == ref_label)].shape[0]
                )
                if j == 0:
                    idx = sc.pp.subsample(
                        adata_[(adata_.obs["leiden"] == i) & (adata_.obs[pert_key] == ref_label)],
                        n_obs=adata_[(adata_.obs["leiden"] == i) & (adata_.obs[pert_key] == expr_label)].shape[0],
                        copy=True,
                    ).obs.index
                    idx = idx.append(
                        adata_[(adata_.obs["leiden"] == i) & (adata_.obs[pert_key] == expr_label)].obs.index
                    )
                    j = j + 1
                else:
                    idx_tmp = sc.pp.subsample(
                        adata_[(adata_.obs["leiden"] == i) & (adata_.obs[pert_key] == ref_label)],
                        n_obs=adata_[(adata_.obs["leiden"] == i) & (adata_.obs[pert_key] == expr_label)].shape[0],
                        copy=True,
                    ).obs.index
                    idx_tmp = idx_tmp.append(
                        adata_[(adata_.obs["leiden"] == i) & (adata_.obs[pert_key] == expr_label)].obs.index
                    )
                    idx = idx.append(idx_tmp)
            else:
                z[(adata_.obs["leiden"] == i) & (adata_.obs[pert_key] == expr_label)] = (
                    adata_[(adata_.obs["leiden"] == i) & (adata_.obs[pert_key] == ref_label)].shape[0]
                    / adata_[(adata_.obs["leiden"] == i) & (adata_.obs[pert_key] == expr_label)].shape[0]
                )
                if j == 0:
                    idx = sc.pp.subsample(
                        adata_[(adata_.obs["leiden"] == i) & (adata_.obs[pert_key] == expr_label)],
                        n_obs=adata_[(adata_.obs["leiden"] == i) & (adata_.obs[pert_key] == ref_label)].shape[0],
                        copy=True,
                    ).obs.index
                    idx = idx.append(
                        adata_[(adata_.obs["leiden"] == i) & (adata_.obs[pert_key] == ref_label)].obs.index
                    )
                    j = j + 1
                else:
                    idx_tmp = sc.pp.subsample(
                        adata_[(adata_.obs["leiden"] == i) & (adata_.obs[pert_key] == expr_label)],
                        n_obs=adata_[(adata_.obs["leiden"] == i) & (adata_.obs[pert_key] == ref_label)].shape[0],
                        copy=True,
                    ).obs.index
                    idx_tmp = idx_tmp.append(
                        adata_[(adata_.obs["leiden"] == i) & (adata_.obs[pert_key] == ref_label)].obs.index
                    )
                    idx = idx.append(idx_tmp)

        return idx



[docs]
    def synergy(
        self,
        adata: AnnData,
        pert_key: str,
        base: str,
        A: str,
        B: str,
        AB: str,
        dim: int | None = 20,
        thres: float = 0.15,
        smoothness: float = 1e-4,
        preweight_label: str | None = None,
        **kwargs,
    ):
        """A wrapper for computing the synergy matrices.

        Args:
            adata: The annotated data object.
            pert_key: The column  of `.obs` with perturbation categories, should also contain `control`.
            base: Control category from the `pert_key` column.
            A: the category for perturbation A.
            B: the category for perturbation B.
            AB: the category for the combinatorial perturbation A+B.
            dim: Use the first dim components in use_rep.
                If none, use a biwhitening procedure on the raw count matrix to derive a reasonable rank.
            thres: the threshold for the rank-dependence metric.
            smoothness: the coefficient determining the smooth level in entropic optimal transport problem.
            eps: Tolerate error of the optimal transport.
            preweight_label: the annotated label (e.g. cell type) that is used to assign weights for treated
                and control cells to balance across the label. Helps overcome the differential abundance issue.
            **kwargs: other parameters that can be passed to Cinemaot.causaleffect()

        Returns:
            Returns an AnnData object that contains the single-cell level synergy matrix de.X and the embedding.

        Examples:
            >>> import pertpy as pt
            >>> adata = pt.dt.dong_2023()
            >>> sc.pp.pca(adata)
            >>> model = pt.tl.Cinemaot()
            >>> combo = model.synergy(adata, pert_key='perturbation', base='No stimulation', A='IFNb', B='IFNg',
            >>>                   AB='IFNb+ IFNg', thres=0.5, smoothness=1e-5, eps=1e-3, solver='Sinkhorn')

        """
        adata1 = adata[adata.obs[pert_key].isin([base, A]), :].copy()
        adata2 = adata[adata.obs[pert_key].isin([B, AB]), :].copy()
        adata_link = adata[adata.obs[pert_key].isin([base, B]), :].copy()
        de1 = self.causaleffect(
            adata1,
            pert_key=pert_key,
            control=A,
            return_matching=True,
            dim=dim,
            thres=thres,
            smoothness=smoothness,
            preweight_label=preweight_label,
            **kwargs,
        )
        ot1 = de1.obsm["ot"]  # noqa: F841
        de2 = self.causaleffect(
            adata2,
            pert_key=pert_key,
            control=AB,
            return_matching=True,
            dim=dim,
            thres=thres,
            smoothness=smoothness,
            preweight_label=preweight_label,
            **kwargs,
        )
        ot2 = de2.obsm["ot"]  # noqa: F841
        de0 = self.causaleffect(
            adata_link,
            pert_key=pert_key,
            control=B,
            return_matching=True,
            dim=dim,
            thres=thres,
            smoothness=smoothness,
            preweight_label=preweight_label,
            **kwargs,
        )
        ot0 = de0.obsm["ot"]
        syn = sc.AnnData(
            np.array(-((ot0 / np.sum(ot0, axis=1)[:, None]) @ de2.X - de1.X)), obs=de1.obs.copy(), var=de1.var.copy()
        )
        syn.obsm["X_embedding"] = (ot0 / np.sum(ot0, axis=1)[:, None]) @ de2.obsm["X_embedding"] - de1.obsm[
            "X_embedding"
        ]
        return syn



[docs]
    def attribution_scatter(
        self,
        adata: AnnData,
        pert_key: str,
        control: str,
        cf_rep: str = "cf",
        use_raw: bool = False,
    ):
        """A simple function for computing confounder-specific genes.

        Args:
            adata: The annotated data object.
            pert_key: The column  of `.obs` with perturbation categories, should also contain `control`.
            control: Control category from the `pert_key` column.
            cf_rep: the place to put the confounder embedding in the original adata.obsm.
            use_raw: If true, use adata.raw.X to aggregate the pseudobulk profiles. Otherwise use adata.X.

        Returns:
            Returns the confounder effect (c_effect) and the residual effect (s_effect).

        Examples:
            >>> import pertpy as pt
            >>> adata = pt.dt.cinemaot_example()
            >>> model = pt.tl.Cinemaot()
            >>> c_effect, s_effect = model.attribution_scatter(adata, pert_key="perturbation", control="No stimulation")
        """
        cf = adata.obsm[cf_rep]
        if use_raw:
            if issparse(adata.X):
                Y0 = adata.raw.X.toarray()[adata.obs[pert_key] == control, :]
                Y1 = adata.raw.X.toarray()[adata.obs[pert_key] != control, :]
            else:
                Y0 = adata.raw.X[adata.obs[pert_key] == control, :]
                Y1 = adata.raw.X[adata.obs[pert_key] != control, :]
        else:
            if issparse(adata.X):
                Y0 = adata.X.toarray()[adata.obs[pert_key] == control, :]
                Y1 = adata.X.toarray()[adata.obs[pert_key] != control, :]
            else:
                Y0 = adata.X[adata.obs[pert_key] == control, :]
                Y1 = adata.X[adata.obs[pert_key] != control, :]
        X0 = cf[adata.obs[pert_key] == control, :]
        X1 = cf[adata.obs[pert_key] != control, :]
        ols0 = LinearRegression()
        ols0.fit(X0, Y0)
        ols1 = LinearRegression()
        ols1.fit(X1, Y1)
        c0 = ols0.predict(X0) - np.mean(ols0.predict(X0), axis=0)
        c1 = ols1.predict(X1) - np.mean(ols1.predict(X1), axis=0)
        e0 = Y0 - ols0.predict(X0)
        e1 = Y1 - ols1.predict(X1)
        c_effect = (np.linalg.norm(c1, axis=0) + 1e-6) / (np.linalg.norm(c0, axis=0) + 1e-6)
        s_effect = (np.linalg.norm(e1, axis=0) + 1e-6) / (np.linalg.norm(e0, axis=0) + 1e-6)
        return c_effect, s_effect



[docs]
    def plot_vis_matching(
        self,
        adata: AnnData,
        de: AnnData,
        pert_key: str,
        control: str,
        de_label: str,
        source_label: str,
        matching_rep: str = "ot",
        resolution: float = 0.5,
        normalize: str = "col",
        title: str = "CINEMA-OT matching matrix",
        min_val: float = 0.01,
        show: bool = True,
        save: str | None = None,
        ax: Axes | None = None,
        **kwargs,
    ) -> None:
        """Visualize the CINEMA-OT matching matrix.

        Args:
            adata: the original anndata after running cinemaot.causaleffect or cinemaot.causaleffect_weighted.
            de: The anndata output from Cinemaot.causaleffect() or Cinemaot.causaleffect_weighted().
            pert_key: The column  of `.obs` with perturbation categories, should also contain `control`.
            control: Control category from the `pert_key` column.
            de_label: the label for differential response. If none, use leiden cluster labels at resolution 1.0.
            source_label: the confounder / cell type label.
            matching_rep: the place that stores the matching matrix. default de.obsm['ot'].
            normalize: normalize the coarse-grained matching matrix by row / column.
            title: the title for the figure.
            min_val: The min value to truncate the matching matrix.
            show: Show the plot, do not return axis.
            save: If `True` or a `str`, save the figure. A string is appended to the default filename.
                Infer the filetype if ending on {`'.pdf'`, `'.png'`, `'.svg'`}.
            **kwargs: Other parameters to input for seaborn.heatmap.

        Examples:
            >>> import pertpy as pt
            >>> adata = pt.dt.cinemaot_example()
            >>> cot = pt.tl.Cinemaot()
            >>> de = cot.causaleffect(
            >>>         adata, pert_key="perturbation", control="No stimulation", return_matching=True,
            >>>         thres=0.5, smoothness=1e-5, eps=1e-3, solver="Sinkhorn", preweight_label="cell_type0528")
            >>> cot.plot_vis_matching(
            >>>         adata, de, pert_key="perturbation",control="No stimulation", de_label=None, source_label="cell_type0528")
        """
        adata_ = adata[adata.obs[pert_key] == control]

        df = pd.DataFrame(de.obsm[matching_rep])
        if de_label is None:
            de_label = "leiden"
            sc.pp.neighbors(de, use_rep="X_embedding")
            sc.tl.leiden(de, resolution=resolution)
        df["de_label"] = de.obs[de_label].astype(str).values
        df["de_label"] = "Response " + df["de_label"]
        df = df.groupby("de_label").sum().T
        df["source_label"] = adata_.obs[source_label].astype(str).values
        df = df.groupby("source_label").sum()

        if normalize == "col":
            df = df / df.sum(axis=0)
        else:
            df = (df.T / df.sum(axis=1)).T
        df = df.clip(lower=min_val) - min_val
        if normalize == "col":
            df = df / df.sum(axis=0)
        else:
            df = (df.T / df.sum(axis=1)).T

        g = sns.heatmap(df, annot=True, ax=ax, **kwargs)
        plt.title(title)
        _utils.savefig_or_show("matching_heatmap", show=show, save=save)
        if not show:
            if ax is not None:
                return ax
            else:
                return g




class Xi:
    """
    A fast implementation of cross-rank dependence metric used in CINEMA-OT.

    """

    def __init__(self, x, y):
        self.x = x
        self.y = y

    @property
    def sample_size(self):
        return len(self.x)

    @property
    def x_ordered_rank(self):
        # PI is the rank vector for x, with ties broken at random
        # Not mine: source (https://stackoverflow.com/a/47430384/1628971)
        # random shuffling of the data - reason to use random.choice is that
        # pd.sample(frac=1) uses the same randomizing algorithm
        len_x = len(self.x)
        rng = np.random.default_rng()
        randomized_indices = rng.choice(np.arange(len_x), len_x, replace=False)
        randomized = [self.x[idx] for idx in randomized_indices]
        # same as pandas rank method 'first'
        rankdata = ss.rankdata(randomized, method="ordinal")
        # Reindexing based on pairs of indices before and after
        unrandomized = [rankdata[j] for i, j in sorted(zip(randomized_indices, range(len_x), strict=False))]
        return unrandomized

    @property
    def y_rank_max(self):
        # f[i] is number of j s.t. y[j] <= y[i], divided by n.
        return ss.rankdata(self.y, method="max") / self.sample_size

    @property
    def g(self):
        # g[i] is number of j s.t. y[j] >= y[i], divided by n.
        return ss.rankdata([-i for i in self.y], method="max") / self.sample_size

    @property
    def x_ordered(self):
        # order of the x's, ties broken at random.
        return np.argsort(self.x_ordered_rank)

    @property
    def x_rank_max_ordered(self):
        x_ordered_result = self.x_ordered
        y_rank_max_result = self.y_rank_max
        # Rearrange f according to ord.
        return [y_rank_max_result[i] for i in x_ordered_result]

    @property
    def mean_absolute(self):
        x1 = self.x_rank_max_ordered[0 : (self.sample_size - 1)]
        x2 = self.x_rank_max_ordered[1 : self.sample_size]

        return (
            np.mean(
                np.abs(
                    [
                        x - y
                        for x, y in zip(
                            x1,
                            x2,
                            strict=False,
                        )
                    ]
                )
            )
            * (self.sample_size - 1)
            / (2 * self.sample_size)
        )

    @property
    def inverse_g_mean(self):
        gvalue = self.g
        return np.mean(gvalue * (1 - gvalue))

    @property
    def correlation(self):
        """xi correlation"""
        return 1 - self.mean_absolute / self.inverse_g_mean

    @classmethod
    def xi(cls, x, y):
        return cls(x, y)

    def pval_asymptotic(self, ties: bool = False):
        """Returns p values of the correlation.

        Args:
            ties: boolean
                If ties is true, the algorithm assumes that the data has ties
                and employs the more elaborated theory for calculating
                the P-value. Otherwise, it uses the simpler theory. There is
                no harm in setting tiles True, even if there are no ties.

        Returns:
            p value
        """
        # If there are no ties, return xi and theoretical P-value:
        if ties:
            return 1 - ss.norm.cdf(np.sqrt(self.sample_size) * self.correlation / np.sqrt(2 / 5))

        # If there are ties, and the theoretical method is to be used for calculation P-values:
        # The following steps calculate the theoretical variance in the presence of ties:
        sorted_ordered_x_rank = sorted(self.x_rank_max_ordered)

        ind = [i + 1 for i in range(self.sample_size)]
        ind2 = [2 * self.sample_size - 2 * ind[i - 1] + 1 for i in ind]

        a = np.mean([i * j * j for i, j in zip(ind2, sorted_ordered_x_rank, strict=False)]) / self.sample_size

        c = np.mean([i * j for i, j in zip(ind2, sorted_ordered_x_rank, strict=False)]) / self.sample_size

        cq = np.cumsum(sorted_ordered_x_rank)

        m = [
            (i + (self.sample_size - j) * k) / self.sample_size
            for i, j, k in zip(cq, ind, sorted_ordered_x_rank, strict=False)
        ]

        b = np.mean([np.square(i) for i in m])
        v = (a - 2 * b + np.square(c)) / np.square(self.inverse_g_mean)

        return 1 - ss.norm.cdf(np.sqrt(self.sample_size) * self.correlation / np.sqrt(v))


class SinkhornKnopp:
    """
    An simple implementation of Sinkhorn iteration used in the biwhitening approach.

    """

    def __init__(self, max_iter: float = 1000, setr: int = 0, setc: float = 0, epsilon: float = 1e-3):
        if max_iter < 0:
            raise ValueError(f"max_iter is {max_iter} but must be greater than 0.")
        self._max_iter = int(max_iter)

        if not epsilon > 0 and epsilon < 1:
            raise ValueError(f"epsilon is {epsilon} but must be between 0 and 1 exclusive.")
        self._epsilon = epsilon
        self._setr = setr
        self._setc = setc
        self._stopping_condition: str | None = None
        self._iterations = 0
        self._D1 = np.ones(1)
        self._D2 = np.ones(1)

    def fit(self, P: ArrayLike):
        """Fit the diagonal matrices in Sinkhorn Knopp's algorithm.

        Args:
            P: 2d array-like
               Must be a square non-negative 2d array-like object, that
               is convertible to a numpy array. The matrix must not be
               equal to 0 and it must have total support for the algorithm to converge.

        Returns:
            A double stochastic matrix.
        """
        P = np.asarray(P)
        assert np.all(P >= 0)
        assert P.ndim == 2

        N = P.shape[0]
        if np.sum(abs(self._setr)) == 0:
            rsum = P.shape[1]
        else:
            rsum = self._setr
        if np.sum(abs(self._setc)) == 0:
            csum = P.shape[0]
        else:
            csum = self._setc
        max_threshr = rsum + self._epsilon
        min_threshr = rsum - self._epsilon
        max_threshc = csum + self._epsilon
        min_threshc = csum - self._epsilon
        # Initialize r and c, the diagonals of D1 and D2
        # and warn if the matrix does not have support.
        r = np.ones((N, 1))
        pdotr = P.T.dot(r)

        c = 1 / pdotr
        pdotc = P.dot(c)

        r = 1 / pdotc
        del pdotr, pdotc

        P_eps = np.copy(P)
        while (
            np.any(np.sum(P_eps, axis=1) < min_threshr)
            or np.any(np.sum(P_eps, axis=1) > max_threshr)
            or np.any(np.sum(P_eps, axis=0) < min_threshc)
            or np.any(np.sum(P_eps, axis=0) > max_threshc)
        ):
            c = csum / P.T.dot(r)
            r = rsum / P.dot(c)

            self._D1 = np.diag(np.squeeze(r))
            self._D2 = np.diag(np.squeeze(c))

            P_eps = np.diag(self._D1)[:, None] * P * np.diag(self._D2)[None, :]

            self._iterations += 1

            if self._iterations >= self._max_iter:
                self._stopping_condition = "max_iter"
                break

        if not self._stopping_condition:
            self._stopping_condition = "epsilon"

        self._D1 = np.diag(np.squeeze(r))
        self._D2 = np.diag(np.squeeze(c))
        P_eps = np.diag(self._D1)[:, None] * P * np.diag(self._D2)[None, :]

        return P_eps