Source code for pertpy.tools._differential_gene_expression._pydeseq2
import os
import numpy as np
import pandas as pd
from anndata import AnnData
from numpy import ndarray
from pydeseq2.dds import DeseqDataSet
from pydeseq2.default_inference import DefaultInference
from pydeseq2.ds import DeseqStats
from scipy.sparse import issparse
from ._base import LinearModelBase
from ._checks import check_is_integer_matrix
[docs]
class PyDESeq2(LinearModelBase):
"""Differential expression test using a PyDESeq2."""
def __init__(
self, adata: AnnData, design: str | ndarray, *, mask: str | None = None, layer: str | None = None, **kwargs
):
super().__init__(adata, design, mask=mask, layer=layer, **kwargs)
# work around pydeseq2 issue with sparse matrices
# see also https://github.com/owkin/PyDESeq2/issues/25
if issparse(self.data):
if self.layer is None:
self.adata.X = self.adata.X.toarray()
else:
self.adata.layers[self.layer] = self.adata.layers[self.layer].toarray()
def _check_counts(self):
check_is_integer_matrix(self.data)
[docs]
def fit(self, **kwargs) -> pd.DataFrame:
"""Fit dds model using pydeseq2.
Note: this creates its own AnnData object for downstream processing.
Args:
**kwargs: Keyword arguments specific to DeseqDataSet(), except for `n_cpus` which will use all available CPUs minus one if the argument is not passed.
"""
try:
usable_cpus = len(os.sched_getaffinity(0)) # type: ignore # os.sched_getaffinity is not available on Windows and macOS
except AttributeError:
usable_cpus = os.cpu_count() or 1
inference = DefaultInference(n_cpus=kwargs.pop("n_cpus", usable_cpus))
adata_for_dds = self.adata
if self.layer is not None:
# pydeseq2 always uses `adata.X` as the count matrix, so ensure that `X`
# reflects the chosen layer without mutating the user-provided `.X`.
adata_for_dds = AnnData(X=self.data, obs=self.adata.obs, var=self.adata.var)
dds = DeseqDataSet(
adata=adata_for_dds,
design=self.design, # initialize using design matrix, not formula
refit_cooks=True,
inference=inference,
**kwargs,
)
dds.deseq2()
self.dds = dds
def _test_single_contrast(self, contrast, alpha=0.05, *, lfc_shrink=None, **kwargs) -> pd.DataFrame:
"""Conduct a specific test and returns a Pandas DataFrame.
Args:
contrast: list of three strings of the form `["variable", "tested level", "reference level"]`.
alpha: p value threshold used for controlling fdr with independent hypothesis weighting
lfc_shrink: If given, apply apeGLM LFC shrinkage to the named coefficient (must be a column of the fitted `DeseqStats.LFC` matrix).
Opt-in because apeGLM shrinks individual coefficients rather than arbitrary contrasts, so the right coefficient depends on the design and cannot be inferred from a numerical contrast vector.
**kwargs: extra arguments to pass to DeseqStats()
"""
contrast = np.array(contrast)
stat_res = DeseqStats(self.dds, contrast=contrast, alpha=alpha, **kwargs)
# Calling `.summary()` is required to fill the `results_df` data frame
stat_res.summary()
if lfc_shrink is not None:
stat_res.lfc_shrink(coeff=lfc_shrink)
res_df = (
pd.DataFrame(stat_res.results_df)
.rename(columns={"pvalue": "p_value", "padj": "adj_p_value", "log2FoldChange": "log_fc"})
.sort_values("p_value")
)
res_df.index.name = "variable"
res_df = res_df.reset_index()
return res_df