Source code for tools.sc._sc

#!/usr/bin/env python
# @Author: Kelvin
# @Date:   2022-07-18 11:33:46
# @Last Modified by:   Kelvin
# @Last Modified time: 2022-11-17 16:09:57
"""Miscellaneous single-cell functions."""
import functools
import math
import scipy.sparse

import numpy as np
import scanpy as sc
import pandas as pd

from anndata import AnnData
from pandas import DataFrame
from scanpy.pl._dotplot import DotPlot
from typing import List, Union


[docs] def exportDEres( adata: AnnData, column: str = None, filename: str = None, remove_mito_ribo: bool = True, key: str = "rank_genes_groups", ) -> DataFrame: """ Export DE results from scanpy. Parameters ---------- adata : AnnData AnnData object with `sc.tl.rank_genes_groups` performed. column : Optional[str], optional specific contrast to return. filename : Optional[str], optional if provided, save as file. Otherwise, return as DataFrame. remove_mito_ribo : bool, optional whether to filter all mito and ribo genes in the output. key : str, optional name in `.uns` to retrieve DE results. Returns ------- DataFrame `DataFrame` of DE results. """ if column is None: column = list(adata.uns[key]["scores"].dtype.fields.keys())[0] else: column = column if filename is not None: df_final = returnDEres( adata, column=column, remove_mito_ribo=remove_mito_ribo, key=key, ) df_final.to_csv(filename, sep="\t") else: df = returnDEres( adata, column=column, remove_mito_ribo=remove_mito_ribo, key=key, ) return df
[docs] def returnDEres( adata: AnnData, column: str = None, remove_mito_ribo: bool = True, key: str = "rank_genes_groups", ) -> DataFrame: """Summary Parameters ---------- adata : AnnData AnnData object with `sc.tl.rank_genes_groups` performed. column : Optional[str], optional specific contrast to return. remove_mito_ribo : bool, optional whether to filter all mito and ribo genes in the output. key : str, optional name in `.uns` to retrieve DE results. Returns ------- DataFrame `DataFrame` of DE results. """ if key is None: key = "rank_genes_groups" else: key = key if column is None: column = list(adata.uns[key]["scores"].dtype.fields.keys())[0] else: column = column reference = adata.uns["rank_genes_groups"]["params"]["reference"] scores = DataFrame( data=adata.uns[key]["scores"][column], index=adata.uns[key]["names"][column] ) lfc = DataFrame( data=adata.uns[key]["logfoldchanges"][column], index=adata.uns[key]["names"][column], ) pvals = DataFrame( data=adata.uns[key]["pvals"][column], index=adata.uns[key]["names"][column] ) padj = DataFrame( data=adata.uns[key]["pvals_adj"][column], index=adata.uns[key]["names"][column] ) try: pts = DataFrame( data=adata.uns[key]["pts"][column], index=adata.uns[key]["names"][column] ) ptsx = DataFrame( data=adata.uns[key]["pts_" + reference][column], index=adata.uns[key]["names"][column], ) except: pass scores = scores.loc[scores.index.dropna()] lfc = lfc.loc[lfc.index.dropna()] pvals = pvals.loc[pvals.index.dropna()] padj = padj.loc[padj.index.dropna()] try: pts = pts.loc[pts.index.dropna()] ptsx = ptsx.loc[ptsx.index.dropna()] except: pass try: dfs = [scores, lfc, pvals, padj, pts, ptsx] except: dfs = [scores, lfc, pvals, padj] try: df_final = functools.reduce( lambda left, right: pd.merge( left, right, left_index=True, right_index=True ), dfs, ) except: df_final = pd.concat(dfs, axis=1) try: df_final.columns = [ "scores", "logfoldchanges", "pvals", "pvals_adj", "pts" + "_" + column, "pts_" + reference, ] except: df_final.columns = ["scores", "logfoldchanges", "pvals", "pvals_adj"] if remove_mito_ribo: df_final = df_final[ ~df_final.index.isin( list(df_final.filter(regex="^RPL|^RPS|^MRPS|^MRPL|^MT-", axis=0).index) ) ] df_final = df_final[ ~df_final.index.isin( list(df_final.filter(regex="^Rpl|^Rps|^Mrps|^Mrpl|^mt-", axis=0).index) ) ] return df_final
[docs] def vmax(adata: AnnData, genes: Union[List, str], pct: float) -> List: """ Extract the maximum expression value from list of genes in `AnnData` at the specified `pct`. Parameters ---------- adata : AnnData input `AnnData` object. genes : Union[List, str] gene(s) to query from `AnnData` object. pct : float percentage to cut-off and return. Returns ------- List List of maximum values. """ if type(genes) is not list: genes = [genes] vm = [] for g in genes: try: idx = adata.raw.var.index.get_loc(g) except: idx = adata.var.index.get_loc(g) vm.append( math.ceil(np.quantile(adata.raw.X[:, idx].toarray(), pct) * 100.0) / 100.0 ) return vm
[docs] def vmin(adata: AnnData, genes: Union[List, str], pct: float) -> List: """ Extract the minimum expression value from list of genes in `AnnData` at the specified `pct`. Parameters ---------- adata : AnnData input `AnnData` object. genes : Union[List, str] gene(s) to query from `AnnData` object. pct : float percentage to cut-off and return. Returns ------- List List of minimum values. """ if type(genes) is not list: genes = [genes] vm = [] for g in genes: try: idx = adata.raw.var.index.get_loc(g) except: idx = adata.var.index.get_loc(g) vm.append( math.ceil(np.quantile(adata.raw.X[:, idx].toarray(), 1 - pct) * 100.0) / 100.0 ) return vm
[docs] def cell_cycle_scoring(adata: AnnData, human: bool = False): """ Run cell cycle scoring on `AnnData` object. Parameters ---------- adata : AnnData input `AnnData` object. human : bool, optional whether the data is human or not (mouse). """ # cell cycle scoring adata_cc = adata.copy() if adata_cc.raw is not None: adata_cc = adata_cc.raw.to_adata() if float(np.max(adata_cc.X)).is_integer(): # raw integer counts sc.pp.normalize_total(adata_cc, target_sum=1e4) sc.pp.log1p(adata_cc) sc.pp.scale(adata_cc) elif np.min(adata_cc.X) == 0: if "log1p" not in adata_cc.uns: sc.pp.log1p(adata_cc) # not scaled sc.pp.scale(adata_cc) else: raise ValueError("Please provide either raw integer or normalised data.") if not human: s_genes = [ "Mcm5", "Pcna", "Tyms", "Fen1", "Mcm2", "Mcm4", "Rrm1", "Ung", "Gins2", "Mcm6", "Cdca7", "Dtl", "Prim1", "Uhrf1", "Hells", "Rfc2", "Rpa2", "Nasp", "Rad51ap1", "Gmnn", "Wdr76", "Slbp", "Ccne2", "Msh2", "Rad51", "Rrm2", "Cdc45", "Cdc6", "Exo1", "Tipin", "Dscc1", "Blm", "Casp8ap2", "Usp1", "Clspn", "Pola1", "Chaf1b", "Brip1", "E2f8", ] g2m_genes = [ "Hmgb2", "Cdk1", "Nusap1", "Ube2c", "Birc5", "Tpx2", "Top2a", "Ndc80", "Cks2", "Nuf2", "Cks1b", "Mki67", "Tmpo", "Cenpf", "Tacc3", "Smc4", "Ccnb2", "Ckap2l", "Ckap2", "Aurkb", "Bub1", "Kif11", "Anp32e", "Tubb4b", "Gtse1", "Kif20b", "Hjurp", "Cdca3", "Cdc20", "Ttk", "Cdc25c", "Kif2c", "Rangap1", "Ncapd2", "Dlgap5", "Cdca2", "Cdca8", "Ect2", "Kif23", "Hmmr", "Aurka", "Psrc1", "Anln", "Lbr", "Ckap5", "Cenpe", "Ctcf", "Nek2", "G2e3", "Gas2l3", "Cbx5", "Cenpa", ] else: s_genes = [ "MCM5", "PCNA", "TYMS", "FEN1", "MCM2", "MCM4", "RRM1", "UNG", "GINS2", "MCM6", "CDCA7", "DTL", "PRIM1", "UHRF1", "MLF1IP", "HELLS", "RFC2", "RPA2", "NASP", "RAD51AP1", "GMNN", "WDR76", "SLBP", "CCNE2", "UBR7", "POLD3", "MSH2", "ATAD2", "RAD51", "RRM2", "CDC45", "CDC6", "EXO1", "TIPIN", "DSCC1", "BLM", "CASP8AP2", "USP1", "CLSPN", "POLA1", "CHAF1B", "BRIP1", "E2F8", ] g2m_genes = [ "HMGB2", "CDK1", "NUSAP1", "UBE2C", "BIRC5", "TPX2", "TOP2A", "NDC80", "CKS2", "NUF2", "CKS1B", "MKI67", "TMPO", "CENPF", "TACC3", "FAM64A", "SMC4", "CCNB2", "CKAP2L", "CKAP2", "AURKB", "BUB1", "KIF11", "ANP32E", "TUBB4B", "GTSE1", "KIF20B", "HJURP", "CDCA3", "HN1", "CDC20", "TTK", "CDC25C", "KIF2C", "RANGAP1", "NCAPD2", "DLGAP5", "CDCA2", "CDCA8", "ECT2", "KIF23", "HMMR", "AURKA", "PSRC1", "ANLN", "LBR", "CKAP5", "CENPE", "CTCF", "NEK2", "G2E3", "GAS2L3", "CBX5", "CENPA", ] sc.tl.score_genes_cell_cycle( adata_cc, s_genes=s_genes, g2m_genes=g2m_genes, use_raw=False ) for x in ["S_score", "G2M_score", "phase"]: adata.obs[x] = adata_cc.obs[x]
[docs] def combine_two_categories(adata: AnnData, A: str, B: str, sep: str = "_") -> None: """Combine two categories in place, respecting the order of the concatenation. Parameters ---------- adata : AnnData Input anndata object. A : str Column name for first category. B : str Column name for second category. sep : str, optional The separator to combine the names. """ comb_cat = A + sep + B adata.obs[comb_cat] = [a + "_" + b for a, b in zip(adata.obs[A], adata.obs[B])] adata.obs[A] = adata.obs[A].astype("category") adata.obs[B] = adata.obs[B].astype("category") a_cat = adata.obs[A].cat.categories b_cat = adata.obs[B].cat.categories cats = [] for a in a_cat: for b in b_cat: cats.append(a + "_" + b) adata.obs[comb_cat] = adata.obs[comb_cat].astype("category") adata.obs[comb_cat] = adata.obs[comb_cat].cat.reorder_categories( [c for c in cats if c in adata.obs[comb_cat].cat.categories] )
[docs] def dotplot_2obs( adata, gene, x_axis, y_axis, x_order=None, y_order=None, use_raw=True, fill_na=True, show_plot=True, **kwargs ): """ A function that extracts the expression for a provided gene, and subsequently prepares a dotplot where the two axes are two categoricals from the obs of the object. Can be controlled between .raw and .X via use_raw (defaults to raw) By default fills nonexistent obs combinations as zeroes in the plot (as otherwise the plot can error in weird ways sometimes). Thanks kp9! """ # prepare dfs with dot sizes and colours # start off by making a helper df with all the info obs_tidy_master = adata.obs[[y_axis, x_axis]].set_index(y_axis) # add gene expression if use_raw: if scipy.sparse.issparse(adata.raw.X): obs_tidy_master[gene] = adata.raw[:, gene].X.todense() else: obs_tidy_master[gene] = np.array(adata.raw[:, gene].X) else: if scipy.sparse.issparse(adata.X): obs_tidy_master[gene] = adata[:, gene].X.todense() else: obs_tidy_master[gene] = np.array(adata[:, gene].X) # this is where our computed values will go dot_size_df = pd.DataFrame( 0, index=np.unique(obs_tidy_master.index), columns=np.unique(obs_tidy_master[x_axis]), ) # reorder the groups if specified if x_order is not None: dot_size_df = dot_size_df[x_order] if y_order is not None: dot_size_df = dot_size_df.loc[y_order, :] dot_color_df = dot_size_df.copy() # compute the sizes and colors, replicating the logic found within DotPlot() for cat in dot_size_df.columns: obs_tidy = obs_tidy_master.loc[obs_tidy_master[x_axis] == cat, gene] obs_bool = obs_tidy > 0 dot_color_df[cat] = obs_tidy.groupby(level=0).mean().fillna(0) dot_size_df[cat] = ( obs_bool.groupby(level=0).sum() / obs_bool.groupby(level=0).count() ) # some combinations may be absent, and there will be NaNs there, and this sometimes kills the plot # fill with zeroes if specified if fill_na: dot_color_df.fillna(0, inplace=True) dot_size_df.fillna(0, inplace=True) # in order to actually plot this, we need to make a dummy anndata object # just how DotPlot() is wired internally bdata = AnnData(np.zeros(dot_size_df.shape)) bdata.var_names = dot_size_df.columns # this needs to be turned to a list or it whines about a conflict later bdata.obs_names = list(dot_size_df.index) # the grouping variable needs to be present in the dummy object # its existence is checked, it actually does nothing when we insert the dot sizes bdata.obs[y_axis] = dot_size_df.index # actually make the dotplot, with the best colour scheme :P dp = DotPlot( bdata, dot_size_df.columns, y_axis, dot_size_df=dot_size_df, dot_color_df=dot_color_df, title=gene, **kwargs, ) if show_plot: dp.show() return dp