Single cell data analysis using scanpy

import numpy as np
import pandas as pd
import scanpy as sc

# !mkdir data
# !wget http://cf.10xgenomics.com/samples/cell-exp/1.1.0/pbmc3k/pbmc3k_filtered_gene_bc_matrices.tar.gz -O data/pbmc3k_filtered_gene_bc_matrices.tar.gz
# !cd data; tar -xzf pbmc3k_filtered_gene_bc_matrices.tar.gz
# !mkdir write

sc.settings.verbosity = 3             # verbosity: errors (0), warnings (1), info (2), hints (3)
sc.logging.print_header()
sc.settings.set_figure_params(dpi=80, facecolor='white')

scanpy==1.6.1 anndata==0.7.5 umap==0.4.6 numpy==1.19.2 scipy==1.5.2 pandas==1.2.1 scikit-learn==0.23.2 statsmodels==0.12.1 python-igraph==0.8.3 leidenalg==0.8.3

results_file = 'write/pbmc3k.h5ad'  # the file that will store the analysis results

adata = sc.read_10x_mtx(
    'filtered_gene_bc_matrices/hg19/',  # the directory with the `.mtx` file
    var_names='gene_symbols',                # use gene symbols for the variable names (variables-axis index)
    cache=True)                              # write a cache file for faster subsequent reading

... writing an h5ad cache file to speedup reading next time

adata.var_names_make_unique()  # this is unnecessary if using `var_names='gene_ids'` in `sc.read_10x_mtx`

adata.obs

	n_genes	n_genes_by_counts	total_counts	total_counts_mt	pct_counts_mt
AAACATACAACCAC-1	781	779	2419.0	73.0	3.017776
AAACATTGAGCTAC-1	1352	1352	4903.0	186.0	3.793596
AAACATTGATCAGC-1	1131	1129	3147.0	28.0	0.889736
AAACCGTGCTTCCG-1	960	960	2639.0	46.0	1.743085
AAACCGTGTATGCG-1	522	521	980.0	12.0	1.224490
…	…	…	…	…	…
TTTCGAACTCTCAT-1	1155	1153	3459.0	73.0	2.110436
TTTCTACTGAGGCA-1	1227	1224	3443.0	32.0	0.929422
TTTCTACTTCCTCG-1	622	622	1684.0	37.0	2.197150
TTTGCATGAGAGGC-1	454	452	1022.0	21.0	2.054795
TTTGCATGCCTCAC-1	724	723	1984.0	16.0	0.806452

2700 rows × 5 columns

sc.pl.highest_expr_genes(adata, n_top=20, )

normalizing counts per cell
    finished (0:00:00)

png

sc.pp.filter_cells(adata, min_genes=200)
sc.pp.filter_genes(adata, min_cells=3)

filtered out 19024 genes that are detected in less than 3 cells

adata.var['mt'] = adata.var_names.str.startswith('MT-')  # annotate the group of mitochondrial genes as 'mt'
sc.pp.calculate_qc_metrics(adata, qc_vars=['mt'], percent_top=None, log1p=False, inplace=True)

sc.pl.violin(adata, ['n_genes_by_counts', 'total_counts', 'pct_counts_mt'],
             jitter=0.4, multi_panel=True)

png

sc.pl.scatter(adata, x='total_counts', y='pct_counts_mt')
sc.pl.scatter(adata, x='total_counts', y='n_genes_by_counts')

png

adata = adata[adata.obs.n_genes_by_counts < 2500, :]
adata = adata[adata.obs.pct_counts_mt < 5, :]

sc.pp.normalize_total(adata, target_sum=1e4)

normalizing counts per cell
    finished (0:00:00)



  view_to_actual(adata)

sc.pp.log1p(adata)

sc.pp.highly_variable_genes(adata, min_mean=0.0125, max_mean=3, min_disp=0.5)

extracting highly variable genes
    finished (0:00:00)
--> added
    'highly_variable', boolean vector (adata.var)
    'means', float vector (adata.var)
    'dispersions', float vector (adata.var)
    'dispersions_norm', float vector (adata.var)

sc.pl.highly_variable_genes(adata)

png

adata.raw = adata

adata = adata[:, adata.var.highly_variable]

sc.pp.regress_out(adata, ['total_counts', 'pct_counts_mt'])

regressing out ['total_counts', 'pct_counts_mt']
    sparse input is densified and may lead to high memory use
    finished (0:00:08)

sc.pp.scale(adata, max_value=10)

sc.tl.pca(adata, svd_solver='arpack')

computing PCA
    on highly variable genes
    with n_comps=50
    finished (0:00:00)

sc.pl.pca(adata, color='CST3')

png

sc.pl.pca_variance_ratio(adata, log=True)

png

adata.write(results_file)

adata

AnnData object with n_obs × n_vars = 2638 × 1838
    obs: 'n_genes', 'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt'
    var: 'gene_ids', 'n_cells', 'mt', 'n_cells_by_counts', 'mean_counts', 'pct_dropout_by_counts', 'total_counts', 'highly_variable', 'means', 'dispersions', 'dispersions_norm', 'mean', 'std'
    uns: 'log1p', 'hvg', 'pca'
    obsm: 'X_pca'
    varm: 'PCs'

sc.pp.neighbors(adata, n_neighbors=10, n_pcs=40)

computing neighbors
    using 'X_pca' with n_pcs = 40
    finished: added to `.uns['neighbors']`
    `.obsp['distances']`, distances for each pair of neighbors
    `.obsp['connectivities']`, weighted adjacency matrix (0:00:01)

sc.tl.umap(adata)

computing UMAP
    finished: added
    'X_umap', UMAP coordinates (adata.obsm) (0:00:03)

sc.pl.umap(adata, color=['CST3', 'NKG7', 'PPBP'])

png

sc.pl.umap(adata, color=['CST3', 'NKG7', 'PPBP'], use_raw=False)

png

sc.tl.leiden(adata)

running Leiden clustering
    finished: found 8 clusters and added
    'leiden', the cluster labels (adata.obs, categorical) (0:00:00)

sc.pl.umap(adata, color=['leiden', 'CST3', 'NKG7'])

png

adata.write(results_file)

sc.tl.rank_genes_groups(adata, 'leiden', method='t-test')
sc.pl.rank_genes_groups(adata, n_genes=25, sharey=False)

ranking genes
    finished: added to `.uns['rank_genes_groups']`
    'names', sorted np.recarray to be indexed by group ids
    'scores', sorted np.recarray to be indexed by group ids
    'logfoldchanges', sorted np.recarray to be indexed by group ids
    'pvals', sorted np.recarray to be indexed by group ids
    'pvals_adj', sorted np.recarray to be indexed by group ids (0:00:00)

png

sc.settings.verbosity = 2  # reduce the verbosity

sc.tl.rank_genes_groups(adata, 'leiden', method='wilcoxon')
sc.pl.rank_genes_groups(adata, n_genes=25, sharey=False)

ranking genes
    finished (0:00:03)

png

adata.write(results_file)

sc.tl.rank_genes_groups(adata, 'leiden', method='logreg')
sc.pl.rank_genes_groups(adata, n_genes=25, sharey=False)

ranking genes
    finished (0:00:05)




Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(

png

marker_genes = ['IL7R', 'CD79A', 'MS4A1', 'CD8A', 'CD8B', 'LYZ', 'CD14',
                'LGALS3', 'S100A8', 'GNLY', 'NKG7', 'KLRB1',
                'FCGR3A', 'MS4A7', 'FCER1A', 'CST3', 'PPBP']

adata = sc.read(results_file)

pd.DataFrame(adata.uns['rank_genes_groups']['names']).head(5)

	0	1	2	3	4	5	6	7
0	RPS12	LYZ	CD74	CCL5	NKG7	LST1	HLA-DPA1	PF4
1	LDHB	S100A9	CD79A	NKG7	GNLY	FCER1G	HLA-DPB1	SDPR
2	RPS25	S100A8	HLA-DRA	B2M	GZMB	AIF1	HLA-DRA	GNG11
3	RPS27	TYROBP	CD79B	CST7	CTSW	COTL1	HLA-DRB1	PPBP
4	RPS6	FTL	HLA-DPB1	IL32	PRF1	FCGR3A	CD74	NRGN

result = adata.uns['rank_genes_groups']
groups = result['names'].dtype.names
pd.DataFrame(
    {group + '_' + key[:1]: result[key][group]
    for group in groups for key in ['names', 'pvals']}).head(5)

	0_n	0_p	1_n	1_p	2_n	2_p	3_n	3_p	4_n	4_p	5_n	5_p	6_n	6_p	7_n	7_p
0	RPS12	4.803005e-219	LYZ	7.634876e-249	CD74	3.043536e-182	CCL5	2.841576e-120	NKG7	1.203971e-96	LST1	1.322111e-88	HLA-DPA1	5.422417e-21	PF4	4.722886e-10
1	LDHB	5.645430e-216	S100A9	4.626358e-246	CD79A	6.860832e-170	NKG7	2.045965e-96	GNLY	1.257170e-88	FCER1G	6.259712e-85	HLA-DPB1	7.591860e-21	SDPR	4.733899e-10
2	RPS25	3.358266e-195	S100A8	1.622835e-238	HLA-DRA	8.389292e-166	B2M	1.480838e-82	GZMB	1.429027e-88	AIF1	1.348814e-83	HLA-DRA	1.306768e-19	GNG11	4.733899e-10
3	RPS27	3.322679e-185	TYROBP	2.957652e-220	CD79B	1.171444e-153	CST7	3.420909e-78	CTSW	4.144726e-87	COTL1	5.974694e-82	HLA-DRB1	1.865104e-19	PPBP	4.744938e-10
4	RPS6	2.398577e-183	FTL	2.479195e-214	HLA-DPB1	6.167786e-148	IL32	3.687167e-73	PRF1	1.692100e-85	FCGR3A	1.392377e-77	CD74	5.853161e-19	NRGN	4.800511e-10

sc.tl.rank_genes_groups(adata, 'leiden', groups=['0'], reference='1', method='wilcoxon')
sc.pl.rank_genes_groups(adata, groups=['0'], n_genes=20)

ranking genes
    finished (0:00:01)

png

sc.pl.rank_genes_groups_violin(adata, groups='0', n_genes=8)

png

sc.pl.rank_genes_groups_violin(adata, groups='0', n_genes=8)

png

sc.pl.violin(adata, ['CST3', 'NKG7', 'PPBP'], groupby='leiden')

png

new_cluster_names = [
    'CD4 T', 'CD14 Monocytes',
    'B', 'CD8 T',
    'NK', 'FCGR3A Monocytes',
    'Dendritic', 'Megakaryocytes']
adata.rename_categories('leiden', new_cluster_names)

Omitting rank_genes_groups/names as old categories do not match.
Omitting rank_genes_groups/scores as old categories do not match.
Omitting rank_genes_groups/pvals as old categories do not match.
Omitting rank_genes_groups/pvals_adj as old categories do not match.
Omitting rank_genes_groups/logfoldchanges as old categories do not match.

sc.pl.umap(adata, color='leiden', legend_loc='on data', title='', frameon=False, save='.pdf')

WARNING: saving figure to file figures/umap.pdf

png

sc.pl.dotplot(adata, marker_genes, groupby='leiden');

png

sc.pl.stacked_violin(adata, marker_genes, groupby='leiden', rotation=90);

png

adata.write(results_file, compression='gzip')  # `compression='gzip'` saves disk space, but slows down writing and subsequent reading

adata.raw.to_adata().write('./data/write/pbmc3k_withoutX.h5ad')