5 min read

Single cell data analysis using scanpy

import numpy as np
import pandas as pd
import scanpy as sc
# !mkdir data
# !wget http://cf.10xgenomics.com/samples/cell-exp/1.1.0/pbmc3k/pbmc3k_filtered_gene_bc_matrices.tar.gz -O data/pbmc3k_filtered_gene_bc_matrices.tar.gz
# !cd data; tar -xzf pbmc3k_filtered_gene_bc_matrices.tar.gz
# !mkdir write
sc.settings.verbosity = 3             # verbosity: errors (0), warnings (1), info (2), hints (3)
sc.logging.print_header()
sc.settings.set_figure_params(dpi=80, facecolor='white')
scanpy==1.6.1 anndata==0.7.5 umap==0.4.6 numpy==1.19.2 scipy==1.5.2 pandas==1.2.1 scikit-learn==0.23.2 statsmodels==0.12.1 python-igraph==0.8.3 leidenalg==0.8.3
results_file = 'write/pbmc3k.h5ad'  # the file that will store the analysis results
adata = sc.read_10x_mtx(
    'filtered_gene_bc_matrices/hg19/',  # the directory with the `.mtx` file
    var_names='gene_symbols',                # use gene symbols for the variable names (variables-axis index)
    cache=True)                              # write a cache file for faster subsequent reading
... writing an h5ad cache file to speedup reading next time
adata.var_names_make_unique()  # this is unnecessary if using `var_names='gene_ids'` in `sc.read_10x_mtx`
adata.obs
n_genes n_genes_by_counts total_counts total_counts_mt pct_counts_mt
AAACATACAACCAC-1 781 779 2419.0 73.0 3.017776
AAACATTGAGCTAC-1 1352 1352 4903.0 186.0 3.793596
AAACATTGATCAGC-1 1131 1129 3147.0 28.0 0.889736
AAACCGTGCTTCCG-1 960 960 2639.0 46.0 1.743085
AAACCGTGTATGCG-1 522 521 980.0 12.0 1.224490
TTTCGAACTCTCAT-1 1155 1153 3459.0 73.0 2.110436
TTTCTACTGAGGCA-1 1227 1224 3443.0 32.0 0.929422
TTTCTACTTCCTCG-1 622 622 1684.0 37.0 2.197150
TTTGCATGAGAGGC-1 454 452 1022.0 21.0 2.054795
TTTGCATGCCTCAC-1 724 723 1984.0 16.0 0.806452

2700 rows × 5 columns

sc.pl.highest_expr_genes(adata, n_top=20, )
normalizing counts per cell
    finished (0:00:00)

png

sc.pp.filter_cells(adata, min_genes=200)
sc.pp.filter_genes(adata, min_cells=3)
filtered out 19024 genes that are detected in less than 3 cells
adata.var['mt'] = adata.var_names.str.startswith('MT-')  # annotate the group of mitochondrial genes as 'mt'
sc.pp.calculate_qc_metrics(adata, qc_vars=['mt'], percent_top=None, log1p=False, inplace=True)
sc.pl.violin(adata, ['n_genes_by_counts', 'total_counts', 'pct_counts_mt'],
             jitter=0.4, multi_panel=True)

png

sc.pl.scatter(adata, x='total_counts', y='pct_counts_mt')
sc.pl.scatter(adata, x='total_counts', y='n_genes_by_counts')

png

png

adata = adata[adata.obs.n_genes_by_counts < 2500, :]
adata = adata[adata.obs.pct_counts_mt < 5, :]
sc.pp.normalize_total(adata, target_sum=1e4)
normalizing counts per cell
    finished (0:00:00)



  view_to_actual(adata)
sc.pp.log1p(adata)
sc.pp.highly_variable_genes(adata, min_mean=0.0125, max_mean=3, min_disp=0.5)
extracting highly variable genes
    finished (0:00:00)
--> added
    'highly_variable', boolean vector (adata.var)
    'means', float vector (adata.var)
    'dispersions', float vector (adata.var)
    'dispersions_norm', float vector (adata.var)
sc.pl.highly_variable_genes(adata)

png

adata.raw = adata
adata = adata[:, adata.var.highly_variable]
sc.pp.regress_out(adata, ['total_counts', 'pct_counts_mt'])
regressing out ['total_counts', 'pct_counts_mt']
    sparse input is densified and may lead to high memory use
    finished (0:00:08)
sc.pp.scale(adata, max_value=10)
sc.tl.pca(adata, svd_solver='arpack')
computing PCA
    on highly variable genes
    with n_comps=50
    finished (0:00:00)
sc.pl.pca(adata, color='CST3')

png

sc.pl.pca_variance_ratio(adata, log=True)

png

adata.write(results_file)
adata
AnnData object with n_obs × n_vars = 2638 × 1838
    obs: 'n_genes', 'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt'
    var: 'gene_ids', 'n_cells', 'mt', 'n_cells_by_counts', 'mean_counts', 'pct_dropout_by_counts', 'total_counts', 'highly_variable', 'means', 'dispersions', 'dispersions_norm', 'mean', 'std'
    uns: 'log1p', 'hvg', 'pca'
    obsm: 'X_pca'
    varm: 'PCs'
sc.pp.neighbors(adata, n_neighbors=10, n_pcs=40)
computing neighbors
    using 'X_pca' with n_pcs = 40
    finished: added to `.uns['neighbors']`
    `.obsp['distances']`, distances for each pair of neighbors
    `.obsp['connectivities']`, weighted adjacency matrix (0:00:01)
sc.tl.umap(adata)
computing UMAP
    finished: added
    'X_umap', UMAP coordinates (adata.obsm) (0:00:03)
sc.pl.umap(adata, color=['CST3', 'NKG7', 'PPBP'])

png

sc.pl.umap(adata, color=['CST3', 'NKG7', 'PPBP'], use_raw=False)

png

sc.tl.leiden(adata)
running Leiden clustering
    finished: found 8 clusters and added
    'leiden', the cluster labels (adata.obs, categorical) (0:00:00)
sc.pl.umap(adata, color=['leiden', 'CST3', 'NKG7'])

png

adata.write(results_file)
sc.tl.rank_genes_groups(adata, 'leiden', method='t-test')
sc.pl.rank_genes_groups(adata, n_genes=25, sharey=False)
ranking genes
    finished: added to `.uns['rank_genes_groups']`
    'names', sorted np.recarray to be indexed by group ids
    'scores', sorted np.recarray to be indexed by group ids
    'logfoldchanges', sorted np.recarray to be indexed by group ids
    'pvals', sorted np.recarray to be indexed by group ids
    'pvals_adj', sorted np.recarray to be indexed by group ids (0:00:00)

png

sc.settings.verbosity = 2  # reduce the verbosity
sc.tl.rank_genes_groups(adata, 'leiden', method='wilcoxon')
sc.pl.rank_genes_groups(adata, n_genes=25, sharey=False)
ranking genes
    finished (0:00:03)

png

adata.write(results_file)
sc.tl.rank_genes_groups(adata, 'leiden', method='logreg')
sc.pl.rank_genes_groups(adata, n_genes=25, sharey=False)
ranking genes
    finished (0:00:05)




Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(

png

marker_genes = ['IL7R', 'CD79A', 'MS4A1', 'CD8A', 'CD8B', 'LYZ', 'CD14',
                'LGALS3', 'S100A8', 'GNLY', 'NKG7', 'KLRB1',
                'FCGR3A', 'MS4A7', 'FCER1A', 'CST3', 'PPBP']
adata = sc.read(results_file)
pd.DataFrame(adata.uns['rank_genes_groups']['names']).head(5)
0 1 2 3 4 5 6 7
0 RPS12 LYZ CD74 CCL5 NKG7 LST1 HLA-DPA1 PF4
1 LDHB S100A9 CD79A NKG7 GNLY FCER1G HLA-DPB1 SDPR
2 RPS25 S100A8 HLA-DRA B2M GZMB AIF1 HLA-DRA GNG11
3 RPS27 TYROBP CD79B CST7 CTSW COTL1 HLA-DRB1 PPBP
4 RPS6 FTL HLA-DPB1 IL32 PRF1 FCGR3A CD74 NRGN
result = adata.uns['rank_genes_groups']
groups = result['names'].dtype.names
pd.DataFrame(
    {group + '_' + key[:1]: result[key][group]
    for group in groups for key in ['names', 'pvals']}).head(5)
0_n 0_p 1_n 1_p 2_n 2_p 3_n 3_p 4_n 4_p 5_n 5_p 6_n 6_p 7_n 7_p
0 RPS12 4.803005e-219 LYZ 7.634876e-249 CD74 3.043536e-182 CCL5 2.841576e-120 NKG7 1.203971e-96 LST1 1.322111e-88 HLA-DPA1 5.422417e-21 PF4 4.722886e-10
1 LDHB 5.645430e-216 S100A9 4.626358e-246 CD79A 6.860832e-170 NKG7 2.045965e-96 GNLY 1.257170e-88 FCER1G 6.259712e-85 HLA-DPB1 7.591860e-21 SDPR 4.733899e-10
2 RPS25 3.358266e-195 S100A8 1.622835e-238 HLA-DRA 8.389292e-166 B2M 1.480838e-82 GZMB 1.429027e-88 AIF1 1.348814e-83 HLA-DRA 1.306768e-19 GNG11 4.733899e-10
3 RPS27 3.322679e-185 TYROBP 2.957652e-220 CD79B 1.171444e-153 CST7 3.420909e-78 CTSW 4.144726e-87 COTL1 5.974694e-82 HLA-DRB1 1.865104e-19 PPBP 4.744938e-10
4 RPS6 2.398577e-183 FTL 2.479195e-214 HLA-DPB1 6.167786e-148 IL32 3.687167e-73 PRF1 1.692100e-85 FCGR3A 1.392377e-77 CD74 5.853161e-19 NRGN 4.800511e-10
sc.tl.rank_genes_groups(adata, 'leiden', groups=['0'], reference='1', method='wilcoxon')
sc.pl.rank_genes_groups(adata, groups=['0'], n_genes=20)
ranking genes
    finished (0:00:01)

png

sc.pl.rank_genes_groups_violin(adata, groups='0', n_genes=8)

png

sc.pl.rank_genes_groups_violin(adata, groups='0', n_genes=8)

png

sc.pl.violin(adata, ['CST3', 'NKG7', 'PPBP'], groupby='leiden')

png

new_cluster_names = [
    'CD4 T', 'CD14 Monocytes',
    'B', 'CD8 T',
    'NK', 'FCGR3A Monocytes',
    'Dendritic', 'Megakaryocytes']
adata.rename_categories('leiden', new_cluster_names)
Omitting rank_genes_groups/names as old categories do not match.
Omitting rank_genes_groups/scores as old categories do not match.
Omitting rank_genes_groups/pvals as old categories do not match.
Omitting rank_genes_groups/pvals_adj as old categories do not match.
Omitting rank_genes_groups/logfoldchanges as old categories do not match.
sc.pl.umap(adata, color='leiden', legend_loc='on data', title='', frameon=False, save='.pdf')
WARNING: saving figure to file figures/umap.pdf

png

sc.pl.dotplot(adata, marker_genes, groupby='leiden');

png

sc.pl.stacked_violin(adata, marker_genes, groupby='leiden', rotation=90);

png

adata.write(results_file, compression='gzip')  # `compression='gzip'` saves disk space, but slows down writing and subsequent reading
adata.raw.to_adata().write('./data/write/pbmc3k_withoutX.h5ad')