Peripheral Blood Mononuclear Cells (PBMC)¶

The PBMC multiome data was downloaded on the 10X genomics website. We downloaded the h5 file of scRNA-seq and scATAC-seq and processed with following codes.

Since there is no annotation in scRNA-seq data, we first annatate the scRNA-seq dataset. First, We used the scanpy to perform the basic QC for scRNA-seq.

import os
import sys

import numpy as np
import pandas as pd
import anndata as ad

import seaborn as sns
import scanpy as sc

from SCRIP.utilities.utils import print_log, safe_makedirs, read_SingleCellExperiment_rds, excute_info, read_pickle, write_to_mtx

rna_adata = sc.read_10x_h5('example/PBMC/data/pbmc_granulocyte_sorted_10k_raw_feature_bc_matrix.h5')
rna_adata.var_names_make_unique()
sc.pp.filter_cells(rna_adata, min_genes=200)
sc.pp.filter_genes(rna_adata, min_cells=3)
rna_adata.var['mt'] = rna_adata.var_names.str.startswith('MT-')  # annotate the group of mitochondrial genes as 'mt'
sc.pp.calculate_qc_metrics(rna_adata, qc_vars=['mt'], percent_top=None, log1p=False, inplace=True)
sc.pl.violin(rna_adata, ['n_genes_by_counts', 'total_counts', 'pct_counts_mt'], jitter=0.4, multi_panel=True)

We clustered the cells with the louvain algorithm.

rna_adata = rna_adata[rna_adata.obs.n_genes_by_counts < 6000, :]
rna_adata = rna_adata[rna_adata.obs.pct_counts_mt < 20, :]
rna_adata = rna_adata[keep_cell_index,:]
sc.pp.normalize_total(rna_adata, target_sum=1e4)
sc.pp.log1p(rna_adata)
sc.pp.highly_variable_genes(rna_adata, min_mean=0.0125, max_mean=3, min_disp=0.5)
rna_adata.raw = rna_adata
sc.pp.regress_out(rna_adata, ['total_counts'])
sc.pp.scale(rna_adata, max_value=10)
sc.tl.pca(rna_adata, svd_solver='arpack')
sc.pp.neighbors(rna_adata, n_neighbors=10, n_pcs=40)
sc.tl.umap(rna_adata)
sc.tl.louvain(rna_adata)
sc.tl.rank_genes_groups(rna_adata, 'louvain', method='wilcoxon')
fig, ax = plt.subplots(1,1,figsize=(8,8))
sc.pl.umap(rna_adata, color=['louvain'], legend_loc='on data', title='PBMC RNA Cluster', legend_fontsize=15, ax=ax)
fig.show()

We annotated the cells with well-known gene markers.

marker_dict = {  'B': ['CD79A', 'CD79B', 'CD19', 'MS4A1', 'CR2'],
                'Plasma': ['SLAMF7', 'IGKC'],
                'T': ['CD3D', 'CD3G', 'CD3E', 'CD2'], # T
                'CD8T': ['CD8A', 'CD8B','GZMA','GZMB'], # cd8
                'CD4T': ['CD4', 'STAT4', 'STAT1'], # cd4
                'NK': ['KLRC1', 'KLRD1'], #NK
                'Mono': ['CD68', 'CSF1R', 'ADGRE1'],
                'CD14Mono': ['CD14'], #mono
                'CD16Mono': ['FCGR3A', 'TEK', 'SELL'], #mono
                'DC': ['CD86', 'ITGAX', 'FLT3', 'GZMB', 'IL3RA'], #dc
                'pDC': ['CLEC4C'], # pdc
                'Endothelial': ['PECAM1', 'NKAIN2'],  # endothelial
                'Others':['BCL11A', 'BCL11B']
                }


fig, ax = plt.subplots(1,1,figsize=(20,8))
sc.pl.dotplot(rna_adata,marker_dict, 'louvain', dendrogram=True, ax=ax)

rna_adata.obs['louvain_cell_type'] = rna_adata.obs['louvain'].astype("str")

cluster2annotation = {
    '0': 'CD14Mono',
    '1': 'CD4T',
    '2': 'CD4T',
    '3': 'CD8T',
    '4': 'CD14Mono',
    '5': 'B',
    '6': 'CD4T',
    '7': 'CD4T',
    '8': 'CD16Mono',
    '9': 'CD14Mono',
    '10': 'B',
    '11': 'CD8T',
    '12': 'CD8T',
    '13': 'CD4T',
    '14': 'CD8T',
    '15': 'CD4T',
    '16': 'pDC',
    '17': 'Endothelial',
    '18': 'Plasma'
}

rna_adata.obs['louvain_cell_type'] = rna_adata.obs['louvain'].map(cluster2annotation).astype('category')

fig, ax = plt.subplots(1,1,figsize=(8,8))
sc.pl.umap(rna_adata, color=['louvain_cell_type'], title='RNA Annotation', legend_fontsize=15,ax=ax)
fig.show()

We used the matched barcodes to migrate the cell annotations to scATAC-seq data.

rna_adata.obs.index = [i.split('-')[0] for i in rna_adata.obs.index]

with open('example/PBMC/737K-arc-v1_ATAC.txt', 'r') as atac_bc_file:
    atac_bc = [i.rstrip('\n') for i in atac_bc_file.readlines()]
with open('example/PBMC/737K-arc-v1_RNA.txt', 'r') as rna_bc_file:
    rna_bc = [i.rstrip('\n') for i in rna_bc_file.readlines()]
keys = pd.DataFrame(np.zeros([len(rna_bc),2]), columns=['ATAC','RNA'])
keys['ATAC'] = atac_bc
keys['RNA'] = rna_bc
keys.index = keys['RNA']
atac_adata = sc.read_10x_h5('example/PBMC/data/PBMC_granulocyte_sorted_10k_peak_count.h5', gex_only=False)
atac_index = list(set(atac_adata.obs.index).intersection(set(keys.loc[rna_adata.obs.index,'ATAC'])))
atac_adata = atac_adata[atac_index,:]
write_to_mtx(atac_adata, 'example/PBMC/data/ATAC/filtered_mtx')

MAESTRO provides the utility that can convert mtx to h5 format.

MAESTRO mtx-to-h5 -d . --outprefix PBMC_granulocyte_sorted_10k_filtered_peak_count

We can run SCRIP with peak count matrix in either h5 or mtx format.

SCRIP enrich -i data/ATAC/filtered_mtx/PBMC_granulocyte_sorted_10k_filtered_peak_count.h5 -s hs -p multiome_pbmc_SCRIP -t 32

To check the biological finding of SCRIP results, we use the MAESTRO to perform the basic analysis for scATAC-seq data.

library(MAESTRO)
library(Seurat)
library(SummarizedExperiment)
library(dplyr)
library(motifmatchr)

pbmc_inputMat <- Read10X_h5('example/PBMC/data/ATAC/filtered_mtx/PBMC_granulocyte_sorted_10k_filtered_peak_count.h5')
pbmc.ATAC.res <- ATACRunSeurat(inputMat = pbmc_inputMat,
                                project = "atac",
                                min.c = 50,
                                min.p = 500,
                                method = "LSI",
                                dims.use = 1:30,
                                cluster.res = 0.6,
                                only.pos = TRUE,
                                peaks.test.use = "presto",
                                peaks.cutoff = 1e-05,
                                peaks.pct = 0.1,
                                peaks.logfc = 0.2,
                                outdir = "example/PBMC/analysis/")
pbmc.ATAC.singlecellexperiment <- as.SingleCellExperiment(pbmc.ATAC.res$ATAC)
saveRDS(pbmc.ATAC.singlecellexperiment,'example/PBMC/pbmc_ATAC_singlecellexperiment.rds')

atac_adata = read_SingleCellExperiment_rds('example/PBMC/pbmc_ATAC_singlecellexperiment.rds')
atac_adata.obs['Celltype'] = atac_adata.obs['seurat_clusters'].astype('str')
keys.index = keys['ATAC']
for i in atac_adata.obs.index:
    rna_bc = keys.loc[i,'RNA']
    atac_adata.obs.loc[i, 'Celltype'] = str(rna_adata.obs.loc[rna_bc,'louvain_cell_type'])
atac_adata.obs['Celltype'] = atac_adata.obs['Celltype'].astype("category")
fig, ax = plt.subplots(1,1,figsize=(8,8))
sc.pl.umap(atac_adata, color=['Celltype'], title='ATAC Annotation',ax=ax)
fig.show()

We checked the TR enrichment in cell types.

script_result_table = read_pickle('example/PBMC/multiome_pbmc_SCRIP_20211219/enrichment/tf_cell_score_df.pk')
atac_adata.obs['SCRIP_BCL11A'] = script_result_table.T.loc[atac_adata.obs_names,'BCL11A']
atac_adata.obs['SCRIP_BCL11B'] = script_result_table.T.loc[atac_adata.obs_names,'BCL11B']

fig, ax = plt.subplots(1,1,figsize=(5,5))
sc.pl.umap(atac_adata, color=['SCRIP_BCL11A'], cmap='coolwarm', ax=ax)
fig.show()

fig, ax = plt.subplots(1,1,figsize=(5,5))
sc.pl.umap(atac_adata, color=['SCRIP_BCL11B'], cmap='coolwarm', ax=ax)
fig.show()