Label transfer using customized neuron cell reference¶
In this vignette, we will explore how to use hECA build customized reference dataset of the same cell type and perform label transfer to annotate other cells.
Step 1: load packages
library(Seurat)
library(SingleR)
library(tidyverse)
2.4 - Download cells following the tutorials of ECAUGT and save the results to csv file (in Python):
rows_to_get = pd.read_csv('keys.csv')
rows_to_get = [[('cid',i)] for i in rows_to_get['cid']]
result = ECAUGT.get_columnsbycell_para(rows_to_get = rows_to_get, cols_to_get=None,
col_filter=gene_condition, do_transfer = True,
thread_num = multiprocessing.cpu_count()-1)
genes = result.columns[:43878]
metaCols = result.columns[43878:43878+18]
expr = result.loc[:,genes]
meta = result.loc[:,metaCols]
expr.to_csv("hECA_exprs.csv", index=True)
meta.to_csv("hECA_metadata.csv", index=True)
2.5 - Load downloaded expression matrix and metadata as customized reference dataset (continue in R).
expr <- read.csv("hECA_exprs.csv", header=T, row.names=1)
meta <- read.csv("hECA_metadata.csv", header=T, row.names=1)
Step 3.1: uniform cell type names
que_file = "DER-22_Single_cell_expression_raw_UMI.tsv"
data.matrix <- read.table(que_file, h = T, row.names = 1)
cellid = colnames(data.matrix)
samp.devStage <- data.frame(samp.devStage = ifelse(grepl("^Fetal",cellid),"Fetal","Adult"))
rownames(samp.devStage) = cellid
true_query = data.frame(cellid = cellid) %>% mutate(
orig.celllabel = stringr::str_split_fixed(gsub("Fetal.","",cellid),"\\.",2)[,1],
orig.uhafcelltype = case_when(
grepl("^Ex",orig.celllabel) ~ "Excitatory neuron",
grepl("^In",orig.celllabel) ~ "Inhibitory neuron",
grepl("^Neuron",orig.celllabel) ~ "Neuron",
grepl("^quies",orig.celllabel) ~ "Neuron",
grepl("^trans",orig.celllabel) ~ "Neuron",
grepl("^repli",orig.celllabel) ~ "Neural progenitor cell",
grepl("^IPC",orig.celllabel) ~ "Neural progenitor cell",
grepl("^NEP",orig.celllabel) ~ "Neural progenitor cell",
grepl("^OPC",orig.celllabel) ~ "Oligodendrocyte progenitor cell",
grepl("^Oligo",orig.celllabel) ~ "Oligodendrocyte",
grepl("^Astro",orig.celllabel) ~ "Astrocyte",
grepl("^Microglia",orig.celllabel) ~ "Microglia",
grepl("^Endo",orig.celllabel) ~ "Endothelial cell",
grepl("^Per",orig.celllabel) ~ "Pericyte",
TRUE ~ "Unclassified"),
orig.uhaftissuetype = case_when(
orig.celltype %in% c("Neuron","Excitatory neuron","Inhibitory neuron","Astrocyte","Oligodendrocyte","Microglia","Neural progenitor cell","Oligodendrocyte progenitor cell") ~ "Nerve tissue",
orig.celltype %in% c("Endothelial cell","Pericyte") ~ "Epithelial tissue",
TRUE ~ as.character(orig.celltype)))
rownames(true_query) = cellid
Step 3.2: Create Seurat object and perform quality control using Seurat
# create Seurat object
query_obj <- CreateSeuratObject(counts = as.matrix(data.matrix), meta.data = samp.devStage)
query_obj <- AddMetaData(query_obj, metadata = true_query)
# QC
mito_genes <- rownames(query_obj)[grep("^[Mm][Tt]-",rownames(query_obj))]
query_obj <- PercentageFeatureSet(query_obj, "^MT-", col.name = "percent_mito")
ribo_genes <- rownames(query_obj)[grep("^RP[SL]",rownames(query_obj))]
query_obj <- PercentageFeatureSet(query_obj, "^RP[SL]", col.name = "percent_ribo")
hemoglobin_genes <- rownames(query_obj)[grep("^HB[^(P)]",rownames(query_obj))]
query_obj <- PercentageFeatureSet(query_obj, "^HB[^(P)]", col.name = "percent_hb")
selected_c <- WhichCells(query_obj, expression = nFeature_RNA > 200)# for UMI
selected_f <- rownames(query_obj)[Matrix::rowSums(query_obj.filt) > 3]
query_obj.filt <- subset(query_obj, features = selected_f, cells = selected_c)
Step 3.3: Run standard Seurat pipeline to obtain clusters
query_obj.filt <- NormalizeData(query_obj.filt, normalization.method = "LogNormalize", scale.factor = 10000)
query_obj.filt <- FindVariableFeatures(query_obj.filt, selection.method = "vst", nfeatures = 2000)
query_obj.filt <- ScaleData(query_obj.filt, features = VariableFeatures(query_obj.filt), vars.to.regress = c("nFeature_RNA")) # UMI dataset
query_obj.filt = RunPCA(query_obj.filt, npcs = 20, verbose = F)
query_obj.filt <- JackStraw(query_obj.filt, num.replicate = 100)
query_obj.filt <- ScoreJackStraw(query_obj.filt, dims = 1:20)
pcDim = 8
query_obj.filt <- FindNeighbors(query_obj.filt, dims = 1:pcDim, nn.eps = 0.5)
query_obj.filt <- FindClusters(query_obj.filt, resolution = 0.5, n.start = 10)
query_obj.filt = RunUMAP(query_obj.filt, dims = 1:pcDim, verbose = F)
# DimPlot(query_obj.filt, reduction="umap")
# Finding cluster markers
query_obj.markers <- FindAllMarkers(query_obj.filt, only.pos = TRUE, min.pct = 0.25, logfc.threshold = 0.25)
Step 3.4: Assign uHAF labels to clusters
uhaf.anno <- data.frame(seurat.clust = query_obj.filt$seurat_clusters) %>% mutate(uhaf.cell_type = case_when(
seurat.clust %in% c(0:2,4,7,16,17,20) ~ "Excitatory neuron",
seurat.clust == 3 ~ "PV inhibitory neuron",
seurat.clust == 18 ~ "Inhibitory neuron",
seurat.clust %in% c(8,10) ~ "VIP inhibitory neuron",
seurat.clust == 9 ~ "SST inhibitory neuron",
seurat.clust %in% c(5,12,19) ~ "Astrocyte",
seurat.clust %in% c(6,11,15) ~ "Oligodendrocyte",
seurat.clust %in% c(13,21) ~ "Oligodendrocyte progenitor cell",
seurat.clust == 14 ~ "Endothelial cell"))
rownames(uhaf.anno) = colnames(query_obj.filt)
levels(uhaf.anno$uhaf.cell_type) = levels(uhaf.anno$seurat.clust)
query_obj.filt <- AddMetaData(query_obj.filt, metadata = uhaf.anno)
Step 4.1: Train SingleR model
gene.use = intersect(row.names(expr),row.names(query_obj.filt))
expr <- exprs[gene.use,]
ct.ref <- meta$cell_type
trainedR <- trainSingleR(expr, ct.ref, de.method = "wilcox")
Step 4.2: Predict labels of query data
### predict labels for query
cell.use = grepl("neuron", query_obj.filt@meta.data[, "uhaf.cell_type"])
#obj.query = query_obj.filt[gene.use,cell.use]
obj.query = query_obj.filt[gene.use,]
predict <- classifySingleR(obj.query@assays$RNA@data,trainedR)
Step 4.3: Check prediction results
truth <- obj.query$uhaf.cell_type
confMat = caret::confusionMatrix(factor(predict$pruned.labels,levels=unique(predict$labels)),factor(truth,levels=unique(predict$labels)))
confMat
(Optional) Step 4.4: Add prediction label to query object
df.result <- predict[,c("pruned.labels","labels")]
df.result$truth <- obj.query$uhaf.cell_type
df.result$X <- rownames(df.result)
df.result <- data.frame(df.result)
row.names(df.result) = df.result$X
singleR.anno = df.result[,"pruned.labels", drop = F]
query_obj.filt <- AddMetaData(query_obj.filt, metadata = singleR.anno)
query_obj.filtNeuron <- AddMetaData(obj.query, metadata = singleR.anno)
(Optional) Step 5: Check the accuracy of DE genes after label transfer
## identify cell type-specific markers for uhaf cell type label
query_obj.filt <- SetIdent(query_obj.filt,cells = colnames(query_obj.filt), value = query_obj.filt$uhaf.cell_type)
uhafcelltype_markers <- FindAllMarkers(query_obj.filt, only.pos = TRUE, min.pct = 0.25, logfc.threshold = 0.25)
uhafcelltype_markers.top100 <- uhafcelltype_markers %>% group_by(cluster) %>% top_n(n = 100, wt = avg_log2FC)
## identify cell type-specific markers for singleR cell type label
query_obj.filt <- SetIdent(query_obj.filt, cells = colnames(query_obj.filt), value = query_obj.filt$pruned.labels)
saveRDS(query_obj.filt, file = "PsychEncode_UMI.query_obj.filt.rds")
singleRanno_markers <- FindAllMarkers(query_obj.filt, only.pos = TRUE, min.pct = 0.25, logfc.threshold = 0.25)
singleRanno_markers.top100 <- singleRanno_markers %>% group_by(cluster) %>% top_n(n = 100, wt = avg_log2FC)
## identify cell type-specific markers for singleR cell type label in non-neuron filtered query object
query_obj.filtNeuron <- SetIdent(query_obj.filtNeuron, cells = colnames(query_obj.filtNeuron), value = query_obj.filtNeuron$pruned.labels)
saveRDS(query_obj.filtNeuron, file = "PsychEncode_UMI.query_obj.filtNeuron.rds")
uhafNeuronOnly_singleRanno_markers <- FindAllMarkers(query_obj.filtNeuron, only.pos = TRUE, min.pct = 0.25, logfc.threshold = 0.25)
uhafNeuronOnly_singleRanno_markers.top100 <- uhafNeuronOnly_singleRanno_markers %>% group_by(cluster) %>% top_n(n = 100, wt = avg_log2FC)
## compare two sets of markers
uhafcelltypeMarkerSummary = uhafcelltype_markers %>% group_by(cluster) %>% summarise(markerNum = n())
singleRannoMarkerSummary = singleRanno_markers %>% group_by(cluster) %>% summarise(markerNum = n())
uhafNeuronOnlySingleRannoMarkerSummary = uhafNeuronOnly_singleRanno_markers %>% group_by(cluster) %>% summarise(markerNum = n())
marker = merge(uhafcelltype_markers,singleRanno_markers, by = "gene", suffixes = c(".manualAnno",".singlerAnno"))
markerSummary = marker %>% filter(cluster.manualAnno == cluster.singlerAnno) %>% group_by(cluster.manualAnno) %>% summarise(commonMarkerNum = n()) %>% rename(cluster = cluster.manualAnno) %>% inner_join(merge(uhafcelltypeMarkerSummary, singleRannoMarkerSummary, by = "cluster", suffixes = c(".manualAnno",".singlerAnno")), by = "cluster") %>% mutate(singleRannoMarkerAccuracy = commonMarkerNum/markerNum.singlerAnno)
## plotting
p10 <- cowplot::plot_grid(ncol = 2,
DimPlot(query_obj.filt, reduction = "umap", label = TRUE, group.by = "orig.celllabel") + NoAxes(),
DimPlot(query_obj.filt, reduction = "umap", label = TRUE, group.by = "seurat_clusters") + NoAxes(),
DimPlot(query_obj.filt, reduction = "umap", label = TRUE, group.by = "uhaf.cell_type") + NoAxes(),
DimPlot(query_obj.filt, reduction = "umap", label = TRUE, group.by = "pruned.labels") + NoAxes()
)
ggsave(paste0(que_name,".umapAll.tiff"), plot = p10, width=18, height=10, dpi = 300)