Label transfer using customized neuron cell reference

In this vignette, we will explore how to use hECA build customized reference dataset of the same cell type and perform label transfer to annotate other cells.

Step 1: load packages

library(Seurat)
library(SingleR)
library(tidyverse)
Step 2: build reference dataset from hECA
In this experiment we will build reference of neuron cells.
2.1 - Open hECA website, click “Cell Sorting” in the menu.
2.2 - “Add Filters” - “Cell Type” - type in “Neuron”, click to select and include subtypes as default
2.3 - Click “Apply”. After seconds, click “Download Data” to download the keys of sorted cells, a csv file “keys.csv” will be downloaded.

2.4 - Download cells following the tutorials of ECAUGT and save the results to csv file (in Python):

rows_to_get = pd.read_csv('keys.csv')
rows_to_get = [[('cid',i)] for i in rows_to_get['cid']]
result = ECAUGT.get_columnsbycell_para(rows_to_get = rows_to_get, cols_to_get=None,
                                       col_filter=gene_condition, do_transfer = True,
                                       thread_num = multiprocessing.cpu_count()-1)
genes = result.columns[:43878]
metaCols = result.columns[43878:43878+18]
expr = result.loc[:,genes]
meta = result.loc[:,metaCols]
expr.to_csv("hECA_exprs.csv", index=True)
meta.to_csv("hECA_metadata.csv", index=True)

2.5 - Load downloaded expression matrix and metadata as customized reference dataset (continue in R).

expr <- read.csv("hECA_exprs.csv", header=T, row.names=1)
meta <- read.csv("hECA_metadata.csv", header=T, row.names=1)
Step 3: load query data and perform pre-processing
In this step we will load an external dataset of neuron cells as query data, downloaded from PsychENCODE (https://explorer.nimhgenetics.org/).
Due to the differences between names of genes and cell types, some pre-processing steps are needed.

Step 3.1: uniform cell type names

que_file = "DER-22_Single_cell_expression_raw_UMI.tsv"
data.matrix <- read.table(que_file, h = T, row.names = 1)
cellid = colnames(data.matrix)
samp.devStage <- data.frame(samp.devStage = ifelse(grepl("^Fetal",cellid),"Fetal","Adult"))
rownames(samp.devStage) = cellid

true_query = data.frame(cellid = cellid) %>% mutate(
    orig.celllabel = stringr::str_split_fixed(gsub("Fetal.","",cellid),"\\.",2)[,1],
    orig.uhafcelltype = case_when(
        grepl("^Ex",orig.celllabel) ~ "Excitatory neuron",
        grepl("^In",orig.celllabel) ~ "Inhibitory neuron",
        grepl("^Neuron",orig.celllabel) ~ "Neuron",
        grepl("^quies",orig.celllabel) ~ "Neuron",
        grepl("^trans",orig.celllabel) ~ "Neuron",
        grepl("^repli",orig.celllabel) ~ "Neural progenitor cell",
        grepl("^IPC",orig.celllabel) ~ "Neural progenitor cell",
        grepl("^NEP",orig.celllabel) ~ "Neural progenitor cell",
        grepl("^OPC",orig.celllabel) ~ "Oligodendrocyte progenitor cell",
        grepl("^Oligo",orig.celllabel) ~ "Oligodendrocyte",
        grepl("^Astro",orig.celllabel) ~ "Astrocyte",
        grepl("^Microglia",orig.celllabel) ~ "Microglia",
        grepl("^Endo",orig.celllabel) ~ "Endothelial cell",
        grepl("^Per",orig.celllabel) ~ "Pericyte",
        TRUE ~ "Unclassified"),
    orig.uhaftissuetype = case_when(
        orig.celltype %in% c("Neuron","Excitatory neuron","Inhibitory neuron","Astrocyte","Oligodendrocyte","Microglia","Neural progenitor cell","Oligodendrocyte progenitor cell") ~ "Nerve tissue",
        orig.celltype %in% c("Endothelial cell","Pericyte") ~ "Epithelial tissue",
        TRUE ~ as.character(orig.celltype)))
rownames(true_query) = cellid

Step 3.2: Create Seurat object and perform quality control using Seurat

# create Seurat object
query_obj <- CreateSeuratObject(counts = as.matrix(data.matrix),  meta.data = samp.devStage)
query_obj <- AddMetaData(query_obj, metadata = true_query)

# QC
mito_genes <- rownames(query_obj)[grep("^[Mm][Tt]-",rownames(query_obj))]
query_obj <- PercentageFeatureSet(query_obj, "^MT-", col.name = "percent_mito")
ribo_genes <- rownames(query_obj)[grep("^RP[SL]",rownames(query_obj))]
query_obj <- PercentageFeatureSet(query_obj, "^RP[SL]", col.name = "percent_ribo")
hemoglobin_genes <- rownames(query_obj)[grep("^HB[^(P)]",rownames(query_obj))]
query_obj <- PercentageFeatureSet(query_obj, "^HB[^(P)]", col.name = "percent_hb")
selected_c <- WhichCells(query_obj, expression = nFeature_RNA > 200)# for UMI
selected_f <- rownames(query_obj)[Matrix::rowSums(query_obj.filt) > 3]
query_obj.filt <- subset(query_obj, features = selected_f, cells = selected_c)

Step 3.3: Run standard Seurat pipeline to obtain clusters

query_obj.filt <- NormalizeData(query_obj.filt, normalization.method = "LogNormalize", scale.factor = 10000)
query_obj.filt <- FindVariableFeatures(query_obj.filt, selection.method = "vst", nfeatures = 2000)
query_obj.filt <- ScaleData(query_obj.filt, features = VariableFeatures(query_obj.filt), vars.to.regress = c("nFeature_RNA")) # UMI dataset
query_obj.filt = RunPCA(query_obj.filt, npcs = 20, verbose = F)
query_obj.filt <- JackStraw(query_obj.filt, num.replicate = 100)
query_obj.filt <- ScoreJackStraw(query_obj.filt, dims = 1:20)
pcDim = 8

query_obj.filt <- FindNeighbors(query_obj.filt, dims = 1:pcDim, nn.eps = 0.5)
query_obj.filt <- FindClusters(query_obj.filt, resolution = 0.5, n.start = 10)
query_obj.filt = RunUMAP(query_obj.filt, dims = 1:pcDim, verbose = F)

# DimPlot(query_obj.filt, reduction="umap")

# Finding cluster markers
query_obj.markers <- FindAllMarkers(query_obj.filt, only.pos = TRUE, min.pct = 0.25, logfc.threshold = 0.25)

Step 3.4: Assign uHAF labels to clusters

uhaf.anno <- data.frame(seurat.clust = query_obj.filt$seurat_clusters) %>% mutate(uhaf.cell_type = case_when(
    seurat.clust %in% c(0:2,4,7,16,17,20) ~ "Excitatory neuron",
    seurat.clust == 3 ~ "PV inhibitory neuron",
    seurat.clust == 18 ~ "Inhibitory neuron",
    seurat.clust %in% c(8,10) ~ "VIP inhibitory neuron",
    seurat.clust == 9 ~ "SST inhibitory neuron",
    seurat.clust %in% c(5,12,19) ~ "Astrocyte",
    seurat.clust %in% c(6,11,15) ~ "Oligodendrocyte",
    seurat.clust %in% c(13,21) ~ "Oligodendrocyte progenitor cell",
    seurat.clust == 14 ~ "Endothelial cell"))
rownames(uhaf.anno) = colnames(query_obj.filt)
levels(uhaf.anno$uhaf.cell_type) = levels(uhaf.anno$seurat.clust)
query_obj.filt <- AddMetaData(query_obj.filt, metadata = uhaf.anno)
Step 4: perform label transfer with SingleR
Now we will use SingleR to transfer labels from reference data to query data.

Step 4.1: Train SingleR model

gene.use = intersect(row.names(expr),row.names(query_obj.filt))
expr <- exprs[gene.use,]
ct.ref <- meta$cell_type
trainedR <- trainSingleR(expr, ct.ref, de.method = "wilcox")

Step 4.2: Predict labels of query data

### predict labels for query
cell.use = grepl("neuron", query_obj.filt@meta.data[, "uhaf.cell_type"])
#obj.query = query_obj.filt[gene.use,cell.use]
obj.query = query_obj.filt[gene.use,]
predict <- classifySingleR(obj.query@assays$RNA@data,trainedR)

Step 4.3: Check prediction results

truth <- obj.query$uhaf.cell_type
confMat = caret::confusionMatrix(factor(predict$pruned.labels,levels=unique(predict$labels)),factor(truth,levels=unique(predict$labels)))
confMat

(Optional) Step 4.4: Add prediction label to query object

df.result <- predict[,c("pruned.labels","labels")]
df.result$truth <- obj.query$uhaf.cell_type
df.result$X <- rownames(df.result)
df.result <- data.frame(df.result)
row.names(df.result) = df.result$X
singleR.anno = df.result[,"pruned.labels", drop = F]
query_obj.filt <- AddMetaData(query_obj.filt, metadata = singleR.anno)
query_obj.filtNeuron <- AddMetaData(obj.query, metadata = singleR.anno)

(Optional) Step 5: Check the accuracy of DE genes after label transfer

## identify cell type-specific markers for uhaf cell type label
query_obj.filt <- SetIdent(query_obj.filt,cells = colnames(query_obj.filt), value = query_obj.filt$uhaf.cell_type)
uhafcelltype_markers <- FindAllMarkers(query_obj.filt, only.pos = TRUE, min.pct = 0.25, logfc.threshold = 0.25)
uhafcelltype_markers.top100 <- uhafcelltype_markers %>% group_by(cluster) %>% top_n(n = 100, wt = avg_log2FC)

## identify cell type-specific markers for singleR cell type label
query_obj.filt <- SetIdent(query_obj.filt, cells = colnames(query_obj.filt), value = query_obj.filt$pruned.labels)
saveRDS(query_obj.filt, file = "PsychEncode_UMI.query_obj.filt.rds")
singleRanno_markers <- FindAllMarkers(query_obj.filt, only.pos = TRUE, min.pct = 0.25, logfc.threshold = 0.25)
singleRanno_markers.top100 <- singleRanno_markers %>% group_by(cluster) %>% top_n(n = 100, wt = avg_log2FC)

## identify cell type-specific markers for singleR cell type label in non-neuron filtered query object
query_obj.filtNeuron <- SetIdent(query_obj.filtNeuron, cells = colnames(query_obj.filtNeuron), value = query_obj.filtNeuron$pruned.labels)
saveRDS(query_obj.filtNeuron, file = "PsychEncode_UMI.query_obj.filtNeuron.rds")
uhafNeuronOnly_singleRanno_markers <- FindAllMarkers(query_obj.filtNeuron, only.pos = TRUE, min.pct = 0.25, logfc.threshold = 0.25)
uhafNeuronOnly_singleRanno_markers.top100 <- uhafNeuronOnly_singleRanno_markers %>% group_by(cluster) %>% top_n(n = 100, wt = avg_log2FC)

## compare two sets of markers
uhafcelltypeMarkerSummary = uhafcelltype_markers %>% group_by(cluster) %>% summarise(markerNum = n())
singleRannoMarkerSummary = singleRanno_markers %>% group_by(cluster) %>% summarise(markerNum = n())
uhafNeuronOnlySingleRannoMarkerSummary = uhafNeuronOnly_singleRanno_markers %>% group_by(cluster) %>% summarise(markerNum = n())

marker = merge(uhafcelltype_markers,singleRanno_markers, by = "gene", suffixes = c(".manualAnno",".singlerAnno"))
markerSummary = marker %>% filter(cluster.manualAnno == cluster.singlerAnno) %>% group_by(cluster.manualAnno) %>% summarise(commonMarkerNum = n()) %>% rename(cluster = cluster.manualAnno) %>% inner_join(merge(uhafcelltypeMarkerSummary, singleRannoMarkerSummary, by = "cluster", suffixes = c(".manualAnno",".singlerAnno")), by = "cluster") %>% mutate(singleRannoMarkerAccuracy = commonMarkerNum/markerNum.singlerAnno)

## plotting
p10 <- cowplot::plot_grid(ncol = 2,
    DimPlot(query_obj.filt, reduction = "umap", label = TRUE, group.by = "orig.celllabel") + NoAxes(),
    DimPlot(query_obj.filt, reduction = "umap", label = TRUE, group.by = "seurat_clusters") + NoAxes(),
    DimPlot(query_obj.filt, reduction = "umap", label = TRUE, group.by = "uhaf.cell_type") + NoAxes(),
    DimPlot(query_obj.filt, reduction = "umap", label = TRUE, group.by = "pruned.labels") + NoAxes()
)
ggsave(paste0(que_name,".umapAll.tiff"), plot = p10, width=18, height=10, dpi = 300)