.. _Vignette_LabelTransferNeuron_label:

Label transfer using customized neuron cell reference
=====================================================

In this vignette, we will explore how to use hECA build customized
reference dataset of the same cell type and perform label transfer to
annotate other cells.

**Step 1: load packages**

.. code:: r

    library(Seurat)
    library(SingleR)
    library(tidyverse)

| **Step 2: build reference dataset from hECA**
| In this experiment we will build reference of neuron cells.
| 2.1 - Open `hECA website <http://eca.xglab.tech/>`__, click “Cell
  Sorting” in the menu.
| 2.2 - “Add Filters” - “Cell Type” - type in “Neuron”, click to select
  and include subtypes as default


| 2.3 - Click “Apply”. After seconds, click “Download Data” to download
  the keys of sorted cells, a csv file “keys.csv” will be downloaded.
| 

2.4 - Download cells following the tutorials of
`ECAUGT <http://159.226.47.242:60080/ecaugt/ECAUGT_CD19.html#get-gene-expression-data-and-metadata>`__
and save the results to csv file (in Python):

.. code:: python

   rows_to_get = pd.read_csv('keys.csv')
   rows_to_get = [[('cid',i)] for i in rows_to_get['cid']]
   result = ECAUGT.get_columnsbycell_para(rows_to_get = rows_to_get, cols_to_get=None,
                                          col_filter=gene_condition, do_transfer = True,
                                          thread_num = multiprocessing.cpu_count()-1)
   genes = result.columns[:43878]
   metaCols = result.columns[43878:43878+18]
   expr = result.loc[:,genes]
   meta = result.loc[:,metaCols]
   expr.to_csv("hECA_exprs.csv", index=True)
   meta.to_csv("hECA_metadata.csv", index=True)

2.5 - Load downloaded expression matrix and metadata as customized
reference dataset (continue in R).

.. code:: r

    expr <- read.csv("hECA_exprs.csv", header=T, row.names=1)
    meta <- read.csv("hECA_metadata.csv", header=T, row.names=1)

| **Step 3: load query data and perform pre-processing**
| In this step we will load an external dataset of neuron cells as query
  data, downloaded from PsychENCODE
  (https://explorer.nimhgenetics.org/).
| The download url is
  http://resource.psychencode.org/Datasets/Derived/SC_Decomp/DER-22_Single_cell_expression_raw_UMI.tsv
  .
| Due to the differences between names of genes and cell types, some
  pre-processing steps are needed.

Step 3.1: uniform cell type names

.. code:: r

    que_file = "DER-22_Single_cell_expression_raw_UMI.tsv"
    data.matrix <- read.table(que_file, h = T, row.names = 1)
    cellid = colnames(data.matrix)
    samp.devStage <- data.frame(samp.devStage = ifelse(grepl("^Fetal",cellid),"Fetal","Adult"))
    rownames(samp.devStage) = cellid
    
    true_query = data.frame(cellid = cellid) %>% mutate(
        orig.celllabel = stringr::str_split_fixed(gsub("Fetal.","",cellid),"\\.",2)[,1], 
        orig.uhafcelltype = case_when(
            grepl("^Ex",orig.celllabel) ~ "Excitatory neuron",
            grepl("^In",orig.celllabel) ~ "Inhibitory neuron",
            grepl("^Neuron",orig.celllabel) ~ "Neuron",
            grepl("^quies",orig.celllabel) ~ "Neuron",
            grepl("^trans",orig.celllabel) ~ "Neuron",
            grepl("^repli",orig.celllabel) ~ "Neural progenitor cell",
            grepl("^IPC",orig.celllabel) ~ "Neural progenitor cell",
            grepl("^NEP",orig.celllabel) ~ "Neural progenitor cell",
            grepl("^OPC",orig.celllabel) ~ "Oligodendrocyte progenitor cell",
            grepl("^Oligo",orig.celllabel) ~ "Oligodendrocyte",
            grepl("^Astro",orig.celllabel) ~ "Astrocyte",
            grepl("^Microglia",orig.celllabel) ~ "Microglia",
            grepl("^Endo",orig.celllabel) ~ "Endothelial cell",
            grepl("^Per",orig.celllabel) ~ "Pericyte",
            TRUE ~ "Unclassified"),
        orig.uhaftissuetype = case_when(
            orig.celltype %in% c("Neuron","Excitatory neuron","Inhibitory neuron","Astrocyte","Oligodendrocyte","Microglia","Neural progenitor cell","Oligodendrocyte progenitor cell") ~ "Nerve tissue",
            orig.celltype %in% c("Endothelial cell","Pericyte") ~ "Epithelial tissue",
            TRUE ~ as.character(orig.celltype)))
    rownames(true_query) = cellid


Step 3.2: Create Seurat object and perform quality control using Seurat

.. code:: r

    # create Seurat object
    query_obj <- CreateSeuratObject(counts = as.matrix(data.matrix),  meta.data = samp.devStage)
    query_obj <- AddMetaData(query_obj, metadata = true_query)
    
    # QC  
    mito_genes <- rownames(query_obj)[grep("^[Mm][Tt]-",rownames(query_obj))]
    query_obj <- PercentageFeatureSet(query_obj, "^MT-", col.name = "percent_mito")
    ribo_genes <- rownames(query_obj)[grep("^RP[SL]",rownames(query_obj))]
    query_obj <- PercentageFeatureSet(query_obj, "^RP[SL]", col.name = "percent_ribo")
    hemoglobin_genes <- rownames(query_obj)[grep("^HB[^(P)]",rownames(query_obj))]
    query_obj <- PercentageFeatureSet(query_obj, "^HB[^(P)]", col.name = "percent_hb")
    selected_c <- WhichCells(query_obj, expression = nFeature_RNA > 200)# for UMI
    selected_f <- rownames(query_obj)[Matrix::rowSums(query_obj.filt) > 3]
    query_obj.filt <- subset(query_obj, features = selected_f, cells = selected_c)


Step 3.3: Run standard Seurat pipeline to obtain clusters

.. code:: r

    query_obj.filt <- NormalizeData(query_obj.filt, normalization.method = "LogNormalize", scale.factor = 10000)
    query_obj.filt <- FindVariableFeatures(query_obj.filt, selection.method = "vst", nfeatures = 2000)
    query_obj.filt <- ScaleData(query_obj.filt, features = VariableFeatures(query_obj.filt), vars.to.regress = c("nFeature_RNA")) # UMI dataset
    query_obj.filt = RunPCA(query_obj.filt, npcs = 20, verbose = F)
    query_obj.filt <- JackStraw(query_obj.filt, num.replicate = 100)
    query_obj.filt <- ScoreJackStraw(query_obj.filt, dims = 1:20)
    pcDim = 8
    
    query_obj.filt <- FindNeighbors(query_obj.filt, dims = 1:pcDim, nn.eps = 0.5)
    query_obj.filt <- FindClusters(query_obj.filt, resolution = 0.5, n.start = 10)
    query_obj.filt = RunUMAP(query_obj.filt, dims = 1:pcDim, verbose = F)
    
    # DimPlot(query_obj.filt, reduction="umap")
    
    # Finding cluster markers 
    query_obj.markers <- FindAllMarkers(query_obj.filt, only.pos = TRUE, min.pct = 0.25, logfc.threshold = 0.25)


Step 3.4: Assign uHAF labels to clusters

.. code:: r

    uhaf.anno <- data.frame(seurat.clust = query_obj.filt$seurat_clusters) %>% mutate(uhaf.cell_type = case_when(
        seurat.clust %in% c(0:2,4,7,16,17,20) ~ "Excitatory neuron",
        seurat.clust == 3 ~ "PV inhibitory neuron",
        seurat.clust == 18 ~ "Inhibitory neuron",
        seurat.clust %in% c(8,10) ~ "VIP inhibitory neuron",
        seurat.clust == 9 ~ "SST inhibitory neuron",
        seurat.clust %in% c(5,12,19) ~ "Astrocyte",
        seurat.clust %in% c(6,11,15) ~ "Oligodendrocyte",
        seurat.clust %in% c(13,21) ~ "Oligodendrocyte progenitor cell",
        seurat.clust == 14 ~ "Endothelial cell"))
    rownames(uhaf.anno) = colnames(query_obj.filt)
    levels(uhaf.anno$uhaf.cell_type) = levels(uhaf.anno$seurat.clust)
    query_obj.filt <- AddMetaData(query_obj.filt, metadata = uhaf.anno)


| **Step 4: perform label transfer with SingleR**
| Now we will use SingleR to transfer labels from reference data to
  query data.

Step 4.1: Train SingleR model

.. code:: r

    gene.use = intersect(row.names(expr),row.names(query_obj.filt))
    expr <- exprs[gene.use,]
    ct.ref <- meta$cell_type
    trainedR <- trainSingleR(expr, ct.ref, de.method = "wilcox")

Step 4.2: Predict labels of query data

.. code:: r

    ### predict labels for query
    cell.use = grepl("neuron", query_obj.filt@meta.data[, "uhaf.cell_type"]) 
    #obj.query = query_obj.filt[gene.use,cell.use]
    obj.query = query_obj.filt[gene.use,]
    predict <- classifySingleR(obj.query@assays$RNA@data,trainedR)

Step 4.3: Check prediction results

.. code:: r

    truth <- obj.query$uhaf.cell_type
    confMat = caret::confusionMatrix(factor(predict$pruned.labels,levels=unique(predict$labels)),factor(truth,levels=unique(predict$labels)))
    confMat

(Optional) Step 4.4: Add prediction label to query object

.. code:: r

    df.result <- predict[,c("pruned.labels","labels")]
    df.result$truth <- obj.query$uhaf.cell_type
    df.result$X <- rownames(df.result)
    df.result <- data.frame(df.result)
    row.names(df.result) = df.result$X
    singleR.anno = df.result[,"pruned.labels", drop = F]
    query_obj.filt <- AddMetaData(query_obj.filt, metadata = singleR.anno)
    query_obj.filtNeuron <- AddMetaData(obj.query, metadata = singleR.anno)

**(Optional) Step 5: Check the accuracy of DE genes after label
transfer**

.. code:: r

    ## identify cell type-specific markers for uhaf cell type label
    query_obj.filt <- SetIdent(query_obj.filt,cells = colnames(query_obj.filt), value = query_obj.filt$uhaf.cell_type)
    uhafcelltype_markers <- FindAllMarkers(query_obj.filt, only.pos = TRUE, min.pct = 0.25, logfc.threshold = 0.25)
    uhafcelltype_markers.top100 <- uhafcelltype_markers %>% group_by(cluster) %>% top_n(n = 100, wt = avg_log2FC) 
    
    ## identify cell type-specific markers for singleR cell type label
    query_obj.filt <- SetIdent(query_obj.filt, cells = colnames(query_obj.filt), value = query_obj.filt$pruned.labels)
    saveRDS(query_obj.filt, file = "PsychEncode_UMI.query_obj.filt.rds")
    singleRanno_markers <- FindAllMarkers(query_obj.filt, only.pos = TRUE, min.pct = 0.25, logfc.threshold = 0.25)
    singleRanno_markers.top100 <- singleRanno_markers %>% group_by(cluster) %>% top_n(n = 100, wt = avg_log2FC) 
    
    ## identify cell type-specific markers for singleR cell type label in non-neuron filtered query object
    query_obj.filtNeuron <- SetIdent(query_obj.filtNeuron, cells = colnames(query_obj.filtNeuron), value = query_obj.filtNeuron$pruned.labels)
    saveRDS(query_obj.filtNeuron, file = "PsychEncode_UMI.query_obj.filtNeuron.rds")
    uhafNeuronOnly_singleRanno_markers <- FindAllMarkers(query_obj.filtNeuron, only.pos = TRUE, min.pct = 0.25, logfc.threshold = 0.25)
    uhafNeuronOnly_singleRanno_markers.top100 <- uhafNeuronOnly_singleRanno_markers %>% group_by(cluster) %>% top_n(n = 100, wt = avg_log2FC) 
    
    ## compare two sets of markers
    uhafcelltypeMarkerSummary = uhafcelltype_markers %>% group_by(cluster) %>% summarise(markerNum = n())
    singleRannoMarkerSummary = singleRanno_markers %>% group_by(cluster) %>% summarise(markerNum = n())
    uhafNeuronOnlySingleRannoMarkerSummary = uhafNeuronOnly_singleRanno_markers %>% group_by(cluster) %>% summarise(markerNum = n())
    
    marker = merge(uhafcelltype_markers,singleRanno_markers, by = "gene", suffixes = c(".manualAnno",".singlerAnno"))
    markerSummary = marker %>% filter(cluster.manualAnno == cluster.singlerAnno) %>% group_by(cluster.manualAnno) %>% summarise(commonMarkerNum = n()) %>% rename(cluster = cluster.manualAnno) %>% inner_join(merge(uhafcelltypeMarkerSummary, singleRannoMarkerSummary, by = "cluster", suffixes = c(".manualAnno",".singlerAnno")), by = "cluster") %>% mutate(singleRannoMarkerAccuracy = commonMarkerNum/markerNum.singlerAnno)
    
    ## plotting
    p10 <- cowplot::plot_grid(ncol = 2, 
        DimPlot(query_obj.filt, reduction = "umap", label = TRUE, group.by = "orig.celllabel") + NoAxes(),
        DimPlot(query_obj.filt, reduction = "umap", label = TRUE, group.by = "seurat_clusters") + NoAxes(),
        DimPlot(query_obj.filt, reduction = "umap", label = TRUE, group.by = "uhaf.cell_type") + NoAxes(),
        DimPlot(query_obj.filt, reduction = "umap", label = TRUE, group.by = "pruned.labels") + NoAxes()
    )
    ggsave(paste0(que_name,".umapAll.tiff"), plot = p10, width=18, height=10, dpi = 300)