.. _LabelTransfer_batch1_label: 5. LabelTransfer batch1 ======================= .. code:: r suppressPackageStartupMessages({ library(reticulate) library(Seurat) library(dplyr) library(ggpubr) library(sceasy) library(future) }) # utilities o<- function(w,h) options(repr.plot.width=w, repr.plot.height=h) manycolors=c('#e6194b', '#3cb44b', '#ffe119', '#4363d8', '#f58231', '#911eb4', '#46f0f0', '#f032e6', '#bcf60c', '#fabebe', '#008080', '#e6beff', '#9a6324', '#fffac8', '#800000', '#aaffc3', '#808000', '#ffd8b1', '#000075', '#808080', '#4f34ff', '#f340F0') covid.color.table= c( 'B'="#FFF3B0", 'Plasma'='#E09F3E', 'T'='#2F6D9D', 'T CD4'='#68BDDF', 'T CD8'='#FF046E', 'CD8 cytotoxic T'='#9D2A2B', 'T cycling'='#ADA7FF', 'Treg'='#493A9D', 'NKT'='#3300FF', 'NK'='#D543FF', 'gdT'='#6A24FF', 'Neu'='#312697', 'macrophage'='#429783', 'Mast'='#FF6B3E', 'cDC1'='#FF8F39', 'cDC2'='#E7FF3F', 'moDC'='#FFBCE1', 'pDC'='#FF72DB', 'DC'='#FF0022', 'Mye'='#19C3BE' ) heca.color.table=c( 'B cell'='#FFF3B0', 'CD8 T cell'='#FF046E', 'Dendritic cell'='#FF0022', 'Macrophage'='#429783', 'Mast cell'='#FF6B3E', 'Megakaryocyte'='gray', 'Monocyte'='#429753', 'Myeloid cell'='#19C3BE', 'NK cell'='#D543FF', 'Neutrophilic granunocyte'='#312697', 'Plasma B cell'='#E09F3E', 'T cell'='#2F6D9D' ) load data ========= load hECA data -------------- .. code:: r # convert h5ad to rds format so that R can read it sceasy::convertFormat('./lungimm_heca.h5ad', from="anndata", to="seurat", outFile='lungimm_heca.rds') .. parsed-literal:: X -> counts Warning message: “Keys should be one or more alphanumeric characters followed by an underscore, setting key from DMAPharmony_bydonor_ to DMAPharmonybydonor_” Warning message: “All keys should be one or more alphanumeric characters followed by an underscore '_', setting key to DMAPharmonybydonor_” Warning message: “Keys should be one or more alphanumeric characters followed by an underscore, setting key from PCAharmony_bydonor_ to PCAharmonybydonor_” Warning message: “All keys should be one or more alphanumeric characters followed by an underscore '_', setting key to PCAharmonybydonor_” .. parsed-literal:: An object of class Seurat 20492 features across 53299 samples within 1 assay Active assay: RNA (20492 features, 0 variable features) 6 dimensional reductions calculated: densmap, densmap_harmonyByDonor, dmap_harmony_bydonor, pca, pca_harmony_bydonor, umap .. code:: r # read saved hECA data heca = readRDS('./lungimm_heca.rds') .. code:: r # create a clean seurat object heca.full = CreateSeuratObject(heca@assays$RNA@data, meta.data = heca@meta.data) heca.full@assays$RNA@data <- heca.full@assays$RNA@counts heca.full[["umap"]] <- CreateDimReducObject(heca@reductions$umap@cell.embeddings, key = "umap_") .. parsed-literal:: Warning message: “No assay specified, setting assay as RNA by default.” .. code:: r # show different batches o(15,5) DimPlot(heca.full, group.by='ann',cols = heca.color.table, split.by = 'study_id', pt.size=0.01, label=T, repel=T) # print study_id unique(heca.full@meta.data$study_id) .. raw:: html

10.1038/s41591-019-0468-5
10.1186/s13059-019-1906-x
10.1038/s41586-020-2157-4

Levels:

'10.1038/s41586-020-2157-4'
'10.1038/s41591-019-0468-5'
'10.1186/s13059-019-1906-x'

.. image:: output_6_1.png :width: 900px :height: 300px .. code:: r heca1 = subset(heca.full, study_id=='10.1038/s41586-020-2157-4') .. code:: r #heca2 = subset(heca.full, study_id=='10.1186/s13059-019-1906-x') load COVID data --------------- .. code:: r load("./nCoV.integrated.annotated.rda") .. code:: r o(9,8) DimPlot(immune.combined, group.by='persample_ann', label=T, repel=T, cols=covid.color.table) + ggtitle( "COVID19 lung immune cells")& theme(axis.title = element_blank(), axis.text = element_blank(), axis.ticks=element_blank()) cairo_pdf(filename = 'covid19.umap.pdf', width = 9,height = 8) DimPlot(immune.combined, group.by='persample_ann', label=T, repel=T, cols=covid.color.table) + ggtitle( "COVID19 lung immune cells")& theme(axis.title = element_blank(), axis.text = element_blank(), axis.ticks=element_blank()) dev.off() .. raw:: html png: 2 .. image:: output_11_1.png :width: 540px :height: 480px batch-1 label transfer ====================== .. code:: r # find genes for label transfer DefaultAssay(heca1)<-'RNA' heca1 = FindVariableFeatures(heca1, selection.method="vst", nfeatures=2000, verbose=F) DefaultAssay(immune.combined)<-'RNA' immune.combined = FindVariableFeatures(immune.combined, selection.method="vst", nfeatures=2000, verbose=F) .. code:: r #mask horizontal genes x=heca1 masked.genes<-c( c(grep("^RPL", rownames(x), value = T),grep("^RPS", rownames(x), value = T)), grep("^MT-", rownames(x), value = T), c(grep("^IFI", rownames(x), value = T),grep("^ISG", rownames(x), value = T)), grep("^SMC[0-9]*", rownames(x), value = T), grep("^MCM[0-9]*", rownames(x), value = T), c(grep("^TUBA", rownames(x), value = T), grep("^TUBB", rownames(x), value = T), grep("^TUBD[0-9]", rownames(x), value = T), grep("^TUBE[0-9]", rownames(x), value = T), grep("^TUBG[0-9]", rownames(x), value = T) ), c(Seurat::cc.genes.updated.2019$s.genes, Seurat::cc.genes.updated.2019$g2m.genes ), c("H1-0","H1-1","H1-10","H1-12P","H1-2","H1-3","H1-4","H1-5","H1-6","H1-7","H1-8", "H1-9P","H2AB1","H2AB2","H2AB3","H2AC1","H2AC10P","H2AC11","H2AC12","H2AC13","H2AC14", "H2AC15","H2AC16","H2AC17","H2AC18","H2AC19","H2AC20","H2AC21","H2AC2P","H2AC3P","H2AC4", "H2AC5P","H2AC6","H2AC7","H2AC8","H2AC9P","H2AJ","H2AL1MP","H2AL1Q","H2AL3", "H2AP","H2AQ1P","H2AW","H2AX","H2AZ1","H2AZ2","MACROH2A1","MACROH2A2","H2BC1","H2BC10", "H2BC11","H2BC12","H2BC13","H2BC14","H2BC15","H2BC16P","H2BC17","H2BC18", "H2BC19P","H2BC20P","H2BC21","H2BC2P","H2BC3","H2BC4","H2BC5","H2BC6","H2BC7","H2BC8", "H2BC9","H2BK1","H2BL1P","H2BN1","H2BC12L","H2BU1","H2BU2P","H2BW1","H2BW2", "H2BW3P","H2BW4P","H3-7","H3-3A","H3-3B","H3-4","H3-5","H3C1","H3C10","H3C11","H3C12", "H3C13","H3C14","H3C15","H3C2","H3C3","H3C4","H3C5P","H3C6","H3C7","H3C8", "H3C9P","H3Y1","H3Y2","CENPA","H4-16","H4C1","H4C10P","H4C11","H4C12","H4C13","H4C14", "H4C15","H4C2","H4C3","H4C4","H4C5","H4C6","H4C7","H4C8","H4C9"), grep("^HIST", rownames(x), value=T) ) goi1 = setdiff( VariableFeatures(heca1), masked.genes) # mask horizontal genes x=immune.combined masked.genes<-c( c(grep("^RPL", rownames(x), value = T),grep("^RPS", rownames(x), value = T)), grep("^MT-", rownames(x), value = T), c(grep("^IFI", rownames(x), value = T),grep("^ISG", rownames(x), value = T)), grep("^SMC[0-9]*", rownames(x), value = T), grep("^MCM[0-9]*", rownames(x), value = T), c(grep("^TUBA", rownames(x), value = T), grep("^TUBB", rownames(x), value = T), grep("^TUBD[0-9]", rownames(x), value = T), grep("^TUBE[0-9]", rownames(x), value = T), grep("^TUBG[0-9]", rownames(x), value = T) ), c(Seurat::cc.genes.updated.2019$s.genes, Seurat::cc.genes.updated.2019$g2m.genes ), c("H1-0","H1-1","H1-10","H1-12P","H1-2","H1-3","H1-4","H1-5","H1-6","H1-7","H1-8", "H1-9P","H2AB1","H2AB2","H2AB3","H2AC1","H2AC10P","H2AC11","H2AC12","H2AC13","H2AC14", "H2AC15","H2AC16","H2AC17","H2AC18","H2AC19","H2AC20","H2AC21","H2AC2P","H2AC3P","H2AC4", "H2AC5P","H2AC6","H2AC7","H2AC8","H2AC9P","H2AJ","H2AL1MP","H2AL1Q","H2AL3", "H2AP","H2AQ1P","H2AW","H2AX","H2AZ1","H2AZ2","MACROH2A1","MACROH2A2","H2BC1","H2BC10", "H2BC11","H2BC12","H2BC13","H2BC14","H2BC15","H2BC16P","H2BC17","H2BC18", "H2BC19P","H2BC20P","H2BC21","H2BC2P","H2BC3","H2BC4","H2BC5","H2BC6","H2BC7","H2BC8", "H2BC9","H2BK1","H2BL1P","H2BN1","H2BC12L","H2BU1","H2BU2P","H2BW1","H2BW2", "H2BW3P","H2BW4P","H3-7","H3-3A","H3-3B","H3-4","H3-5","H3C1","H3C10","H3C11","H3C12", "H3C13","H3C14","H3C15","H3C2","H3C3","H3C4","H3C5P","H3C6","H3C7","H3C8", "H3C9P","H3Y1","H3Y2","CENPA","H4-16","H4C1","H4C10P","H4C11","H4C12","H4C13","H4C14", "H4C15","H4C2","H4C3","H4C4","H4C5","H4C6","H4C7","H4C8","H4C9"), grep("^HIST", rownames(x), value=T) ) goi2 = setdiff( VariableFeatures(immune.combined), masked.genes) .. code:: r goi= unique(c(goi1,goi2)) length(goi) .. raw:: html 3254 .. code:: r plan("sequential") options(future.globals.maxSize = 1000 * 1024^2*4) plan("multiprocess", workers = 14) heca1 <- ScaleData(heca1, features = goi) immune.combined <- ScaleData(immune.combined, features = goi) plan("sequential") .. parsed-literal:: Warning message: “Strategy 'multiprocess' is deprecated in future (>= 1.20.0). Instead, explicitly specify either 'multisession' or 'multicore'. In the current R session, 'multiprocess' equals 'multicore'.” Centering and scaling data matrix Centering and scaling data matrix .. code:: r plan("sequential") options(future.globals.maxSize = 1000 * 1024^2*4) plan("multiprocess", workers = 14) transfer.anchors <- FindTransferAnchors(reference = heca1, query = immune.combined, dims = 1:30, features=goi) plan("sequential") .. parsed-literal:: Warning message: “207 features of the features specified were not present in both the reference query assays. Continuing with remaining 3047 features.” Performing PCA on the provided reference using 3047 features as input. Projecting cell embeddings Finding neighborhoods Finding anchors Found 5294 anchors Filtering anchors Retained 1927 anchors .. code:: r predictions <- TransferData(anchorset = transfer.anchors, refdata = heca1$ann, dims = 1:30) immune.combined <- AddMetaData(immune.combined, metadata = predictions) .. parsed-literal:: Finding integration vectors Finding integration vector weights Predicting cell labels .. code:: r o(10,8) DimPlot(immune.combined, group.by='predicted.id', label=T, repel=T, cols=heca.color.table, order='Neutrophilic granunocyte') DimPlot(immune.combined, group.by='persample_ann', label=T, repel=T, cols=covid.color.table) .. image:: output_19_0.png :width: 600px :height: 480px .. image:: output_19_1.png :width: 600px :height: 480px .. code:: r o(10,8) DimPlot(immune.combined, group.by='predicted.id', label=T, repel=T, cols=heca.color.table) DimPlot(immune.combined, group.by='persample_ann', label=T, repel=T, cols=covid.color.table) .. image:: output_20_0.png :width: 600px :height: 480px .. image:: output_20_1.png :width: 600px :height: 480px .. code:: r matched <- c( paste('B' , 'B cell', sep="_"), paste('Plasma' , 'Plasma B cell', sep="_"), paste('Plasma' , 'B cell', sep="_"), paste('T' , 'CD8 T cell', sep="_"), paste('T' , 'T cell', sep="_"), paste('T CD4' , 'T cell', sep="_"), paste('T CD8' , 'T cell', sep="_"), paste('T CD8' , 'CD8 T cell', sep="_"), paste('CD8 cytotoxic T' , 'T cell', sep="_"), paste('CD8 cytotoxic T' , 'CD8 T cell', sep="_"), paste('T cycling' , 'T cell', sep="_"), paste('T cycling' , 'CD8 T cell', sep="_"), paste('Treg' , 'T cell', sep="_"), paste('gdT' , 'T cell', sep="_"), paste('NKT' , 'T cell', sep="_"), paste('NK' , 'NK cell', sep="_"), paste('Neu' , 'Neutrophilic granulocyte', sep="_"), paste('macrophage' , 'Macrophage', sep="_"), paste('macrophage' , 'Monocyte', sep="_"), paste('macrophage' , 'Myeloid cell', sep="_"), paste('Mast' , 'Mast cell', sep="_"), paste('cDC1' , 'Dendritic cell', sep="_"), paste('cDC1' , 'Myeloid cell', sep="_"), paste('cDC2' , 'Dendritic cell', sep="_"), paste('cDC2' , 'Myeloid cell', sep="_"), paste('moDC' , 'Dendritic cell', sep="_"), paste('pDC' , 'Dendritic cell', sep="_"), paste('DC' , 'Dendritic cell', sep="_"), paste('Mye' , 'Macrophage', sep="_"), paste('Mye' , 'Mast cell', sep="_"), paste('Mye' , 'Megakaryocyte', sep="_"), paste('Mye' , 'Dendritic cell', sep="_"), paste('Mye' , 'Monocyte', sep="_"), paste('Mye' , 'Neutrophilic granulocyte', sep="_"), paste('Mye' , 'Myeloid cell', sep="_") ) .. code:: r # calculate accuracy meta=immune.combined@meta.data meta = meta[, c('persample_ann','predicted.id')] meta$joint <- paste(meta$persample_ann, meta$predicted.id, sep='_') meta$isMatched <- (meta$joint %in% matched) | (meta$persample_ann == meta$predicted.id) acc = (meta %>% filter(isMatched==T) %>% nrow) / (meta %>% nrow) print( acc ) .. parsed-literal:: [1] 0.8283611 .. code:: r # draw matrix ## create an empty adj matrix rows = as.character( unique(immune.combined@meta.data$persample_ann) ) # from:covid ground-truth cols = as.character( unique(heca.full@meta.data$cell_type) ) # to:predicted to mat <- matrix(0, length(rows), length(cols)) rownames(mat) <- rows colnames(mat) <- cols # get an edge list edge.list = meta %>% count(persample_ann, predicted.id, sort = TRUE) # fill in the edge list for(i in 1:NROW(edge.list)) mat[ edge.list[i,1], edge.list[i,2] ] <- edge.list[i,3] # SEE UPDATE row.order= c('Plasma','B', 'Treg','T cycling','T CD4','T','NKT','gdT','CD8 cytotoxic T', 'T CD8','NK', 'Neu','DC','cDC2','cDC1','macrophage','moDC','pDC','Mast' ) col.order=c('Plasma B cell','B cell', 'T cell','CD8 T cell','NK cell', 'Neutrophilic granulocyte', 'Macrophage','Dendritic cell','Myeloid cell','Mast cell','Monocyte','Megakaryocyte') # row-sum scaling mat = mat/rowSums(mat) mat = mat[row.order,col.order] library(pheatmap) o(5,5) pheatmap(mat, cluster_rows = F, cluster_cols = F,main='hECA batch1 reference') .. image:: output_23_0.png :width: 300px :height: 300px .. code:: r library('mclust') adjustedRandIndex(meta$persample_ann, meta$predicted.id) .. parsed-literal:: Package 'mclust' version 5.4.9 Type 'citation("mclust")' for citing this R package in publications. .. raw:: html 0.305493881788284