.. _T_cell_analysis_label: T cell analysis with in-data cell sorting ========================================================= load ECAUGT package =================== .. code:: ipython3 # load the ECAUGT package as well as other related packages import ECAUGT import sys,time,multiprocessing import scanpy as sc import numpy as np, pandas as pd .. code:: ipython3 # set parameters endpoint = "https://HCAd-Datasets.cn-beijing.ots.aliyuncs.com" access_id = "LTAI5t7t216W9amUD1crMVos" #enter your id and keys access_key = "ZJPlUbpLCij5qUPjbsU8GnQHm97IxJ" instance_name = "HCAd-Datasets" table_name = 'HCA_d' .. code:: ipython3 # # setup client ECAUGT.Setup_Client(endpoint, access_id, access_key, instance_name, table_name) .. parsed-literal:: Connected to the server, find the table. HCA_d TableName: HCA_d PrimaryKey: [('cid', 'INTEGER')] Reserved read throughput: 0 Reserved write throughput: 0 Last increase throughput time: 1605795297 Last decrease throughput time: None table options's time to live: -1 table options's max version: 1 table options's max_time_deviation: 86400 .. parsed-literal:: 0 sorting cells from uGT ====================== sort with labels ---------------- .. code:: ipython3 query_language = "cell_type == T cell" cid_label = ECAUGT.query_cells(metadata_conditions=query_language, include_children=True) .. parsed-literal:: 51588 cells found sort with expressional conditions --------------------------------- .. code:: ipython3 query_language = "PTPRC>1.5 && (CD3D>1.5 || CD3E>1.5)" gene_condition = ECAUGT.seq2filter(query_language) df_result_tcell = ECAUGT.get_columnsbycell_para( rows_to_get = None, # make sure condition associated columns listed here cols_to_get=['organ','cell_type','CD3D','CD3E','PTPRC'], col_filter=gene_condition, do_transfer = True, thread_num = 24 ) .. parsed-literal:: 1093299 cells found .. code:: ipython3 df_result_tcell .. raw:: html
CD3D CD3E PTPRC cell_type organ
cid
35 2.910235 2.910235 3.575773 T cell Spleen
40 2.403368 0.000000 3.050255 Neutrophilic granulocyte Spleen
50 3.820847 0.000000 3.149373 T cell Spleen
126 0.000000 4.292039 3.220413 T cell Spleen
167 1.589888 0.000000 1.589888 Plasma B cell Spleen
... ... ... ... ... ...
4085793 0.000000 3.241421 3.241421 Zona fasciculata cell Adrenal gland
4089833 2.782014 0.000000 2.782014 Neutrophilic granulocyte Adrenal gland
4092102 0.000000 3.204398 3.204398 Neutrophilic granulocyte Adrenal gland
4092673 2.709732 0.000000 2.709732 Neutrophilic granulocyte Adrenal gland
4093924 2.256228 0.000000 2.256228 Zona fasciculata cell Adrenal gland

14710 rows × 5 columns

.. code:: ipython3 cid_expression = [[('cid',i)] for i in df_result_tcell.index] merge two cid obtained from origins ----------------------------------- .. code:: ipython3 # merge and remove duplicated cids cid_list = set() for i in range(len(cid_expression)): cid= cid_expression[i][0][1] cid_list.add(cid) for i in range(len(cid_label)): cid= cid_label[i][0][1] cid_list.add(cid) cid_list = list(cid_list) # print number of obtained cids print(len(cid_list)) # build rows_to_get variable to download data rows_to_get = [[('cid',i)] for i in cid_list] .. parsed-literal:: 56540 .. code:: ipython3 import pickle pickle.HIGHEST_PROTOCOL .. parsed-literal:: 4 .. code:: ipython3 with open('rows_to_get.pickle', 'wb') as f: pickle.dump(rows_to_get, f, protocol=4) .. code:: ipython3 import pickle with open('rows_to_get.pickle', 'rb') as f: rows_to_get=pickle.load(f ) .. code:: ipython3 from tqdm import tqdm import pickle # we suggest downloading cells in small batches in case of network issues for chunk in tqdm(range(int(1+len(rows_to_get)/500)),ncols=80): # split batches lb, rb = chunk*500, (chunk+1)*500 rows = rows_to_get[lb:rb] if len(rows)<=0:break # download rows from the unified Giant Table (uGT) result = ECAUGT.get_columnsbycell_para(rows_to_get = rows, cols_to_get = None, # download all columns col_filter = None, do_transfer = True, thread_num = 24) result.to_pickle("__temp_%d_%d.pk"%(lb,rb)) #print("downloading %d~%d"%(lb, rb)) #print(len(rows)) .. parsed-literal:: 5%|██▏ | 6/114 [09:44<2:55:11, 97.33s/it]OTS request failed, API: GetRow, HTTPStatus: 503, ErrorCode: OTSTimeout, ErrorMessage: Operation timeout., RequestID: 0005d044-f8df-f27c-613f-020a872a9e27. 100%|███████████████████████████████████████| 114/114 [3:04:25<00:00, 97.06s/it] .. code:: ipython3 # load split batches giant_table_list = [] for chunk in tqdm(range(int(1+len(rows_to_get)/500)),ncols=80): lb, rb = chunk*500, (chunk+1)*500 fname = "__temp_%d_%d.pk" % (lb, rb) with open(fname,'rb') as f: df=pickle.load(f) giant_table_list.append(df) .. parsed-literal:: 100%|█████████████████████████████████████████| 114/114 [00:17<00:00, 6.59it/s] .. code:: ipython3 giant_table= giant_table_list[0] for i in range(1, len(giant_table_list)): giant_table = pd.concat([ giant_table, giant_table_list[i] ]) .. code:: ipython3 # remove intermediate results del giant_table_list import gc gc.collect() giant_table.to_pickle("sorted_tcells_raw.pk") .. code:: ipython3 genes = giant_table.columns[:43878] metaCols = giant_table.columns[43878:43878+18] .. code:: ipython3 expr = giant_table.loc[:,genes] meta = giant_table.loc[:,metaCols] meta.reset_index(inplace=True) expr.reset_index(inplace=True) expr=expr.drop(['cid'], axis=1) print(expr.shape) print(meta.shape) .. parsed-literal:: (56540, 43878) (56540, 19) .. code:: ipython3 # check the sample-by-gene expression matrix expr .. raw:: html
A12M1 A12M2 A12M3 A12M4 A1BG A1BG-AS1 A1CF A2M A2M-AS1 A2ML1 ... ZXDA ZXDB ZXDC ZYG11A ZYG11AP1 ZYG11B ZYX ZYXP1 ZZEF1 ZZZ3
0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.000000 ... 0.0 0.000000 0.0 0.0 0.0 0.0 0.0 0.0 0.000000 0.0
1 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.000000 ... 0.0 0.000000 0.0 0.0 0.0 0.0 0.0 0.0 0.000000 0.0
2 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.000000 ... 0.0 0.000000 0.0 0.0 0.0 0.0 0.0 0.0 0.000000 0.0
3 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.000000 ... 0.0 0.000000 0.0 0.0 0.0 0.0 0.0 0.0 0.000000 0.0
4 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.000000 ... 0.0 0.000000 0.0 0.0 0.0 0.0 0.0 0.0 0.000000 0.0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
56535 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.000000 ... 0.0 0.000000 0.0 0.0 0.0 0.0 0.0 0.0 0.000000 0.0
56536 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.000000 ... 0.0 0.000000 0.0 0.0 0.0 0.0 0.0 0.0 2.165166 0.0
56537 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.000000 ... 0.0 2.696737 0.0 0.0 0.0 0.0 0.0 0.0 0.000000 0.0
56538 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.000000 ... 0.0 0.000000 0.0 0.0 0.0 0.0 0.0 0.0 0.000000 0.0
56539 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 2.658875 ... 0.0 0.000000 0.0 0.0 0.0 0.0 0.0 0.0 0.000000 0.0

56540 rows × 43878 columns

.. code:: ipython3 # check the metadata matrix meta .. raw:: html
cid cell_id cell_type cl_name donor_age donor_gender donor_id hcad_name marker_gene organ original_name region sample_status seq_tech study_id subregion tissue_type uHAF_name user_id
0 2016314 ACTGATGCAGCGTTCG-1-HCATisStab7587208 T cell T cell NA NA 356C Lung-Connective tissue-T cell-CD3D IL32 CD3D IL32 Lung T_CD4 Left lung Healthy 10X 10.1186/s13059-019-1906-x Inferior lobular Connective tissue Lung-Connective tissue-T cell-CD3D IL32 2
1 3100139 FetalMaleGonad_2.TCACTTGGACATGAGATC NK T cell NA GW11 Male Donor9 Testis-Connective tissue-NK T cell-MT-ATP6 MT-CYB MT-ATP6 MT-CYB Testis Fetal fibroblast NA Healthy Microwell-seq 10.1038/s41586-020-2157-4 NA Muscle tissue Testis-Connective tissue-NK T cell-MT-ATP6 MT-CYB 3
2 4063239 AdultGallbladder_2.ACAATAAATAAAGGGCGA3 T cell T cell 58yr Male AdultGallbladder2 Gallbladder-Connective tissue-T cell-IL32 IL32 Gallbladder T cell_CCL5 high NA Healthy Microwell-seq 10.1038/s41586-020-2157-4 NA Connective tissue Gallbladder-Connective tissue-T cell-IL32 4
3 3100141 FetalMaleGonad_2.TGATCAGTCCCGAGATGG NK T cell NA GW11 Male Donor9 Testis-Connective tissue-NK T cell-MT-ATP6 MT-CYB MT-ATP6 MT-CYB Testis Fetal fibroblast NA Healthy Microwell-seq 10.1038/s41586-020-2157-4 NA Muscle tissue Testis-Connective tissue-NK T cell-MT-ATP6 MT-CYB 3
4 4063252 AdultGallbladder_2.ACAATACATGATCGCACC3 Epithelial cell epithelial cell 58yr Male AdultGallbladder2 Gallbladder-Epithelial tissue-Epithelial cell-... TM4SF4 Gallbladder Mucous epithelial cell NA Healthy Microwell-seq 10.1038/s41586-020-2157-4 NA Epithelial tissue Gallbladder-Epithelial tissue-Epithelial cell-... 4
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
56535 4063198 AdultGallbladder_2.AATAAAAGCGAGAGGGTC3 T cell T cell 58yr Male AdultGallbladder2 Gallbladder-Connective tissue-T cell-IL32 IL32 Gallbladder T cell NA Healthy Microwell-seq 10.1038/s41586-020-2157-4 NA Connective tissue Gallbladder-Connective tissue-T cell-IL32 4
56536 1172316 FetalMuscle_1.CAACAACCGCTAGGCTGC Proliferating T cell NA GW12 Male NA Muscle-Connective tissue-Proliferating T cell-... UBE2C Muscle Proliferating cell_UBE2C high NA Healthy Microwell-seq 10.1038/s41586-020-2157-4 NA Connective tissue Muscle-Connective tissue-Proliferating T cell-... 1
56537 3100133 FetalMaleGonad_2.TAGCATAACCTACAAAGT NK T cell NA GW11 Male Donor9 Testis-Connective tissue-NK T cell-MT-ATP6 MT-CYB MT-ATP6 MT-CYB Testis Fetal fibroblast NA Healthy Microwell-seq 10.1038/s41586-020-2157-4 NA Muscle tissue Testis-Connective tissue-NK T cell-MT-ATP6 MT-CYB 3
56538 3100135 FetalMaleGonad_2.TTTAGGGTGGTACCATCT NK T cell NA GW11 Male Donor9 Testis-Connective tissue-NK T cell-MT-ATP6 MT-CYB MT-ATP6 MT-CYB Testis Fetal fibroblast NA Healthy Microwell-seq 10.1038/s41586-020-2157-4 NA Muscle tissue Testis-Connective tissue-NK T cell-MT-ATP6 MT-CYB 3
56539 3100136 FetalMaleGonad_2.CCATCTGCGTCCTGTGCG NK T cell NA GW11 Male Donor9 Testis-Connective tissue-NK T cell-MT-ATP6 MT-CYB MT-ATP6 MT-CYB Testis Fetal fibroblast NA Healthy Microwell-seq 10.1038/s41586-020-2157-4 NA Muscle tissue Testis-Connective tissue-NK T cell-MT-ATP6 MT-CYB 3

56540 rows × 19 columns

create single-cell analysis objects =================================== .. code:: ipython3 # create scanpy object from the matrices adata = sc.AnnData(X = expr, obs = meta) adata.var_names_make_unique() sc.pp.filter_genes(adata, min_counts=5) sc.pp.filter_genes(adata, min_cells=3) .. code:: ipython3 # post-processing steps from scipy.sparse import csc_matrix adata.X = csc_matrix(adata.X, dtype=np.float32) adata.obs['donor_id']=adata.obs['donor_id'].astype(str) .. code:: ipython3 adata.write_h5ad("sorted_tcells_raw.h5ad") .. parsed-literal:: /home/chensijie/software/anaconda3/envs/r411py37/lib/python3.7/site-packages/anndata/_core/anndata.py:1220: FutureWarning: The `inplace` parameter in pandas.Categorical.reorder_categories is deprecated and will be removed in a future version. Removing unused categories will always return a new Categorical object. c.reorder_categories(natsorted(c.categories), inplace=True) ... storing 'donor_id' as categorical filter cells for downstream analysis ==================================== .. code:: ipython3 # remove fibroblast sel = adata[:,"LUM"].X==0 adata = adata[sel].copy() sel = adata[:,"SERPING1"].X==0 adata = adata[sel].copy() sel = adata[:,"COL1A1"].X==0 adata = adata[sel].copy() sel = adata[:,"COL1A2"].X==0 adata = adata[sel].copy() # remove vascular endothelial cells sel = adata[:,"INMT"].X==0 adata = adata[sel].copy() # remove muscle cells sel = adata[:,"ACTA2"].X==0 adata = adata[sel].copy() # remove granulocytes sel = adata[:,"S100A8"].X==0 adata = adata[sel].copy() sel = adata[:,"S100A9"].X==0 adata = adata[sel].copy() sel = adata[:,"SIGLEC8"].X==0 adata = adata[sel].copy() # remove myeloid cells sel = adata[:,"C1QA"].X==0 adata = adata[sel].copy() sel = adata[:,"C1QB"].X==0 adata = adata[sel].copy() sel = adata[:,"C1QC"].X==0 adata = adata[sel].copy() sel = adata[:,"ITGAM"].X==0 adata = adata[sel].copy() sel = adata[:,"ITGAX"].X==0 adata = adata[sel].copy() # remove B cells sel = adata[:,"CD79A"].X==0 adata = adata[sel].copy() sel = adata[:,"CD79B"].X==0 adata = adata[sel].copy() sel = adata[:,"CD19"].X==0 adata = adata[sel].copy() sel = adata[:,"MS4A1"].X==0 adata = adata[sel].copy() # remove cells from nerve tissues sel = adata[:,"FSCN1"].X==0 adata = adata[sel].copy() .. code:: ipython3 adata .. parsed-literal:: AnnData object with n_obs × n_vars = 20553 × 20491 obs: 'cid', 'cell_id', 'cell_type', 'cl_name', 'donor_age', 'donor_gender', 'donor_id', 'hcad_name', 'marker_gene', 'organ', 'original_name', 'region', 'sample_status', 'seq_tech', 'study_id', 'subregion', 'tissue_type', 'uHAF_name', 'user_id' var: 'n_counts', 'n_cells' .. code:: ipython3 sc.pp.highly_variable_genes(adata, min_mean=0.0125, max_mean=3, min_disp=0.5) .. code:: ipython3 sc.pl.highly_variable_genes(adata) .. image:: T_cell_analysis_with_in-data_cell_sorting_files/output_35_0.png .. code:: ipython3 sc.tl.pca(adata, svd_solver='arpack') .. code:: ipython3 sc.set_figure_params(figsize=[5,5]) sc.pl.pca(adata,color="organ") .. image:: T_cell_analysis_with_in-data_cell_sorting_files/output_37_0.png :width: 739px :height: 357px .. code:: ipython3 sc.set_figure_params(figsize=[5,5]) sc.pl.pca(adata,color="cell_type") .. image:: T_cell_analysis_with_in-data_cell_sorting_files/output_38_0.png :width: 1341px :height: 357px .. code:: ipython3 sc.set_figure_params(figsize=[5,5]) sc.pl.pca(adata,color=["PTPRC","CD3D","CD3E","CD3G", "CD8A","CD4","CD69","CCR7"],sort_order=True) .. image:: T_cell_analysis_with_in-data_cell_sorting_files/output_39_0.png :width: 1543px :height: 709px .. code:: ipython3 adata.write_h5ad("sorted_tcells_filtered.h5ad") Now we have obtained a collection of T cell using in data sorting. You can do what ever analysis you like with these sorted cells. We have done some downstream metabolic pathway analysis on these cells using GSVA, which is available at https://github.com/XuegongLab/hECA/tree/main/examples