.. _T_cell_analysis_label: T cell analysis with in-data cell sorting ========================================================= load ECAUGT package =================== .. code:: ipython3 # load the ECAUGT package as well as other related packages import ECAUGT import sys,time,multiprocessing import scanpy as sc import numpy as np, pandas as pd .. code:: ipython3 # set parameters endpoint = "https://HCAd-Datasets.cn-beijing.ots.aliyuncs.com" access_id = "LTAI5t7t216W9amUD1crMVos" #enter your id and keys access_key = "ZJPlUbpLCij5qUPjbsU8GnQHm97IxJ" instance_name = "HCAd-Datasets" table_name = 'HCA_d' .. code:: ipython3 # # setup client ECAUGT.Setup_Client(endpoint, access_id, access_key, instance_name, table_name) .. parsed-literal:: Connected to the server, find the table. HCA_d TableName: HCA_d PrimaryKey: [('cid', 'INTEGER')] Reserved read throughput: 0 Reserved write throughput: 0 Last increase throughput time: 1605795297 Last decrease throughput time: None table options's time to live: -1 table options's max version: 1 table options's max_time_deviation: 86400 .. parsed-literal:: 0 sorting cells from uGT ====================== sort with labels ---------------- .. code:: ipython3 query_language = "cell_type == T cell" cid_label = ECAUGT.query_cells(metadata_conditions=query_language, include_children=True) .. parsed-literal:: 51588 cells found sort with expressional conditions --------------------------------- .. code:: ipython3 query_language = "PTPRC>1.5 && (CD3D>1.5 || CD3E>1.5)" gene_condition = ECAUGT.seq2filter(query_language) df_result_tcell = ECAUGT.get_columnsbycell_para( rows_to_get = None, # make sure condition associated columns listed here cols_to_get=['organ','cell_type','CD3D','CD3E','PTPRC'], col_filter=gene_condition, do_transfer = True, thread_num = 24 ) .. parsed-literal:: 1093299 cells found .. code:: ipython3 df_result_tcell .. raw:: html

	CD3D	CD3E	PTPRC	cell_type	organ
cid
35	2.910235	2.910235	3.575773	T cell	Spleen
40	2.403368	0.000000	3.050255	Neutrophilic granulocyte	Spleen
50	3.820847	0.000000	3.149373	T cell	Spleen
126	0.000000	4.292039	3.220413	T cell	Spleen
167	1.589888	0.000000	1.589888	Plasma B cell	Spleen
...	...	...	...	...	...
4085793	0.000000	3.241421	3.241421	Zona fasciculata cell	Adrenal gland
4089833	2.782014	0.000000	2.782014	Neutrophilic granulocyte	Adrenal gland
4092102	0.000000	3.204398	3.204398	Neutrophilic granulocyte	Adrenal gland
4092673	2.709732	0.000000	2.709732	Neutrophilic granulocyte	Adrenal gland
4093924	2.256228	0.000000	2.256228	Zona fasciculata cell	Adrenal gland

14710 rows × 5 columns

.. code:: ipython3 cid_expression = [[('cid',i)] for i in df_result_tcell.index] merge two cid obtained from origins ----------------------------------- .. code:: ipython3 # merge and remove duplicated cids cid_list = set() for i in range(len(cid_expression)): cid= cid_expression[i][0][1] cid_list.add(cid) for i in range(len(cid_label)): cid= cid_label[i][0][1] cid_list.add(cid) cid_list = list(cid_list) # print number of obtained cids print(len(cid_list)) # build rows_to_get variable to download data rows_to_get = [[('cid',i)] for i in cid_list] .. parsed-literal:: 56540 .. code:: ipython3 import pickle pickle.HIGHEST_PROTOCOL .. parsed-literal:: 4 .. code:: ipython3 with open('rows_to_get.pickle', 'wb') as f: pickle.dump(rows_to_get, f, protocol=4) .. code:: ipython3 import pickle with open('rows_to_get.pickle', 'rb') as f: rows_to_get=pickle.load(f ) .. code:: ipython3 from tqdm import tqdm import pickle # we suggest downloading cells in small batches in case of network issues for chunk in tqdm(range(int(1+len(rows_to_get)/500)),ncols=80): # split batches lb, rb = chunk*500, (chunk+1)*500 rows = rows_to_get[lb:rb] if len(rows)<=0:break # download rows from the unified Giant Table (uGT) result = ECAUGT.get_columnsbycell_para(rows_to_get = rows, cols_to_get = None, # download all columns col_filter = None, do_transfer = True, thread_num = 24) result.to_pickle("__temp_%d_%d.pk"%(lb,rb)) #print("downloading %d~%d"%(lb, rb)) #print(len(rows)) .. parsed-literal:: 5%|██▏ | 6/114 [09:44<2:55:11, 97.33s/it]OTS request failed, API: GetRow, HTTPStatus: 503, ErrorCode: OTSTimeout, ErrorMessage: Operation timeout., RequestID: 0005d044-f8df-f27c-613f-020a872a9e27. 100%|███████████████████████████████████████| 114/114 [3:04:25<00:00, 97.06s/it] .. code:: ipython3 # load split batches giant_table_list = [] for chunk in tqdm(range(int(1+len(rows_to_get)/500)),ncols=80): lb, rb = chunk*500, (chunk+1)*500 fname = "__temp_%d_%d.pk" % (lb, rb) with open(fname,'rb') as f: df=pickle.load(f) giant_table_list.append(df) .. parsed-literal:: 100%|█████████████████████████████████████████| 114/114 [00:17<00:00, 6.59it/s] .. code:: ipython3 giant_table= giant_table_list[0] for i in range(1, len(giant_table_list)): giant_table = pd.concat([ giant_table, giant_table_list[i] ]) .. code:: ipython3 # remove intermediate results del giant_table_list import gc gc.collect() giant_table.to_pickle("sorted_tcells_raw.pk") .. code:: ipython3 genes = giant_table.columns[:43878] metaCols = giant_table.columns[43878:43878+18] .. code:: ipython3 expr = giant_table.loc[:,genes] meta = giant_table.loc[:,metaCols] meta.reset_index(inplace=True) expr.reset_index(inplace=True) expr=expr.drop(['cid'], axis=1) print(expr.shape) print(meta.shape) .. parsed-literal:: (56540, 43878) (56540, 19) .. code:: ipython3 # check the sample-by-gene expression matrix expr .. raw:: html

	A12M1	A12M2	A12M3	A12M4	A1BG	A1BG-AS1	A1CF	A2M	A2M-AS1	A2ML1	...	ZXDA	ZXDB	ZXDC	ZYG11A	ZYG11AP1	ZYG11B	ZYX	ZYXP1	ZZEF1	ZZZ3
0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.000000	...	0.0	0.000000	0.0	0.0	0.0	0.0	0.0	0.0	0.000000	0.0
1	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.000000	...	0.0	0.000000	0.0	0.0	0.0	0.0	0.0	0.0	0.000000	0.0
2	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.000000	...	0.0	0.000000	0.0	0.0	0.0	0.0	0.0	0.0	0.000000	0.0
3	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.000000	...	0.0	0.000000	0.0	0.0	0.0	0.0	0.0	0.0	0.000000	0.0
4	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.000000	...	0.0	0.000000	0.0	0.0	0.0	0.0	0.0	0.0	0.000000	0.0
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
56535	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.000000	...	0.0	0.000000	0.0	0.0	0.0	0.0	0.0	0.0	0.000000	0.0
56536	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.000000	...	0.0	0.000000	0.0	0.0	0.0	0.0	0.0	0.0	2.165166	0.0
56537	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.000000	...	0.0	2.696737	0.0	0.0	0.0	0.0	0.0	0.0	0.000000	0.0
56538	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.000000	...	0.0	0.000000	0.0	0.0	0.0	0.0	0.0	0.0	0.000000	0.0
56539	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	2.658875	...	0.0	0.000000	0.0	0.0	0.0	0.0	0.0	0.0	0.000000	0.0

56540 rows × 43878 columns

.. code:: ipython3 # check the metadata matrix meta .. raw:: html

	cid	cell_id	cell_type	cl_name	donor_age	donor_gender	donor_id	hcad_name	marker_gene	organ	original_name	region	sample_status	seq_tech	study_id	subregion	tissue_type	uHAF_name	user_id
0	2016314	ACTGATGCAGCGTTCG-1-HCATisStab7587208	T cell	T cell	NA	NA	356C	Lung-Connective tissue-T cell-CD3D IL32	CD3D IL32	Lung	T_CD4	Left lung	Healthy	10X	10.1186/s13059-019-1906-x	Inferior lobular	Connective tissue	Lung-Connective tissue-T cell-CD3D IL32	2
1	3100139	FetalMaleGonad_2.TCACTTGGACATGAGATC	NK T cell	NA	GW11	Male	Donor9	Testis-Connective tissue-NK T cell-MT-ATP6 MT-CYB	MT-ATP6 MT-CYB	Testis	Fetal fibroblast	NA	Healthy	Microwell-seq	10.1038/s41586-020-2157-4	NA	Muscle tissue	Testis-Connective tissue-NK T cell-MT-ATP6 MT-CYB	3
2	4063239	AdultGallbladder_2.ACAATAAATAAAGGGCGA3	T cell	T cell	58yr	Male	AdultGallbladder2	Gallbladder-Connective tissue-T cell-IL32	IL32	Gallbladder	T cell_CCL5 high	NA	Healthy	Microwell-seq	10.1038/s41586-020-2157-4	NA	Connective tissue	Gallbladder-Connective tissue-T cell-IL32	4
3	3100141	FetalMaleGonad_2.TGATCAGTCCCGAGATGG	NK T cell	NA	GW11	Male	Donor9	Testis-Connective tissue-NK T cell-MT-ATP6 MT-CYB	MT-ATP6 MT-CYB	Testis	Fetal fibroblast	NA	Healthy	Microwell-seq	10.1038/s41586-020-2157-4	NA	Muscle tissue	Testis-Connective tissue-NK T cell-MT-ATP6 MT-CYB	3
4	4063252	AdultGallbladder_2.ACAATACATGATCGCACC3	Epithelial cell	epithelial cell	58yr	Male	AdultGallbladder2	Gallbladder-Epithelial tissue-Epithelial cell-...	TM4SF4	Gallbladder	Mucous epithelial cell	NA	Healthy	Microwell-seq	10.1038/s41586-020-2157-4	NA	Epithelial tissue	Gallbladder-Epithelial tissue-Epithelial cell-...	4
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
56535	4063198	AdultGallbladder_2.AATAAAAGCGAGAGGGTC3	T cell	T cell	58yr	Male	AdultGallbladder2	Gallbladder-Connective tissue-T cell-IL32	IL32	Gallbladder	T cell	NA	Healthy	Microwell-seq	10.1038/s41586-020-2157-4	NA	Connective tissue	Gallbladder-Connective tissue-T cell-IL32	4
56536	1172316	FetalMuscle_1.CAACAACCGCTAGGCTGC	Proliferating T cell	NA	GW12	Male	NA	Muscle-Connective tissue-Proliferating T cell-...	UBE2C	Muscle	Proliferating cell_UBE2C high	NA	Healthy	Microwell-seq	10.1038/s41586-020-2157-4	NA	Connective tissue	Muscle-Connective tissue-Proliferating T cell-...	1
56537	3100133	FetalMaleGonad_2.TAGCATAACCTACAAAGT	NK T cell	NA	GW11	Male	Donor9	Testis-Connective tissue-NK T cell-MT-ATP6 MT-CYB	MT-ATP6 MT-CYB	Testis	Fetal fibroblast	NA	Healthy	Microwell-seq	10.1038/s41586-020-2157-4	NA	Muscle tissue	Testis-Connective tissue-NK T cell-MT-ATP6 MT-CYB	3
56538	3100135	FetalMaleGonad_2.TTTAGGGTGGTACCATCT	NK T cell	NA	GW11	Male	Donor9	Testis-Connective tissue-NK T cell-MT-ATP6 MT-CYB	MT-ATP6 MT-CYB	Testis	Fetal fibroblast	NA	Healthy	Microwell-seq	10.1038/s41586-020-2157-4	NA	Muscle tissue	Testis-Connective tissue-NK T cell-MT-ATP6 MT-CYB	3
56539	3100136	FetalMaleGonad_2.CCATCTGCGTCCTGTGCG	NK T cell	NA	GW11	Male	Donor9	Testis-Connective tissue-NK T cell-MT-ATP6 MT-CYB	MT-ATP6 MT-CYB	Testis	Fetal fibroblast	NA	Healthy	Microwell-seq	10.1038/s41586-020-2157-4	NA	Muscle tissue	Testis-Connective tissue-NK T cell-MT-ATP6 MT-CYB	3

56540 rows × 19 columns

create single-cell analysis objects =================================== .. code:: ipython3 # create scanpy object from the matrices adata = sc.AnnData(X = expr, obs = meta) adata.var_names_make_unique() sc.pp.filter_genes(adata, min_counts=5) sc.pp.filter_genes(adata, min_cells=3) .. code:: ipython3 # post-processing steps from scipy.sparse import csc_matrix adata.X = csc_matrix(adata.X, dtype=np.float32) adata.obs['donor_id']=adata.obs['donor_id'].astype(str) .. code:: ipython3 adata.write_h5ad("sorted_tcells_raw.h5ad") .. parsed-literal:: /home/chensijie/software/anaconda3/envs/r411py37/lib/python3.7/site-packages/anndata/_core/anndata.py:1220: FutureWarning: The `inplace` parameter in pandas.Categorical.reorder_categories is deprecated and will be removed in a future version. Removing unused categories will always return a new Categorical object. c.reorder_categories(natsorted(c.categories), inplace=True) ... storing 'donor_id' as categorical filter cells for downstream analysis ==================================== .. code:: ipython3 # remove fibroblast sel = adata[:,"LUM"].X==0 adata = adata[sel].copy() sel = adata[:,"SERPING1"].X==0 adata = adata[sel].copy() sel = adata[:,"COL1A1"].X==0 adata = adata[sel].copy() sel = adata[:,"COL1A2"].X==0 adata = adata[sel].copy() # remove vascular endothelial cells sel = adata[:,"INMT"].X==0 adata = adata[sel].copy() # remove muscle cells sel = adata[:,"ACTA2"].X==0 adata = adata[sel].copy() # remove granulocytes sel = adata[:,"S100A8"].X==0 adata = adata[sel].copy() sel = adata[:,"S100A9"].X==0 adata = adata[sel].copy() sel = adata[:,"SIGLEC8"].X==0 adata = adata[sel].copy() # remove myeloid cells sel = adata[:,"C1QA"].X==0 adata = adata[sel].copy() sel = adata[:,"C1QB"].X==0 adata = adata[sel].copy() sel = adata[:,"C1QC"].X==0 adata = adata[sel].copy() sel = adata[:,"ITGAM"].X==0 adata = adata[sel].copy() sel = adata[:,"ITGAX"].X==0 adata = adata[sel].copy() # remove B cells sel = adata[:,"CD79A"].X==0 adata = adata[sel].copy() sel = adata[:,"CD79B"].X==0 adata = adata[sel].copy() sel = adata[:,"CD19"].X==0 adata = adata[sel].copy() sel = adata[:,"MS4A1"].X==0 adata = adata[sel].copy() # remove cells from nerve tissues sel = adata[:,"FSCN1"].X==0 adata = adata[sel].copy() .. code:: ipython3 adata .. parsed-literal:: AnnData object with n_obs × n_vars = 20553 × 20491 obs: 'cid', 'cell_id', 'cell_type', 'cl_name', 'donor_age', 'donor_gender', 'donor_id', 'hcad_name', 'marker_gene', 'organ', 'original_name', 'region', 'sample_status', 'seq_tech', 'study_id', 'subregion', 'tissue_type', 'uHAF_name', 'user_id' var: 'n_counts', 'n_cells' .. code:: ipython3 sc.pp.highly_variable_genes(adata, min_mean=0.0125, max_mean=3, min_disp=0.5) .. code:: ipython3 sc.pl.highly_variable_genes(adata) .. image:: T_cell_analysis_with_in-data_cell_sorting_files/output_35_0.png .. code:: ipython3 sc.tl.pca(adata, svd_solver='arpack') .. code:: ipython3 sc.set_figure_params(figsize=[5,5]) sc.pl.pca(adata,color="organ") .. image:: T_cell_analysis_with_in-data_cell_sorting_files/output_37_0.png :width: 739px :height: 357px .. code:: ipython3 sc.set_figure_params(figsize=[5,5]) sc.pl.pca(adata,color="cell_type") .. image:: T_cell_analysis_with_in-data_cell_sorting_files/output_38_0.png :width: 1341px :height: 357px .. code:: ipython3 sc.set_figure_params(figsize=[5,5]) sc.pl.pca(adata,color=["PTPRC","CD3D","CD3E","CD3G", "CD8A","CD4","CD69","CCR7"],sort_order=True) .. image:: T_cell_analysis_with_in-data_cell_sorting_files/output_39_0.png :width: 1543px :height: 709px .. code:: ipython3 adata.write_h5ad("sorted_tcells_filtered.h5ad") Now we have obtained a collection of T cell using in data sorting. You can do what ever analysis you like with these sorted cells. We have done some downstream metabolic pathway analysis on these cells using GSVA, which is available at https://github.com/XuegongLab/hECA/tree/main/examples