T cell analysis with in-data cell sorting

load ECAUGT package

# load the ECAUGT package as well as other related packages
import ECAUGT
import sys,time,multiprocessing
import scanpy as sc
import numpy as np, pandas as pd
# set parameters
endpoint = "https://HCAd-Datasets.cn-beijing.ots.aliyuncs.com"
access_id = "LTAI5t7t216W9amUD1crMVos" #enter your id and keys
access_key = "ZJPlUbpLCij5qUPjbsU8GnQHm97IxJ"
instance_name = "HCAd-Datasets"
table_name = 'HCA_d'
# # setup client
ECAUGT.Setup_Client(endpoint, access_id, access_key, instance_name, table_name)
Connected to the server, find the table.
HCA_d
TableName: HCA_d
PrimaryKey: [('cid', 'INTEGER')]
Reserved read throughput: 0
Reserved write throughput: 0
Last increase throughput time: 1605795297
Last decrease throughput time: None
table options's time to live: -1
table options's max version: 1
table options's max_time_deviation: 86400
0

sorting cells from uGT

sort with labels

query_language = "cell_type == T cell"
cid_label = ECAUGT.query_cells(metadata_conditions=query_language, include_children=True)
51588 cells found

sort with expressional conditions

query_language  = "PTPRC>1.5 && (CD3D>1.5 || CD3E>1.5)"
gene_condition  = ECAUGT.seq2filter(query_language)
df_result_tcell = ECAUGT.get_columnsbycell_para(
                                rows_to_get = None,
                                # make sure condition associated columns listed here
                                cols_to_get=['organ','cell_type','CD3D','CD3E','PTPRC'],
                                col_filter=gene_condition,
                                do_transfer = True,
                                thread_num = 24
                         )
1093299 cells found
df_result_tcell
CD3D CD3E PTPRC cell_type organ
cid
35 2.910235 2.910235 3.575773 T cell Spleen
40 2.403368 0.000000 3.050255 Neutrophilic granulocyte Spleen
50 3.820847 0.000000 3.149373 T cell Spleen
126 0.000000 4.292039 3.220413 T cell Spleen
167 1.589888 0.000000 1.589888 Plasma B cell Spleen
... ... ... ... ... ...
4085793 0.000000 3.241421 3.241421 Zona fasciculata cell Adrenal gland
4089833 2.782014 0.000000 2.782014 Neutrophilic granulocyte Adrenal gland
4092102 0.000000 3.204398 3.204398 Neutrophilic granulocyte Adrenal gland
4092673 2.709732 0.000000 2.709732 Neutrophilic granulocyte Adrenal gland
4093924 2.256228 0.000000 2.256228 Zona fasciculata cell Adrenal gland

14710 rows × 5 columns

cid_expression = [[('cid',i)] for i in df_result_tcell.index]

merge two cid obtained from origins

# merge and remove duplicated cids
cid_list = set()
for i in range(len(cid_expression)):
    cid= cid_expression[i][0][1]
    cid_list.add(cid)

for i in range(len(cid_label)):
    cid= cid_label[i][0][1]
    cid_list.add(cid)

cid_list = list(cid_list)

# print number of obtained cids
print(len(cid_list))

# build rows_to_get variable to download data
rows_to_get = [[('cid',i)] for i in cid_list]
56540
import pickle
pickle.HIGHEST_PROTOCOL
4
with open('rows_to_get.pickle', 'wb') as f:
    pickle.dump(rows_to_get, f, protocol=4)
import pickle
with open('rows_to_get.pickle', 'rb') as f:
    rows_to_get=pickle.load(f )
from tqdm import tqdm
import pickle

# we suggest downloading cells in small batches in case of network issues
for chunk in tqdm(range(int(1+len(rows_to_get)/500)),ncols=80):
    # split batches
    lb, rb = chunk*500, (chunk+1)*500
    rows = rows_to_get[lb:rb]
    if len(rows)<=0:break

    # download rows from the unified Giant Table (uGT)
    result = ECAUGT.get_columnsbycell_para(rows_to_get = rows,
                                           cols_to_get = None, # download all columns
                                           col_filter  = None,
                                           do_transfer = True,
                                           thread_num  = 24)
    result.to_pickle("__temp_%d_%d.pk"%(lb,rb))
    #print("downloading %d~%d"%(lb, rb))
    #print(len(rows))
  5%|██▏                                      | 6/114 [09:44<2:55:11, 97.33s/it]OTS request failed, API: GetRow, HTTPStatus: 503, ErrorCode: OTSTimeout, ErrorMessage: Operation timeout., RequestID: 0005d044-f8df-f27c-613f-020a872a9e27.
100%|███████████████████████████████████████| 114/114 [3:04:25<00:00, 97.06s/it]
# load split batches
giant_table_list = []
for chunk in tqdm(range(int(1+len(rows_to_get)/500)),ncols=80):
    lb, rb = chunk*500, (chunk+1)*500
    fname = "__temp_%d_%d.pk" % (lb, rb)
    with open(fname,'rb') as f:
        df=pickle.load(f)
        giant_table_list.append(df)
100%|█████████████████████████████████████████| 114/114 [00:17<00:00,  6.59it/s]
giant_table= giant_table_list[0]

for i in range(1, len(giant_table_list)):
    giant_table = pd.concat([ giant_table, giant_table_list[i] ])
# remove intermediate results
del giant_table_list
import gc
gc.collect()


giant_table.to_pickle("sorted_tcells_raw.pk")
genes    = giant_table.columns[:43878]
metaCols = giant_table.columns[43878:43878+18]
expr = giant_table.loc[:,genes]
meta = giant_table.loc[:,metaCols]

meta.reset_index(inplace=True)
expr.reset_index(inplace=True)
expr=expr.drop(['cid'], axis=1)

print(expr.shape)
print(meta.shape)
(56540, 43878)
(56540, 19)
# check the sample-by-gene expression matrix
expr
A12M1 A12M2 A12M3 A12M4 A1BG A1BG-AS1 A1CF A2M A2M-AS1 A2ML1 ... ZXDA ZXDB ZXDC ZYG11A ZYG11AP1 ZYG11B ZYX ZYXP1 ZZEF1 ZZZ3
0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.000000 ... 0.0 0.000000 0.0 0.0 0.0 0.0 0.0 0.0 0.000000 0.0
1 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.000000 ... 0.0 0.000000 0.0 0.0 0.0 0.0 0.0 0.0 0.000000 0.0
2 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.000000 ... 0.0 0.000000 0.0 0.0 0.0 0.0 0.0 0.0 0.000000 0.0
3 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.000000 ... 0.0 0.000000 0.0 0.0 0.0 0.0 0.0 0.0 0.000000 0.0
4 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.000000 ... 0.0 0.000000 0.0 0.0 0.0 0.0 0.0 0.0 0.000000 0.0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
56535 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.000000 ... 0.0 0.000000 0.0 0.0 0.0 0.0 0.0 0.0 0.000000 0.0
56536 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.000000 ... 0.0 0.000000 0.0 0.0 0.0 0.0 0.0 0.0 2.165166 0.0
56537 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.000000 ... 0.0 2.696737 0.0 0.0 0.0 0.0 0.0 0.0 0.000000 0.0
56538 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.000000 ... 0.0 0.000000 0.0 0.0 0.0 0.0 0.0 0.0 0.000000 0.0
56539 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 2.658875 ... 0.0 0.000000 0.0 0.0 0.0 0.0 0.0 0.0 0.000000 0.0

56540 rows × 43878 columns

# check the metadata matrix
meta
cid cell_id cell_type cl_name donor_age donor_gender donor_id hcad_name marker_gene organ original_name region sample_status seq_tech study_id subregion tissue_type uHAF_name user_id
0 2016314 ACTGATGCAGCGTTCG-1-HCATisStab7587208 T cell T cell NA NA 356C Lung-Connective tissue-T cell-CD3D IL32 CD3D IL32 Lung T_CD4 Left lung Healthy 10X 10.1186/s13059-019-1906-x Inferior lobular Connective tissue Lung-Connective tissue-T cell-CD3D IL32 2
1 3100139 FetalMaleGonad_2.TCACTTGGACATGAGATC NK T cell NA GW11 Male Donor9 Testis-Connective tissue-NK T cell-MT-ATP6 MT-CYB MT-ATP6 MT-CYB Testis Fetal fibroblast NA Healthy Microwell-seq 10.1038/s41586-020-2157-4 NA Muscle tissue Testis-Connective tissue-NK T cell-MT-ATP6 MT-CYB 3
2 4063239 AdultGallbladder_2.ACAATAAATAAAGGGCGA3 T cell T cell 58yr Male AdultGallbladder2 Gallbladder-Connective tissue-T cell-IL32 IL32 Gallbladder T cell_CCL5 high NA Healthy Microwell-seq 10.1038/s41586-020-2157-4 NA Connective tissue Gallbladder-Connective tissue-T cell-IL32 4
3 3100141 FetalMaleGonad_2.TGATCAGTCCCGAGATGG NK T cell NA GW11 Male Donor9 Testis-Connective tissue-NK T cell-MT-ATP6 MT-CYB MT-ATP6 MT-CYB Testis Fetal fibroblast NA Healthy Microwell-seq 10.1038/s41586-020-2157-4 NA Muscle tissue Testis-Connective tissue-NK T cell-MT-ATP6 MT-CYB 3
4 4063252 AdultGallbladder_2.ACAATACATGATCGCACC3 Epithelial cell epithelial cell 58yr Male AdultGallbladder2 Gallbladder-Epithelial tissue-Epithelial cell-... TM4SF4 Gallbladder Mucous epithelial cell NA Healthy Microwell-seq 10.1038/s41586-020-2157-4 NA Epithelial tissue Gallbladder-Epithelial tissue-Epithelial cell-... 4
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
56535 4063198 AdultGallbladder_2.AATAAAAGCGAGAGGGTC3 T cell T cell 58yr Male AdultGallbladder2 Gallbladder-Connective tissue-T cell-IL32 IL32 Gallbladder T cell NA Healthy Microwell-seq 10.1038/s41586-020-2157-4 NA Connective tissue Gallbladder-Connective tissue-T cell-IL32 4
56536 1172316 FetalMuscle_1.CAACAACCGCTAGGCTGC Proliferating T cell NA GW12 Male NA Muscle-Connective tissue-Proliferating T cell-... UBE2C Muscle Proliferating cell_UBE2C high NA Healthy Microwell-seq 10.1038/s41586-020-2157-4 NA Connective tissue Muscle-Connective tissue-Proliferating T cell-... 1
56537 3100133 FetalMaleGonad_2.TAGCATAACCTACAAAGT NK T cell NA GW11 Male Donor9 Testis-Connective tissue-NK T cell-MT-ATP6 MT-CYB MT-ATP6 MT-CYB Testis Fetal fibroblast NA Healthy Microwell-seq 10.1038/s41586-020-2157-4 NA Muscle tissue Testis-Connective tissue-NK T cell-MT-ATP6 MT-CYB 3
56538 3100135 FetalMaleGonad_2.TTTAGGGTGGTACCATCT NK T cell NA GW11 Male Donor9 Testis-Connective tissue-NK T cell-MT-ATP6 MT-CYB MT-ATP6 MT-CYB Testis Fetal fibroblast NA Healthy Microwell-seq 10.1038/s41586-020-2157-4 NA Muscle tissue Testis-Connective tissue-NK T cell-MT-ATP6 MT-CYB 3
56539 3100136 FetalMaleGonad_2.CCATCTGCGTCCTGTGCG NK T cell NA GW11 Male Donor9 Testis-Connective tissue-NK T cell-MT-ATP6 MT-CYB MT-ATP6 MT-CYB Testis Fetal fibroblast NA Healthy Microwell-seq 10.1038/s41586-020-2157-4 NA Muscle tissue Testis-Connective tissue-NK T cell-MT-ATP6 MT-CYB 3

56540 rows × 19 columns

create single-cell analysis objects

# create scanpy object from the matrices
adata = sc.AnnData(X = expr, obs = meta)
adata.var_names_make_unique()

sc.pp.filter_genes(adata, min_counts=5)
sc.pp.filter_genes(adata, min_cells=3)
# post-processing steps
from scipy.sparse import csc_matrix
adata.X = csc_matrix(adata.X, dtype=np.float32)

adata.obs['donor_id']=adata.obs['donor_id'].astype(str)
adata.write_h5ad("sorted_tcells_raw.h5ad")
/home/chensijie/software/anaconda3/envs/r411py37/lib/python3.7/site-packages/anndata/_core/anndata.py:1220: FutureWarning: The inplace parameter in pandas.Categorical.reorder_categories is deprecated and will be removed in a future version. Removing unused categories will always return a new Categorical object.
  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'donor_id' as categorical

filter cells for downstream analysis

# remove fibroblast
sel = adata[:,"LUM"].X==0
adata = adata[sel].copy()

sel = adata[:,"SERPING1"].X==0
adata = adata[sel].copy()

sel = adata[:,"COL1A1"].X==0
adata = adata[sel].copy()

sel = adata[:,"COL1A2"].X==0
adata = adata[sel].copy()

# remove vascular endothelial cells
sel = adata[:,"INMT"].X==0
adata = adata[sel].copy()

# remove muscle cells
sel = adata[:,"ACTA2"].X==0
adata = adata[sel].copy()

# remove granulocytes
sel = adata[:,"S100A8"].X==0
adata = adata[sel].copy()

sel = adata[:,"S100A9"].X==0
adata = adata[sel].copy()

sel = adata[:,"SIGLEC8"].X==0
adata = adata[sel].copy()


# remove myeloid cells
sel = adata[:,"C1QA"].X==0
adata = adata[sel].copy()

sel = adata[:,"C1QB"].X==0
adata = adata[sel].copy()

sel = adata[:,"C1QC"].X==0
adata = adata[sel].copy()

sel = adata[:,"ITGAM"].X==0
adata = adata[sel].copy()

sel = adata[:,"ITGAX"].X==0
adata = adata[sel].copy()

# remove B cells
sel = adata[:,"CD79A"].X==0
adata = adata[sel].copy()

sel = adata[:,"CD79B"].X==0
adata = adata[sel].copy()

sel = adata[:,"CD19"].X==0
adata = adata[sel].copy()

sel = adata[:,"MS4A1"].X==0
adata = adata[sel].copy()

# remove cells from nerve tissues
sel = adata[:,"FSCN1"].X==0
adata = adata[sel].copy()
adata
AnnData object with n_obs × n_vars = 20553 × 20491
    obs: 'cid', 'cell_id', 'cell_type', 'cl_name', 'donor_age', 'donor_gender', 'donor_id', 'hcad_name', 'marker_gene', 'organ', 'original_name', 'region', 'sample_status', 'seq_tech', 'study_id', 'subregion', 'tissue_type', 'uHAF_name', 'user_id'
    var: 'n_counts', 'n_cells'
sc.pp.highly_variable_genes(adata, min_mean=0.0125, max_mean=3, min_disp=0.5)
sc.pl.highly_variable_genes(adata)
_images/output_35_0.png
sc.tl.pca(adata, svd_solver='arpack')
sc.set_figure_params(figsize=[5,5])
sc.pl.pca(adata,color="organ")
_images/output_37_0.png
sc.set_figure_params(figsize=[5,5])
sc.pl.pca(adata,color="cell_type")
_images/output_38_0.png
sc.set_figure_params(figsize=[5,5])
sc.pl.pca(adata,color=["PTPRC","CD3D","CD3E","CD3G",
                       "CD8A","CD4","CD69","CCR7"],sort_order=True)
_images/output_39_0.png
adata.write_h5ad("sorted_tcells_filtered.h5ad")

Now we have obtained a collection of T cell using in data sorting. You can do what ever analysis you like with these sorted cells.

We have done some downstream metabolic pathway analysis on these cells using GSVA, which is available at https://github.com/XuegongLab/hECA/tree/main/examples