ECAUGT CD19¶
import packages and setup¶
import sys
import pandas as pd
import ECAUGT
import time
import multiprocessing
import numpy as np
# set parameters
endpoint = "https://HCAd-Datasets.cn-beijing.ots.aliyuncs.com"
access_id = "LTAI5t7t216W9amUD1crMVos" #enter your id and keys
access_key = "ZJPlUbpLCij5qUPjbsU8GnQHm97IxJ"
instance_name = "HCAd-Datasets"
table_name = 'HCA_d'
# # setup client
ECAUGT.Setup_Client(endpoint, access_id, access_key, instance_name, table_name)
Connected to the server, find the table.
HCA_d
TableName: HCA_d
PrimaryKey: [('cid', 'INTEGER')]
Reserved read throughput: 0
Reserved write throughput: 0
Last increase throughput time: 1605795297
Last decrease throughput time: None
table options's time to live: -1
table options's max version: 1
table options's max_time_deviation: 86400
0
Get data¶
filter by gene expression¶
gene_condition = ECAUGT.seq2filter("CD19 > 0.1")
df_result_cd19 = ECAUGT.get_columnsbycell_para(rows_to_get = None, cols_to_get=['CD19', 'organ', 'cell_type'],
col_filter=gene_condition, do_transfer = True,
thread_num = multiprocessing.cpu_count()-1)
1093299 cells found
filter by gene expression in web server¶
Open hECA website and set filters to get CD19+ cells
this step takes about 5 minutes
data:image/s3,"s3://crabby-images/79c7f/79c7f4defc853fdf0da12d3aad1087414ec5ee9e" alt="img"
img¶
click “Download Data” and a file containing ids of selected cells is downloaded
Get gene expression data and metadata¶
this step takes about 10 minutes on a server with 24 cpu cores
# if use web to query, load cid list like this
# rows_to_get = pd.read_csv('./cellSelectionRawData_1625643023446.txt')
# rows_to_get = [[('cid',i)] for i in rows_to_get['cid']]
rows_to_get = rows_to_get = [[('cid',i)] for i in df_result_cd19.index]
result = ECAUGT.get_columnsbycell_para(rows_to_get = rows_to_get, cols_to_get=None,
col_filter=gene_condition, do_transfer = True,
thread_num = multiprocessing.cpu_count()-1)
genes = result.columns[:43878]
metaCols = result.columns[43878:43878+18]
expr = result.loc[:,genes]
meta = result.loc[:,metaCols]
print(expr.shape)
print(meta.shape)
(2566, 43878)
(2566, 18)
meta.columns
Index(['cell_id', 'cell_type', 'cl_name', 'donor_age', 'donor_gender',
'donor_id', 'hcad_name', 'marker_gene', 'organ', 'original_name',
'region', 'sample_status', 'seq_tech', 'study_id', 'subregion',
'tissue_type', 'uHAF_name', 'user_id'],
dtype='object')
Analyze CD19+ data with scanpy¶
import scanpy as sc
sc.settings.verbosity = 3 # verbosity: errors (0), warnings (1), info (2), hints (3)
sc.logging.print_versions()
sc.settings.set_figure_params(dpi=80)
scanpy==1.4.4.post1 anndata==0.6.22.post1 umap==0.3.10 numpy==1.17.3 scipy==1.3.2 pandas==0.25.3 scikit-learn==0.21.3 statsmodels==0.10.2 python-igraph==0.7.1 louvain==0.6.1
adata = sc.AnnData(X = expr, obs = meta)
Transforming to str index.
adata.var_names_make_unique()
adata
AnnData object with n_obs × n_vars = 2566 × 43878
obs: 'cell_id', 'cell_type', 'cl_name', 'donor_age', 'donor_gender', 'donor_id', 'hcad_name', 'marker_gene', 'organ', 'original_name', 'region', 'sample_status', 'seq_tech', 'study_id', 'subregion', 'tissue_type', 'uHAF_name', 'user_id'
adata = adata[:, np.sum(np.isnan(adata.X), axis = 0)==0]
adata
View of AnnData object with n_obs × n_vars = 2566 × 43871
obs: 'cell_id', 'cell_type', 'cl_name', 'donor_age', 'donor_gender', 'donor_id', 'hcad_name', 'marker_gene', 'organ', 'original_name', 'region', 'sample_status', 'seq_tech', 'study_id', 'subregion', 'tissue_type', 'uHAF_name', 'user_id'
sc.pp.normalize_total(adata, target_sum=1e4)
Normalizing counts per cell.
finished (0:00:00):normalized adata.X
sc.pp.log1p(adata)
/home/ghx/anaconda3/envs/Seurat/lib/python3.7/site-packages/scanpy/preprocessing/_simple.py:285: UserWarning: Revieved a view of an AnnData. Making a copy.
view_to_actual(data)
sc.pp.highly_variable_genes(adata, min_mean=0.0125, max_mean=3, min_disp=0.5)
sc.pl.highly_variable_genes(adata)
extracting highly variable genes
finished (0:00:02)
--> added
'highly_variable', boolean vector (adata.var)
'means', float vector (adata.var)
'dispersions', float vector (adata.var)
'dispersions_norm', float vector (adata.var)
data:image/s3,"s3://crabby-images/094d1/094d17c28254a7c575e73ecf3b6963f18b0a7227" alt="_images/ECAUGT_CD19_25_1.png"
adata.raw = adata
adata = adata[:, adata.var.highly_variable]
sc.pp.scale(adata, max_value=10)
sc.tl.pca(adata, svd_solver='arpack')
sc.pp.neighbors(adata, n_neighbors=10, n_pcs=40)
/home/ghx/anaconda3/envs/Seurat/lib/python3.7/site-packages/scanpy/preprocessing/_simple.py:869: UserWarning: Revieved a view of an AnnData. Making a copy.
view_to_actual(adata)
computing PCA with n_comps = 50 computing PCA on highly variable genes finished (0:00:00) computing neighbors using 'X_pca' with n_pcs = 40 finished: added to .uns['neighbors'] 'distances', distances for each pair of neighbors 'connectivities', weighted adjacency matrix (0:00:01)
sc.tl.umap(adata)
computing UMAP
/home/ghx/anaconda3/envs/Seurat/lib/python3.7/site-packages/umap/spectral.py:229: UserWarning: Embedding a total of 3 separate connected components using meta-embedding (experimental)
n_components
finished: added
'X_umap', UMAP coordinates (adata.obsm) (0:00:06)
sc.pl.umap(adata, color=['CD19', 'CD79A', 'CD248'])
... storing 'cell_type' as categorical
... storing 'cl_name' as categorical
... storing 'donor_age' as categorical
... storing 'donor_gender' as categorical
... storing 'donor_id' as categorical
... storing 'hcad_name' as categorical
... storing 'marker_gene' as categorical
... storing 'organ' as categorical
... storing 'original_name' as categorical
... storing 'region' as categorical
... storing 'sample_status' as categorical
... storing 'seq_tech' as categorical
... storing 'study_id' as categorical
... storing 'subregion' as categorical
... storing 'tissue_type' as categorical
... storing 'uHAF_name' as categorical
data:image/s3,"s3://crabby-images/4bfd3/4bfd3fc91b3730da7000e2e4fb3fe5c61b02e5d0" alt="_images/ECAUGT_CD19_29_1.png"
sc.pl.umap(adata, color=['organ'])
data:image/s3,"s3://crabby-images/7ea40/7ea40ce6de64941437f9daf5774a7727a58da113" alt="ECAUGT_CD19_files/ECAUGT_CD19_30_0.png"
sc.pl.umap(adata, color=['cell_type'])
data:image/s3,"s3://crabby-images/54db1/54db12acb34e7cbf6f8b660ef64a93f3cfa89861" alt="_images/ECAUGT_CD19_31_0.png"
tb = pd.crosstab(index=meta['organ'], columns=meta['cell_type'])
tb.iloc[np.where(tb.sum(axis = 1)>30)[0], np.where(tb.sum(axis = 0)>30)[0]]
cell_type | B cell | Cardiomyocyte cell | Dendritic cell | Endothelial cell | Enterocyte | Excitatory neuron | Fibrocyte | Macrophage | Microglia | Monocyte | NK cell | Plasma B cell | T cell | Unclassified |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
organ | ||||||||||||||
Blood | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 12 | 0 | 26 | 38 | 2 |
Bone marrow | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 18 | 1 | 0 |
Brain | 0 | 0 | 0 | 62 | 0 | 36 | 0 | 0 | 108 | 0 | 0 | 0 | 0 | 20 |
Bronchi | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 9 |
Eye | 12 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 2 | 0 | 0 | 0 | 0 | 0 |
Heart | 0 | 66 | 0 | 6 | 0 | 0 | 20 | 14 | 0 | 0 | 0 | 0 | 0 | 0 |
Kidney | 2 | 0 | 2 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 |
Liver | 35 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 3 | 0 |
Lung | 469 | 0 | 9 | 1 | 0 | 0 | 25 | 25 | 0 | 20 | 20 | 4 | 5 | 0 |
Oesophagus | 241 | 0 | 10 | 7 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 |
Pancreas | 157 | 0 | 10 | 7 | 0 | 0 | 0 | 1 | 0 | 8 | 13 | 4 | 49 | 0 |
Rectum | 64 | 0 | 0 | 0 | 55 | 0 | 0 | 0 | 0 | 0 | 0 | 7 | 6 | 1 |
Spleen | 0 | 0 | 0 | 9 | 0 | 0 | 0 | 3 | 0 | 0 | 0 | 310 | 2 | 0 |
Thyroid | 0 | 0 | 2 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 7 | 5 | 0 | 0 |
# expr.to_csv('./expr_CD19.csv')
# meta.to_csv('./meta_CD19.csv')