forked from leeoesterreich/scRNA-CellLines
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcalcCNV.py
163 lines (130 loc) · 6.14 KB
/
calcCNV.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
from utils import *
from scipy import linalg
import itertools
from sklearn.mixture import GaussianMixture
def grep_genes(chrom:str, start_pos:int, stop_pos:int,
gene_pos:pd.DataFrame):
tmp = gene_pos[gene_pos['chrom'] == chrom]
tmp = tmp[(tmp['txStart']>start_pos) & (tmp['txEnd']<stop_pos)].drop_duplicates(subset='symbol')
genes = tmp['symbol'].values
return genes # np.array
def grep_chrArm(gene:str,
gene_pos:pd.DataFrame, chr_pos:pd.DataFrame):
chrom, start, end = gene_pos[gene_pos['symbol']==gene][['chrom','txStart','txEnd']].values.flatten()
idx = [x for x in chr_pos.index if str(chr_pos.loc[x,'chrom'])==chrom and start > chr_pos.loc[x,'chromStart'] and end < chr_pos.loc[x,'chromEnd']]
if len(idx) == 1:
idx = idx[0]
else:
idx = None
return idx
def pred_model(X:np.array): # n cells X 1 feature
# find optimal number of clusters
lowest_bic = np.infty
bic = []
n_components_range = range(1, 7)
# cv_types = ['spherical', 'tied', 'diag', 'full']
cv_types = ['full']
for cv_type in cv_types:
for n_components in n_components_range:
# Fit a Gaussian mixture with EM
gmm = GaussianMixture(n_components=n_components,
covariance_type=cv_type)
gmm.fit(X)
bic.append(gmm.bic(X))
if bic[-1] < lowest_bic:
lowest_bic = bic[-1]
best_gmm = gmm
return best_gmm
def calcMovingAvg(cts:pd.DataFrame, gene_pos:pd.DataFrame, win=100, inputForm='rawcounts'):
'''
cts: gene X cells: raw counts, not Log or Normed
win: number of genes to average in one window
'''
order = ['chr%d'%(i) for i in range(1,23)] + ['chrX', 'chrY']
for n, chrom in enumerate(order):
tmp = gene_pos[gene_pos['chrom']==chrom].sort_values(by=['txStart'])
if n == 0:
res = tmp
elif n > 0:
res = pd.concat((res, tmp), axis=0)
cts = cts.loc[[g for g in res['symbol'] if g in cts.index],:]
if inputForm == 'rawcounts':
X = cts.values
E = np.log2(1+1e4*X/np.sum(X, axis=0))
elif inputForm == 'log':
E = cts.values
ER = E - np.mean(E, axis=1, keepdims=True) # each gene among cells, sum=0
ER_MvAvg = np.apply_along_axis(lambda x:np.convolve(x, np.ones((win,))/win, mode='same'), 0, ER)
genes_s, cells = list(cts.index), cts.columns
return genes_s, cells, ER_MvAvg
if __name__ == "__main__":
wkdir = '/bgfs/alee/chelsea/projects/10X/AL1/codes'
os.chdir(wkdir)
# Prep files
chr_pos = pd.read_csv('../data/hg38_chromPos.csv', index_col=0)
gene_pos = pd.read_csv('../data/CNV/gene_pos.csv')
adata = sc.read('../data/scVelo/AL1_ref_raw.h5ad')
CellTypes = ['MCF7','T47D WT','T47D KO', 'MM134','SUM44','BCK4', 'MCF10A','HEK293']
CNVDict = {}
min_genes_per_arm = 100
min_cl_cells = 20
min_cl_prob = 0.95
for cell in CellTypes:
CNVDict[cell] = {}
cnv = None
n = -1
chrs = []
tmp = adata_ref_raw[adata_ref_raw.obs['CellType']==cell]
X = np.array(tmp.X.todense()).T # genes X cell
cts_raw = pd.DataFrame(X, index=tmp.var.index, columns=tmp.obs.index) # gene X cell
genes, cells, ER_MvAvg = calcMovingAvg(cts_raw, gene_pos, win=100) # gene X cell
cts = pd.DataFrame(ER_MvAvg, index=genes, columns=cells)
CNVDict[cell]['WindowCts'] = cts.T # cell X genes
CNVDict[cell]['gmm'] = {}
for arm in chr_pos.index:
chrom, start_pos, stop_pos = chr_pos.loc[arm,:]
genes = grep_genes(chrom,start_pos,stop_pos,gene_pos)
genes_s = [g for g in genes if g in cts.index]
if len(genes_s) < min_genes_per_arm:
print('Low cov: {} {}'.format(cell, chrom))
if len(genes_s) > min_genes_per_arm:
x = np.mean(cts.loc[genes_s,:], axis=0).values
X = np.reshape(x, (len(x),1))
best_gmm = pred_model(X)
if best_gmm.n_components > 1:
predicts = best_gmm.predict(X)
probs = best_gmm.predict_proba(X)
for i in range(best_gmm.n_components):
cells_cl = np.where(predicts==i)[0]
probs_cl = probs[cells_cl,i].flatten()
if sum(probs_cl > min_cl_prob) < min_cl_cells:
break
elif sum(probs_cl > min_cl_prob) > min_cl_cells:
if i < best_gmm.n_components-1:
continue
if i==(best_gmm.n_components-1):
print('\n',cell, arm, len(genes_s), best_gmm.n_components, Counter(best_gmm.predict(X)))
n += 1
chrs.append(arm)
if n == 0:
cnv = x
elif n > 0:
cnv = np.vstack((cnv, x))
CNVDict[cell]['gmm'][arm] = best_gmm
if cnv is None or len(cnv.shape) < 2:
continue
cnv_df = pd.DataFrame(cnv, index=chrs, columns=cts.columns) # Arm X cells
CNVDict[cell]['ArmCts'] = cnv_df.T # cell X Arms
# heatmap (cell X Arm)
width = len(chrs)*0.7
g = sns.clustermap(cnv_df.T, method='ward', metric='euclidean',
cmap='coolwarm', figsize=(width, 5))
g.ax_heatmap.set_yticklabels('')
g.ax_heatmap.set_xlabel(cell)
xticklabels = cnv_df.T.columns[np.array(g.dendrogram_col.reordered_ind)]
g.ax_heatmap.set_xticks([x+0.5 for x in range(len(xticklabels))])
g.ax_heatmap.set_xticklabels(xticklabels, rotation=60)
cbar = g.cax
x0,y0,x1,y1 = np.array(g.ax_heatmap.get_position()).flatten()
cbar.set_position([1.3, y0, 0.2, y1-y0])
CNVDict[cell]['ArmCtsHeatmap'] = g