Skip to content

Commit 1bbc636

Browse files
authored
Add files via upload
1 parent a70cd14 commit 1bbc636

File tree

1 file changed

+394
-0
lines changed

1 file changed

+394
-0
lines changed

others/Filtering.ipynb

Lines changed: 394 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,394 @@
1+
{
2+
"cells": [
3+
{
4+
"cell_type": "code",
5+
"execution_count": 2,
6+
"metadata": {},
7+
"outputs": [],
8+
"source": [
9+
"from rdkit import Chem\n",
10+
"from rdkit.Chem import AllChem\n",
11+
"from rdkit.Chem.rdmolops import GetFormalCharge\n",
12+
"from rdkit.Chem.rdMolDescriptors import CalcMolFormula, CalcExactMolWt\n",
13+
"from pandas import Series,DataFrame\n",
14+
"import pandas as pd"
15+
]
16+
},
17+
{
18+
"cell_type": "code",
19+
"execution_count": 3,
20+
"metadata": {},
21+
"outputs": [
22+
{
23+
"data": {
24+
"text/html": [
25+
"<div>\n",
26+
"<style scoped>\n",
27+
" .dataframe tbody tr th:only-of-type {\n",
28+
" vertical-align: middle;\n",
29+
" }\n",
30+
"\n",
31+
" .dataframe tbody tr th {\n",
32+
" vertical-align: top;\n",
33+
" }\n",
34+
"\n",
35+
" .dataframe thead th {\n",
36+
" text-align: right;\n",
37+
" }\n",
38+
"</style>\n",
39+
"<table border=\"1\" class=\"dataframe\">\n",
40+
" <thead>\n",
41+
" <tr style=\"text-align: right;\">\n",
42+
" <th></th>\n",
43+
" <th>Pubchem ID</th>\n",
44+
" <th>Inchi</th>\n",
45+
" <th>Exac Min</th>\n",
46+
" <th>Exac Max</th>\n",
47+
" <th>CCS Min</th>\n",
48+
" <th>CCS Max</th>\n",
49+
" <th>Exac</th>\n",
50+
" <th>CCS</th>\n",
51+
" <th>Adduct</th>\n",
52+
" <th>ID</th>\n",
53+
" </tr>\n",
54+
" </thead>\n",
55+
" <tbody>\n",
56+
" <tr>\n",
57+
" <th>0</th>\n",
58+
" <td>5280637</td>\n",
59+
" <td>InChI=1S/C21H20O11/c22-7-16-18(27)19(28)20(29)...</td>\n",
60+
" <td>448.088695</td>\n",
61+
" <td>448.111100</td>\n",
62+
" <td>201.21</td>\n",
63+
" <td>216.81</td>\n",
64+
" <td>448.099898</td>\n",
65+
" <td>209.006823</td>\n",
66+
" <td>[M+H]+</td>\n",
67+
" <td>Y1</td>\n",
68+
" </tr>\n",
69+
" <tr>\n",
70+
" <th>1</th>\n",
71+
" <td>442439</td>\n",
72+
" <td>InChI=1S/C28H34O15/c1-10-21(33)23(35)25(37)27(...</td>\n",
73+
" <td>610.173852</td>\n",
74+
" <td>610.204362</td>\n",
75+
" <td>220.40</td>\n",
76+
" <td>238.60</td>\n",
77+
" <td>610.189107</td>\n",
78+
" <td>229.497183</td>\n",
79+
" <td>[M-H]-</td>\n",
80+
" <td>Y2</td>\n",
81+
" </tr>\n",
82+
" <tr>\n",
83+
" <th>2</th>\n",
84+
" <td>30231</td>\n",
85+
" <td>InChI=1S/C28H36O15/c1-11-21(34)23(36)25(38)27(...</td>\n",
86+
" <td>612.189452</td>\n",
87+
" <td>612.220062</td>\n",
88+
" <td>225.22</td>\n",
89+
" <td>242.69</td>\n",
90+
" <td>612.204757</td>\n",
91+
" <td>233.956908</td>\n",
92+
" <td>[M+H]+</td>\n",
93+
" <td>Y3</td>\n",
94+
" </tr>\n",
95+
" <tr>\n",
96+
" <th>3</th>\n",
97+
" <td>73659</td>\n",
98+
" <td>InChI=1S/C30H48O4/c1-25(2)12-14-30(24(33)34)15...</td>\n",
99+
" <td>472.342788</td>\n",
100+
" <td>472.366405</td>\n",
101+
" <td>213.71</td>\n",
102+
" <td>231.36</td>\n",
103+
" <td>472.354597</td>\n",
104+
" <td>222.535275</td>\n",
105+
" <td>[M-H]-</td>\n",
106+
" <td>Y4</td>\n",
107+
" </tr>\n",
108+
" <tr>\n",
109+
" <th>4</th>\n",
110+
" <td>9809542</td>\n",
111+
" <td>InChI=1S/C36H62O9/c1-19(2)10-9-13-36(8,45-31-2...</td>\n",
112+
" <td>638.422759</td>\n",
113+
" <td>638.454681</td>\n",
114+
" <td>248.90</td>\n",
115+
" <td>269.46</td>\n",
116+
" <td>638.438720</td>\n",
117+
" <td>259.180746</td>\n",
118+
" <td>[M-H]-</td>\n",
119+
" <td>Y5</td>\n",
120+
" </tr>\n",
121+
" </tbody>\n",
122+
"</table>\n",
123+
"</div>"
124+
],
125+
"text/plain": [
126+
" Pubchem ID Inchi Exac Min \\\n",
127+
"0 5280637 InChI=1S/C21H20O11/c22-7-16-18(27)19(28)20(29)... 448.088695 \n",
128+
"1 442439 InChI=1S/C28H34O15/c1-10-21(33)23(35)25(37)27(... 610.173852 \n",
129+
"2 30231 InChI=1S/C28H36O15/c1-11-21(34)23(36)25(38)27(... 612.189452 \n",
130+
"3 73659 InChI=1S/C30H48O4/c1-25(2)12-14-30(24(33)34)15... 472.342788 \n",
131+
"4 9809542 InChI=1S/C36H62O9/c1-19(2)10-9-13-36(8,45-31-2... 638.422759 \n",
132+
"\n",
133+
" Exac Max CCS Min CCS Max Exac CCS Adduct ID \n",
134+
"0 448.111100 201.21 216.81 448.099898 209.006823 [M+H]+ Y1 \n",
135+
"1 610.204362 220.40 238.60 610.189107 229.497183 [M-H]- Y2 \n",
136+
"2 612.220062 225.22 242.69 612.204757 233.956908 [M+H]+ Y3 \n",
137+
"3 472.366405 213.71 231.36 472.354597 222.535275 [M-H]- Y4 \n",
138+
"4 638.454681 248.90 269.46 638.438720 259.180746 [M-H]- Y5 "
139+
]
140+
},
141+
"execution_count": 3,
142+
"metadata": {},
143+
"output_type": "execute_result"
144+
}
145+
],
146+
"source": [
147+
"D = pd.read_csv('data.csv')\n",
148+
"adduct_type = list(D['Adduct'])\n",
149+
"pubchem_id = list(D['Pubchem ID'])\n",
150+
"exact_min = list(D['Exac Min'])\n",
151+
"exact_max = list(D['Exac Max'])\n",
152+
"ccs_min = list(D['CCS Min'])\n",
153+
"ccs_max = list(D['CCS Max'])\n",
154+
"ccs_true = list(D['CCS'])\n",
155+
"exact_true = list(D['Exac'])\n",
156+
"ID = list(D['ID'])\n",
157+
"D"
158+
]
159+
},
160+
{
161+
"cell_type": "code",
162+
"execution_count": 1,
163+
"metadata": {},
164+
"outputs": [],
165+
"source": [
166+
"fenshu = []\n",
167+
"excat_num = []\n",
168+
"ccs_num = []\n",
169+
"for index in range(len(adduct_type)):\n",
170+
" \n",
171+
" data_exm = []\n",
172+
" data_ccs = []\n",
173+
"\n",
174+
" for i in range(1,156000000,500000):\n",
175+
" filename = str(i).zfill(9) + '_' + str(i+500000-1).zfill(9)\n",
176+
" \n",
177+
" data = pd.read_csv('Compound_'+filename+'.csv')\n",
178+
" data = data[data['EXACT MASS'] >= exact_min[index]]\n",
179+
" data = data[data['EXACT MASS'] <= exact_max[index]]\n",
180+
" data_exm.append(data)\n",
181+
" \n",
182+
" data = data[data[adduct_type[index]] >= ccs_min[index]]\n",
183+
" data = data[data[adduct_type[index]] <= ccs_max[index]]\n",
184+
" data_ccs.append(data)\n",
185+
" \n",
186+
" data_exm_s = pd.concat(data_exm, axis=0, ignore_index=True)\n",
187+
" data_ccs_s = pd.concat(data_ccs, axis=0, ignore_index=True)\n",
188+
" \n",
189+
" excat_num.append(len(list(data_exm_s['Pubchem ID'])))\n",
190+
" ccs_num.append(len(list(data_ccs_s['Pubchem ID'])))\n",
191+
" \n",
192+
" inchis = []\n",
193+
" data_ccs_s_smiles = list(data_ccs_s['ISO SMILES'])\n",
194+
" for smi in data_ccs_s_smiles:\n",
195+
" mol = Chem.MolFromSmiles(smi)\n",
196+
" inchi = Chem.inchi.MolToInchi(mol)\n",
197+
" inchis.append(inchi)\n",
198+
" data_ccs_s['INCHI'] = inchis\n",
199+
" \n",
200+
" RE = []\n",
201+
" ccs_pred = list(data_ccs_s[adduct_type[index]])\n",
202+
" for ccs in ccs_pred:\n",
203+
" RE.append((ccs_true[index]-ccs)/ccs_true[index]*100)\n",
204+
" \n",
205+
" try:\n",
206+
" this_ccs_pred = list(data_ccs_s[data_ccs_s['Pubchem ID'].isin([pubchem_id[index]])][adduct_type[index]])[0]\n",
207+
" this_ccs_re = (ccs_true[index]-this_ccs_pred)/ccs_true[index]*100\n",
208+
" if this_ccs_re < 0:\n",
209+
" fenshu.append((1-this_ccs_re/min(RE))*100)\n",
210+
" else:\n",
211+
" fenshu.append((1-this_ccs_re/max(RE))*100)\n",
212+
" except:\n",
213+
" fenshu.append('null')\n",
214+
" \n",
215+
" data_ccs_s.to_csv(str(ID[index])+'.csv',index=False)"
216+
]
217+
},
218+
{
219+
"cell_type": "code",
220+
"execution_count": 5,
221+
"metadata": {},
222+
"outputs": [
223+
{
224+
"data": {
225+
"text/html": [
226+
"<div>\n",
227+
"<style scoped>\n",
228+
" .dataframe tbody tr th:only-of-type {\n",
229+
" vertical-align: middle;\n",
230+
" }\n",
231+
"\n",
232+
" .dataframe tbody tr th {\n",
233+
" vertical-align: top;\n",
234+
" }\n",
235+
"\n",
236+
" .dataframe thead th {\n",
237+
" text-align: right;\n",
238+
" }\n",
239+
"</style>\n",
240+
"<table border=\"1\" class=\"dataframe\">\n",
241+
" <thead>\n",
242+
" <tr style=\"text-align: right;\">\n",
243+
" <th></th>\n",
244+
" <th>Pubchem ID</th>\n",
245+
" <th>Inchi</th>\n",
246+
" <th>Exac Min</th>\n",
247+
" <th>Exac Max</th>\n",
248+
" <th>CCS Min</th>\n",
249+
" <th>CCS Max</th>\n",
250+
" <th>Exac</th>\n",
251+
" <th>CCS</th>\n",
252+
" <th>Adduct</th>\n",
253+
" <th>ID</th>\n",
254+
" <th>excat_num</th>\n",
255+
" <th>ccs_num</th>\n",
256+
" </tr>\n",
257+
" </thead>\n",
258+
" <tbody>\n",
259+
" <tr>\n",
260+
" <th>0</th>\n",
261+
" <td>5280637</td>\n",
262+
" <td>InChI=1S/C21H20O11/c22-7-16-18(27)19(28)20(29)...</td>\n",
263+
" <td>448.088695</td>\n",
264+
" <td>448.111100</td>\n",
265+
" <td>201.21</td>\n",
266+
" <td>216.81</td>\n",
267+
" <td>448.099898</td>\n",
268+
" <td>209.006823</td>\n",
269+
" <td>[M+H]+</td>\n",
270+
" <td>Y1</td>\n",
271+
" <td>14629</td>\n",
272+
" <td>9472</td>\n",
273+
" </tr>\n",
274+
" <tr>\n",
275+
" <th>1</th>\n",
276+
" <td>442439</td>\n",
277+
" <td>InChI=1S/C28H34O15/c1-10-21(33)23(35)25(37)27(...</td>\n",
278+
" <td>610.173852</td>\n",
279+
" <td>610.204362</td>\n",
280+
" <td>220.40</td>\n",
281+
" <td>238.60</td>\n",
282+
" <td>610.189107</td>\n",
283+
" <td>229.497183</td>\n",
284+
" <td>[M-H]-</td>\n",
285+
" <td>Y2</td>\n",
286+
" <td>3827</td>\n",
287+
" <td>1099</td>\n",
288+
" </tr>\n",
289+
" <tr>\n",
290+
" <th>2</th>\n",
291+
" <td>30231</td>\n",
292+
" <td>InChI=1S/C28H36O15/c1-11-21(34)23(36)25(38)27(...</td>\n",
293+
" <td>612.189452</td>\n",
294+
" <td>612.220062</td>\n",
295+
" <td>225.22</td>\n",
296+
" <td>242.69</td>\n",
297+
" <td>612.204757</td>\n",
298+
" <td>233.956908</td>\n",
299+
" <td>[M+H]+</td>\n",
300+
" <td>Y3</td>\n",
301+
" <td>3944</td>\n",
302+
" <td>1542</td>\n",
303+
" </tr>\n",
304+
" <tr>\n",
305+
" <th>3</th>\n",
306+
" <td>73659</td>\n",
307+
" <td>InChI=1S/C30H48O4/c1-25(2)12-14-30(24(33)34)15...</td>\n",
308+
" <td>472.342788</td>\n",
309+
" <td>472.366405</td>\n",
310+
" <td>213.71</td>\n",
311+
" <td>231.36</td>\n",
312+
" <td>472.354597</td>\n",
313+
" <td>222.535275</td>\n",
314+
" <td>[M-H]-</td>\n",
315+
" <td>Y4</td>\n",
316+
" <td>2254</td>\n",
317+
" <td>1807</td>\n",
318+
" </tr>\n",
319+
" <tr>\n",
320+
" <th>4</th>\n",
321+
" <td>9809542</td>\n",
322+
" <td>InChI=1S/C36H62O9/c1-19(2)10-9-13-36(8,45-31-2...</td>\n",
323+
" <td>638.422759</td>\n",
324+
" <td>638.454681</td>\n",
325+
" <td>248.90</td>\n",
326+
" <td>269.46</td>\n",
327+
" <td>638.438720</td>\n",
328+
" <td>259.180746</td>\n",
329+
" <td>[M-H]-</td>\n",
330+
" <td>Y5</td>\n",
331+
" <td>733</td>\n",
332+
" <td>505</td>\n",
333+
" </tr>\n",
334+
" </tbody>\n",
335+
"</table>\n",
336+
"</div>"
337+
],
338+
"text/plain": [
339+
" Pubchem ID Inchi Exac Min \\\n",
340+
"0 5280637 InChI=1S/C21H20O11/c22-7-16-18(27)19(28)20(29)... 448.088695 \n",
341+
"1 442439 InChI=1S/C28H34O15/c1-10-21(33)23(35)25(37)27(... 610.173852 \n",
342+
"2 30231 InChI=1S/C28H36O15/c1-11-21(34)23(36)25(38)27(... 612.189452 \n",
343+
"3 73659 InChI=1S/C30H48O4/c1-25(2)12-14-30(24(33)34)15... 472.342788 \n",
344+
"4 9809542 InChI=1S/C36H62O9/c1-19(2)10-9-13-36(8,45-31-2... 638.422759 \n",
345+
"\n",
346+
" Exac Max CCS Min CCS Max Exac CCS Adduct ID \\\n",
347+
"0 448.111100 201.21 216.81 448.099898 209.006823 [M+H]+ Y1 \n",
348+
"1 610.204362 220.40 238.60 610.189107 229.497183 [M-H]- Y2 \n",
349+
"2 612.220062 225.22 242.69 612.204757 233.956908 [M+H]+ Y3 \n",
350+
"3 472.366405 213.71 231.36 472.354597 222.535275 [M-H]- Y4 \n",
351+
"4 638.454681 248.90 269.46 638.438720 259.180746 [M-H]- Y5 \n",
352+
"\n",
353+
" excat_num ccs_num \n",
354+
"0 14629 9472 \n",
355+
"1 3827 1099 \n",
356+
"2 3944 1542 \n",
357+
"3 2254 1807 \n",
358+
"4 733 505 "
359+
]
360+
},
361+
"execution_count": 5,
362+
"metadata": {},
363+
"output_type": "execute_result"
364+
}
365+
],
366+
"source": [
367+
"D['excat_num'] = excat_num\n",
368+
"D['ccs_num'] = ccs_num\n",
369+
"D"
370+
]
371+
}
372+
],
373+
"metadata": {
374+
"kernelspec": {
375+
"display_name": "Python 3",
376+
"language": "python",
377+
"name": "python3"
378+
},
379+
"language_info": {
380+
"codemirror_mode": {
381+
"name": "ipython",
382+
"version": 3
383+
},
384+
"file_extension": ".py",
385+
"mimetype": "text/x-python",
386+
"name": "python",
387+
"nbconvert_exporter": "python",
388+
"pygments_lexer": "ipython3",
389+
"version": "3.8.5"
390+
}
391+
},
392+
"nbformat": 4,
393+
"nbformat_minor": 5
394+
}

0 commit comments

Comments
 (0)