-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathbuild_df.py
86 lines (62 loc) · 2.54 KB
/
build_df.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score
'''
```
$ wc -l input/*.csv
85004 input/dipole_moments.csv 分子毎の双極子モーメント
85004 input/potential_energy.csv 分子毎の位置エネルギー
1533538 input/magnetic_shielding_tensors.csv 原子毎の化学シフトテンソル
1533538 input/mulliken_charges.csv 原子毎のマリケン電気陰性度(kJ/mol)
2358658 input/structures.csv 原子毎の構造データ(全130775分子)
4658148 input/scalar_coupling_contributions.csv 原子間のtype,fc,sd,pso,dso
4658148 input/train.csv 原子間のカップリング係数
2505543 input/test.csv 求めるべき原子間のカップリング係数
```
* テスト分子数 45772
* テスト原子数
* トレーニング分子数 85003
* トレーニング原子数 1533538
* テスト原子間数 2505542
* トレーニング原子間数 4658147
'''
def build_molecule_df():
''' 分子単位のデータ '''
mole_dipole = pd.read_csv('./input/dipole_moments.csv')
mole_enegy = pd.read_csv('./input/potential_energy.csv')
molecule = pd.merge(mole_dipole, mole_enegy, on=['molecule_name'])
print(len(molecule), molecule.head())
return molecule
def build_atom_df():
''' 原子単位のデータ '''
atom_shield = pd.read_csv('./input/magnetic_shielding_tensors.csv')
atom_charge = pd.read_csv('./input/mulliken_charges.csv')
atom_struct = pd.read_csv('./input/structures.csv')
atom = pd.merge(atom_struct, atom_shield,
on=['molecule_name', 'atom_index'],
how='left')
atom = pd.merge(atom, atom_charge,
on=['molecule_name', 'atom_index'],
how='left')
print(len(atom), atom.head())
return atom
def build_couple_df():
''' 原子間結合単位のデータ '''
coupling = pd.read_csv('./input/scalar_coupling_contributions.csv')
train = pd.read_csv('./input/train.csv')
#test = pd.read_csv('./input/test.csv')
coupling = pd.merge(train, coupling,
on=['molecule_name', 'atom_index_0', 'atom_index_1', 'type'],
how='left')
return coupling
def main():
mols = build_molecule_df()
atom = build_atom_df()
coupling = build_couple_df()
mols.to_csv('./temp/molecule.csv')
atom.to_csv('./temp/atom.csv')
coupling.to_csv('./temp/couping.csv')
if __name__ == '__main__':
main()