|
1 | 1 | from ydata_synthetic.synthesizers.regular import CGAN |
2 | | -from ydata_synthetic.preprocessing.regular.credit_fraud import transformations |
3 | 2 | from ydata_synthetic.synthesizers import ModelParameters, TrainParameters |
4 | 3 |
|
5 | 4 | import pandas as pd |
6 | 5 | import numpy as np |
7 | 6 | from sklearn import cluster |
8 | 7 |
|
| 8 | +model = CGAN |
| 9 | + |
9 | 10 | #Read the original data and have it preprocessed |
10 | 11 | data = pd.read_csv('data/creditcard.csv', index_col=[0]) |
11 | 12 |
|
12 | 13 | #List of columns different from the Class column |
13 | | -data_cols = list(data.columns[ data.columns != 'Class' ]) |
14 | | -label_cols = ['Class'] |
| 14 | +num_cols = list(data.columns[ data.columns != 'Class' ]) |
| 15 | +cat_cols = [] # Condition features are not preprocessed and therefore not listed here |
15 | 16 |
|
16 | | -print('Dataset columns: {}'.format(data_cols)) |
| 17 | +print('Dataset columns: {}'.format(num_cols)) |
17 | 18 | sorted_cols = ['V14', 'V4', 'V10', 'V17', 'V12', 'V26', 'Amount', 'V21', 'V8', 'V11', 'V7', 'V28', 'V19', 'V3', 'V22', 'V6', 'V20', 'V27', 'V16', 'V13', 'V25', 'V24', 'V18', 'V2', 'V1', 'V5', 'V15', 'V9', 'V23', 'Class'] |
18 | | -processed_data = data[ sorted_cols ].copy() |
19 | | - |
20 | | -#Before training the GAN do not forget to apply the required data transformations |
21 | | -#To ease here we've applied a PowerTransformation |
22 | | -_, data, _ = transformations(data) |
| 19 | +data = data[ sorted_cols ].copy() |
23 | 20 |
|
24 | 21 | #For the purpose of this example we will only synthesize the minority class |
25 | 22 | train_data = data.loc[ data['Class']==1 ].copy() |
|
28 | 25 | print("Dataset info: Number of records - {} Number of variables - {}".format(train_data.shape[0], train_data.shape[1])) |
29 | 26 | algorithm = cluster.KMeans |
30 | 27 | args, kwds = (), {'n_clusters':2, 'random_state':0} |
31 | | -labels = algorithm(*args, **kwds).fit_predict(train_data[ data_cols ]) |
| 28 | +labels = algorithm(*args, **kwds).fit_predict(train_data[ num_cols ]) |
32 | 29 |
|
33 | 30 | print( pd.DataFrame( [ [np.sum(labels==i)] for i in np.unique(labels) ], columns=['count'], index=np.unique(labels) ) ) |
34 | 31 |
|
|
51 | 48 | learning_rate = 5e-4 |
52 | 49 | models_dir = './cache' |
53 | 50 |
|
54 | | -train_sample = fraud_w_classes.copy().reset_index(drop=True) |
55 | | -train_sample = pd.get_dummies(train_sample, columns=['Class'], prefix='Class', drop_first=True) |
56 | | -label_cols = [list(train_sample.columns).index(i) for i in train_sample.columns if 'Class' in i ] |
57 | | -data_cols = [ i for i in train_sample.columns if i not in label_cols ] |
58 | | -train_sample[ data_cols ] = train_sample[ data_cols ] / 10 # scale to random noise size, one less thing to learn |
59 | | -train_no_label = train_sample[ data_cols ] |
60 | | - |
61 | 51 | #Test here the new inputs |
62 | 52 | gan_args = ModelParameters(batch_size=batch_size, |
63 | 53 | lr=learning_rate, |
64 | 54 | betas=(beta_1, beta_2), |
65 | 55 | noise_dim=noise_dim, |
66 | | - n_cols=train_sample.shape[1] - len(label_cols), # Don't count the label columns here |
67 | 56 | layers_dim=dim) |
68 | 57 |
|
69 | 58 | train_args = TrainParameters(epochs=epochs, |
|
73 | 62 | labels=(0,1)) |
74 | 63 |
|
75 | 64 | #Init the Conditional GAN providing the index of the label column as one of the arguments |
76 | | -synthesizer = CGAN(model_parameters=gan_args, num_classes=2) |
| 65 | +synthesizer = model(model_parameters=gan_args, num_classes=2) |
77 | 66 |
|
78 | 67 | #Training the Conditional GAN |
79 | | -synthesizer.train(data=train_sample, label="Class",train_arguments=train_args) |
| 68 | +synthesizer.train(data=fraud_w_classes, label_col="Class", train_arguments=train_args, |
| 69 | + num_cols=num_cols, cat_cols=cat_cols) |
80 | 70 |
|
81 | 71 | #Saving the synthesizer |
82 | 72 | synthesizer.save('cgan_synthtrained.pkl') |
| 73 | + |
| 74 | +#Loading the synthesizer |
| 75 | +synthesizer = model.load('cgan_synthtrained.pkl') |
| 76 | + |
| 77 | +#Sampling from the synthesizer |
| 78 | +cond_array = np.array([0]) |
| 79 | +# Synthesizer samples are returned in the original format (inverse_transform of internal processing already took place) |
| 80 | +synthesizer = synthesizer.sample(cond_array, 1000) |
0 commit comments