Skip to content

Commit 1a57bb9

Browse files
author
Francisco Santos
committed
PATEGAN base implementation
1 parent 08d3cae commit 1a57bb9

File tree

28 files changed

+2347
-717
lines changed

28 files changed

+2347
-717
lines changed

examples/regular/adult_dragan.py

Lines changed: 16 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,18 @@
1-
from ydata_synthetic.preprocessing.regular.adult import transformations
1+
from pmlb import fetch_data
2+
23
from ydata_synthetic.synthesizers.regular import DRAGAN
34
from ydata_synthetic.synthesizers import ModelParameters, TrainParameters
45

5-
#Load and process the data
6-
data, processed_data, preprocessor = transformations()
6+
model = DRAGAN
7+
8+
#Load data and define the data processor parameters
9+
data = fetch_data('adult')
10+
num_cols = ['age', 'fnlwgt', 'capital-gain', 'capital-loss', 'hours-per-week']
11+
cat_cols = ['workclass','education', 'education-num', 'marital-status', 'occupation', 'relationship', 'race', 'sex',
12+
'native-country', 'target']
713

8-
# WGAN_GP training
9-
#Defininf the training parameters of WGAN_GP
14+
# DRAGAN training
15+
#Defining the training parameters of DRAGAN
1016

1117
noise_dim = 128
1218
dim = 128
@@ -23,12 +29,14 @@
2329
lr=learning_rate,
2430
betas=(beta_1, beta_2),
2531
noise_dim=noise_dim,
26-
n_cols=processed_data.shape[1],
2732
layers_dim=dim)
2833

2934
train_args = TrainParameters(epochs=epochs,
3035
sample_interval=log_step)
3136

32-
synthesizer = DRAGAN(gan_args, n_discriminator=3)
33-
synthesizer.train(processed_data, train_args)
37+
synthesizer = model(gan_args, n_discriminator=3)
38+
synthesizer.train(data, train_args, num_cols, cat_cols, preprocess = True)
3439
synthesizer.save('adult_synth.pkl')
40+
41+
synthesizer = model.load('adult_synth.pkl')
42+
synthesizer.sample(1000)

examples/regular/adult_wgangp.py

Lines changed: 17 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1,16 +1,21 @@
1-
from ydata_synthetic.preprocessing.regular.adult import transformations
1+
from pmlb import fetch_data
2+
23
from ydata_synthetic.synthesizers.regular import WGAN_GP
34
from ydata_synthetic.synthesizers import ModelParameters, TrainParameters
45

5-
#Load and process the data
6-
data, processed_data, preprocessor = transformations()
6+
model = WGAN_GP
7+
8+
#Load data and define the data processor parameters
9+
data = fetch_data('adult')
10+
num_cols = ['age', 'fnlwgt', 'capital-gain', 'capital-loss', 'hours-per-week']
11+
cat_cols = ['workclass','education', 'education-num', 'marital-status', 'occupation', 'relationship', 'race', 'sex',
12+
'native-country', 'target']
713

8-
# WGAN_GP training
9-
#Defining the training parameters of WGAN_GP
14+
#Defining the training parameters
1015

11-
noise_dim = 32
16+
noise_dim = 128
1217
dim = 128
13-
batch_size = 128
18+
batch_size = 500
1419

1520
log_step = 100
1621
epochs = 300+1
@@ -23,14 +28,15 @@
2328
lr=learning_rate,
2429
betas=(beta_1, beta_2),
2530
noise_dim=noise_dim,
26-
n_cols=processed_data.shape[1],
2731
layers_dim=dim)
2832

2933
train_args = TrainParameters(epochs=epochs,
3034
sample_interval=log_step)
3135

32-
synthesizer = WGAN_GP(gan_args, n_critic=2)
33-
synthesizer.train(processed_data, train_args)
36+
synthesizer = model(gan_args, n_critic=2)
37+
synthesizer.train(data, train_args, num_cols, cat_cols, preprocess = True)
3438

35-
synth_data = synthesizer.sample(1000)
3639
synthesizer.save('test.pkl')
40+
41+
synthesizer = model.load('test.pkl')
42+
synth_data = synthesizer.sample(1000)

examples/regular/cgan_example.py

Lines changed: 18 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -1,25 +1,22 @@
11
from ydata_synthetic.synthesizers.regular import CGAN
2-
from ydata_synthetic.preprocessing.regular.credit_fraud import transformations
32
from ydata_synthetic.synthesizers import ModelParameters, TrainParameters
43

54
import pandas as pd
65
import numpy as np
76
from sklearn import cluster
87

8+
model = CGAN
9+
910
#Read the original data and have it preprocessed
1011
data = pd.read_csv('data/creditcard.csv', index_col=[0])
1112

1213
#List of columns different from the Class column
13-
data_cols = list(data.columns[ data.columns != 'Class' ])
14-
label_cols = ['Class']
14+
num_cols = list(data.columns[ data.columns != 'Class' ])
15+
cat_cols = [] # Condition features are not preprocessed and therefore not listed here
1516

16-
print('Dataset columns: {}'.format(data_cols))
17+
print('Dataset columns: {}'.format(num_cols))
1718
sorted_cols = ['V14', 'V4', 'V10', 'V17', 'V12', 'V26', 'Amount', 'V21', 'V8', 'V11', 'V7', 'V28', 'V19', 'V3', 'V22', 'V6', 'V20', 'V27', 'V16', 'V13', 'V25', 'V24', 'V18', 'V2', 'V1', 'V5', 'V15', 'V9', 'V23', 'Class']
18-
processed_data = data[ sorted_cols ].copy()
19-
20-
#Before training the GAN do not forget to apply the required data transformations
21-
#To ease here we've applied a PowerTransformation
22-
_, data, _ = transformations(data)
19+
data = data[ sorted_cols ].copy()
2320

2421
#For the purpose of this example we will only synthesize the minority class
2522
train_data = data.loc[ data['Class']==1 ].copy()
@@ -28,7 +25,7 @@
2825
print("Dataset info: Number of records - {} Number of variables - {}".format(train_data.shape[0], train_data.shape[1]))
2926
algorithm = cluster.KMeans
3027
args, kwds = (), {'n_clusters':2, 'random_state':0}
31-
labels = algorithm(*args, **kwds).fit_predict(train_data[ data_cols ])
28+
labels = algorithm(*args, **kwds).fit_predict(train_data[ num_cols ])
3229

3330
print( pd.DataFrame( [ [np.sum(labels==i)] for i in np.unique(labels) ], columns=['count'], index=np.unique(labels) ) )
3431

@@ -51,19 +48,11 @@
5148
learning_rate = 5e-4
5249
models_dir = './cache'
5350

54-
train_sample = fraud_w_classes.copy().reset_index(drop=True)
55-
train_sample = pd.get_dummies(train_sample, columns=['Class'], prefix='Class', drop_first=True)
56-
label_cols = [list(train_sample.columns).index(i) for i in train_sample.columns if 'Class' in i ]
57-
data_cols = [ i for i in train_sample.columns if i not in label_cols ]
58-
train_sample[ data_cols ] = train_sample[ data_cols ] / 10 # scale to random noise size, one less thing to learn
59-
train_no_label = train_sample[ data_cols ]
60-
6151
#Test here the new inputs
6252
gan_args = ModelParameters(batch_size=batch_size,
6353
lr=learning_rate,
6454
betas=(beta_1, beta_2),
6555
noise_dim=noise_dim,
66-
n_cols=train_sample.shape[1] - len(label_cols), # Don't count the label columns here
6756
layers_dim=dim)
6857

6958
train_args = TrainParameters(epochs=epochs,
@@ -73,10 +62,19 @@
7362
labels=(0,1))
7463

7564
#Init the Conditional GAN providing the index of the label column as one of the arguments
76-
synthesizer = CGAN(model_parameters=gan_args, num_classes=2)
65+
synthesizer = model(model_parameters=gan_args, num_classes=2)
7766

7867
#Training the Conditional GAN
79-
synthesizer.train(data=train_sample, label="Class",train_arguments=train_args)
68+
synthesizer.train(data=fraud_w_classes, label_col="Class", train_arguments=train_args,
69+
num_cols=num_cols, cat_cols=cat_cols)
8070

8171
#Saving the synthesizer
8272
synthesizer.save('cgan_synthtrained.pkl')
73+
74+
#Loading the synthesizer
75+
synthesizer = model.load('cgan_synthtrained.pkl')
76+
77+
#Sampling from the synthesizer
78+
cond_array = np.array([0])
79+
# Synthesizer samples are returned in the original format (inverse_transform of internal processing already took place)
80+
synthesizer = synthesizer.sample(cond_array, 1000)

examples/regular/cramergan_example.py

Lines changed: 8 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -6,34 +6,28 @@
66

77
from ydata_synthetic.synthesizers import ModelParameters, TrainParameters
88
from ydata_synthetic.synthesizers.regular import CRAMERGAN
9-
from ydata_synthetic.preprocessing.regular.credit_fraud import transformations
109

1110
model = CRAMERGAN
1211

1312
#Read the original data and have it preprocessed
1413
data = pd.read_csv('data/creditcard.csv', index_col=[0])
1514

16-
#Data processing and analysis
17-
data_cols = list(data.columns[ data.columns != 'Class' ])
18-
label_cols = ['Class']
15+
#List of columns different from the Class column
16+
num_cols = list(data.columns[ data.columns != 'Class' ])
17+
cat_cols = ['Class']
1918

20-
print('Dataset columns: {}'.format(data_cols))
19+
print('Dataset columns: {}'.format(num_cols))
2120
sorted_cols = ['V14', 'V4', 'V10', 'V17', 'V12', 'V26', 'Amount', 'V21', 'V8', 'V11', 'V7', 'V28', 'V19', 'V3', 'V22', 'V6', 'V20', 'V27', 'V16', 'V13', 'V25', 'V24', 'V18', 'V2', 'V1', 'V5', 'V15', 'V9', 'V23', 'Class']
22-
processed_data = data[ sorted_cols ].copy()
23-
24-
#Before training the GAN do not forget to apply the required data transformations
25-
#To ease here we've applied a PowerTransformation
26-
_, data, _ = transformations(data)
27-
21+
data = data[ sorted_cols ].copy()
2822

2923
#For the purpose of this example we will only synthesize the minority class
3024
train_data = data.loc[ data['Class']==1 ].copy()
3125

26+
#Create a new class column using KMeans - This will mainly be useful if we want to leverage conditional GAN
3227
print("Dataset info: Number of records - {} Number of variables - {}".format(train_data.shape[0], train_data.shape[1]))
33-
3428
algorithm = cluster.KMeans
3529
args, kwds = (), {'n_clusters':2, 'random_state':0}
36-
labels = algorithm(*args, **kwds).fit_predict(train_data[ data_cols ])
30+
labels = algorithm(*args, **kwds).fit_predict(train_data[ num_cols ])
3731

3832
print( pd.DataFrame( [ [np.sum(labels==i)] for i in np.unique(labels) ], columns=['count'], index=np.unique(labels) ) )
3933

@@ -53,29 +47,18 @@
5347
beta_2 = 0.9
5448
models_dir = './cache'
5549

56-
train_sample = fraud_w_classes.copy().reset_index(drop=True)
57-
train_sample = pd.get_dummies(train_sample, columns=['Class'], prefix='Class', drop_first=True)
58-
label_cols = [ i for i in train_sample.columns if 'Class' in i ]
59-
data_cols = [ i for i in train_sample.columns if i not in label_cols ]
60-
train_sample[ data_cols ] = train_sample[ data_cols ] / 10 # scale to random noise size, one less thing to learn
61-
train_no_label = train_sample[ data_cols ]
62-
6350
model_parameters = ModelParameters(batch_size=batch_size,
6451
lr=learning_rate,
6552
betas=(beta_1, beta_2),
6653
noise_dim=noise_dim,
67-
n_cols=train_sample.shape[1],
6854
layers_dim=dim)
6955

7056
train_args = TrainParameters(epochs=epochs,
7157
sample_interval=log_step)
7258

73-
test_size = 492 # number of fraud cases
74-
noise_dim = 32
75-
7659
#Training the CRAMERGAN model
7760
synthesizer = model(model_parameters, gradient_penalty_weight=10)
78-
synthesizer.train(train_sample, train_args)
61+
synthesizer.train(data=fraud_w_classes, train_arguments=train_args, num_cols = num_cols, cat_cols = cat_cols)
7962

8063
#Saving the synthesizer to later generate new events
8164
synthesizer.save(path='models/cramergan_creditcard.pkl')

0 commit comments

Comments
 (0)