Synthesize tabular data

Using CGAN to generate tabular synthetic data:

Real-world domains are often described by tabular data i.e., data that can be structured and organized in a table-like format, where features/variables are represented in columns, whereas observations correspond to the rows.

CGAN is a deep learning model that combines GANs with conditional models to generate data samples based on specific conditions:

📑 Paper: Conditonal Generative Adversarial Nets

Here’s an example of how to synthetize tabular data with CGAN using the Credit Card dataset:

"""
    CGAN architecture example file
"""
import pandas as pd
from sklearn import cluster

from ydata_synthetic.utils.cache import cache_file
from ydata_synthetic.synthesizers import ModelParameters, TrainParameters
from ydata_synthetic.synthesizers.regular import RegularSynthesizer

#Read the original data and have it preprocessed
data_path = cache_file('creditcard.csv', 'https://datahub.io/machine-learning/creditcard/r/creditcard.csv')
data = pd.read_csv(data_path, index_col=[0])

#Data processing and analysis
num_cols = list(data.columns[ data.columns != 'Class' ])
cat_cols = []

print('Dataset columns: {}'.format(num_cols))
sorted_cols = ['V14', 'V4', 'V10', 'V17', 'V12', 'V26', 'Amount', 'V21', 'V8', 'V11', 'V7', 'V28', 'V19',
                'V3', 'V22', 'V6', 'V20', 'V27', 'V16', 'V13', 'V25', 'V24', 'V18', 'V2', 'V1', 'V5', 'V15',
                'V9', 'V23', 'Class']
processed_data = data[ sorted_cols ].copy()

#For the purpose of this example we will only synthesize the minority class
train_data = processed_data.loc[processed_data['Class'] == 1].copy()

#Create a new class column using KMeans - This will mainly be useful if we want to leverage conditional GAN
print("Dataset info: Number of records - {} Number of variables - {}".format(train_data.shape[0], train_data.shape[1]))
algorithm = cluster.KMeans
args, kwds = (), {'n_clusters':2, 'random_state':0}
labels = algorithm(*args, **kwds).fit_predict(train_data[ num_cols ])

fraud_w_classes = train_data.copy()
fraud_w_classes['Class'] = labels

#----------------------------
#    GAN Training
#----------------------------

#Define the Conditional GAN and training parameters
noise_dim = 32
dim = 128
batch_size = 128
beta_1 = 0.5
beta_2 = 0.9

log_step = 100
epochs = 2 + 1
learning_rate = 5e-4
models_dir = '../cache'

#Test here the new inputs
gan_args = ModelParameters(batch_size=batch_size,
                           lr=learning_rate,
                           betas=(beta_1, beta_2),
                           noise_dim=noise_dim,
                           layers_dim=dim)

train_args = TrainParameters(epochs=epochs,
                             cache_prefix='',
                             sample_interval=log_step,
                             label_dim=-1,
                             labels=(0,1))

#create a bining
fraud_w_classes['Amount'] = pd.cut(fraud_w_classes['Amount'], 5).cat.codes

#Init the Conditional GAN providing the index of the label column as one of the arguments
synth = RegularSynthesizer(modelname='cgan', model_parameters=gan_args)

#Training the Conditional GAN
synth.fit(data=fraud_w_classes, label_cols=["Class"], train_arguments=train_args, num_cols=num_cols, cat_cols=cat_cols)

#Saving the synthesizer
synth.save('creditcard_cgan_model.pkl')

#Loading the synthesizer
synthesizer = RegularSynthesizer.load('creditcard_cgan_model.pkl')

#Sampling from the synthesizer
cond_array = pd.DataFrame(100*[1], columns=['Class'])
# Synthesizer samples are returned in the original format (inverse_transform of internal processing already took place)
sample = synthesizer.sample(cond_array)

print(sample)