In a previous post we discussed generative AI and the need to sometimes create your own training and testing data (see: adressing data deficit in machine learning). In this post we focus on a special kind of generative tooling: Variational Auto Encoders and how to build your own in Python. For some background on Variational Autoencoders take a look at this article. In this post we provide an example of a Python class that first preprocess input data from a .csv file into a standard pandas dataframe, secondly it uses Bayesian Optimization to find best parameters for a Variational Auto Encoding (VAE) model and then trains and uses this VAE model with the best parameters found to generate and save a bunch of synthetic data for later use.
# Copyright (c) 2025 Hans De Weme
# Licensed under the MIT License (https://opensource.org/licenses/M
# Class: Generator Timeseries
# Purpose: finding the best hyperparameters for a VAE to generate synthetic time series data from an input data set
# of previous collected and preprocessed historical data
"""
- preprocess input data from a .csv file into a standard pandas dataframe
- use BayesianOptimization to find best parameters for VAE model
- use VAE model with best parameters
- generate and save synthetic data
"""
import sys
import tensorflow as tf
import numpy as np
import pandas as pd
import warnings
import logging
from keras.layers import LSTM, Dense, Lambda, TimeDistributed, RepeatVector
from keras import backend as K
from sklearn.preprocessing import StandardScaler
from bayes_opt import BayesianOptimization
from keras.callbacks import EarlyStopping, ReduceLROnPlateau
from PyQt6.QtCore import QThread, pyqtSignal
warnings.filterwarnings("ignore")
# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
class VAE(QThread):
progress_signal = pyqtSignal(str) # Signal to communicate progress (string message) back to the main thread
def __init__(self, data, parent=None):
super().__init__() # necessary for QObject, needed for pyqtSignal
self.data_path = data # path to input data
self.MODEL_WEIGHTS_PATH = "vae_model_weights.h5" # path to save / saved hyperparams
self. best_params = None # Initialize best_params
logging.info("Starting data processing...")
df = self.process_data_file(self.data_path, strip=False) # Load and preprocess the dataset
features = ['open', 'high', 'low', 'close', 'volume', 'number_of_trades'] # Select features for normalization
scaler = StandardScaler()
df[features] = scaler.fit_transform(df[features])
logging.info("Feature scaling complete.")
self.time_steps = 30 # Convert to time-series format
self.feature_dim = len(features)
num_samples = len(df) - self.time_steps
self.X_series = np.array([df[features].iloc[i:i+self.time_steps].values for i in range(num_samples)])
logging.info("Time-series data formatted successfully.")
logging.info("Starting Bayesian Optimization...") # Run Bayesian Optimization
optimizer = BayesianOptimization(
f=lambda latent_dim, lstm_units, learning_rate: self.train_vae(latent_dim, lstm_units, learning_rate, return_model=False),
pbounds={'latent_dim': (5, 50), 'lstm_units': (30, 150), 'learning_rate': (0.0001, 0.01)},
random_state=42
)
optimizer.maximize(init_points=3, n_iter=10)
best_params = optimizer.max['params'] # Assign best_params after optimization
logging.info(f"Best parameters found: {best_params}")
logging.info("Training final VAE with best parameters...") # Train the final VAE using best parameters
self.vae, self.encoder, self.decoder = self.train_vae(best_params['latent_dim'], best_params['lstm_units'], best_params['learning_rate'], return_model=True)
X_synthetic = self.generate_synthetic_samples(best_params, 2000)
X_synthetic_reshaped = X_synthetic.reshape(-1, self.feature_dim)
synthetic_df = pd.DataFrame(scaler.inverse_transform(X_synthetic_reshaped), columns=features) # Assign proper datetime index for synthetic data
start_date = df.index[-1] # Use the last timestamp from original data
synthetic_timestamps = pd.date_range(start=start_date, periods=len(synthetic_df), freq='H') # Hourly intervals
synthetic_df.index = synthetic_timestamps
synthetic_df.to_csv("synthetic_timeseries_data.csv", index=True)
logging.info("Synthetic time-series data saved as synthetic_timeseries_data.csv")
def process_data_file(self, file_path, strip=False):
"""Loads and preprocesses the CSV file."""
logging.info("Loading and preprocessing dataset...")
columns = ['open_time', 'open', 'high', 'low', 'close', 'volume', 'close_time',
'quote_asset_volume', 'number_of_trades', 'taker_buy_base_asset_volume',
'taker_buy_quote_asset_volume', 'ignore']
df = pd.read_csv(file_path)
df.columns = columns
df['dt'] = pd.to_datetime(df['open_time'], unit='ms', origin='unix')
if strip:
df.drop(['open_time', 'open', 'high', 'low', 'volume', 'close_time',
'quote_asset_volume', 'number_of_trades',
'taker_buy_base_asset_volume', 'taker_buy_quote_asset_volume', 'ignore'],
axis='columns', inplace=True)
else:
df.drop(['open_time', 'close_time', 'quote_asset_volume', 'taker_buy_base_asset_volume',
'taker_buy_quote_asset_volume', 'ignore'], axis='columns', inplace=True)
df = df.dropna().drop_duplicates()
df.set_index('dt', inplace=True)
df = df.sort_index()
logging.info("Dataset preprocessing complete.")
return df
def train_vae(self, latent_dim, lstm_units, learning_rate, return_model=False):
"""Train VAE with given hyperparameters. Returns loss for optimization, or models if requested."""
logging.info(f"Training VAE with latent_dim={latent_dim}, lstm_units={lstm_units}, learning_rate={learning_rate}")
latent_dim = int(latent_dim)
lstm_units = int(lstm_units)
def build_encoder(input_shape, latent_dim):
inputs = tf.keras.Input(shape=input_shape)
x = LSTM(lstm_units, return_sequences=True)(inputs)
x = LSTM(lstm_units)(x)
z_mean = Dense(latent_dim)(x)
z_log_var = Dense(latent_dim)(x)
return tf.keras.Model(inputs, [z_mean, z_log_var])
def sampling(args):
z_mean, z_log_var = args
batch = K.shape(z_mean)[0]
dim = K.int_shape(z_mean)[1]
epsilon = K.random_normal(shape=(batch, dim))
return z_mean + K.exp(0.5 * z_log_var) * epsilon
def build_decoder(latent_dim, output_shape):
inputs = tf.keras.Input(shape=(latent_dim,))
x = Dense(50)(inputs)
x = RepeatVector(output_shape[0])(x)
x = LSTM(lstm_units, return_sequences=True)(x)
x = TimeDistributed(Dense(output_shape[1]))(x)
return tf.keras.Model(inputs, x)
input_shape = (self.time_steps, self.feature_dim)
encoder = build_encoder(input_shape, latent_dim)
decoder = build_decoder(latent_dim, input_shape)
inputs = tf.keras.Input(shape=input_shape)
z_mean, z_log_var = encoder(inputs)
z = Lambda(sampling)([z_mean, z_log_var])
outputs = decoder(z)
vae = tf.keras.Model(inputs, outputs)
reconstruction_loss = tf.keras.losses.mean_squared_error(K.flatten(inputs), K.flatten(outputs))
kl_loss = 1 + z_log_var - K.square(z_mean) - K.exp(z_log_var)
kl_loss = -0.5 * K.sum(kl_loss, axis=-1)
vae_loss = K.mean(reconstruction_loss + kl_loss)
vae.add_loss(vae_loss)
vae.compile(optimizer=tf.keras.optimizers.Adam(learning_rate))
early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)
lr_scheduler = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=3, min_lr=1e-6)
history = vae.fit(self.X_series, epochs=50, batch_size=32, validation_split=0.2, verbose=1, callbacks=[early_stopping, lr_scheduler])
vae.save_weights(self.MODEL_WEIGHTS_PATH)
logging.info("VAE training complete. Weights saved.")
if return_model:
return vae, encoder, decoder
return -min(history.history['val_loss']) # Return only validation loss for optimization
def generate_synthetic_samples(self, best_params, num_samples=2000): # Generate and save synthetic data
logging.info(f"Generating {num_samples} synthetic samples...")
z_samples = np.random.normal(size=(num_samples, int(best_params['latent_dim'])))
synthetic_data = self.decoder.predict(z_samples)
return synthetic_data
if __name__ == "__main__":
if len(sys.argv) != 2:
sys.exit("Usage: python generator_timeseries.py datafile.csv")
VAE(sys.argv[1])
PythonUser & Technical Documentation for the Time-Series Variational Autoencoder (VAE) Generator
Introduction
The Time-Series VAE Generator is a Python-based tool designed to generate synthetic time-series data using a Variational Autoencoder (VAE). This tool leverages Bayesian Optimization to determine the best hyperparameters before training the final VAE model. It is particularly useful for financial data, IoT sensor data, and anomaly detection, where high-quality synthetic data is required for model training and evaluation.
Key Features
- Preprocessing: Loads a time-series dataset from a CSV file and standardizes the features.
- Bayesian Optimization: Finds the best hyperparameters (
latent_dim,lstm_units, andlearning_rate) for the VAE. - VAE Training: Uses LSTMs to encode time-series patterns and generates realistic synthetic sequences.
- Synthetic Data Generation: Generates new time-series sequences that match the statistical properties of the input data.
- File Output: Saves the generated synthetic data as a CSV file with hourly timestamps.
- PyQt6 Integration: Can be executed as a background thread in a PyQt6 application.
Functional Overview
Workflow
- Load and preprocess the input dataset
- Optimize VAE hyperparameters using Bayesian Optimization
- Train the VAE model with the best hyperparameters
- Generate synthetic time-series data
- Save the synthetic data with indexed timestamps
System Requirements
- Python 3.8+
- TensorFlow/Keras
- NumPy & Pandas
- Scikit-Learn
- Bayesian Optimization
- PyQt6 (optional for GUI integration)
How to Use
Running the Script
The script can be executed via the command line:
python generator_timeseries.py datafile.csvwhere datafile.csv is the input time-series dataset.
Expected Input Data Format
The input CSV file must contain the following columns:
| Column | Description |
|---|---|
open_time | Timestamp of data collection |
open | Open price |
high | Highest price |
low | Lowest price |
close | Closing price |
volume | Trading volume |
number_of_trades | Count of executed trades |
The script preprocesses the data by converting timestamps and normalizing numerical features using StandardScaler.
Output Data Format
The generated synthetic data is saved as synthetic_timeseries_data.csv with the same column structure and hourly timestamps.
Technical Details
Class Structure
VAE Class (QThread-based for PyQt6 integration)
| Method | Description |
__init__(self, data, parent=None) | Initializes the VAE generator, preprocesses data, and starts Bayesian Optimization. |
process_data_file(self, file_path, strip=False) | Loads and preprocesses the input CSV file. |
train_vae(self, latent_dim, lstm_units, learning_rate, return_model=False) | Trains the VAE with given hyperparameters. |
generate_synthetic_samples(self, best_params, num_samples=2000) | Generates synthetic time-series data based on trained VAE. |
Bayesian Optimization for Hyperparameter Tuning
latent_dim(5 – 50): Defines the size of the latent space.lstm_units(30 – 150): Number of LSTM units in the encoder/decoder.learning_rate(0.0001 – 0.01): Optimized for faster and stable training.
The optimizer selects the best parameters based on minimizing validation loss during training.
VAE Model Architecture
Encoder (LSTM-based)
- Converts time-series input into a compressed latent representation.
- Outputs z_mean and z_log_var (parameters of the latent space distribution).
Decoder (LSTM-based)
- Takes samples from the latent space and reconstructs time-series sequences.
- Uses RepeatVector to repeat the latent representation across time-steps.
Loss Function
The VAE loss consists of:
- Reconstruction Loss (MSE between input and output sequences)
- KL Divergence Loss (encourages the latent space to follow a normal distribution)
Example Code Execution Flow
vae = VAE("XTZUSDT-totaal.csv")
vae.start() # Runs in a separate thread in PyQt6Summary
| Feature | Details |
| Input | CSV file with historical time-series data |
| Preprocessing | Timestamp conversion, feature scaling |
| Hyperparameter Tuning | Bayesian Optimization (latent_dim, lstm_units, learning_rate) |
| VAE Architecture | LSTM-based Encoder/Decoder with KL Divergence loss |
| Synthetic Data | Generated with same structure as input, indexed with timestamps |
| Output File | synthetic_timeseries_data.csv |
| Integration | Works with PyQt6 as a background thread |
This tool provides an automated, optimized way to generate realistic synthetic time-series data for financial modeling, forecasting, and anomaly detection.