|
| 1 | +# Anomaly-Detection-LSTM-Autoencoder |
| 2 | + |
| 3 | +## Import Library |
| 4 | +``` |
| 5 | +import numpy as np |
| 6 | +import tensorflow as tf |
| 7 | +import pandas as pd |
| 8 | +pd.options.mode.chained_assignment = None |
| 9 | +import seaborn as sns |
| 10 | +from matplotlib.pylab import rcParams |
| 11 | +import matplotlib.pyplot as plt |
| 12 | +import plotly.express as px |
| 13 | +import plotly.graph_objects as go |
| 14 | +
|
| 15 | +%matplotlib inline |
| 16 | +
|
| 17 | +sns.set(style='whitegrid', palette='muted') |
| 18 | +rcParams['figure.figsize'] = 14, 8 |
| 19 | +np.random.seed(1) |
| 20 | +tf.random.set_seed(1) |
| 21 | +
|
| 22 | +print('Tensorflow version:', tf.__version__) |
| 23 | +``` |
| 24 | + |
| 25 | +## Load Data |
| 26 | +``` |
| 27 | +df = pd.read_csv('S&P_500_Index_Data.csv', parse_dates=['date']) |
| 28 | +df.head() |
| 29 | +df.shape |
| 30 | +``` |
| 31 | + |
| 32 | +## Plot Data |
| 33 | + |
| 34 | +``` |
| 35 | +fig = go.Figure() |
| 36 | +fig.add_trace(go.Scatter(x=df.date, y=df.close, |
| 37 | + mode='lines', |
| 38 | + name='close')) |
| 39 | +fig.update_layout(showlegend=True) |
| 40 | +fig.show() |
| 41 | +``` |
| 42 | + |
| 43 | + |
| 44 | +## Data Preprocessing |
| 45 | +``` |
| 46 | +train_size = int(len(df) * 0.8) |
| 47 | +test_size = len(df) - train_size |
| 48 | +train, test = df.iloc[0:train_size], df.iloc[train_size:len(df)] |
| 49 | +print(train.shape, test.shape) |
| 50 | +from sklearn.preprocessing import StandardScaler |
| 51 | +
|
| 52 | +scaler = StandardScaler() |
| 53 | +scaler = scaler.fit(train[['close']]) |
| 54 | +
|
| 55 | +train['close'] = scaler.transform(train[['close']]) |
| 56 | +test['close'] = scaler.transform(test[['close']]) |
| 57 | +``` |
| 58 | + |
| 59 | +## Create Training and Test Splits |
| 60 | +``` |
| 61 | +def create_dataset(X, y, time_steps=1): |
| 62 | + Xs, ys = [], [] |
| 63 | + for i in range(len(X) - time_steps): |
| 64 | + v = X.iloc[i:(i + time_steps)].values |
| 65 | + Xs.append(v) |
| 66 | + ys.append(y.iloc[i + time_steps]) |
| 67 | + return np.array(Xs), np.array(ys) |
| 68 | + |
| 69 | +time_steps = 30 |
| 70 | +
|
| 71 | +X_train, y_train = create_dataset(train[['close']], train.close, time_steps) |
| 72 | +X_test, y_test = create_dataset(test[['close']], test.close, time_steps) |
| 73 | +
|
| 74 | +print(X_train.shape) |
| 75 | +``` |
| 76 | + |
| 77 | +## Build an LSTM Autoencoder |
| 78 | +``` |
| 79 | +timesteps = X_train.shape[1] |
| 80 | +num_features = X_train.shape[2] |
| 81 | +from tensorflow.keras.models import Sequential |
| 82 | +from tensorflow.keras.layers import Dense, LSTM, Dropout, RepeatVector, TimeDistributed |
| 83 | +
|
| 84 | +model = Sequential([ |
| 85 | + LSTM(128, input_shape=(timesteps, num_features)), |
| 86 | + Dropout(0.2), |
| 87 | + RepeatVector(timesteps), |
| 88 | + LSTM(128, return_sequences=True), |
| 89 | + Dropout(0.2), |
| 90 | + TimeDistributed(Dense(num_features)) |
| 91 | +]) |
| 92 | +
|
| 93 | +model.compile(loss='mae', optimizer='adam') |
| 94 | +model.summary() |
| 95 | +``` |
| 96 | + |
| 97 | +## Training |
| 98 | +``` |
| 99 | +es = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=3, mode='min') |
| 100 | +history = model.fit( |
| 101 | + X_train, y_train, |
| 102 | + epochs=100, |
| 103 | + batch_size=32, |
| 104 | + validation_split=0.1, |
| 105 | + callbacks = [es], |
| 106 | + shuffle=False |
| 107 | +) |
| 108 | +``` |
| 109 | + |
| 110 | +## Plot Metrics and Evaluate the Model |
| 111 | +``` |
| 112 | +plt.plot(history.history['loss'], label='Training Loss') |
| 113 | +plt.plot(history.history['val_loss'], label='Validation Loss') |
| 114 | +plt.legend(); |
| 115 | +``` |
| 116 | + |
| 117 | + |
| 118 | +``` |
| 119 | +X_train_pred = model.predict(X_train) |
| 120 | +
|
| 121 | +train_mae_loss = pd.DataFrame(np.mean(np.abs(X_train_pred - X_train), axis=1), columns=['Error']) |
| 122 | +model.evaluate(X_test, y_test) |
| 123 | +
|
| 124 | +``` |
| 125 | + |
| 126 | + |
| 127 | +``` |
| 128 | +sns.distplot(train_mae_loss, bins=50, kde=True); |
| 129 | +
|
| 130 | +``` |
| 131 | + |
| 132 | + |
| 133 | +``` |
| 134 | +X_test_pred = model.predict(X_test) |
| 135 | +
|
| 136 | +test_mae_loss = np.mean(np.abs(X_test_pred - X_test), axis=1) |
| 137 | +sns.distplot(test_mae_loss, bins=50, kde=True); |
| 138 | +
|
| 139 | +``` |
| 140 | + |
| 141 | + |
| 142 | +## Detect Anomalies in the S&P 500 Index Data |
| 143 | +``` |
| 144 | +THRESHOLD = 0.65 |
| 145 | +
|
| 146 | +test_score_df = pd.DataFrame(test[time_steps:]) |
| 147 | +test_score_df['loss'] = test_mae_loss |
| 148 | +test_score_df['threshold'] = THRESHOLD |
| 149 | +test_score_df['anomaly'] = test_score_df.loss > test_score_df.threshold |
| 150 | +test_score_df['close'] = test[time_steps:].close |
| 151 | +fig = go.Figure() |
| 152 | +fig.add_trace(go.Scatter(x=test[time_steps:].date, y=test_score_df.loss, |
| 153 | + mode='lines', |
| 154 | + name='Test Loss')) |
| 155 | +fig.add_trace(go.Scatter(x=test[time_steps:].date, y=test_score_df.threshold, |
| 156 | + mode='lines', |
| 157 | + name='Threshold')) |
| 158 | +fig.update_layout(showlegend=True) |
| 159 | +fig.show() |
| 160 | +
|
| 161 | +``` |
| 162 | + |
| 163 | + |
| 164 | + |
| 165 | +``` |
| 166 | +anomalies = test_score_df[test_score_df.anomaly == True] |
| 167 | +anomalies.head() |
| 168 | +``` |
| 169 | + |
| 170 | + |
| 171 | +``` |
| 172 | +fig = go.Figure() |
| 173 | +fig.add_trace(go.Scatter(x=test[time_steps:].date, y=scaler.inverse_transform(test[time_steps:].close), |
| 174 | + mode='lines', |
| 175 | + name='Close Price')) |
| 176 | +fig.add_trace(go.Scatter(x=anomalies.date, y=scaler.inverse_transform(anomalies.close), |
| 177 | + mode='markers', |
| 178 | + name='Anomaly')) |
| 179 | +fig.update_layout(showlegend=True) |
| 180 | +fig.show() |
| 181 | +``` |
| 182 | + |
| 183 | + |
0 commit comments