## Tuesday, January 24, 2017

### LSTM, Baseline

I ran the code shared here which does time series prediction. Based on 49 values, it predicts 50th. Simplified version is below. MSE was 0.07.

I also created a baseline, a "predictor" that would simply take X_t to be X_{t-1}. So if energy use was 10 today, it will be 10 the next day. That is the simplest predictor there is.

import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

df = df[['Global_active_power']]
df = df[df.Global_active_power != '?']
df['G2'] = df['Global_active_power'].shift(1)
df = df.astype(float)
df['err'] = df['G2']-df['Global_active_power']
df['err'] = np.power(df['err'],2)
print df.err.sum() / len(df)

I also get MSE 0.07 from this. In ML it helps always to compare a model to a baseline. This is not to say the LSTM code is doing nothing, or maybe the model can be improved, etc.

import matplotlib.pyplot as plt
import numpy as np, time, csv
from keras.layers.core import Dense, Activation, Dropout
from keras.layers.recurrent import LSTM
from keras.models import Sequential
np.random.seed(1234)

seq = 50

def data_power_consumption(path_to_dataset,sequence_length=seq,ratio=1.0):

max_values = ratio * 2049280
with open(path_to_dataset) as f:
power = []
nb_of_values = 0
for i,line in enumerate(data):
#if i % 20 != 0: continue
try:
power.append(float(line[2]))
nb_of_values += 1
except ValueError:
pass
if nb_of_values >= max_values:
break

print "Data loaded from csv. Formatting..."

result = []
for index in range(len(power) - sequence_length):
result.append(power[index: index + sequence_length])
result = np.array(result)  # shape (2049230, 50)

result_mean = result.mean()
result -= result_mean
print "Shift : ", result_mean
print "Data  : ", result.shape

row = round(0.9 * result.shape[0])
train = result[:row, :]
np.random.shuffle(train)
X_train = train[:, :-1]
y_train = train[:, -1]
X_test = result[row:, :-1]
y_test = result[row:, -1]

X_train = np.reshape(X_train, (X_train.shape[0], X_train.shape[1], 1))
X_test = np.reshape(X_test, (X_test.shape[0], X_test.shape[1], 1))

return [X_train, y_train, X_test, y_test]

ratio = 0.5
path_to_dataset = 'household_power_consumption.txt'
X_train, y_train, X_test, y_test = data_power_consumption(path_to_dataset, seq, ratio)

def build_model():
model = Sequential()
model.compile(loss="mse", optimizer="rmsprop")
return model

def run_network(model=None, data=None):
global_start_time = time.time()
ratio = 0.5
sequence_length = seq
path_to_dataset = 'household_power_consumption.txt'

X_train, y_train, X_test, y_test = data_power_consumption(path_to_dataset, sequence_length, ratio)
print X_train.shape, X_test.shape
model = build_model()
model.fit(X_train, y_train,batch_size=512, nb_epoch=1, validation_split=0.05)
predicted = model.predict(X_test)
predicted = np.reshape(predicted, (predicted.size,))
fig = plt.figure()