Stock Price Analysis using Long Short Term Memory Neural Network

This project focused on predicting Tesla (TSLA) stock prices using deep learning techniques with LSTM neural networks. I built a time series forecasting model that uses historical closing prices and trading volumes to predict next-day stock prices, implementing robust validation through K-fold cross-validation.

๐Ÿ’ป Tech Stack:

๐Ÿงช Data Pipeline:

๐Ÿ“Š Code Snippets & Visualisations:

# Importing the libraries
import pandas as pd
import plotly.express as px
from copy import copy
from scipy import stats
import matplotlib.pyplot as plt
import numpy as np
import plotly.figure_factory as ff
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import KFold
from tensorflow import keras

# Prepare Data for training

# Function to concatenate the date, stock price, and volume in one dataframe
def individual_stock(price_df, vol_df, name):
    return pd.DataFrame({
        'Date': price_df['Date'], 
        'Close': price_df[name], 
        'Volume': vol_df[name]
    })

# Function to return the output (target) data ML Model [Target stock price today will be tomorrow's price]
def trading_window(data):
    n = 1  # 1 day window
    
    # Create a column containing the prices for the next 1 days
    data['Target'] = data[['Close']].shift(-n)
    
    return data

# Test concatenation function using individual stock (Table 1)
price_volume_df = individual_stock(stock_price_df, stock_vol_df, 'TSLA')
print(price_volume_df)

# Train An LSTM Time Series Model

# Use the close and volume data as training data (Input)
training_data = price_volume_df.iloc[:, 1:3].values
print("Training data shape:", training_data.shape)

# Apply feature scaling the data
sc = MinMaxScaler(feature_range=(0, 1))
training_set_scaled = sc.fit_transform(training_data)

# Create the training and testing data, training data contains present day and previous day values
X = []
y = []
for i in range(1, len(price_volume_df)):
    X.append(training_set_scaled[i-1:i, 0])
    y.append(training_set_scaled[i, 0])

# Convert the data into array format
X = np.asarray(X)
y = np.asarray(y)

# Split the data
split = int(0.75 * len(X))
X_train = X[:split]
y_train = y[:split]
X_test = X[split:]
y_test = y[split:]

# Reshape the 1D arrays to 3D arrays to feed in the model
X_train = np.reshape(X_train, (X_train.shape[0], X_train.shape[1], 1))
X_test = np.reshape(X_test, (X_test.shape[0], X_test.shape[1], 1))
print("X_train shape:", X_train.shape, "X_test shape:", X_test.shape)

# Create the LSTM model
inputs = keras.layers.Input(shape=(X_train.shape[1], X_train.shape[2]))
x = keras.layers.LSTM(150, return_sequences=True)(inputs)
x = keras.layers.Dropout(0.3)(x)

x = keras.layers.LSTM(150, return_sequences=True)(x)
x = keras.layers.Dropout(0.3)(x)

x = keras.layers.LSTM(150)(x)
outputs = keras.layers.Dense(1, activation='linear')(x)

model = keras.Model(inputs=inputs, outputs=outputs)
model.compile(optimizer='adam', loss="mse")
model.summary()

# Train the model
history = model.fit(
    X_train, y_train,
    epochs=20,
    batch_size=32,
    validation_split=0.2
)

# Make predictions on test data
test_predicted = model.predict(X_test)

# K folds cross validation for model
# Define the number of folds
k = 5
kf = KFold(n_splits=k, shuffle=True, random_state=42)

# Initialize a list to store the validation loss for each fold
fold_losses = []

# Create a fresh model for cross-validation
def create_lstm_model(input_shape):
    inputs = keras.layers.Input(shape=input_shape)
    x = keras.layers.LSTM(150, return_sequences=True)(inputs)
    x = keras.layers.Dropout(0.3)(x)
    
    x = keras.layers.LSTM(150, return_sequences=True)(x)
    x = keras.layers.Dropout(0.3)(x)
    
    x = keras.layers.LSTM(150)(x)
    outputs = keras.layers.Dense(1, activation='linear')(x)
    
    model = keras.Model(inputs=inputs, outputs=outputs)
    model.compile(optimizer='adam', loss="mse")
    return model

# Perform k-fold cross validation
for fold, (train_index, val_index) in enumerate(kf.split(X)):
    print(f"Training fold {fold + 1}/{k}")
    
    X_train_fold, X_val_fold = X[train_index], X[val_index]
    y_train_fold, y_val_fold = y[train_index], y[val_index]
    
    # Reshape data for LSTM
    X_train_fold = np.reshape(X_train_fold, (X_train_fold.shape[0], X_train_fold.shape[1], 1))
    X_val_fold = np.reshape(X_val_fold, (X_val_fold.shape[0], X_val_fold.shape[1], 1))
    
    # Create a new model for this fold
    fold_model = create_lstm_model((X_train_fold.shape[1], X_train_fold.shape[2]))
    
    # Train the model on the training fold
    history = fold_model.fit(
        X_train_fold, y_train_fold,
        epochs=20,
        batch_size=32,
        validation_data=(X_val_fold, y_val_fold),
        verbose=0  # Reduce output for cleaner logs
    )
    
    # Evaluate the model on the validation fold and store the loss
    val_loss = fold_model.evaluate(X_val_fold, y_val_fold, verbose=0)
    fold_losses.append(val_loss)
    print(f"Fold {fold + 1} validation loss: {val_loss}")

# Calculate the average validation loss across all folds
average_val_loss = np.mean(fold_losses)
print("Average Validation Loss:", average_val_loss)
print("Standard deviation of validation losses:", np.std(fold_losses))

# Get the Closing price data for visualization
close = []
for i in training_set_scaled:
    close.append(i[0])

# Create dataframe for the dates, predicted prices and closing price
df_predicted = price_volume_df[1:][['Date']].copy()

# Ensure we have the right number of predictions
if len(test_predicted) < len(df_predicted):
    # If we have fewer predictions, take the last part of the dataframe
    df_predicted = df_predicted.tail(len(test_predicted)).copy()
    df_predicted['Predictions'] = test_predicted.flatten()
    df_predicted['Close'] = close[-len(test_predicted):]
else:
    # If we have enough predictions, take the corresponding part
    df_predicted['Predictions'] = test_predicted[:len(df_predicted)].flatten()
    df_predicted['Close'] = close[1:len(df_predicted)+1]

print("Prediction dataframe:")
print(df_predicted.head())

# Define interactive plot
def interactive_plot(df, title):
    fig = px.line(df, title=title, x='Date', y=['Close', 'Predictions'])
    fig.show()

# Plot the results (Figure 1 & 2)
interactive_plot(df_predicted, "Original Vs. Prediction (TSLA)")
						

๐ŸŒŸ Key Insights:

๐Ÿง—๐Ÿพ Challenge Faced:

My biggest challenge was preparing the time series data in the correct 3D format for LSTM input. Initially, I struggled with reshaping arrays from 1D to the required (samples, timesteps, features) structure. After researching LSTM input requirements and experimenting with NumPy reshape operations, I learned to properly sequence the data with sliding windows and convert to the appropriate tensor dimensions for neural network training.

View on GitHub

โ† Back to Projects