Real Time Sign language detection¶

Benedek Fulop (Sn: 12290335)

This notebook has been created to implement an efficient real time sign language recognition.

In order to abstract from the issues we experienced with the background and lightning conditions I decided to explore keypoint estimation models. I have tried using Openpose, however settled on using Mediapipe as it is maintained and provides more stable performance at the cost of not being open-source. Using the extracted features had been shown to perform better than visual models at larger word level datasets such as Li et al. 2020 "Word-level Deep Sign Language Recognition from Video: A New Large-scale Dataset and Methods Comparison".

Then to adress the dynamic signs present in all sign language I utilise a suit of RNN and CNN architectures to categorise the series.

In [1]:
import os
import re
import cv2
import pandas as pd
import numpy as np
import shutil
import random
import subprocess
from zipfile import ZipFile
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
from tabulate import tabulate
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from sklearn.metrics import classification_report
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn import metrics
import uuid
import time
import torch
import torch.nn as nn
import torch.optim as optim
import torchvision
import tensorboard
import torch.nn.functional as F
from torchvision import datasets, models, transforms
from torchvision.transforms import v2
from torch.utils.data import DataLoader, TensorDataset
from torch.optim import lr_scheduler
from torchinfo import summary
from tempfile import TemporaryDirectory
from sklearn.preprocessing import LabelEncoder
from torch.utils.tensorboard import SummaryWriter
import mediapipe as mp

from sklearn.model_selection import train_test_split
In [2]:
# fixing the seed to make training and results stable I just fixed all the randomisers I could think of still get some random behaviour possibly due to the LSTMs
randomseed = 42 # My favourite integer
np.random.seed(randomseed) 
torch.manual_seed(randomseed) # I still see non-deterministic results no clue why
torch.cuda.manual_seed(randomseed) 
torch.backends.cudnn.deterministic = True
random.seed(randomseed)

Data Wrangling¶

Clone The Github repository of data to my local machine

In [4]:
# get the url
repo_url = "https://github.com/marlondcu/ISL.git"
# the destination
destination_folder = os.path.join("ISL1")
# Run the git clone command
subprocess.run(["git", "clone", repo_url, destination_folder])
Out[4]:
CompletedProcess(args=['git', 'clone', 'https://github.com/marlondcu/ISL.git', 'ISL1'], returncode=0)

Unzip the video folders

In [7]:
zip_dir = os.path.join(destination_folder, "Videos") # unzip just the videos
zip_files = [f for f in os.listdir(zip_dir) if f.endswith('.zip')] 

# Go through ZIP files and extract contents
for zip_file in zip_files:
    zip_path = os.path.join(zip_dir, zip_file) 
    
    with ZipFile(zip_path, 'r') as zip_ref:
        # Extract all contents to the same directory as the ZIP file
        zip_ref.extractall(zip_dir)
    os.remove(zip_path) # delete .zip files once extracted

Get rid of the person information as it is not useful

In [8]:
# helper function to pop subfolders
def pop_folder(folder_path): 
    contents = os.listdir(folder_path)
    
    for item in contents:
        item_path = os.path.join(folder_path, item)
        new_path = os.path.join(os.path.dirname(folder_path), str(uuid.uuid1())[:8]+"-"+item)
        
        shutil.move(item_path, new_path)

    shutil.rmtree(folder_path)
    
videos_path = os.path.join("ISL1", "Videos")
    
# Unwrapping the Person subfolders to get all the images in the train and test folders    
for subfolder in os.listdir(os.path.join(videos_path)):
    pop_folder(os.path.join(os.path.join(videos_path), subfolder))
    
# statically define the categories     
abcs = ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z']

# create categorical directories 
for value in abcs:
    folder_path = os.path.join(videos_path, value)
    os.makedirs(folder_path, exist_ok=True)
    
pattern = r'-(\w)' # regular expression to extract category info from file names

file_names = os.listdir(videos_path)

# Move files to their corresponding folders
for file_name in file_names:
    match = re.search(pattern, file_name)
    if match:
        value = match.group(1)
        source_path = os.path.join(videos_path, file_name)
        destination_path = os.path.join(videos_path, value, file_name)
        
        # Ensure the destination folder exists before moving
        os.makedirs(os.path.dirname(destination_path), exist_ok=True)
        
        shutil.move(source_path, destination_path) # move videos to the corresponding subfolder

# Now there shoul be 18 videos of different length organised by class label in the directory    

Mediapipe

In [ ]:
videos_path = os.path.join("ISL1", "videos") #checkpoint if all above is ran on the machine
In [11]:
mp_hands = mp.solutions.hands # Instantiate the hands model
mp_drawing = mp.solutions.drawing_utils # Drawing utilities

# create a function to process video
def mediapipe_detection(image, model):
    image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB) # COLOR CONVERSION BGR 2 RGB
    image.flags.writeable = False                  # Image is no longer writeable
    results = model.process(image)                 # Make prediction
    image.flags.writeable = True                   # Image is now writeable 
    image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR) # COLOR COVERSION RGB 2 BGR
    return image, results

# helper function to visualise the landmarks on the video feed
def draw_landmarks(image, results):
    if results.multi_hand_landmarks:
        for hand_landmarks in results.multi_hand_landmarks:
            mp_drawing.draw_landmarks(image, hand_landmarks, mp_hands.HAND_CONNECTIONS)

# and a function to extract the results in real time to use for inference or creating training data
def extract_keypoints(results):
    if results.multi_hand_world_landmarks:
        positions = np.array([[res.x, res.y, res.z] for hand_landmarks in results.multi_hand_world_landmarks for res in hand_landmarks.landmark]).flatten()
    else:
        positions = np.zeros(21*3)
    return positions
In [14]:
#create the directories to store the results
train_output_path = os.path.join("ISL1", "Extracted_features")
for digit in os.listdir(videos_path):
    #loop through each digit
    for vid in os.listdir(os.path.join(videos_path, digit)):
        try: 
            os.makedirs(os.path.join(train_output_path, digit))
        except:
            pass
        
start_time = time.time()
# Now to create the training data 
for digit in os.listdir(videos_path):
    #loop through each digit
    for vid in os.listdir(os.path.join(videos_path , digit)):
        #loop through each video 
        # get the corresponding video for each video in the train folder
        video_path = os.path.join(videos_path , digit, vid) 
        cap = cv2.VideoCapture(video_path) # start cv2
        
        hands = mp_hands.Hands(static_image_mode=False,max_num_hands=1, min_detection_confidence=0.5) # start off the pretrained model
        # note the max_num_hands set to 1 so the model won't translate to multi hand detecion
        
        frame_extracts = [] 
        
        while cap.isOpened():
            # Read a frame from the video
            ret, frame = cap.read()
            if not ret:
                break
                
            # Detect landmarks in the frame
            frame, results = mediapipe_detection(frame, hands)
            key_points = extract_keypoints(results)
            frame_extracts.append(key_points)
            frame_extracts_array = np.array(frame_extracts)

        # Release the video capture object
        cap.release()
        
        np.save(os.path.join(train_output_path, digit, vid), frame_extracts_array)

        # Close all OpenCV 
        cv2.destroyAllWindows()
        
end_time = time.time()

timed = (end_time - start_time)/60
print(f"Extraction time: {timed} minutes")
# this may take a second to run 21 minutes on my machine
Extraction time: 21.408754924933117 minutes

Data Loading¶

In [3]:
train_output_path = os.path.join("ISL", "Extracted_features") #checkpoint if data had been loaded on the machine
In [4]:
# now load the data from the directories
X, Y = [] , []
for digit in os.listdir(train_output_path):
    #loop through each digit
    for vid in os.listdir(os.path.join(train_output_path, digit)): 
        #print(vid)
        fs = np.load(os.path.join(train_output_path, digit, vid))
        X.append(fs)
        Y.append(digit)
        
# now randomly subsample so that each series is of the same length
desired_length = 60  # is the lowest frame count in the dataset 
X_subsample = [element[np.random.choice(element.shape[0], desired_length, replace=False)] if element.shape[0] > desired_length else element for element in X]
X_subsample = torch.tensor(np.array(X_subsample), dtype=torch.float32)

# recode labels to numeric
num_classes = len(set(Y))
abcs=sorted(list(set(Y)))
label_encoder = LabelEncoder()
Y_num = label_encoder.fit_transform(Y)
Y_tensor = torch.tensor(Y_num)

# use stratified shuffle to make balanced test and train sets
sss = StratifiedShuffleSplit(n_splits =1, test_size = 0.2, random_state = 42)
for train_index, test_index in sss.split(X_subsample, Y_num):
    X_train, X_test = X_subsample[train_index], X_subsample[test_index]
    Y_train, Y_test = Y_tensor[train_index], Y_tensor[test_index]

# Print the shapes of the resulting sets
print("X_train shape:", X_train.shape)
print("Y_train shape:", Y_train.shape)
print("X_test shape:", X_test.shape)
print("Y_test shape:", Y_test.shape)
X_train shape: torch.Size([374, 60, 63])
Y_train shape: torch.Size([374])
X_test shape: torch.Size([94, 60, 63])
Y_test shape: torch.Size([94])

The motivation for exploring this avenue is in the hopes that given robust keypoint estimation the "hand-crafted" keypoints allow for robust categorisation. This approach does have its limitations nonetheless. As discussed by Hachiuma, Sato, and Sekii 2023. This technique relies on precise estimation of the feature extraction model used. The number of recognised keypoint groups is limited, in this case to one hand. In practical applications the varying sequence length can pose problems for the classification model due to the constrained feature space.

These points motivate the use of multi-stream models averaging the result of appearance based and therefore complex feature based and hand crafted keypoint models avereging their results for more robust spatiotemporal predictions. This avenue was explored however it is not presented here due to time constraints with model development.

The constrained feature set does mean I have considerable fewer datapoints to learn from just 374 observations in the train set each containing 60 temporal snapshots of 63 features of 21 x, y, and z coordinates of the keypoints on the hand with the bottom of the palm as the origin.

Modelling¶

Setup¶

In [5]:
# train on gpu if possible
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

writer = SummaryWriter(log_dir='Logs/keypoints2')
In [6]:
# adopted an early stopping training loop as it shortens development considerably
def train_with_early_stopping(model, train_loader, val_data, optimizer, criterion, n_total_steps, num_epochs=1000, patience=10):
    early_stop = False
    best_loss = float('inf')
    counter = 0
    
    running_loss = 0.0
    running_correct = 0

    for epoch in range(num_epochs):
        for i, (batch_X, batch_Y) in enumerate(train_loader):
            optimizer.zero_grad()
            outputs = model(batch_X)
            loss = criterion(outputs, batch_Y)
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1) # gradient clipping 
            optimizer.step()
            
            running_loss += loss.item()
            _, predicted = torch.max(outputs, 1)
            running_correct += (predicted == batch_Y).sum().item()
            
            if (i+1) % 6 == 0:
                #print(f'Epoch [{epoch + 1}/{num_epochs}], Training Loss: {loss.item():.4f}',flush=True)
                writer.add_scalar('trainin loss', running_loss/100, epoch * n_total_steps + i )
                writer.add_scalar('accuracy', running_correct/100, epoch * n_total_steps + i )
                writer.close()
                running_loss = 0.0
                running_correct = 0
                # I found tensorboard a little hard to work with 

                
        # Validation
        model.eval()  # Set to evaluation mode
        with torch.no_grad():
            X_val, Y_val = val_data
            val_outputs = model(X_val)
            val_loss = criterion(val_outputs, Y_val)

        # print(f'Epoch [{epoch + 1}/{num_epochs}], Training Loss: {loss.item():.4f}, Validation Loss: {val_loss.item():.4f}')

        # Check for improvement in validation # this does mean there is data leakage from the test set as I do not have a validation set
        if val_loss < best_loss:
            best_loss = val_loss
            counter = 0
        else:
            counter += 1


        if counter >= patience:
            print(f'Early stopping after {epoch + 1} epochs.')
            early_stop = True
            break

        model.train() 

    return model


def evaluate_model(model, X_test, Y_test, class_names):
    # Set to evaluation mode
    model.eval().to(device)

    # Make predictions on the test set
    with torch.no_grad():
        test_outputs = model(X_test.to(device))
        _, predicted = torch.max(test_outputs, 1)
        accuracy = (predicted == Y_test).sum().item() / Y_test.size(0)

    # Convert to NumPy arrays
    y_true = Y_test.numpy()
    y_pred = predicted.numpy()

    # Calculate confusion matrix
    conf_matrix = confusion_matrix(y_true, y_pred)

    # Plot confusion matrix
    plt.figure(figsize=(10, 8))
    sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', xticklabels=class_names, yticklabels=class_names)
    plt.xlabel('Predicted Labels')
    plt.ylabel('True Labels')
    plt.title('Confusion Matrix')
    plt.show()
    # print accuracy and classification report
    print(f'Test Accuracy: {accuracy * 100:.2f}%')
    print("Classification Report:")
    print(classification_report(y_true, predicted, target_names=class_names, zero_division=1))

Models¶

LSTMs¶
In [7]:
# The simplest LSTM I could think of 1 LSTM layer to one linear layer
class LSTMModel(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, output_size):
        super(LSTMModel, self).__init__()
        self.lstm = nn.LSTM(input_size, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        out, _ = self.lstm(x)
        out = self.fc(out[:, -1, :])
        out = F.relu(out)
        return out
In [8]:
# now  to a more flexible bidirectional LSTM which statistically should provide better performance
class ComplexLSTMModel(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, output_size, dropout_rate=0.5, bidirectional=False):
        super(ComplexLSTMModel, self).__init__()
        self.num_layers = num_layers
        self.bidirectional = bidirectional

        self.lstm = nn.LSTM(input_size, hidden_size, num_layers=num_layers, batch_first=True, dropout=dropout_rate, bidirectional=bidirectional)
        
        # If bidirectional, adjust the output size for the fully connected layer
        fc_input_size = hidden_size * 2 if bidirectional else hidden_size
        self.fc = nn.Linear(fc_input_size, output_size)

        # Add dropout layer for some regularisation
        self.dropout = nn.Dropout(p=dropout_rate)

    def forward(self, x):
        h0 = torch.zeros(self.num_layers * 2 if self.bidirectional else self.num_layers, x.size(0), hidden_size).to(x.device) #hidden cell states
        c0 = torch.zeros(self.num_layers * 2 if self.bidirectional else self.num_layers, x.size(0), hidden_size).to(x.device)

        out, _ = self.lstm(x, (h0, c0))
        
        # Use the last time step
        out = self.fc(out[:, -1, :])
        out = F.relu(out)

        # Apply dropout
        if hasattr(self, 'dropout'):
            out = self.dropout(out)

        return out
In [9]:
# attention model to focus on most crucial features
class AttentionLSTMModel(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, output_size):
        super(AttentionLSTMModel, self).__init__()
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers=num_layers, batch_first=True)
        self.attention = nn.Linear(hidden_size, 1)
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        out, _ = self.lstm(x)
        attention_weights = F.softmax(self.attention(out), dim=1)
        out = torch.sum(attention_weights * out, dim=1) 
        out = self.fc(out)
        return out
CNNs¶
In [10]:
class SimpleCNN(nn.Module):
    def __init__(self, input_channels, output_size):
        super(SimpleCNN, self).__init__()

        self.conv1 = nn.Conv1d(in_channels=input_channels, out_channels=64, kernel_size=3, stride=1, padding=1)
        self.conv2 = nn.Conv1d(in_channels=64, out_channels=128, kernel_size=3, stride=1, padding=1)
        self.conv3 = nn.Conv1d(in_channels=128, out_channels=256, kernel_size=3, stride=1, padding=1)

        self.pool = nn.MaxPool1d(kernel_size=2, stride=2)

        self.fc1 = nn.Linear(256 * (sequence_length // 8), 512)  # Adjust based on your sequence_length
        self.fc2 = nn.Linear(512, output_size)

    def forward(self, x):
        x = F.relu(self.conv1(x))
        x = self.pool(x)
        
        x = F.relu(self.conv2(x))
        x = self.pool(x)
        
        x = F.relu(self.conv3(x))
        x = self.pool(x)

        x = x.view(x.size(0), -1)

        x = F.relu(self.fc1(x))
        x = self.fc2(x)

        return x
In [11]:
class LargerCNN(nn.Module):
    def __init__(self, input_channels, output_size, dropout_prob):
        super(LargerCNN, self).__init__()

        self.conv1 = nn.Conv1d(in_channels=input_channels, out_channels=64, kernel_size=3, stride=1, padding=1)
        self.conv2 = nn.Conv1d(in_channels=64, out_channels=128, kernel_size=3, stride=1, padding=1)
        self.conv3 = nn.Conv1d(in_channels=128, out_channels=256, kernel_size=3, stride=1, padding=1)
        self.conv4 = nn.Conv1d(in_channels=256, out_channels=512, kernel_size=3, stride=1, padding=1)
        self.conv5 = nn.Conv1d(in_channels=512, out_channels=512, kernel_size=3, stride=1, padding=1)

        self.pool = nn.MaxPool1d(kernel_size=2, stride=2)

        self.fc1 = nn.Linear(512 * (sequence_length // 32), 1024)
        self.dropout1 = nn.Dropout(p=dropout_prob)
        self.fc2 = nn.Linear(1024, 512)
        self.dropout2 = nn.Dropout(p=dropout_prob)
        self.fc3 = nn.Linear(512, output_size)

    def forward(self, x):
        x = F.relu(self.conv1(x))
        x = self.pool(x)
        
        x = F.relu(self.conv2(x))
        x = self.pool(x)
        
        x = F.relu(self.conv3(x))
        x = self.pool(x)

        x = F.relu(self.conv4(x))
        x = self.pool(x)

        x = F.relu(self.conv5(x))
        x = self.pool(x)

        x = x.view(x.size(0), -1)

        x = F.relu(self.fc1(x))
        x = self.dropout1(x)
        x = F.relu(self.fc2(x))
        x = self.dropout2(x)
        x = self.fc3(x)

        return x
In [12]:
class ResidualBlock(nn.Module):
    def __init__(self, in_channels, out_channels, stride=1):
        super(ResidualBlock, self).__init__()
        
        self.conv1 = nn.Conv1d(in_channels, out_channels, kernel_size=3, stride=stride, padding=1)
        self.bn1 = nn.BatchNorm1d(out_channels)
        self.relu = nn.ReLU(inplace=True)
        
        self.conv2 = nn.Conv1d(out_channels, out_channels, kernel_size=3, stride=1, padding=1)
        self.bn2 = nn.BatchNorm1d(out_channels)
        
        # If the input size changes, use a 1x1 convolution to match dimensions
        self.downsample = nn.Sequential(
            nn.Conv1d(in_channels, out_channels, kernel_size=1, stride=stride),
            nn.BatchNorm1d(out_channels)
        ) if stride != 1 or in_channels != out_channels else None
    
    def forward(self, x):
        residual = x
        
        out = self.conv1(x)
        out = self.bn1(out)
        out = self.relu(out)
        
        out = self.conv2(out)
        out = self.bn2(out)
        
        if self.downsample is not None:
            residual = self.downsample(x)
        
        out += residual
        out = self.relu(out)
        
        return out

class DeepResidualCNN(nn.Module):
    def __init__(self, input_channels, output_size, num_blocks=[2, 2, 2], initial_channels=64):
        super(DeepResidualCNN, self).__init__()

        self.in_channels = initial_channels
        
        self.conv1 = nn.Conv1d(input_channels, initial_channels, kernel_size=3, stride=1, padding=1)
        self.bn1 = nn.BatchNorm1d(initial_channels)
        self.relu = nn.ReLU(inplace=True)
        
        # Build residual blocks
        self.layer1 = self._make_layer(ResidualBlock, initial_channels, num_blocks[0], stride=2)
        self.layer2 = self._make_layer(ResidualBlock, initial_channels * 2, num_blocks[1], stride=2)
        self.layer3 = self._make_layer(ResidualBlock, initial_channels * 4, num_blocks[2], stride=2)
        
        # Global average pooling
        self.global_avg_pooling = nn.AdaptiveAvgPool1d(1)
        
        self.fc = nn.Linear(initial_channels * 4, output_size)

    def _make_layer(self, block, out_channels, num_blocks, stride):
        layers = []
        layers.append(block(self.in_channels, out_channels, stride))
        self.in_channels = out_channels
        for _ in range(1, num_blocks):
            layers.append(block(out_channels, out_channels, stride=1))
        return nn.Sequential(*layers)

    def forward(self, x):
        x = self.conv1(x)
        x = self.bn1(x)
        x = self.relu(x)
        
        x = self.layer1(x)
        x = self.layer2(x)
        x = self.layer3(x)
        
        x = self.global_avg_pooling(x)
        x = x.view(x.size(0), -1)
        x = self.fc(x)
        
        return x

Training¶

In [13]:
train_dataset = TensorDataset(X_train.to(device), Y_train.to(device))
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)

n_total_steps = len(train_loader)

num_epochs = 1000
early_stop = False
patience = 10  # Number of epochs to wait for improvement
best_loss = float('inf')
counter = 0

input_size = 63
hidden_size = 128
output_size = 26
num_layers = 2

base_model = LSTMModel(input_size, hidden_size, num_layers, output_size).to(device)

criterion = nn.CrossEntropyLoss().to(device)

optimizer = optim.Adam(base_model.parameters(), lr=0.001)

writer.add_graph(base_model, X_test.to(device))
writer.close()


base_model = train_with_early_stopping(base_model, 
                                          train_loader, 
                                          (X_test.to(device), 
                                          Y_test.to(device)), 
                                          optimizer, 
                                          criterion,
                                          n_total_steps,
                                          num_epochs=1000, 
                                          patience=100)


evaluate_model(base_model, X_test, Y_test, abcs)
Early stopping after 718 epochs.
No description has been provided for this image
Test Accuracy: 87.23%
Classification Report:
              precision    recall  f1-score   support

           A       0.50      1.00      0.67         3
           B       1.00      1.00      1.00         3
           C       0.80      1.00      0.89         4
           D       1.00      1.00      1.00         4
           E       1.00      1.00      1.00         4
           F       1.00      0.67      0.80         3
           G       0.80      1.00      0.89         4
           H       1.00      1.00      1.00         4
           I       1.00      1.00      1.00         3
           J       1.00      1.00      1.00         3
           K       1.00      1.00      1.00         4
           L       1.00      1.00      1.00         3
           M       0.60      0.75      0.67         4
           N       1.00      0.33      0.50         3
           O       1.00      1.00      1.00         4
           P       1.00      1.00      1.00         4
           Q       1.00      1.00      1.00         3
           R       1.00      1.00      1.00         4
           S       1.00      0.00      0.00         4
           T       1.00      1.00      1.00         4
           U       1.00      0.00      0.00         4
           V       1.00      1.00      1.00         4
           W       1.00      1.00      1.00         4
           X       1.00      1.00      1.00         3
           Y       1.00      1.00      1.00         3
           Z       0.44      1.00      0.62         4

    accuracy                           0.87        94
   macro avg       0.93      0.88      0.85        94
weighted avg       0.93      0.87      0.84        94

In [14]:
torch.save(base_model.state_dict(), 'trained_base_model.pth')
In [15]:
train_dataset = TensorDataset(X_train.to(device), Y_train.to(device))
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)


num_epochs = 1000
early_stop = False
patience = 10  # Number of epochs to wait for improvement
best_loss = float('inf')
counter = 0

input_size = 63
hidden_size = 128
output_size = 26
num_layers = 2

complex_model = ComplexLSTMModel(input_size, hidden_size, num_layers, output_size).to(device)

criterion = nn.CrossEntropyLoss().to(device)

optimizer = optim.Adam(complex_model.parameters(), lr=0.001)


complex_model = train_with_early_stopping(complex_model, 
                                          train_loader, 
                                          (X_test.to(device), 
                                          Y_test.to(device)), 
                                          optimizer, 
                                          criterion,
                                          n_total_steps,
                                          num_epochs=1000, 
                                          patience=50)

evaluate_model(complex_model, X_test, Y_test, abcs)
Early stopping after 253 epochs.
No description has been provided for this image
Test Accuracy: 63.83%
Classification Report:
              precision    recall  f1-score   support

           A       0.18      1.00      0.30         3
           B       0.43      1.00      0.60         3
           C       1.00      0.00      0.00         4
           D       0.50      1.00      0.67         4
           E       1.00      1.00      1.00         4
           F       1.00      0.00      0.00         3
           G       0.43      0.75      0.55         4
           H       1.00      0.00      0.00         4
           I       1.00      1.00      1.00         3
           J       1.00      0.00      0.00         3
           K       0.80      1.00      0.89         4
           L       1.00      0.00      0.00         3
           M       0.67      1.00      0.80         4
           N       1.00      0.00      0.00         3
           O       1.00      1.00      1.00         4
           P       1.00      1.00      1.00         4
           Q       1.00      1.00      1.00         3
           R       1.00      0.00      0.00         4
           S       1.00      0.00      0.00         4
           T       1.00      0.00      0.00         4
           U       0.50      1.00      0.67         4
           V       1.00      0.75      0.86         4
           W       1.00      1.00      1.00         4
           X       1.00      1.00      1.00         3
           Y       1.00      1.00      1.00         3
           Z       0.80      1.00      0.89         4

    accuracy                           0.64        94
   macro avg       0.86      0.63      0.55        94
weighted avg       0.86      0.64      0.55        94

In [16]:
torch.save(complex_model.state_dict(), 'trained_lstm_model.pth')
In [17]:
train_dataset = TensorDataset(X_train.to(device), Y_train.to(device))
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
n_total_steps = len(train_loader)


num_epochs = 1000
early_stop = False
patience = 10  # Number of epochs to wait for improvement
best_loss = float('inf')
counter = 0

input_size = 63
hidden_size = 128
output_size = 26
num_layers = 4

attention_model = AttentionLSTMModel(input_size, hidden_size, num_layers, output_size).to(device)

criterion = nn.CrossEntropyLoss().to(device)

optimizer = optim.Adam(attention_model.parameters(), lr=0.001)

attention_model = train_with_early_stopping(attention_model, 
                                          train_loader, 
                                          (X_test.to(device), 
                                          Y_test.to(device)), 
                                          optimizer, 
                                          criterion,
                                          n_total_steps,
                                          num_epochs=1000, 
                                          patience=50)

evaluate_model(attention_model, X_test, Y_test, abcs)
Early stopping after 345 epochs.
No description has been provided for this image
Test Accuracy: 89.36%
Classification Report:
              precision    recall  f1-score   support

           A       1.00      1.00      1.00         3
           B       1.00      1.00      1.00         3
           C       1.00      1.00      1.00         4
           D       1.00      1.00      1.00         4
           E       1.00      0.50      0.67         4
           F       0.75      1.00      0.86         3
           G       1.00      0.75      0.86         4
           H       1.00      1.00      1.00         4
           I       1.00      1.00      1.00         3
           J       1.00      1.00      1.00         3
           K       1.00      1.00      1.00         4
           L       1.00      1.00      1.00         3
           M       0.50      0.75      0.60         4
           N       1.00      0.00      0.00         3
           O       1.00      1.00      1.00         4
           P       1.00      1.00      1.00         4
           Q       1.00      1.00      1.00         3
           R       1.00      0.50      0.67         4
           S       1.00      1.00      1.00         4
           T       0.67      1.00      0.80         4
           U       0.67      1.00      0.80         4
           V       1.00      1.00      1.00         4
           W       1.00      1.00      1.00         4
           X       0.75      1.00      0.86         3
           Y       1.00      1.00      1.00         3
           Z       0.75      0.75      0.75         4

    accuracy                           0.89        94
   macro avg       0.93      0.89      0.88        94
weighted avg       0.92      0.89      0.88        94

In [18]:
torch.save(attention_model.state_dict(), 'trained_attention_model.pth')
In [44]:
train_dataset = TensorDataset(X_train.to(device), Y_train.to(device))
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)

num_epochs = 1000
early_stop = False
patience = 10  # Number of epochs to wait for improvement
best_loss = float('inf')
counter = 0

input_channels = 60
output_size = 26
sequence_length = 60

cnn_model = SimpleCNN(input_channels=input_channels, output_size=output_size)

criterion = nn.CrossEntropyLoss().to(device)

optimizer = optim.Adam(cnn_model.parameters(), lr=0.001)

writer.add_graph(cnn_model, X_test.to(device))
writer.close()

cnn_model = train_with_early_stopping(cnn_model, 
                                          train_loader, 
                                          (X_test.to(device), 
                                          Y_test.to(device)), 
                                          optimizer, 
                                          criterion,
                                          n_total_steps,
                                          num_epochs=1000, 
                                          patience=50)



evaluate_model(cnn_model, X_test, Y_test, abcs)
No description has been provided for this image
Test Accuracy: 94.68%
Classification Report:
              precision    recall  f1-score   support

           A       1.00      1.00      1.00         3
           B       1.00      1.00      1.00         3
           C       1.00      0.75      0.86         4
           D       1.00      1.00      1.00         4
           E       1.00      1.00      1.00         4
           F       0.75      1.00      0.86         3
           G       1.00      0.75      0.86         4
           H       1.00      1.00      1.00         4
           I       1.00      1.00      1.00         3
           J       0.75      1.00      0.86         3
           K       1.00      1.00      1.00         4
           L       1.00      1.00      1.00         3
           M       1.00      0.75      0.86         4
           N       1.00      1.00      1.00         3
           O       0.80      1.00      0.89         4
           P       1.00      1.00      1.00         4
           Q       1.00      1.00      1.00         3
           R       1.00      1.00      1.00         4
           S       1.00      1.00      1.00         4
           T       1.00      1.00      1.00         4
           U       0.67      1.00      0.80         4
           V       1.00      0.50      0.67         4
           W       1.00      1.00      1.00         4
           X       1.00      1.00      1.00         3
           Y       1.00      1.00      1.00         3
           Z       1.00      1.00      1.00         4

    accuracy                           0.95        94
   macro avg       0.96      0.95      0.95        94
weighted avg       0.96      0.95      0.95        94

In [20]:
torch.save(cnn_model.state_dict(), 'trained_cnn_model.pth')
In [21]:
train_dataset = TensorDataset(X_train.to(device), Y_train.to(device))
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)

num_epochs = 1000
early_stop = False
patience = 10  # Number of epochs to wait for improvement
best_loss = float('inf')
counter = 0

input_channels = 60
output_size = 26


sequence_length = 60

cnn_model2 = LargerCNN(input_channels=input_channels, output_size=output_size, dropout_prob = 0.5)

criterion = nn.CrossEntropyLoss().to(device)

optimizer = optim.Adam(cnn_model2.parameters(), lr=0.001)

writer.add_graph(cnn_model, X_test.to(device))
writer.close()


cnn_model2 = train_with_early_stopping(cnn_model2, 
                                          train_loader, 
                                          (X_test.to(device), 
                                          Y_test.to(device)), 
                                          optimizer, 
                                          criterion,
                                          n_total_steps,
                                          num_epochs=1000, 
                                          patience=100) # I saw better performance with longer training times



evaluate_model(cnn_model2, X_test, Y_test, abcs)
Early stopping after 164 epochs.
No description has been provided for this image
Test Accuracy: 76.60%
Classification Report:
              precision    recall  f1-score   support

           A       0.75      1.00      0.86         3
           B       0.50      1.00      0.67         3
           C       1.00      0.50      0.67         4
           D       1.00      0.75      0.86         4
           E       1.00      0.75      0.86         4
           F       1.00      1.00      1.00         3
           G       1.00      1.00      1.00         4
           H       0.80      1.00      0.89         4
           I       1.00      1.00      1.00         3
           J       1.00      0.67      0.80         3
           K       1.00      0.75      0.86         4
           L       0.75      1.00      0.86         3
           M       0.50      0.50      0.50         4
           N       0.40      0.67      0.50         3
           O       0.80      1.00      0.89         4
           P       0.25      0.25      0.25         4
           Q       1.00      0.33      0.50         3
           R       0.57      1.00      0.73         4
           S       0.75      0.75      0.75         4
           T       0.80      1.00      0.89         4
           U       1.00      0.25      0.40         4
           V       0.67      1.00      0.80         4
           W       1.00      0.25      0.40         4
           X       1.00      0.67      0.80         3
           Y       1.00      1.00      1.00         3
           Z       1.00      1.00      1.00         4

    accuracy                           0.77        94
   macro avg       0.83      0.77      0.76        94
weighted avg       0.83      0.77      0.75        94

In [22]:
torch.save(cnn_model2.state_dict(), 'trained_cnn2_model.pth')
In [34]:
train_dataset = TensorDataset(X_train.to(device), Y_train.to(device))
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)

deep_residual_cnn_model = DeepResidualCNN(input_channels=60, output_size=26)

criterion = nn.CrossEntropyLoss().to(device)

optimizer = optim.Adam(deep_residual_cnn_model.parameters(), lr=0.001)


deep_residual_cnn_model = train_with_early_stopping(deep_residual_cnn_model, 
                                          train_loader, 
                                          (X_test.to(device), 
                                          Y_test.to(device)), 
                                          optimizer, 
                                          criterion,
                                          n_total_steps,
                                          num_epochs=1000, 
                                          patience=50)


evaluate_model(deep_residual_cnn_model, X_test, Y_test, abcs)
Early stopping after 108 epochs.
No description has been provided for this image
Test Accuracy: 88.30%
Classification Report:
              precision    recall  f1-score   support

           A       1.00      1.00      1.00         3
           B       1.00      1.00      1.00         3
           C       1.00      0.75      0.86         4
           D       1.00      1.00      1.00         4
           E       1.00      1.00      1.00         4
           F       0.75      1.00      0.86         3
           G       0.75      0.75      0.75         4
           H       1.00      1.00      1.00         4
           I       1.00      1.00      1.00         3
           J       1.00      1.00      1.00         3
           K       1.00      1.00      1.00         4
           L       1.00      0.67      0.80         3
           M       0.60      0.75      0.67         4
           N       0.50      0.33      0.40         3
           O       0.80      1.00      0.89         4
           P       0.80      1.00      0.89         4
           Q       1.00      1.00      1.00         3
           R       0.75      0.75      0.75         4
           S       1.00      1.00      1.00         4
           T       1.00      1.00      1.00         4
           U       0.50      0.75      0.60         4
           V       1.00      0.50      0.67         4
           W       1.00      0.75      0.86         4
           X       1.00      1.00      1.00         3
           Y       1.00      1.00      1.00         3
           Z       1.00      1.00      1.00         4

    accuracy                           0.88        94
   macro avg       0.90      0.88      0.88        94
weighted avg       0.90      0.88      0.88        94

In [24]:
torch.save(deep_residual_cnn_model.state_dict(), 'trained_deep_residual_cnn_model.pth')
Load trained models¶
In [27]:
input_size = 63
hidden_size = 128
output_size = 26
num_layers = 2

base_model = LSTMModel(input_size, hidden_size, num_layers, output_size)

base_model.load_state_dict(torch.load('trained_base_model.pth'))

base_model.eval()
Out[27]:
LSTMModel(
  (lstm): LSTM(63, 128, batch_first=True)
  (fc): Linear(in_features=128, out_features=26, bias=True)
)
In [42]:
input_size = 63
hidden_size = 128
output_size = 26
num_layers = 2

model = AttentionLSTMModel(input_size, hidden_size, num_layers, output_size)

model.load_state_dict(torch.load('attention_model2000.pth'))

model.eval()
Out[42]:
AttentionLSTMModel(
  (lstm): LSTM(63, 128, num_layers=2, batch_first=True)
  (attention): Linear(in_features=128, out_features=1, bias=True)
  (fc): Linear(in_features=128, out_features=26, bias=True)
)
In [28]:
input_size = 63
hidden_size = 128
output_size = 26
num_layers = 2

attention_model = AttentionLSTMModel(input_size, hidden_size, num_layers, output_size)

attention_model.load_state_dict(torch.load('trained_attention_model.pth'))

attention_model.eval()
Out[28]:
AttentionLSTMModel(
  (lstm): LSTM(63, 128, num_layers=2, batch_first=True)
  (attention): Linear(in_features=128, out_features=1, bias=True)
  (fc): Linear(in_features=128, out_features=26, bias=True)
)
In [30]:
input_size = 63
hidden_size = 128
output_size = 26
num_layers = 2

complex_model = ComplexLSTMModel(input_size, hidden_size, num_layers, output_size)

complex_model.load_state_dict(torch.load('trained_lstm_model.pth'))

complex_model.eval()
Out[30]:
ComplexLSTMModel(
  (lstm): LSTM(63, 128, num_layers=2, batch_first=True, dropout=0.5)
  (fc): Linear(in_features=128, out_features=26, bias=True)
  (dropout): Dropout(p=0.5, inplace=False)
)
In [35]:
input_channels = 60
output_size = 26
sequence_length = 60

cnn_model = SimpleCNN(input_channels=input_channels, output_size=output_size)

cnn_model.load_state_dict(torch.load('trained_cnn_model.pth'))

cnn_model.eval()
Out[35]:
SimpleCNN(
  (conv1): Conv1d(60, 64, kernel_size=(3,), stride=(1,), padding=(1,))
  (conv2): Conv1d(64, 128, kernel_size=(3,), stride=(1,), padding=(1,))
  (conv3): Conv1d(128, 256, kernel_size=(3,), stride=(1,), padding=(1,))
  (pool): MaxPool1d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (fc1): Linear(in_features=1792, out_features=512, bias=True)
  (fc2): Linear(in_features=512, out_features=26, bias=True)
)
In [37]:
input_channels = 60
output_size = 26
sequence_length = 60

cnn_model2 = LargerCNN(input_channels=input_channels, output_size=output_size, dropout_prob =0.5)

cnn_model2.load_state_dict(torch.load('trained_cnn2_model.pth'))

cnn_model2.eval()
Out[37]:
LargerCNN(
  (conv1): Conv1d(60, 64, kernel_size=(3,), stride=(1,), padding=(1,))
  (conv2): Conv1d(64, 128, kernel_size=(3,), stride=(1,), padding=(1,))
  (conv3): Conv1d(128, 256, kernel_size=(3,), stride=(1,), padding=(1,))
  (conv4): Conv1d(256, 512, kernel_size=(3,), stride=(1,), padding=(1,))
  (conv5): Conv1d(512, 512, kernel_size=(3,), stride=(1,), padding=(1,))
  (pool): MaxPool1d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (fc1): Linear(in_features=512, out_features=1024, bias=True)
  (dropout1): Dropout(p=0.5, inplace=False)
  (fc2): Linear(in_features=1024, out_features=512, bias=True)
  (dropout2): Dropout(p=0.5, inplace=False)
  (fc3): Linear(in_features=512, out_features=26, bias=True)
)
In [36]:
input_channels = 60
output_size = 26
sequence_length = 60

deep_residual_cnn_model = DeepResidualCNN(input_channels=60, output_size=26)

deep_residual_cnn_model.load_state_dict(torch.load('trained_deep_residual_cnn_model.pth'))

deep_residual_cnn_model.eval()
Out[36]:
DeepResidualCNN(
  (conv1): Conv1d(60, 64, kernel_size=(3,), stride=(1,), padding=(1,))
  (bn1): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (relu): ReLU(inplace=True)
  (layer1): Sequential(
    (0): ResidualBlock(
      (conv1): Conv1d(64, 64, kernel_size=(3,), stride=(2,), padding=(1,))
      (bn1): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
      (conv2): Conv1d(64, 64, kernel_size=(3,), stride=(1,), padding=(1,))
      (bn2): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (downsample): Sequential(
        (0): Conv1d(64, 64, kernel_size=(1,), stride=(2,))
        (1): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      )
    )
    (1): ResidualBlock(
      (conv1): Conv1d(64, 64, kernel_size=(3,), stride=(1,), padding=(1,))
      (bn1): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
      (conv2): Conv1d(64, 64, kernel_size=(3,), stride=(1,), padding=(1,))
      (bn2): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    )
  )
  (layer2): Sequential(
    (0): ResidualBlock(
      (conv1): Conv1d(64, 128, kernel_size=(3,), stride=(2,), padding=(1,))
      (bn1): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
      (conv2): Conv1d(128, 128, kernel_size=(3,), stride=(1,), padding=(1,))
      (bn2): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (downsample): Sequential(
        (0): Conv1d(64, 128, kernel_size=(1,), stride=(2,))
        (1): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      )
    )
    (1): ResidualBlock(
      (conv1): Conv1d(128, 128, kernel_size=(3,), stride=(1,), padding=(1,))
      (bn1): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
      (conv2): Conv1d(128, 128, kernel_size=(3,), stride=(1,), padding=(1,))
      (bn2): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    )
  )
  (layer3): Sequential(
    (0): ResidualBlock(
      (conv1): Conv1d(128, 256, kernel_size=(3,), stride=(2,), padding=(1,))
      (bn1): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
      (conv2): Conv1d(256, 256, kernel_size=(3,), stride=(1,), padding=(1,))
      (bn2): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (downsample): Sequential(
        (0): Conv1d(128, 256, kernel_size=(1,), stride=(2,))
        (1): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      )
    )
    (1): ResidualBlock(
      (conv1): Conv1d(256, 256, kernel_size=(3,), stride=(1,), padding=(1,))
      (bn1): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
      (conv2): Conv1d(256, 256, kernel_size=(3,), stride=(1,), padding=(1,))
      (bn2): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    )
  )
  (global_avg_pooling): AdaptiveAvgPool1d(output_size=1)
  (fc): Linear(in_features=256, out_features=26, bias=True)
)
In [38]:
model_names = ['base_model', 'attention_model', 'bi_directional_model', 'cnn_model', 'cnn_model2', 'deep_residual_cnn_model']

# Initialize a list to store accuracy scores
accuracy_scores = []

base_model.to(device)
attention_model.to(device)
complex_model.to(device)
cnn_model.to(device)
cnn_model2.to(device)
deep_residual_cnn_model.to(device)
# Function to evaluate a model and return accuracy
def evaluate_model2(model, X_test, Y_test, class_names):
    # Set to evaluation mode
    model.eval().to(device)

    # Make predictions on the test set
    with torch.no_grad():
        test_outputs = model(X_test.to(device))
        _, predicted = torch.max(test_outputs, 1)
        accuracy = (predicted == Y_test).sum().item() / Y_test.size(0)
    return accuracy

# Evaluate each model and store accuracy scores
for model in [base_model, attention_model, complex_model, cnn_model, cnn_model2, deep_residual_cnn_model]:
    accuracy = evaluate_model2(model, X_test, Y_test, abcs)
    accuracy_scores.append(accuracy * 100)

# Create a table
table_data = list(zip(model_names, accuracy_scores))
table_headers = ["Model Name", "Test Accuracy %"]
table = tabulate(table_data, headers=table_headers, tablefmt="grid")

# Print the table
print(table)
+-------------------------+-------------------+
| Model Name              |   Test Accuracy % |
+=========================+===================+
| base_model              |           87.234  |
+-------------------------+-------------------+
| attention_model         |           89.3617 |
+-------------------------+-------------------+
| bi_directional_model    |           63.8298 |
+-------------------------+-------------------+
| cnn_model               |           93.617  |
+-------------------------+-------------------+
| cnn_model2              |           76.5957 |
+-------------------------+-------------------+
| deep_residual_cnn_model |           91.4894 |
+-------------------------+-------------------+

Validation¶

Using some videos I collected to get some idea of the generalisability of the models

Load validation data for the collection code please see the Video_capture.ipynb file

In [29]:
validation_features = os.path.join("ISL", "Extracted_features_Validation")
In [30]:
# now load the data from the directories
X, Y = [] , []
for digit in os.listdir(validation_features):
    #loop through each digit
    for vid in os.listdir(os.path.join(validation_features, digit)): 
        #print(vid)
        fs = np.load(os.path.join(validation_features, digit, vid))
        X.append(fs)
        Y.append(digit)
        
# now randomly subsample so that each series is of the same length
desired_length = 60  # is the lowest frame count in the dataset 
X_valid = [element[np.random.choice(element.shape[0], desired_length, replace=False)] if element.shape[0] > desired_length else element for element in X]
X_valid = torch.tensor(np.array(X_valid), dtype=torch.float32)

# recode labels to numeric
num_classes = len(set(Y))
abcs=sorted(list(set(Y)))
label_encoder = LabelEncoder()
Y_num = label_encoder.fit_transform(Y)
Y_valid = torch.tensor(Y_num)

print(X_valid.shape)
print(Y_valid.shape)
torch.Size([52, 60, 63])
torch.Size([52])
In [39]:
model_names = ['base_model', 'attention_model', 'bi_directional_model', 'cnn_model', 'cnn_model2', 'deep_residual_cnn_model']

# Initialize a list to store accuracy scores
accuracy_scores = []

base_model.to(device)
attention_model.to(device)
complex_model.to(device)
cnn_model.to(device)
cnn_model2.to(device)
deep_residual_cnn_model.to(device)

# Evaluate each model and store accuracy scores
for model in [base_model, attention_model, complex_model, cnn_model, cnn_model2, deep_residual_cnn_model]:
    accuracy = evaluate_model2(model, X_valid, Y_valid, abcs)
    accuracy_scores.append(accuracy * 100)

# Create a table
table_data = list(zip(model_names, accuracy_scores))
table_headers = ["Model Name", "Vaildation Accuracy"]
table = tabulate(table_data, headers=table_headers, tablefmt="grid")

# Print the table
print(table)
+-------------------------+-----------------------+
| Model Name              |   Vaildation Accuracy |
+=========================+=======================+
| base_model              |               50      |
+-------------------------+-----------------------+
| attention_model         |               59.6154 |
+-------------------------+-----------------------+
| bi_directional_model    |               40.3846 |
+-------------------------+-----------------------+
| cnn_model               |               65.3846 |
+-------------------------+-----------------------+
| cnn_model2              |               53.8462 |
+-------------------------+-----------------------+
| deep_residual_cnn_model |               55.7692 |
+-------------------------+-----------------------+
In [60]:
evaluate_model(cnn_model, X_valid, Y_valid, abcs)
No description has been provided for this image
Test Accuracy: 65.38%
Classification Report:
              precision    recall  f1-score   support

           A       1.00      1.00      1.00         2
           B       1.00      1.00      1.00         2
           C       0.00      0.00      1.00         2
           D       0.00      0.00      1.00         2
           E       0.40      1.00      0.57         2
           F       1.00      0.00      0.00         2
           G       0.50      1.00      0.67         2
           H       1.00      1.00      1.00         2
           I       1.00      1.00      1.00         2
           J       1.00      1.00      1.00         2
           K       1.00      1.00      1.00         2
           L       1.00      0.00      0.00         2
           M       1.00      0.50      0.67         2
           N       0.50      1.00      0.67         2
           O       1.00      0.50      0.67         2
           P       1.00      1.00      1.00         2
           Q       1.00      1.00      1.00         2
           R       1.00      0.50      0.67         2
           S       1.00      1.00      1.00         2
           T       1.00      0.00      0.00         2
           U       1.00      0.00      0.00         2
           V       0.67      1.00      0.80         2
           W       1.00      0.00      0.00         2
           X       1.00      0.50      0.67         2
           Y       1.00      1.00      1.00         2
           Z       0.29      1.00      0.44         2

    accuracy                           0.65        52
   macro avg       0.82      0.65      0.69        52
weighted avg       0.82      0.65      0.69        52

In [32]:
evaluate_model(deep_residual_cnn_model, X_valid, Y_valid, abcs)
No description has been provided for this image
Test Accuracy: 59.62%
Classification Report:
              precision    recall  f1-score   support

           A       1.00      1.00      1.00         2
           B       0.67      1.00      0.80         2
           C       0.00      0.00      1.00         2
           D       1.00      0.00      0.00         2
           E       1.00      1.00      1.00         2
           F       0.67      1.00      0.80         2
           G       1.00      0.50      0.67         2
           H       0.67      1.00      0.80         2
           I       1.00      1.00      1.00         2
           J       1.00      0.50      0.67         2
           K       1.00      0.50      0.67         2
           L       1.00      0.00      0.00         2
           M       1.00      0.50      0.67         2
           N       0.50      1.00      0.67         2
           O       1.00      1.00      1.00         2
           P       0.00      0.00      1.00         2
           Q       0.40      1.00      0.57         2
           R       0.50      0.50      0.50         2
           S       1.00      1.00      1.00         2
           T       1.00      0.00      0.00         2
           U       1.00      0.00      0.00         2
           V       1.00      0.00      0.00         2
           W       0.00      0.00      1.00         2
           X       1.00      1.00      1.00         2
           Y       1.00      1.00      1.00         2
           Z       0.25      1.00      0.40         2

    accuracy                           0.60        52
   macro avg       0.76      0.60      0.66        52
weighted avg       0.76      0.60      0.66        52

Given the higher F1 score for the simple CNN model and overall accuracy it is choosen to demonstrate the performance of the model in real time.

Real time detection¶

Load back dependencies

In [47]:
mp_hands = mp.solutions.hands # Instantiate the hands model
mp_drawing = mp.solutions.drawing_utils # Drawing utilities

# create a function to process video
def mediapipe_detection(image, model):
    image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB) # COLOR CONVERSION BGR 2 RGB
    image.flags.writeable = False                  # Image is no longer writeable
    results = model.process(image)                 # Make prediction
    image.flags.writeable = True                   # Image is now writeable 
    image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR) # COLOR COVERSION RGB 2 BGR
    return image, results

# helper function to visualise the landmarks on the video feed
def draw_landmarks(image, results):
    if results.multi_hand_landmarks:
        for hand_landmarks in results.multi_hand_landmarks:
            mp_drawing.draw_landmarks(image, hand_landmarks, mp_hands.HAND_CONNECTIONS)

# and a function to extract the results in real time to use for inference or creating training data
def extract_keypoints(results):
    if results.multi_hand_landmarks:
        positions = np.array([[res.x, res.y, res.z] for hand_landmarks in results.multi_hand_landmarks for res in hand_landmarks.landmark]).flatten()
    else:
        positions = np.zeros(21*3)
    return positions

abcs = ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z']
# had to load the class labels to turn outputs to something understandable


#function to make detection easier and move between different models
def hand_gesture_recognition(model, cap):
    # 1. New detection variables
    sequence = []
    predictions = []
    threshold = 0.3

    # Set mediapipe model 
    with mp_hands.Hands(static_image_mode=False, max_num_hands=1, min_detection_confidence=0.5) as hands:
        while cap.isOpened():
            # Read feed
            ret, frame = cap.read()

            # Make detections
            image, results = mediapipe_detection(frame, hands)

            # Draw landmarks
            draw_landmarks(image, results)

            # 2. Prediction logic
            keypoints = extract_keypoints(results)
            sequence.append(keypoints)
            sequence = sequence[-60:]
            # detect if we have 30 frames recorded
            if len(sequence) == 60:
                sequence_tensor = torch.Tensor(np.expand_dims(sequence, axis=0)) # create a tensor of the keypoints 
                #print(sequence_tensor.shape)
                model.eval()  # Switch to evaluation mode
                with torch.no_grad():
                    res = model(sequence_tensor)
                    _, predicted_class = torch.max(res, 1)
                    predictions.append(predicted_class.item())

                    prediction_text = f"Prediction: {abcs[predicted_class.item()]}"
                    cv2.putText(image, prediction_text, (10, 30), cv2.FONT_HERSHEY_SIMPLEX, 1, (255, 255, 255), 2, cv2.LINE_AA)


            # Show to screen
            cv2.imshow('OpenCV Feed', image)

            # Break gracefully
            if cv2.waitKey(10) & 0xFF == ord('q'):
                break

    cap.release()
    cv2.destroyAllWindows()
Detection code¶

Run any of the following for real time detection. Once done with detection and the screen cap is active press "q" to quit the program.

In [55]:
cap = cv2.VideoCapture(2)
hand_gesture_recognition(base_model, cap)
In [52]:
cap = cv2.VideoCapture(0)
hand_gesture_recognition(attention_model, cap)
In [57]:
cap = cv2.VideoCapture(0)
hand_gesture_recognition(complex_model, cap)
In [62]:
cap = cv2.VideoCapture(0)
hand_gesture_recognition(cnn_model, cap)
In [ ]: