import os
import re
import cv2
import pandas as pd
import numpy as np
import shutil
import random
import subprocess
from zipfile import ZipFile
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
from tabulate import tabulate
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from sklearn.metrics import classification_report
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn import metrics
import uuid
import time
import torch
import torch.nn as nn
import torch.optim as optim
import torchvision
import tensorboard
import torch.nn.functional as F
from torchvision import datasets, models, transforms
from torchvision.transforms import v2
from torch.utils.data import DataLoader, TensorDataset
from torch.optim import lr_scheduler
from torchinfo import summary
from tempfile import TemporaryDirectory
from sklearn.preprocessing import LabelEncoder
from torch.utils.tensorboard import SummaryWriter
import mediapipe as mp

from sklearn.model_selection import train_test_split

# fixing the seed to make training and results stable I just fixed all the randomisers I could think of still get some random behaviour possibly due to the LSTMs
randomseed = 42 # My favourite integer
np.random.seed(randomseed) 
torch.manual_seed(randomseed) # I still see non-deterministic results no clue why
torch.cuda.manual_seed(randomseed) 
torch.backends.cudnn.deterministic = True
random.seed(randomseed)

# get the url
repo_url = "https://github.com/marlondcu/ISL.git"
# the destination
destination_folder = os.path.join("ISL1")
# Run the git clone command
subprocess.run(["git", "clone", repo_url, destination_folder])

CompletedProcess(args=['git', 'clone', 'https://github.com/marlondcu/ISL.git', 'ISL1'], returncode=0)

zip_dir = os.path.join(destination_folder, "Videos") # unzip just the videos
zip_files = [f for f in os.listdir(zip_dir) if f.endswith('.zip')] 

# Go through ZIP files and extract contents
for zip_file in zip_files:
    zip_path = os.path.join(zip_dir, zip_file) 
    
    with ZipFile(zip_path, 'r') as zip_ref:
        # Extract all contents to the same directory as the ZIP file
        zip_ref.extractall(zip_dir)
    os.remove(zip_path) # delete .zip files once extracted

# helper function to pop subfolders
def pop_folder(folder_path): 
    contents = os.listdir(folder_path)
    
    for item in contents:
        item_path = os.path.join(folder_path, item)
        new_path = os.path.join(os.path.dirname(folder_path), str(uuid.uuid1())[:8]+"-"+item)
        
        shutil.move(item_path, new_path)

    shutil.rmtree(folder_path)
    
videos_path = os.path.join("ISL1", "Videos")
    
# Unwrapping the Person subfolders to get all the images in the train and test folders    
for subfolder in os.listdir(os.path.join(videos_path)):
    pop_folder(os.path.join(os.path.join(videos_path), subfolder))
    
# statically define the categories     
abcs = ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z']

# create categorical directories 
for value in abcs:
    folder_path = os.path.join(videos_path, value)
    os.makedirs(folder_path, exist_ok=True)
    
pattern = r'-(\w)' # regular expression to extract category info from file names

file_names = os.listdir(videos_path)

# Move files to their corresponding folders
for file_name in file_names:
    match = re.search(pattern, file_name)
    if match:
        value = match.group(1)
        source_path = os.path.join(videos_path, file_name)
        destination_path = os.path.join(videos_path, value, file_name)
        
        # Ensure the destination folder exists before moving
        os.makedirs(os.path.dirname(destination_path), exist_ok=True)
        
        shutil.move(source_path, destination_path) # move videos to the corresponding subfolder

# Now there shoul be 18 videos of different length organised by class label in the directory

videos_path = os.path.join("ISL1", "videos") #checkpoint if all above is ran on the machine

mp_hands = mp.solutions.hands # Instantiate the hands model
mp_drawing = mp.solutions.drawing_utils # Drawing utilities

# create a function to process video
def mediapipe_detection(image, model):
    image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB) # COLOR CONVERSION BGR 2 RGB
    image.flags.writeable = False                  # Image is no longer writeable
    results = model.process(image)                 # Make prediction
    image.flags.writeable = True                   # Image is now writeable 
    image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR) # COLOR COVERSION RGB 2 BGR
    return image, results

# helper function to visualise the landmarks on the video feed
def draw_landmarks(image, results):
    if results.multi_hand_landmarks:
        for hand_landmarks in results.multi_hand_landmarks:
            mp_drawing.draw_landmarks(image, hand_landmarks, mp_hands.HAND_CONNECTIONS)

# and a function to extract the results in real time to use for inference or creating training data
def extract_keypoints(results):
    if results.multi_hand_world_landmarks:
        positions = np.array([[res.x, res.y, res.z] for hand_landmarks in results.multi_hand_world_landmarks for res in hand_landmarks.landmark]).flatten()
    else:
        positions = np.zeros(21*3)
    return positions

#create the directories to store the results
train_output_path = os.path.join("ISL1", "Extracted_features")
for digit in os.listdir(videos_path):
    #loop through each digit
    for vid in os.listdir(os.path.join(videos_path, digit)):
        try: 
            os.makedirs(os.path.join(train_output_path, digit))
        except:
            pass
        
start_time = time.time()
# Now to create the training data 
for digit in os.listdir(videos_path):
    #loop through each digit
    for vid in os.listdir(os.path.join(videos_path , digit)):
        #loop through each video 
        # get the corresponding video for each video in the train folder
        video_path = os.path.join(videos_path , digit, vid) 
        cap = cv2.VideoCapture(video_path) # start cv2
        
        hands = mp_hands.Hands(static_image_mode=False,max_num_hands=1, min_detection_confidence=0.5) # start off the pretrained model
        # note the max_num_hands set to 1 so the model won't translate to multi hand detecion
        
        frame_extracts = [] 
        
        while cap.isOpened():
            # Read a frame from the video
            ret, frame = cap.read()
            if not ret:
                break
                
            # Detect landmarks in the frame
            frame, results = mediapipe_detection(frame, hands)
            key_points = extract_keypoints(results)
            frame_extracts.append(key_points)
            frame_extracts_array = np.array(frame_extracts)

        # Release the video capture object
        cap.release()
        
        np.save(os.path.join(train_output_path, digit, vid), frame_extracts_array)

        # Close all OpenCV 
        cv2.destroyAllWindows()
        
end_time = time.time()

timed = (end_time - start_time)/60
print(f"Extraction time: {timed} minutes")
# this may take a second to run 21 minutes on my machine

Extraction time: 21.408754924933117 minutes

train_output_path = os.path.join("ISL", "Extracted_features") #checkpoint if data had been loaded on the machine

# now load the data from the directories
X, Y = [] , []
for digit in os.listdir(train_output_path):
    #loop through each digit
    for vid in os.listdir(os.path.join(train_output_path, digit)): 
        #print(vid)
        fs = np.load(os.path.join(train_output_path, digit, vid))
        X.append(fs)
        Y.append(digit)
        
# now randomly subsample so that each series is of the same length
desired_length = 60  # is the lowest frame count in the dataset 
X_subsample = [element[np.random.choice(element.shape[0], desired_length, replace=False)] if element.shape[0] > desired_length else element for element in X]
X_subsample = torch.tensor(np.array(X_subsample), dtype=torch.float32)

# recode labels to numeric
num_classes = len(set(Y))
abcs=sorted(list(set(Y)))
label_encoder = LabelEncoder()
Y_num = label_encoder.fit_transform(Y)
Y_tensor = torch.tensor(Y_num)

# use stratified shuffle to make balanced test and train sets
sss = StratifiedShuffleSplit(n_splits =1, test_size = 0.2, random_state = 42)
for train_index, test_index in sss.split(X_subsample, Y_num):
    X_train, X_test = X_subsample[train_index], X_subsample[test_index]
    Y_train, Y_test = Y_tensor[train_index], Y_tensor[test_index]

# Print the shapes of the resulting sets
print("X_train shape:", X_train.shape)
print("Y_train shape:", Y_train.shape)
print("X_test shape:", X_test.shape)
print("Y_test shape:", Y_test.shape)

X_train shape: torch.Size([374, 60, 63])
Y_train shape: torch.Size([374])
X_test shape: torch.Size([94, 60, 63])
Y_test shape: torch.Size([94])

# train on gpu if possible
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

writer = SummaryWriter(log_dir='Logs/keypoints2')

# adopted an early stopping training loop as it shortens development considerably
def train_with_early_stopping(model, train_loader, val_data, optimizer, criterion, n_total_steps, num_epochs=1000, patience=10):
    early_stop = False
    best_loss = float('inf')
    counter = 0
    
    running_loss = 0.0
    running_correct = 0

    for epoch in range(num_epochs):
        for i, (batch_X, batch_Y) in enumerate(train_loader):
            optimizer.zero_grad()
            outputs = model(batch_X)
            loss = criterion(outputs, batch_Y)
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1) # gradient clipping 
            optimizer.step()
            
            running_loss += loss.item()
            _, predicted = torch.max(outputs, 1)
            running_correct += (predicted == batch_Y).sum().item()
            
            if (i+1) % 6 == 0:
                #print(f'Epoch [{epoch + 1}/{num_epochs}], Training Loss: {loss.item():.4f}',flush=True)
                writer.add_scalar('trainin loss', running_loss/100, epoch * n_total_steps + i )
                writer.add_scalar('accuracy', running_correct/100, epoch * n_total_steps + i )
                writer.close()
                running_loss = 0.0
                running_correct = 0
                # I found tensorboard a little hard to work with 

                
        # Validation
        model.eval()  # Set to evaluation mode
        with torch.no_grad():
            X_val, Y_val = val_data
            val_outputs = model(X_val)
            val_loss = criterion(val_outputs, Y_val)

        # print(f'Epoch [{epoch + 1}/{num_epochs}], Training Loss: {loss.item():.4f}, Validation Loss: {val_loss.item():.4f}')

        # Check for improvement in validation # this does mean there is data leakage from the test set as I do not have a validation set
        if val_loss < best_loss:
            best_loss = val_loss
            counter = 0
        else:
            counter += 1


        if counter >= patience:
            print(f'Early stopping after {epoch + 1} epochs.')
            early_stop = True
            break

        model.train() 

    return model


def evaluate_model(model, X_test, Y_test, class_names):
    # Set to evaluation mode
    model.eval().to(device)

    # Make predictions on the test set
    with torch.no_grad():
        test_outputs = model(X_test.to(device))
        _, predicted = torch.max(test_outputs, 1)
        accuracy = (predicted == Y_test).sum().item() / Y_test.size(0)

    # Convert to NumPy arrays
    y_true = Y_test.numpy()
    y_pred = predicted.numpy()

    # Calculate confusion matrix
    conf_matrix = confusion_matrix(y_true, y_pred)

    # Plot confusion matrix
    plt.figure(figsize=(10, 8))
    sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', xticklabels=class_names, yticklabels=class_names)
    plt.xlabel('Predicted Labels')
    plt.ylabel('True Labels')
    plt.title('Confusion Matrix')
    plt.show()
    # print accuracy and classification report
    print(f'Test Accuracy: {accuracy * 100:.2f}%')
    print("Classification Report:")
    print(classification_report(y_true, predicted, target_names=class_names, zero_division=1))

# The simplest LSTM I could think of 1 LSTM layer to one linear layer
class LSTMModel(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, output_size):
        super(LSTMModel, self).__init__()
        self.lstm = nn.LSTM(input_size, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        out, _ = self.lstm(x)
        out = self.fc(out[:, -1, :])
        out = F.relu(out)
        return out

# now  to a more flexible bidirectional LSTM which statistically should provide better performance
class ComplexLSTMModel(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, output_size, dropout_rate=0.5, bidirectional=False):
        super(ComplexLSTMModel, self).__init__()
        self.num_layers = num_layers
        self.bidirectional = bidirectional

        self.lstm = nn.LSTM(input_size, hidden_size, num_layers=num_layers, batch_first=True, dropout=dropout_rate, bidirectional=bidirectional)
        
        # If bidirectional, adjust the output size for the fully connected layer
        fc_input_size = hidden_size * 2 if bidirectional else hidden_size
        self.fc = nn.Linear(fc_input_size, output_size)

        # Add dropout layer for some regularisation
        self.dropout = nn.Dropout(p=dropout_rate)

    def forward(self, x):
        h0 = torch.zeros(self.num_layers * 2 if self.bidirectional else self.num_layers, x.size(0), hidden_size).to(x.device) #hidden cell states
        c0 = torch.zeros(self.num_layers * 2 if self.bidirectional else self.num_layers, x.size(0), hidden_size).to(x.device)

        out, _ = self.lstm(x, (h0, c0))
        
        # Use the last time step
        out = self.fc(out[:, -1, :])
        out = F.relu(out)

        # Apply dropout
        if hasattr(self, 'dropout'):
            out = self.dropout(out)

        return out

# attention model to focus on most crucial features
class AttentionLSTMModel(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, output_size):
        super(AttentionLSTMModel, self).__init__()
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers=num_layers, batch_first=True)
        self.attention = nn.Linear(hidden_size, 1)
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        out, _ = self.lstm(x)
        attention_weights = F.softmax(self.attention(out), dim=1)
        out = torch.sum(attention_weights * out, dim=1) 
        out = self.fc(out)
        return out

class SimpleCNN(nn.Module):
    def __init__(self, input_channels, output_size):
        super(SimpleCNN, self).__init__()

        self.conv1 = nn.Conv1d(in_channels=input_channels, out_channels=64, kernel_size=3, stride=1, padding=1)
        self.conv2 = nn.Conv1d(in_channels=64, out_channels=128, kernel_size=3, stride=1, padding=1)
        self.conv3 = nn.Conv1d(in_channels=128, out_channels=256, kernel_size=3, stride=1, padding=1)

        self.pool = nn.MaxPool1d(kernel_size=2, stride=2)

        self.fc1 = nn.Linear(256 * (sequence_length // 8), 512)  # Adjust based on your sequence_length
        self.fc2 = nn.Linear(512, output_size)

    def forward(self, x):
        x = F.relu(self.conv1(x))
        x = self.pool(x)
        
        x = F.relu(self.conv2(x))
        x = self.pool(x)
        
        x = F.relu(self.conv3(x))
        x = self.pool(x)

        x = x.view(x.size(0), -1)

        x = F.relu(self.fc1(x))
        x = self.fc2(x)

        return x

class LargerCNN(nn.Module):
    def __init__(self, input_channels, output_size, dropout_prob):
        super(LargerCNN, self).__init__()

        self.conv1 = nn.Conv1d(in_channels=input_channels, out_channels=64, kernel_size=3, stride=1, padding=1)
        self.conv2 = nn.Conv1d(in_channels=64, out_channels=128, kernel_size=3, stride=1, padding=1)
        self.conv3 = nn.Conv1d(in_channels=128, out_channels=256, kernel_size=3, stride=1, padding=1)
        self.conv4 = nn.Conv1d(in_channels=256, out_channels=512, kernel_size=3, stride=1, padding=1)
        self.conv5 = nn.Conv1d(in_channels=512, out_channels=512, kernel_size=3, stride=1, padding=1)

        self.pool = nn.MaxPool1d(kernel_size=2, stride=2)

        self.fc1 = nn.Linear(512 * (sequence_length // 32), 1024)
        self.dropout1 = nn.Dropout(p=dropout_prob)
        self.fc2 = nn.Linear(1024, 512)
        self.dropout2 = nn.Dropout(p=dropout_prob)
        self.fc3 = nn.Linear(512, output_size)

    def forward(self, x):
        x = F.relu(self.conv1(x))
        x = self.pool(x)
        
        x = F.relu(self.conv2(x))
        x = self.pool(x)
        
        x = F.relu(self.conv3(x))
        x = self.pool(x)

        x = F.relu(self.conv4(x))
        x = self.pool(x)

        x = F.relu(self.conv5(x))
        x = self.pool(x)

        x = x.view(x.size(0), -1)

        x = F.relu(self.fc1(x))
        x = self.dropout1(x)
        x = F.relu(self.fc2(x))
        x = self.dropout2(x)
        x = self.fc3(x)

        return x

class ResidualBlock(nn.Module):
    def __init__(self, in_channels, out_channels, stride=1):
        super(ResidualBlock, self).__init__()
        
        self.conv1 = nn.Conv1d(in_channels, out_channels, kernel_size=3, stride=stride, padding=1)
        self.bn1 = nn.BatchNorm1d(out_channels)
        self.relu = nn.ReLU(inplace=True)
        
        self.conv2 = nn.Conv1d(out_channels, out_channels, kernel_size=3, stride=1, padding=1)
        self.bn2 = nn.BatchNorm1d(out_channels)
        
        # If the input size changes, use a 1x1 convolution to match dimensions
        self.downsample = nn.Sequential(
            nn.Conv1d(in_channels, out_channels, kernel_size=1, stride=stride),
            nn.BatchNorm1d(out_channels)
        ) if stride != 1 or in_channels != out_channels else None
    
    def forward(self, x):
        residual = x
        
        out = self.conv1(x)
        out = self.bn1(out)
        out = self.relu(out)
        
        out = self.conv2(out)
        out = self.bn2(out)
        
        if self.downsample is not None:
            residual = self.downsample(x)
        
        out += residual
        out = self.relu(out)
        
        return out

class DeepResidualCNN(nn.Module):
    def __init__(self, input_channels, output_size, num_blocks=[2, 2, 2], initial_channels=64):
        super(DeepResidualCNN, self).__init__()

        self.in_channels = initial_channels
        
        self.conv1 = nn.Conv1d(input_channels, initial_channels, kernel_size=3, stride=1, padding=1)
        self.bn1 = nn.BatchNorm1d(initial_channels)
        self.relu = nn.ReLU(inplace=True)
        
        # Build residual blocks
        self.layer1 = self._make_layer(ResidualBlock, initial_channels, num_blocks[0], stride=2)
        self.layer2 = self._make_layer(ResidualBlock, initial_channels * 2, num_blocks[1], stride=2)
        self.layer3 = self._make_layer(ResidualBlock, initial_channels * 4, num_blocks[2], stride=2)
        
        # Global average pooling
        self.global_avg_pooling = nn.AdaptiveAvgPool1d(1)
        
        self.fc = nn.Linear(initial_channels * 4, output_size)

    def _make_layer(self, block, out_channels, num_blocks, stride):
        layers = []
        layers.append(block(self.in_channels, out_channels, stride))
        self.in_channels = out_channels
        for _ in range(1, num_blocks):
            layers.append(block(out_channels, out_channels, stride=1))
        return nn.Sequential(*layers)

    def forward(self, x):
        x = self.conv1(x)
        x = self.bn1(x)
        x = self.relu(x)
        
        x = self.layer1(x)
        x = self.layer2(x)
        x = self.layer3(x)
        
        x = self.global_avg_pooling(x)
        x = x.view(x.size(0), -1)
        x = self.fc(x)
        
        return x

train_dataset = TensorDataset(X_train.to(device), Y_train.to(device))
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)

n_total_steps = len(train_loader)

num_epochs = 1000
early_stop = False
patience = 10  # Number of epochs to wait for improvement
best_loss = float('inf')
counter = 0

input_size = 63
hidden_size = 128
output_size = 26
num_layers = 2

base_model = LSTMModel(input_size, hidden_size, num_layers, output_size).to(device)

criterion = nn.CrossEntropyLoss().to(device)

optimizer = optim.Adam(base_model.parameters(), lr=0.001)

writer.add_graph(base_model, X_test.to(device))
writer.close()


base_model = train_with_early_stopping(base_model, 
                                          train_loader, 
                                          (X_test.to(device), 
                                          Y_test.to(device)), 
                                          optimizer, 
                                          criterion,
                                          n_total_steps,
                                          num_epochs=1000, 
                                          patience=100)


evaluate_model(base_model, X_test, Y_test, abcs)

Early stopping after 718 epochs.

Test Accuracy: 87.23%
Classification Report:
              precision    recall  f1-score   support

           A       0.50      1.00      0.67         3
           B       1.00      1.00      1.00         3
           C       0.80      1.00      0.89         4
           D       1.00      1.00      1.00         4
           E       1.00      1.00      1.00         4
           F       1.00      0.67      0.80         3
           G       0.80      1.00      0.89         4
           H       1.00      1.00      1.00         4
           I       1.00      1.00      1.00         3
           J       1.00      1.00      1.00         3
           K       1.00      1.00      1.00         4
           L       1.00      1.00      1.00         3
           M       0.60      0.75      0.67         4
           N       1.00      0.33      0.50         3
           O       1.00      1.00      1.00         4
           P       1.00      1.00      1.00         4
           Q       1.00      1.00      1.00         3
           R       1.00      1.00      1.00         4
           S       1.00      0.00      0.00         4
           T       1.00      1.00      1.00         4
           U       1.00      0.00      0.00         4
           V       1.00      1.00      1.00         4
           W       1.00      1.00      1.00         4
           X       1.00      1.00      1.00         3
           Y       1.00      1.00      1.00         3
           Z       0.44      1.00      0.62         4

    accuracy                           0.87        94
   macro avg       0.93      0.88      0.85        94
weighted avg       0.93      0.87      0.84        94

torch.save(base_model.state_dict(), 'trained_base_model.pth')

train_dataset = TensorDataset(X_train.to(device), Y_train.to(device))
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)


num_epochs = 1000
early_stop = False
patience = 10  # Number of epochs to wait for improvement
best_loss = float('inf')
counter = 0

input_size = 63
hidden_size = 128
output_size = 26
num_layers = 2

complex_model = ComplexLSTMModel(input_size, hidden_size, num_layers, output_size).to(device)

criterion = nn.CrossEntropyLoss().to(device)

optimizer = optim.Adam(complex_model.parameters(), lr=0.001)


complex_model = train_with_early_stopping(complex_model, 
                                          train_loader, 
                                          (X_test.to(device), 
                                          Y_test.to(device)), 
                                          optimizer, 
                                          criterion,
                                          n_total_steps,
                                          num_epochs=1000, 
                                          patience=50)

evaluate_model(complex_model, X_test, Y_test, abcs)

Early stopping after 253 epochs.

Test Accuracy: 63.83%
Classification Report:
              precision    recall  f1-score   support

           A       0.18      1.00      0.30         3
           B       0.43      1.00      0.60         3
           C       1.00      0.00      0.00         4
           D       0.50      1.00      0.67         4
           E       1.00      1.00      1.00         4
           F       1.00      0.00      0.00         3
           G       0.43      0.75      0.55         4
           H       1.00      0.00      0.00         4
           I       1.00      1.00      1.00         3
           J       1.00      0.00      0.00         3
           K       0.80      1.00      0.89         4
           L       1.00      0.00      0.00         3
           M       0.67      1.00      0.80         4
           N       1.00      0.00      0.00         3
           O       1.00      1.00      1.00         4
           P       1.00      1.00      1.00         4
           Q       1.00      1.00      1.00         3
           R       1.00      0.00      0.00         4
           S       1.00      0.00      0.00         4
           T       1.00      0.00      0.00         4
           U       0.50      1.00      0.67         4
           V       1.00      0.75      0.86         4
           W       1.00      1.00      1.00         4
           X       1.00      1.00      1.00         3
           Y       1.00      1.00      1.00         3
           Z       0.80      1.00      0.89         4

    accuracy                           0.64        94
   macro avg       0.86      0.63      0.55        94
weighted avg       0.86      0.64      0.55        94

torch.save(complex_model.state_dict(), 'trained_lstm_model.pth')

train_dataset = TensorDataset(X_train.to(device), Y_train.to(device))
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
n_total_steps = len(train_loader)


num_epochs = 1000
early_stop = False
patience = 10  # Number of epochs to wait for improvement
best_loss = float('inf')
counter = 0

input_size = 63
hidden_size = 128
output_size = 26
num_layers = 4

attention_model = AttentionLSTMModel(input_size, hidden_size, num_layers, output_size).to(device)

criterion = nn.CrossEntropyLoss().to(device)

optimizer = optim.Adam(attention_model.parameters(), lr=0.001)

attention_model = train_with_early_stopping(attention_model, 
                                          train_loader, 
                                          (X_test.to(device), 
                                          Y_test.to(device)), 
                                          optimizer, 
                                          criterion,
                                          n_total_steps,
                                          num_epochs=1000, 
                                          patience=50)

evaluate_model(attention_model, X_test, Y_test, abcs)

Early stopping after 345 epochs.

Test Accuracy: 89.36%
Classification Report:
              precision    recall  f1-score   support

           A       1.00      1.00      1.00         3
           B       1.00      1.00      1.00         3
           C       1.00      1.00      1.00         4
           D       1.00      1.00      1.00         4
           E       1.00      0.50      0.67         4
           F       0.75      1.00      0.86         3
           G       1.00      0.75      0.86         4
           H       1.00      1.00      1.00         4
           I       1.00      1.00      1.00         3
           J       1.00      1.00      1.00         3
           K       1.00      1.00      1.00         4
           L       1.00      1.00      1.00         3
           M       0.50      0.75      0.60         4
           N       1.00      0.00      0.00         3
           O       1.00      1.00      1.00         4
           P       1.00      1.00      1.00         4
           Q       1.00      1.00      1.00         3
           R       1.00      0.50      0.67         4
           S       1.00      1.00      1.00         4
           T       0.67      1.00      0.80         4
           U       0.67      1.00      0.80         4
           V       1.00      1.00      1.00         4
           W       1.00      1.00      1.00         4
           X       0.75      1.00      0.86         3
           Y       1.00      1.00      1.00         3
           Z       0.75      0.75      0.75         4

    accuracy                           0.89        94
   macro avg       0.93      0.89      0.88        94
weighted avg       0.92      0.89      0.88        94

torch.save(attention_model.state_dict(), 'trained_attention_model.pth')

train_dataset = TensorDataset(X_train.to(device), Y_train.to(device))
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)

num_epochs = 1000
early_stop = False
patience = 10  # Number of epochs to wait for improvement
best_loss = float('inf')
counter = 0

input_channels = 60
output_size = 26
sequence_length = 60

cnn_model = SimpleCNN(input_channels=input_channels, output_size=output_size)

criterion = nn.CrossEntropyLoss().to(device)

optimizer = optim.Adam(cnn_model.parameters(), lr=0.001)

writer.add_graph(cnn_model, X_test.to(device))
writer.close()

cnn_model = train_with_early_stopping(cnn_model, 
                                          train_loader, 
                                          (X_test.to(device), 
                                          Y_test.to(device)), 
                                          optimizer, 
                                          criterion,
                                          n_total_steps,
                                          num_epochs=1000, 
                                          patience=50)



evaluate_model(cnn_model, X_test, Y_test, abcs)

Test Accuracy: 94.68%
Classification Report:
              precision    recall  f1-score   support

           A       1.00      1.00      1.00         3
           B       1.00      1.00      1.00         3
           C       1.00      0.75      0.86         4
           D       1.00      1.00      1.00         4
           E       1.00      1.00      1.00         4
           F       0.75      1.00      0.86         3
           G       1.00      0.75      0.86         4
           H       1.00      1.00      1.00         4
           I       1.00      1.00      1.00         3
           J       0.75      1.00      0.86         3
           K       1.00      1.00      1.00         4
           L       1.00      1.00      1.00         3
           M       1.00      0.75      0.86         4
           N       1.00      1.00      1.00         3
           O       0.80      1.00      0.89         4
           P       1.00      1.00      1.00         4
           Q       1.00      1.00      1.00         3
           R       1.00      1.00      1.00         4
           S       1.00      1.00      1.00         4
           T       1.00      1.00      1.00         4
           U       0.67      1.00      0.80         4
           V       1.00      0.50      0.67         4
           W       1.00      1.00      1.00         4
           X       1.00      1.00      1.00         3
           Y       1.00      1.00      1.00         3
           Z       1.00      1.00      1.00         4

    accuracy                           0.95        94
   macro avg       0.96      0.95      0.95        94
weighted avg       0.96      0.95      0.95        94

torch.save(cnn_model.state_dict(), 'trained_cnn_model.pth')

train_dataset = TensorDataset(X_train.to(device), Y_train.to(device))
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)

num_epochs = 1000
early_stop = False
patience = 10  # Number of epochs to wait for improvement
best_loss = float('inf')
counter = 0

input_channels = 60
output_size = 26


sequence_length = 60

cnn_model2 = LargerCNN(input_channels=input_channels, output_size=output_size, dropout_prob = 0.5)

criterion = nn.CrossEntropyLoss().to(device)

optimizer = optim.Adam(cnn_model2.parameters(), lr=0.001)

writer.add_graph(cnn_model, X_test.to(device))
writer.close()


cnn_model2 = train_with_early_stopping(cnn_model2, 
                                          train_loader, 
                                          (X_test.to(device), 
                                          Y_test.to(device)), 
                                          optimizer, 
                                          criterion,
                                          n_total_steps,
                                          num_epochs=1000, 
                                          patience=100) # I saw better performance with longer training times



evaluate_model(cnn_model2, X_test, Y_test, abcs)

Early stopping after 164 epochs.

Test Accuracy: 76.60%
Classification Report:
              precision    recall  f1-score   support

           A       0.75      1.00      0.86         3
           B       0.50      1.00      0.67         3
           C       1.00      0.50      0.67         4
           D       1.00      0.75      0.86         4
           E       1.00      0.75      0.86         4
           F       1.00      1.00      1.00         3
           G       1.00      1.00      1.00         4
           H       0.80      1.00      0.89         4
           I       1.00      1.00      1.00         3
           J       1.00      0.67      0.80         3
           K       1.00      0.75      0.86         4
           L       0.75      1.00      0.86         3
           M       0.50      0.50      0.50         4
           N       0.40      0.67      0.50         3
           O       0.80      1.00      0.89         4
           P       0.25      0.25      0.25         4
           Q       1.00      0.33      0.50         3
           R       0.57      1.00      0.73         4
           S       0.75      0.75      0.75         4
           T       0.80      1.00      0.89         4
           U       1.00      0.25      0.40         4
           V       0.67      1.00      0.80         4
           W       1.00      0.25      0.40         4
           X       1.00      0.67      0.80         3
           Y       1.00      1.00      1.00         3
           Z       1.00      1.00      1.00         4

    accuracy                           0.77        94
   macro avg       0.83      0.77      0.76        94
weighted avg       0.83      0.77      0.75        94

torch.save(cnn_model2.state_dict(), 'trained_cnn2_model.pth')

train_dataset = TensorDataset(X_train.to(device), Y_train.to(device))
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)

deep_residual_cnn_model = DeepResidualCNN(input_channels=60, output_size=26)

criterion = nn.CrossEntropyLoss().to(device)

optimizer = optim.Adam(deep_residual_cnn_model.parameters(), lr=0.001)


deep_residual_cnn_model = train_with_early_stopping(deep_residual_cnn_model, 
                                          train_loader, 
                                          (X_test.to(device), 
                                          Y_test.to(device)), 
                                          optimizer, 
                                          criterion,
                                          n_total_steps,
                                          num_epochs=1000, 
                                          patience=50)


evaluate_model(deep_residual_cnn_model, X_test, Y_test, abcs)

Early stopping after 108 epochs.

Test Accuracy: 88.30%
Classification Report:
              precision    recall  f1-score   support

           A       1.00      1.00      1.00         3
           B       1.00      1.00      1.00         3
           C       1.00      0.75      0.86         4
           D       1.00      1.00      1.00         4
           E       1.00      1.00      1.00         4
           F       0.75      1.00      0.86         3
           G       0.75      0.75      0.75         4
           H       1.00      1.00      1.00         4
           I       1.00      1.00      1.00         3
           J       1.00      1.00      1.00         3
           K       1.00      1.00      1.00         4
           L       1.00      0.67      0.80         3
           M       0.60      0.75      0.67         4
           N       0.50      0.33      0.40         3
           O       0.80      1.00      0.89         4
           P       0.80      1.00      0.89         4
           Q       1.00      1.00      1.00         3
           R       0.75      0.75      0.75         4
           S       1.00      1.00      1.00         4
           T       1.00      1.00      1.00         4
           U       0.50      0.75      0.60         4
           V       1.00      0.50      0.67         4
           W       1.00      0.75      0.86         4
           X       1.00      1.00      1.00         3
           Y       1.00      1.00      1.00         3
           Z       1.00      1.00      1.00         4

    accuracy                           0.88        94
   macro avg       0.90      0.88      0.88        94
weighted avg       0.90      0.88      0.88        94

torch.save(deep_residual_cnn_model.state_dict(), 'trained_deep_residual_cnn_model.pth')

input_size = 63
hidden_size = 128
output_size = 26
num_layers = 2

base_model = LSTMModel(input_size, hidden_size, num_layers, output_size)

base_model.load_state_dict(torch.load('trained_base_model.pth'))

base_model.eval()

LSTMModel(
  (lstm): LSTM(63, 128, batch_first=True)
  (fc): Linear(in_features=128, out_features=26, bias=True)
)

input_size = 63
hidden_size = 128
output_size = 26
num_layers = 2

model = AttentionLSTMModel(input_size, hidden_size, num_layers, output_size)

model.load_state_dict(torch.load('attention_model2000.pth'))

model.eval()

AttentionLSTMModel(
  (lstm): LSTM(63, 128, num_layers=2, batch_first=True)
  (attention): Linear(in_features=128, out_features=1, bias=True)
  (fc): Linear(in_features=128, out_features=26, bias=True)
)

input_size = 63
hidden_size = 128
output_size = 26
num_layers = 2

attention_model = AttentionLSTMModel(input_size, hidden_size, num_layers, output_size)

attention_model.load_state_dict(torch.load('trained_attention_model.pth'))

attention_model.eval()

AttentionLSTMModel(
  (lstm): LSTM(63, 128, num_layers=2, batch_first=True)
  (attention): Linear(in_features=128, out_features=1, bias=True)
  (fc): Linear(in_features=128, out_features=26, bias=True)
)

input_size = 63
hidden_size = 128
output_size = 26
num_layers = 2

complex_model = ComplexLSTMModel(input_size, hidden_size, num_layers, output_size)

complex_model.load_state_dict(torch.load('trained_lstm_model.pth'))

complex_model.eval()

ComplexLSTMModel(
  (lstm): LSTM(63, 128, num_layers=2, batch_first=True, dropout=0.5)
  (fc): Linear(in_features=128, out_features=26, bias=True)
  (dropout): Dropout(p=0.5, inplace=False)
)

input_channels = 60
output_size = 26
sequence_length = 60

cnn_model = SimpleCNN(input_channels=input_channels, output_size=output_size)

cnn_model.load_state_dict(torch.load('trained_cnn_model.pth'))

cnn_model.eval()

SimpleCNN(
  (conv1): Conv1d(60, 64, kernel_size=(3,), stride=(1,), padding=(1,))
  (conv2): Conv1d(64, 128, kernel_size=(3,), stride=(1,), padding=(1,))
  (conv3): Conv1d(128, 256, kernel_size=(3,), stride=(1,), padding=(1,))
  (pool): MaxPool1d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (fc1): Linear(in_features=1792, out_features=512, bias=True)
  (fc2): Linear(in_features=512, out_features=26, bias=True)
)

input_channels = 60
output_size = 26
sequence_length = 60

cnn_model2 = LargerCNN(input_channels=input_channels, output_size=output_size, dropout_prob =0.5)

cnn_model2.load_state_dict(torch.load('trained_cnn2_model.pth'))

cnn_model2.eval()

LargerCNN(
  (conv1): Conv1d(60, 64, kernel_size=(3,), stride=(1,), padding=(1,))
  (conv2): Conv1d(64, 128, kernel_size=(3,), stride=(1,), padding=(1,))
  (conv3): Conv1d(128, 256, kernel_size=(3,), stride=(1,), padding=(1,))
  (conv4): Conv1d(256, 512, kernel_size=(3,), stride=(1,), padding=(1,))
  (conv5): Conv1d(512, 512, kernel_size=(3,), stride=(1,), padding=(1,))
  (pool): MaxPool1d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (fc1): Linear(in_features=512, out_features=1024, bias=True)
  (dropout1): Dropout(p=0.5, inplace=False)
  (fc2): Linear(in_features=1024, out_features=512, bias=True)
  (dropout2): Dropout(p=0.5, inplace=False)
  (fc3): Linear(in_features=512, out_features=26, bias=True)
)

input_channels = 60
output_size = 26
sequence_length = 60

deep_residual_cnn_model = DeepResidualCNN(input_channels=60, output_size=26)

deep_residual_cnn_model.load_state_dict(torch.load('trained_deep_residual_cnn_model.pth'))

deep_residual_cnn_model.eval()

DeepResidualCNN(
  (conv1): Conv1d(60, 64, kernel_size=(3,), stride=(1,), padding=(1,))
  (bn1): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (relu): ReLU(inplace=True)
  (layer1): Sequential(
    (0): ResidualBlock(
      (conv1): Conv1d(64, 64, kernel_size=(3,), stride=(2,), padding=(1,))
      (bn1): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
      (conv2): Conv1d(64, 64, kernel_size=(3,), stride=(1,), padding=(1,))
      (bn2): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (downsample): Sequential(
        (0): Conv1d(64, 64, kernel_size=(1,), stride=(2,))
        (1): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      )
    )
    (1): ResidualBlock(
      (conv1): Conv1d(64, 64, kernel_size=(3,), stride=(1,), padding=(1,))
      (bn1): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
      (conv2): Conv1d(64, 64, kernel_size=(3,), stride=(1,), padding=(1,))
      (bn2): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    )
  )
  (layer2): Sequential(
    (0): ResidualBlock(
      (conv1): Conv1d(64, 128, kernel_size=(3,), stride=(2,), padding=(1,))
      (bn1): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
      (conv2): Conv1d(128, 128, kernel_size=(3,), stride=(1,), padding=(1,))
      (bn2): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (downsample): Sequential(
        (0): Conv1d(64, 128, kernel_size=(1,), stride=(2,))
        (1): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      )
    )
    (1): ResidualBlock(
      (conv1): Conv1d(128, 128, kernel_size=(3,), stride=(1,), padding=(1,))
      (bn1): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
      (conv2): Conv1d(128, 128, kernel_size=(3,), stride=(1,), padding=(1,))
      (bn2): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    )
  )
  (layer3): Sequential(
    (0): ResidualBlock(
      (conv1): Conv1d(128, 256, kernel_size=(3,), stride=(2,), padding=(1,))
      (bn1): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
      (conv2): Conv1d(256, 256, kernel_size=(3,), stride=(1,), padding=(1,))
      (bn2): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (downsample): Sequential(
        (0): Conv1d(128, 256, kernel_size=(1,), stride=(2,))
        (1): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      )
    )
    (1): ResidualBlock(
      (conv1): Conv1d(256, 256, kernel_size=(3,), stride=(1,), padding=(1,))
      (bn1): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
      (conv2): Conv1d(256, 256, kernel_size=(3,), stride=(1,), padding=(1,))
      (bn2): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    )
  )
  (global_avg_pooling): AdaptiveAvgPool1d(output_size=1)
  (fc): Linear(in_features=256, out_features=26, bias=True)
)

model_names = ['base_model', 'attention_model', 'bi_directional_model', 'cnn_model', 'cnn_model2', 'deep_residual_cnn_model']

# Initialize a list to store accuracy scores
accuracy_scores = []

base_model.to(device)
attention_model.to(device)
complex_model.to(device)
cnn_model.to(device)
cnn_model2.to(device)
deep_residual_cnn_model.to(device)
# Function to evaluate a model and return accuracy
def evaluate_model2(model, X_test, Y_test, class_names):
    # Set to evaluation mode
    model.eval().to(device)

    # Make predictions on the test set
    with torch.no_grad():
        test_outputs = model(X_test.to(device))
        _, predicted = torch.max(test_outputs, 1)
        accuracy = (predicted == Y_test).sum().item() / Y_test.size(0)
    return accuracy

# Evaluate each model and store accuracy scores
for model in [base_model, attention_model, complex_model, cnn_model, cnn_model2, deep_residual_cnn_model]:
    accuracy = evaluate_model2(model, X_test, Y_test, abcs)
    accuracy_scores.append(accuracy * 100)

# Create a table
table_data = list(zip(model_names, accuracy_scores))
table_headers = ["Model Name", "Test Accuracy %"]
table = tabulate(table_data, headers=table_headers, tablefmt="grid")

# Print the table
print(table)

+-------------------------+-------------------+
| Model Name              |   Test Accuracy % |
+=========================+===================+
| base_model              |           87.234  |
+-------------------------+-------------------+
| attention_model         |           89.3617 |
+-------------------------+-------------------+
| bi_directional_model    |           63.8298 |
+-------------------------+-------------------+
| cnn_model               |           93.617  |
+-------------------------+-------------------+
| cnn_model2              |           76.5957 |
+-------------------------+-------------------+
| deep_residual_cnn_model |           91.4894 |
+-------------------------+-------------------+

validation_features = os.path.join("ISL", "Extracted_features_Validation")

# now load the data from the directories
X, Y = [] , []
for digit in os.listdir(validation_features):
    #loop through each digit
    for vid in os.listdir(os.path.join(validation_features, digit)): 
        #print(vid)
        fs = np.load(os.path.join(validation_features, digit, vid))
        X.append(fs)
        Y.append(digit)
        
# now randomly subsample so that each series is of the same length
desired_length = 60  # is the lowest frame count in the dataset 
X_valid = [element[np.random.choice(element.shape[0], desired_length, replace=False)] if element.shape[0] > desired_length else element for element in X]
X_valid = torch.tensor(np.array(X_valid), dtype=torch.float32)

# recode labels to numeric
num_classes = len(set(Y))
abcs=sorted(list(set(Y)))
label_encoder = LabelEncoder()
Y_num = label_encoder.fit_transform(Y)
Y_valid = torch.tensor(Y_num)

print(X_valid.shape)
print(Y_valid.shape)

torch.Size([52, 60, 63])
torch.Size([52])

model_names = ['base_model', 'attention_model', 'bi_directional_model', 'cnn_model', 'cnn_model2', 'deep_residual_cnn_model']

# Initialize a list to store accuracy scores
accuracy_scores = []

base_model.to(device)
attention_model.to(device)
complex_model.to(device)
cnn_model.to(device)
cnn_model2.to(device)
deep_residual_cnn_model.to(device)

# Evaluate each model and store accuracy scores
for model in [base_model, attention_model, complex_model, cnn_model, cnn_model2, deep_residual_cnn_model]:
    accuracy = evaluate_model2(model, X_valid, Y_valid, abcs)
    accuracy_scores.append(accuracy * 100)

# Create a table
table_data = list(zip(model_names, accuracy_scores))
table_headers = ["Model Name", "Vaildation Accuracy"]
table = tabulate(table_data, headers=table_headers, tablefmt="grid")

# Print the table
print(table)

+-------------------------+-----------------------+
| Model Name              |   Vaildation Accuracy |
+=========================+=======================+
| base_model              |               50      |
+-------------------------+-----------------------+
| attention_model         |               59.6154 |
+-------------------------+-----------------------+
| bi_directional_model    |               40.3846 |
+-------------------------+-----------------------+
| cnn_model               |               65.3846 |
+-------------------------+-----------------------+
| cnn_model2              |               53.8462 |
+-------------------------+-----------------------+
| deep_residual_cnn_model |               55.7692 |
+-------------------------+-----------------------+

evaluate_model(cnn_model, X_valid, Y_valid, abcs)

Test Accuracy: 65.38%
Classification Report:
              precision    recall  f1-score   support

           A       1.00      1.00      1.00         2
           B       1.00      1.00      1.00         2
           C       0.00      0.00      1.00         2
           D       0.00      0.00      1.00         2
           E       0.40      1.00      0.57         2
           F       1.00      0.00      0.00         2
           G       0.50      1.00      0.67         2
           H       1.00      1.00      1.00         2
           I       1.00      1.00      1.00         2
           J       1.00      1.00      1.00         2
           K       1.00      1.00      1.00         2
           L       1.00      0.00      0.00         2
           M       1.00      0.50      0.67         2
           N       0.50      1.00      0.67         2
           O       1.00      0.50      0.67         2
           P       1.00      1.00      1.00         2
           Q       1.00      1.00      1.00         2
           R       1.00      0.50      0.67         2
           S       1.00      1.00      1.00         2
           T       1.00      0.00      0.00         2
           U       1.00      0.00      0.00         2
           V       0.67      1.00      0.80         2
           W       1.00      0.00      0.00         2
           X       1.00      0.50      0.67         2
           Y       1.00      1.00      1.00         2
           Z       0.29      1.00      0.44         2

    accuracy                           0.65        52
   macro avg       0.82      0.65      0.69        52
weighted avg       0.82      0.65      0.69        52

evaluate_model(deep_residual_cnn_model, X_valid, Y_valid, abcs)

Test Accuracy: 59.62%
Classification Report:
              precision    recall  f1-score   support

           A       1.00      1.00      1.00         2
           B       0.67      1.00      0.80         2
           C       0.00      0.00      1.00         2
           D       1.00      0.00      0.00         2
           E       1.00      1.00      1.00         2
           F       0.67      1.00      0.80         2
           G       1.00      0.50      0.67         2
           H       0.67      1.00      0.80         2
           I       1.00      1.00      1.00         2
           J       1.00      0.50      0.67         2
           K       1.00      0.50      0.67         2
           L       1.00      0.00      0.00         2
           M       1.00      0.50      0.67         2
           N       0.50      1.00      0.67         2
           O       1.00      1.00      1.00         2
           P       0.00      0.00      1.00         2
           Q       0.40      1.00      0.57         2
           R       0.50      0.50      0.50         2
           S       1.00      1.00      1.00         2
           T       1.00      0.00      0.00         2
           U       1.00      0.00      0.00         2
           V       1.00      0.00      0.00         2
           W       0.00      0.00      1.00         2
           X       1.00      1.00      1.00         2
           Y       1.00      1.00      1.00         2
           Z       0.25      1.00      0.40         2

    accuracy                           0.60        52
   macro avg       0.76      0.60      0.66        52
weighted avg       0.76      0.60      0.66        52

mp_hands = mp.solutions.hands # Instantiate the hands model
mp_drawing = mp.solutions.drawing_utils # Drawing utilities

# create a function to process video
def mediapipe_detection(image, model):
    image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB) # COLOR CONVERSION BGR 2 RGB
    image.flags.writeable = False                  # Image is no longer writeable
    results = model.process(image)                 # Make prediction
    image.flags.writeable = True                   # Image is now writeable 
    image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR) # COLOR COVERSION RGB 2 BGR
    return image, results

# helper function to visualise the landmarks on the video feed
def draw_landmarks(image, results):
    if results.multi_hand_landmarks:
        for hand_landmarks in results.multi_hand_landmarks:
            mp_drawing.draw_landmarks(image, hand_landmarks, mp_hands.HAND_CONNECTIONS)

# and a function to extract the results in real time to use for inference or creating training data
def extract_keypoints(results):
    if results.multi_hand_landmarks:
        positions = np.array([[res.x, res.y, res.z] for hand_landmarks in results.multi_hand_landmarks for res in hand_landmarks.landmark]).flatten()
    else:
        positions = np.zeros(21*3)
    return positions

abcs = ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z']
# had to load the class labels to turn outputs to something understandable


#function to make detection easier and move between different models
def hand_gesture_recognition(model, cap):
    # 1. New detection variables
    sequence = []
    predictions = []
    threshold = 0.3

    # Set mediapipe model 
    with mp_hands.Hands(static_image_mode=False, max_num_hands=1, min_detection_confidence=0.5) as hands:
        while cap.isOpened():
            # Read feed
            ret, frame = cap.read()

            # Make detections
            image, results = mediapipe_detection(frame, hands)

            # Draw landmarks
            draw_landmarks(image, results)

            # 2. Prediction logic
            keypoints = extract_keypoints(results)
            sequence.append(keypoints)
            sequence = sequence[-60:]
            # detect if we have 30 frames recorded
            if len(sequence) == 60:
                sequence_tensor = torch.Tensor(np.expand_dims(sequence, axis=0)) # create a tensor of the keypoints 
                #print(sequence_tensor.shape)
                model.eval()  # Switch to evaluation mode
                with torch.no_grad():
                    res = model(sequence_tensor)
                    _, predicted_class = torch.max(res, 1)
                    predictions.append(predicted_class.item())

                    prediction_text = f"Prediction: {abcs[predicted_class.item()]}"
                    cv2.putText(image, prediction_text, (10, 30), cv2.FONT_HERSHEY_SIMPLEX, 1, (255, 255, 255), 2, cv2.LINE_AA)


            # Show to screen
            cv2.imshow('OpenCV Feed', image)

            # Break gracefully
            if cv2.waitKey(10) & 0xFF == ord('q'):
                break

    cap.release()
    cv2.destroyAllWindows()

cap = cv2.VideoCapture(2)
hand_gesture_recognition(base_model, cap)

cap = cv2.VideoCapture(0)
hand_gesture_recognition(attention_model, cap)

cap = cv2.VideoCapture(0)
hand_gesture_recognition(complex_model, cap)

cap = cv2.VideoCapture(0)
hand_gesture_recognition(cnn_model, cap)

Real Time Sign language detection¶

Data Wrangling¶

Data Loading¶

Modelling¶

Setup¶

Models¶

LSTMs¶

CNNs¶

Training¶

Load trained models¶

Validation¶

Real time detection¶

Detection code¶