Real Time Sign language detection¶
Benedek Fulop (Sn: 12290335)
This notebook has been created to implement an efficient real time sign language recognition.
In order to abstract from the issues we experienced with the background and lightning conditions I decided to explore keypoint estimation models. I have tried using Openpose, however settled on using Mediapipe as it is maintained and provides more stable performance at the cost of not being open-source. Using the extracted features had been shown to perform better than visual models at larger word level datasets such as Li et al. 2020 "Word-level Deep Sign Language Recognition from Video: A New Large-scale Dataset and Methods Comparison".
Then to adress the dynamic signs present in all sign language I utilise a suit of RNN and CNN architectures to categorise the series.
import os
import re
import cv2
import pandas as pd
import numpy as np
import shutil
import random
import subprocess
from zipfile import ZipFile
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
from tabulate import tabulate
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from sklearn.metrics import classification_report
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn import metrics
import uuid
import time
import torch
import torch.nn as nn
import torch.optim as optim
import torchvision
import tensorboard
import torch.nn.functional as F
from torchvision import datasets, models, transforms
from torchvision.transforms import v2
from torch.utils.data import DataLoader, TensorDataset
from torch.optim import lr_scheduler
from torchinfo import summary
from tempfile import TemporaryDirectory
from sklearn.preprocessing import LabelEncoder
from torch.utils.tensorboard import SummaryWriter
import mediapipe as mp
from sklearn.model_selection import train_test_split
# fixing the seed to make training and results stable I just fixed all the randomisers I could think of still get some random behaviour possibly due to the LSTMs
randomseed = 42 # My favourite integer
np.random.seed(randomseed)
torch.manual_seed(randomseed) # I still see non-deterministic results no clue why
torch.cuda.manual_seed(randomseed)
torch.backends.cudnn.deterministic = True
random.seed(randomseed)
Data Wrangling¶
Clone The Github repository of data to my local machine
# get the url
repo_url = "https://github.com/marlondcu/ISL.git"
# the destination
destination_folder = os.path.join("ISL1")
# Run the git clone command
subprocess.run(["git", "clone", repo_url, destination_folder])
CompletedProcess(args=['git', 'clone', 'https://github.com/marlondcu/ISL.git', 'ISL1'], returncode=0)
Unzip the video folders
zip_dir = os.path.join(destination_folder, "Videos") # unzip just the videos
zip_files = [f for f in os.listdir(zip_dir) if f.endswith('.zip')]
# Go through ZIP files and extract contents
for zip_file in zip_files:
zip_path = os.path.join(zip_dir, zip_file)
with ZipFile(zip_path, 'r') as zip_ref:
# Extract all contents to the same directory as the ZIP file
zip_ref.extractall(zip_dir)
os.remove(zip_path) # delete .zip files once extracted
Get rid of the person information as it is not useful
# helper function to pop subfolders
def pop_folder(folder_path):
contents = os.listdir(folder_path)
for item in contents:
item_path = os.path.join(folder_path, item)
new_path = os.path.join(os.path.dirname(folder_path), str(uuid.uuid1())[:8]+"-"+item)
shutil.move(item_path, new_path)
shutil.rmtree(folder_path)
videos_path = os.path.join("ISL1", "Videos")
# Unwrapping the Person subfolders to get all the images in the train and test folders
for subfolder in os.listdir(os.path.join(videos_path)):
pop_folder(os.path.join(os.path.join(videos_path), subfolder))
# statically define the categories
abcs = ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z']
# create categorical directories
for value in abcs:
folder_path = os.path.join(videos_path, value)
os.makedirs(folder_path, exist_ok=True)
pattern = r'-(\w)' # regular expression to extract category info from file names
file_names = os.listdir(videos_path)
# Move files to their corresponding folders
for file_name in file_names:
match = re.search(pattern, file_name)
if match:
value = match.group(1)
source_path = os.path.join(videos_path, file_name)
destination_path = os.path.join(videos_path, value, file_name)
# Ensure the destination folder exists before moving
os.makedirs(os.path.dirname(destination_path), exist_ok=True)
shutil.move(source_path, destination_path) # move videos to the corresponding subfolder
# Now there shoul be 18 videos of different length organised by class label in the directory
Mediapipe
videos_path = os.path.join("ISL1", "videos") #checkpoint if all above is ran on the machine
mp_hands = mp.solutions.hands # Instantiate the hands model
mp_drawing = mp.solutions.drawing_utils # Drawing utilities
# create a function to process video
def mediapipe_detection(image, model):
image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB) # COLOR CONVERSION BGR 2 RGB
image.flags.writeable = False # Image is no longer writeable
results = model.process(image) # Make prediction
image.flags.writeable = True # Image is now writeable
image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR) # COLOR COVERSION RGB 2 BGR
return image, results
# helper function to visualise the landmarks on the video feed
def draw_landmarks(image, results):
if results.multi_hand_landmarks:
for hand_landmarks in results.multi_hand_landmarks:
mp_drawing.draw_landmarks(image, hand_landmarks, mp_hands.HAND_CONNECTIONS)
# and a function to extract the results in real time to use for inference or creating training data
def extract_keypoints(results):
if results.multi_hand_world_landmarks:
positions = np.array([[res.x, res.y, res.z] for hand_landmarks in results.multi_hand_world_landmarks for res in hand_landmarks.landmark]).flatten()
else:
positions = np.zeros(21*3)
return positions
#create the directories to store the results
train_output_path = os.path.join("ISL1", "Extracted_features")
for digit in os.listdir(videos_path):
#loop through each digit
for vid in os.listdir(os.path.join(videos_path, digit)):
try:
os.makedirs(os.path.join(train_output_path, digit))
except:
pass
start_time = time.time()
# Now to create the training data
for digit in os.listdir(videos_path):
#loop through each digit
for vid in os.listdir(os.path.join(videos_path , digit)):
#loop through each video
# get the corresponding video for each video in the train folder
video_path = os.path.join(videos_path , digit, vid)
cap = cv2.VideoCapture(video_path) # start cv2
hands = mp_hands.Hands(static_image_mode=False,max_num_hands=1, min_detection_confidence=0.5) # start off the pretrained model
# note the max_num_hands set to 1 so the model won't translate to multi hand detecion
frame_extracts = []
while cap.isOpened():
# Read a frame from the video
ret, frame = cap.read()
if not ret:
break
# Detect landmarks in the frame
frame, results = mediapipe_detection(frame, hands)
key_points = extract_keypoints(results)
frame_extracts.append(key_points)
frame_extracts_array = np.array(frame_extracts)
# Release the video capture object
cap.release()
np.save(os.path.join(train_output_path, digit, vid), frame_extracts_array)
# Close all OpenCV
cv2.destroyAllWindows()
end_time = time.time()
timed = (end_time - start_time)/60
print(f"Extraction time: {timed} minutes")
# this may take a second to run 21 minutes on my machine
Extraction time: 21.408754924933117 minutes
Data Loading¶
train_output_path = os.path.join("ISL", "Extracted_features") #checkpoint if data had been loaded on the machine
# now load the data from the directories
X, Y = [] , []
for digit in os.listdir(train_output_path):
#loop through each digit
for vid in os.listdir(os.path.join(train_output_path, digit)):
#print(vid)
fs = np.load(os.path.join(train_output_path, digit, vid))
X.append(fs)
Y.append(digit)
# now randomly subsample so that each series is of the same length
desired_length = 60 # is the lowest frame count in the dataset
X_subsample = [element[np.random.choice(element.shape[0], desired_length, replace=False)] if element.shape[0] > desired_length else element for element in X]
X_subsample = torch.tensor(np.array(X_subsample), dtype=torch.float32)
# recode labels to numeric
num_classes = len(set(Y))
abcs=sorted(list(set(Y)))
label_encoder = LabelEncoder()
Y_num = label_encoder.fit_transform(Y)
Y_tensor = torch.tensor(Y_num)
# use stratified shuffle to make balanced test and train sets
sss = StratifiedShuffleSplit(n_splits =1, test_size = 0.2, random_state = 42)
for train_index, test_index in sss.split(X_subsample, Y_num):
X_train, X_test = X_subsample[train_index], X_subsample[test_index]
Y_train, Y_test = Y_tensor[train_index], Y_tensor[test_index]
# Print the shapes of the resulting sets
print("X_train shape:", X_train.shape)
print("Y_train shape:", Y_train.shape)
print("X_test shape:", X_test.shape)
print("Y_test shape:", Y_test.shape)
X_train shape: torch.Size([374, 60, 63]) Y_train shape: torch.Size([374]) X_test shape: torch.Size([94, 60, 63]) Y_test shape: torch.Size([94])
The motivation for exploring this avenue is in the hopes that given robust keypoint estimation the "hand-crafted" keypoints allow for robust categorisation. This approach does have its limitations nonetheless. As discussed by Hachiuma, Sato, and Sekii 2023. This technique relies on precise estimation of the feature extraction model used. The number of recognised keypoint groups is limited, in this case to one hand. In practical applications the varying sequence length can pose problems for the classification model due to the constrained feature space.
These points motivate the use of multi-stream models averaging the result of appearance based and therefore complex feature based and hand crafted keypoint models avereging their results for more robust spatiotemporal predictions. This avenue was explored however it is not presented here due to time constraints with model development.
The constrained feature set does mean I have considerable fewer datapoints to learn from just 374 observations in the train set each containing 60 temporal snapshots of 63 features of 21 x, y, and z coordinates of the keypoints on the hand with the bottom of the palm as the origin.
Modelling¶
Setup¶
# train on gpu if possible
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
writer = SummaryWriter(log_dir='Logs/keypoints2')
# adopted an early stopping training loop as it shortens development considerably
def train_with_early_stopping(model, train_loader, val_data, optimizer, criterion, n_total_steps, num_epochs=1000, patience=10):
early_stop = False
best_loss = float('inf')
counter = 0
running_loss = 0.0
running_correct = 0
for epoch in range(num_epochs):
for i, (batch_X, batch_Y) in enumerate(train_loader):
optimizer.zero_grad()
outputs = model(batch_X)
loss = criterion(outputs, batch_Y)
loss.backward()
torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1) # gradient clipping
optimizer.step()
running_loss += loss.item()
_, predicted = torch.max(outputs, 1)
running_correct += (predicted == batch_Y).sum().item()
if (i+1) % 6 == 0:
#print(f'Epoch [{epoch + 1}/{num_epochs}], Training Loss: {loss.item():.4f}',flush=True)
writer.add_scalar('trainin loss', running_loss/100, epoch * n_total_steps + i )
writer.add_scalar('accuracy', running_correct/100, epoch * n_total_steps + i )
writer.close()
running_loss = 0.0
running_correct = 0
# I found tensorboard a little hard to work with
# Validation
model.eval() # Set to evaluation mode
with torch.no_grad():
X_val, Y_val = val_data
val_outputs = model(X_val)
val_loss = criterion(val_outputs, Y_val)
# print(f'Epoch [{epoch + 1}/{num_epochs}], Training Loss: {loss.item():.4f}, Validation Loss: {val_loss.item():.4f}')
# Check for improvement in validation # this does mean there is data leakage from the test set as I do not have a validation set
if val_loss < best_loss:
best_loss = val_loss
counter = 0
else:
counter += 1
if counter >= patience:
print(f'Early stopping after {epoch + 1} epochs.')
early_stop = True
break
model.train()
return model
def evaluate_model(model, X_test, Y_test, class_names):
# Set to evaluation mode
model.eval().to(device)
# Make predictions on the test set
with torch.no_grad():
test_outputs = model(X_test.to(device))
_, predicted = torch.max(test_outputs, 1)
accuracy = (predicted == Y_test).sum().item() / Y_test.size(0)
# Convert to NumPy arrays
y_true = Y_test.numpy()
y_pred = predicted.numpy()
# Calculate confusion matrix
conf_matrix = confusion_matrix(y_true, y_pred)
# Plot confusion matrix
plt.figure(figsize=(10, 8))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', xticklabels=class_names, yticklabels=class_names)
plt.xlabel('Predicted Labels')
plt.ylabel('True Labels')
plt.title('Confusion Matrix')
plt.show()
# print accuracy and classification report
print(f'Test Accuracy: {accuracy * 100:.2f}%')
print("Classification Report:")
print(classification_report(y_true, predicted, target_names=class_names, zero_division=1))
Models¶
LSTMs¶
# The simplest LSTM I could think of 1 LSTM layer to one linear layer
class LSTMModel(nn.Module):
def __init__(self, input_size, hidden_size, num_layers, output_size):
super(LSTMModel, self).__init__()
self.lstm = nn.LSTM(input_size, hidden_size, batch_first=True)
self.fc = nn.Linear(hidden_size, output_size)
def forward(self, x):
out, _ = self.lstm(x)
out = self.fc(out[:, -1, :])
out = F.relu(out)
return out
# now to a more flexible bidirectional LSTM which statistically should provide better performance
class ComplexLSTMModel(nn.Module):
def __init__(self, input_size, hidden_size, num_layers, output_size, dropout_rate=0.5, bidirectional=False):
super(ComplexLSTMModel, self).__init__()
self.num_layers = num_layers
self.bidirectional = bidirectional
self.lstm = nn.LSTM(input_size, hidden_size, num_layers=num_layers, batch_first=True, dropout=dropout_rate, bidirectional=bidirectional)
# If bidirectional, adjust the output size for the fully connected layer
fc_input_size = hidden_size * 2 if bidirectional else hidden_size
self.fc = nn.Linear(fc_input_size, output_size)
# Add dropout layer for some regularisation
self.dropout = nn.Dropout(p=dropout_rate)
def forward(self, x):
h0 = torch.zeros(self.num_layers * 2 if self.bidirectional else self.num_layers, x.size(0), hidden_size).to(x.device) #hidden cell states
c0 = torch.zeros(self.num_layers * 2 if self.bidirectional else self.num_layers, x.size(0), hidden_size).to(x.device)
out, _ = self.lstm(x, (h0, c0))
# Use the last time step
out = self.fc(out[:, -1, :])
out = F.relu(out)
# Apply dropout
if hasattr(self, 'dropout'):
out = self.dropout(out)
return out
# attention model to focus on most crucial features
class AttentionLSTMModel(nn.Module):
def __init__(self, input_size, hidden_size, num_layers, output_size):
super(AttentionLSTMModel, self).__init__()
self.lstm = nn.LSTM(input_size, hidden_size, num_layers=num_layers, batch_first=True)
self.attention = nn.Linear(hidden_size, 1)
self.fc = nn.Linear(hidden_size, output_size)
def forward(self, x):
out, _ = self.lstm(x)
attention_weights = F.softmax(self.attention(out), dim=1)
out = torch.sum(attention_weights * out, dim=1)
out = self.fc(out)
return out
CNNs¶
class SimpleCNN(nn.Module):
def __init__(self, input_channels, output_size):
super(SimpleCNN, self).__init__()
self.conv1 = nn.Conv1d(in_channels=input_channels, out_channels=64, kernel_size=3, stride=1, padding=1)
self.conv2 = nn.Conv1d(in_channels=64, out_channels=128, kernel_size=3, stride=1, padding=1)
self.conv3 = nn.Conv1d(in_channels=128, out_channels=256, kernel_size=3, stride=1, padding=1)
self.pool = nn.MaxPool1d(kernel_size=2, stride=2)
self.fc1 = nn.Linear(256 * (sequence_length // 8), 512) # Adjust based on your sequence_length
self.fc2 = nn.Linear(512, output_size)
def forward(self, x):
x = F.relu(self.conv1(x))
x = self.pool(x)
x = F.relu(self.conv2(x))
x = self.pool(x)
x = F.relu(self.conv3(x))
x = self.pool(x)
x = x.view(x.size(0), -1)
x = F.relu(self.fc1(x))
x = self.fc2(x)
return x
class LargerCNN(nn.Module):
def __init__(self, input_channels, output_size, dropout_prob):
super(LargerCNN, self).__init__()
self.conv1 = nn.Conv1d(in_channels=input_channels, out_channels=64, kernel_size=3, stride=1, padding=1)
self.conv2 = nn.Conv1d(in_channels=64, out_channels=128, kernel_size=3, stride=1, padding=1)
self.conv3 = nn.Conv1d(in_channels=128, out_channels=256, kernel_size=3, stride=1, padding=1)
self.conv4 = nn.Conv1d(in_channels=256, out_channels=512, kernel_size=3, stride=1, padding=1)
self.conv5 = nn.Conv1d(in_channels=512, out_channels=512, kernel_size=3, stride=1, padding=1)
self.pool = nn.MaxPool1d(kernel_size=2, stride=2)
self.fc1 = nn.Linear(512 * (sequence_length // 32), 1024)
self.dropout1 = nn.Dropout(p=dropout_prob)
self.fc2 = nn.Linear(1024, 512)
self.dropout2 = nn.Dropout(p=dropout_prob)
self.fc3 = nn.Linear(512, output_size)
def forward(self, x):
x = F.relu(self.conv1(x))
x = self.pool(x)
x = F.relu(self.conv2(x))
x = self.pool(x)
x = F.relu(self.conv3(x))
x = self.pool(x)
x = F.relu(self.conv4(x))
x = self.pool(x)
x = F.relu(self.conv5(x))
x = self.pool(x)
x = x.view(x.size(0), -1)
x = F.relu(self.fc1(x))
x = self.dropout1(x)
x = F.relu(self.fc2(x))
x = self.dropout2(x)
x = self.fc3(x)
return x
class ResidualBlock(nn.Module):
def __init__(self, in_channels, out_channels, stride=1):
super(ResidualBlock, self).__init__()
self.conv1 = nn.Conv1d(in_channels, out_channels, kernel_size=3, stride=stride, padding=1)
self.bn1 = nn.BatchNorm1d(out_channels)
self.relu = nn.ReLU(inplace=True)
self.conv2 = nn.Conv1d(out_channels, out_channels, kernel_size=3, stride=1, padding=1)
self.bn2 = nn.BatchNorm1d(out_channels)
# If the input size changes, use a 1x1 convolution to match dimensions
self.downsample = nn.Sequential(
nn.Conv1d(in_channels, out_channels, kernel_size=1, stride=stride),
nn.BatchNorm1d(out_channels)
) if stride != 1 or in_channels != out_channels else None
def forward(self, x):
residual = x
out = self.conv1(x)
out = self.bn1(out)
out = self.relu(out)
out = self.conv2(out)
out = self.bn2(out)
if self.downsample is not None:
residual = self.downsample(x)
out += residual
out = self.relu(out)
return out
class DeepResidualCNN(nn.Module):
def __init__(self, input_channels, output_size, num_blocks=[2, 2, 2], initial_channels=64):
super(DeepResidualCNN, self).__init__()
self.in_channels = initial_channels
self.conv1 = nn.Conv1d(input_channels, initial_channels, kernel_size=3, stride=1, padding=1)
self.bn1 = nn.BatchNorm1d(initial_channels)
self.relu = nn.ReLU(inplace=True)
# Build residual blocks
self.layer1 = self._make_layer(ResidualBlock, initial_channels, num_blocks[0], stride=2)
self.layer2 = self._make_layer(ResidualBlock, initial_channels * 2, num_blocks[1], stride=2)
self.layer3 = self._make_layer(ResidualBlock, initial_channels * 4, num_blocks[2], stride=2)
# Global average pooling
self.global_avg_pooling = nn.AdaptiveAvgPool1d(1)
self.fc = nn.Linear(initial_channels * 4, output_size)
def _make_layer(self, block, out_channels, num_blocks, stride):
layers = []
layers.append(block(self.in_channels, out_channels, stride))
self.in_channels = out_channels
for _ in range(1, num_blocks):
layers.append(block(out_channels, out_channels, stride=1))
return nn.Sequential(*layers)
def forward(self, x):
x = self.conv1(x)
x = self.bn1(x)
x = self.relu(x)
x = self.layer1(x)
x = self.layer2(x)
x = self.layer3(x)
x = self.global_avg_pooling(x)
x = x.view(x.size(0), -1)
x = self.fc(x)
return x
Training¶
train_dataset = TensorDataset(X_train.to(device), Y_train.to(device))
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
n_total_steps = len(train_loader)
num_epochs = 1000
early_stop = False
patience = 10 # Number of epochs to wait for improvement
best_loss = float('inf')
counter = 0
input_size = 63
hidden_size = 128
output_size = 26
num_layers = 2
base_model = LSTMModel(input_size, hidden_size, num_layers, output_size).to(device)
criterion = nn.CrossEntropyLoss().to(device)
optimizer = optim.Adam(base_model.parameters(), lr=0.001)
writer.add_graph(base_model, X_test.to(device))
writer.close()
base_model = train_with_early_stopping(base_model,
train_loader,
(X_test.to(device),
Y_test.to(device)),
optimizer,
criterion,
n_total_steps,
num_epochs=1000,
patience=100)
evaluate_model(base_model, X_test, Y_test, abcs)
Early stopping after 718 epochs.
Test Accuracy: 87.23% Classification Report: precision recall f1-score support A 0.50 1.00 0.67 3 B 1.00 1.00 1.00 3 C 0.80 1.00 0.89 4 D 1.00 1.00 1.00 4 E 1.00 1.00 1.00 4 F 1.00 0.67 0.80 3 G 0.80 1.00 0.89 4 H 1.00 1.00 1.00 4 I 1.00 1.00 1.00 3 J 1.00 1.00 1.00 3 K 1.00 1.00 1.00 4 L 1.00 1.00 1.00 3 M 0.60 0.75 0.67 4 N 1.00 0.33 0.50 3 O 1.00 1.00 1.00 4 P 1.00 1.00 1.00 4 Q 1.00 1.00 1.00 3 R 1.00 1.00 1.00 4 S 1.00 0.00 0.00 4 T 1.00 1.00 1.00 4 U 1.00 0.00 0.00 4 V 1.00 1.00 1.00 4 W 1.00 1.00 1.00 4 X 1.00 1.00 1.00 3 Y 1.00 1.00 1.00 3 Z 0.44 1.00 0.62 4 accuracy 0.87 94 macro avg 0.93 0.88 0.85 94 weighted avg 0.93 0.87 0.84 94
torch.save(base_model.state_dict(), 'trained_base_model.pth')
train_dataset = TensorDataset(X_train.to(device), Y_train.to(device))
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
num_epochs = 1000
early_stop = False
patience = 10 # Number of epochs to wait for improvement
best_loss = float('inf')
counter = 0
input_size = 63
hidden_size = 128
output_size = 26
num_layers = 2
complex_model = ComplexLSTMModel(input_size, hidden_size, num_layers, output_size).to(device)
criterion = nn.CrossEntropyLoss().to(device)
optimizer = optim.Adam(complex_model.parameters(), lr=0.001)
complex_model = train_with_early_stopping(complex_model,
train_loader,
(X_test.to(device),
Y_test.to(device)),
optimizer,
criterion,
n_total_steps,
num_epochs=1000,
patience=50)
evaluate_model(complex_model, X_test, Y_test, abcs)
Early stopping after 253 epochs.
Test Accuracy: 63.83% Classification Report: precision recall f1-score support A 0.18 1.00 0.30 3 B 0.43 1.00 0.60 3 C 1.00 0.00 0.00 4 D 0.50 1.00 0.67 4 E 1.00 1.00 1.00 4 F 1.00 0.00 0.00 3 G 0.43 0.75 0.55 4 H 1.00 0.00 0.00 4 I 1.00 1.00 1.00 3 J 1.00 0.00 0.00 3 K 0.80 1.00 0.89 4 L 1.00 0.00 0.00 3 M 0.67 1.00 0.80 4 N 1.00 0.00 0.00 3 O 1.00 1.00 1.00 4 P 1.00 1.00 1.00 4 Q 1.00 1.00 1.00 3 R 1.00 0.00 0.00 4 S 1.00 0.00 0.00 4 T 1.00 0.00 0.00 4 U 0.50 1.00 0.67 4 V 1.00 0.75 0.86 4 W 1.00 1.00 1.00 4 X 1.00 1.00 1.00 3 Y 1.00 1.00 1.00 3 Z 0.80 1.00 0.89 4 accuracy 0.64 94 macro avg 0.86 0.63 0.55 94 weighted avg 0.86 0.64 0.55 94
torch.save(complex_model.state_dict(), 'trained_lstm_model.pth')
train_dataset = TensorDataset(X_train.to(device), Y_train.to(device))
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
n_total_steps = len(train_loader)
num_epochs = 1000
early_stop = False
patience = 10 # Number of epochs to wait for improvement
best_loss = float('inf')
counter = 0
input_size = 63
hidden_size = 128
output_size = 26
num_layers = 4
attention_model = AttentionLSTMModel(input_size, hidden_size, num_layers, output_size).to(device)
criterion = nn.CrossEntropyLoss().to(device)
optimizer = optim.Adam(attention_model.parameters(), lr=0.001)
attention_model = train_with_early_stopping(attention_model,
train_loader,
(X_test.to(device),
Y_test.to(device)),
optimizer,
criterion,
n_total_steps,
num_epochs=1000,
patience=50)
evaluate_model(attention_model, X_test, Y_test, abcs)
Early stopping after 345 epochs.
Test Accuracy: 89.36% Classification Report: precision recall f1-score support A 1.00 1.00 1.00 3 B 1.00 1.00 1.00 3 C 1.00 1.00 1.00 4 D 1.00 1.00 1.00 4 E 1.00 0.50 0.67 4 F 0.75 1.00 0.86 3 G 1.00 0.75 0.86 4 H 1.00 1.00 1.00 4 I 1.00 1.00 1.00 3 J 1.00 1.00 1.00 3 K 1.00 1.00 1.00 4 L 1.00 1.00 1.00 3 M 0.50 0.75 0.60 4 N 1.00 0.00 0.00 3 O 1.00 1.00 1.00 4 P 1.00 1.00 1.00 4 Q 1.00 1.00 1.00 3 R 1.00 0.50 0.67 4 S 1.00 1.00 1.00 4 T 0.67 1.00 0.80 4 U 0.67 1.00 0.80 4 V 1.00 1.00 1.00 4 W 1.00 1.00 1.00 4 X 0.75 1.00 0.86 3 Y 1.00 1.00 1.00 3 Z 0.75 0.75 0.75 4 accuracy 0.89 94 macro avg 0.93 0.89 0.88 94 weighted avg 0.92 0.89 0.88 94
torch.save(attention_model.state_dict(), 'trained_attention_model.pth')
train_dataset = TensorDataset(X_train.to(device), Y_train.to(device))
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
num_epochs = 1000
early_stop = False
patience = 10 # Number of epochs to wait for improvement
best_loss = float('inf')
counter = 0
input_channels = 60
output_size = 26
sequence_length = 60
cnn_model = SimpleCNN(input_channels=input_channels, output_size=output_size)
criterion = nn.CrossEntropyLoss().to(device)
optimizer = optim.Adam(cnn_model.parameters(), lr=0.001)
writer.add_graph(cnn_model, X_test.to(device))
writer.close()
cnn_model = train_with_early_stopping(cnn_model,
train_loader,
(X_test.to(device),
Y_test.to(device)),
optimizer,
criterion,
n_total_steps,
num_epochs=1000,
patience=50)
evaluate_model(cnn_model, X_test, Y_test, abcs)
Test Accuracy: 94.68% Classification Report: precision recall f1-score support A 1.00 1.00 1.00 3 B 1.00 1.00 1.00 3 C 1.00 0.75 0.86 4 D 1.00 1.00 1.00 4 E 1.00 1.00 1.00 4 F 0.75 1.00 0.86 3 G 1.00 0.75 0.86 4 H 1.00 1.00 1.00 4 I 1.00 1.00 1.00 3 J 0.75 1.00 0.86 3 K 1.00 1.00 1.00 4 L 1.00 1.00 1.00 3 M 1.00 0.75 0.86 4 N 1.00 1.00 1.00 3 O 0.80 1.00 0.89 4 P 1.00 1.00 1.00 4 Q 1.00 1.00 1.00 3 R 1.00 1.00 1.00 4 S 1.00 1.00 1.00 4 T 1.00 1.00 1.00 4 U 0.67 1.00 0.80 4 V 1.00 0.50 0.67 4 W 1.00 1.00 1.00 4 X 1.00 1.00 1.00 3 Y 1.00 1.00 1.00 3 Z 1.00 1.00 1.00 4 accuracy 0.95 94 macro avg 0.96 0.95 0.95 94 weighted avg 0.96 0.95 0.95 94
torch.save(cnn_model.state_dict(), 'trained_cnn_model.pth')
train_dataset = TensorDataset(X_train.to(device), Y_train.to(device))
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
num_epochs = 1000
early_stop = False
patience = 10 # Number of epochs to wait for improvement
best_loss = float('inf')
counter = 0
input_channels = 60
output_size = 26
sequence_length = 60
cnn_model2 = LargerCNN(input_channels=input_channels, output_size=output_size, dropout_prob = 0.5)
criterion = nn.CrossEntropyLoss().to(device)
optimizer = optim.Adam(cnn_model2.parameters(), lr=0.001)
writer.add_graph(cnn_model, X_test.to(device))
writer.close()
cnn_model2 = train_with_early_stopping(cnn_model2,
train_loader,
(X_test.to(device),
Y_test.to(device)),
optimizer,
criterion,
n_total_steps,
num_epochs=1000,
patience=100) # I saw better performance with longer training times
evaluate_model(cnn_model2, X_test, Y_test, abcs)
Early stopping after 164 epochs.
Test Accuracy: 76.60% Classification Report: precision recall f1-score support A 0.75 1.00 0.86 3 B 0.50 1.00 0.67 3 C 1.00 0.50 0.67 4 D 1.00 0.75 0.86 4 E 1.00 0.75 0.86 4 F 1.00 1.00 1.00 3 G 1.00 1.00 1.00 4 H 0.80 1.00 0.89 4 I 1.00 1.00 1.00 3 J 1.00 0.67 0.80 3 K 1.00 0.75 0.86 4 L 0.75 1.00 0.86 3 M 0.50 0.50 0.50 4 N 0.40 0.67 0.50 3 O 0.80 1.00 0.89 4 P 0.25 0.25 0.25 4 Q 1.00 0.33 0.50 3 R 0.57 1.00 0.73 4 S 0.75 0.75 0.75 4 T 0.80 1.00 0.89 4 U 1.00 0.25 0.40 4 V 0.67 1.00 0.80 4 W 1.00 0.25 0.40 4 X 1.00 0.67 0.80 3 Y 1.00 1.00 1.00 3 Z 1.00 1.00 1.00 4 accuracy 0.77 94 macro avg 0.83 0.77 0.76 94 weighted avg 0.83 0.77 0.75 94
torch.save(cnn_model2.state_dict(), 'trained_cnn2_model.pth')
train_dataset = TensorDataset(X_train.to(device), Y_train.to(device))
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
deep_residual_cnn_model = DeepResidualCNN(input_channels=60, output_size=26)
criterion = nn.CrossEntropyLoss().to(device)
optimizer = optim.Adam(deep_residual_cnn_model.parameters(), lr=0.001)
deep_residual_cnn_model = train_with_early_stopping(deep_residual_cnn_model,
train_loader,
(X_test.to(device),
Y_test.to(device)),
optimizer,
criterion,
n_total_steps,
num_epochs=1000,
patience=50)
evaluate_model(deep_residual_cnn_model, X_test, Y_test, abcs)
Early stopping after 108 epochs.
Test Accuracy: 88.30% Classification Report: precision recall f1-score support A 1.00 1.00 1.00 3 B 1.00 1.00 1.00 3 C 1.00 0.75 0.86 4 D 1.00 1.00 1.00 4 E 1.00 1.00 1.00 4 F 0.75 1.00 0.86 3 G 0.75 0.75 0.75 4 H 1.00 1.00 1.00 4 I 1.00 1.00 1.00 3 J 1.00 1.00 1.00 3 K 1.00 1.00 1.00 4 L 1.00 0.67 0.80 3 M 0.60 0.75 0.67 4 N 0.50 0.33 0.40 3 O 0.80 1.00 0.89 4 P 0.80 1.00 0.89 4 Q 1.00 1.00 1.00 3 R 0.75 0.75 0.75 4 S 1.00 1.00 1.00 4 T 1.00 1.00 1.00 4 U 0.50 0.75 0.60 4 V 1.00 0.50 0.67 4 W 1.00 0.75 0.86 4 X 1.00 1.00 1.00 3 Y 1.00 1.00 1.00 3 Z 1.00 1.00 1.00 4 accuracy 0.88 94 macro avg 0.90 0.88 0.88 94 weighted avg 0.90 0.88 0.88 94
torch.save(deep_residual_cnn_model.state_dict(), 'trained_deep_residual_cnn_model.pth')
Load trained models¶
input_size = 63
hidden_size = 128
output_size = 26
num_layers = 2
base_model = LSTMModel(input_size, hidden_size, num_layers, output_size)
base_model.load_state_dict(torch.load('trained_base_model.pth'))
base_model.eval()
LSTMModel( (lstm): LSTM(63, 128, batch_first=True) (fc): Linear(in_features=128, out_features=26, bias=True) )
input_size = 63
hidden_size = 128
output_size = 26
num_layers = 2
model = AttentionLSTMModel(input_size, hidden_size, num_layers, output_size)
model.load_state_dict(torch.load('attention_model2000.pth'))
model.eval()
AttentionLSTMModel( (lstm): LSTM(63, 128, num_layers=2, batch_first=True) (attention): Linear(in_features=128, out_features=1, bias=True) (fc): Linear(in_features=128, out_features=26, bias=True) )
input_size = 63
hidden_size = 128
output_size = 26
num_layers = 2
attention_model = AttentionLSTMModel(input_size, hidden_size, num_layers, output_size)
attention_model.load_state_dict(torch.load('trained_attention_model.pth'))
attention_model.eval()
AttentionLSTMModel( (lstm): LSTM(63, 128, num_layers=2, batch_first=True) (attention): Linear(in_features=128, out_features=1, bias=True) (fc): Linear(in_features=128, out_features=26, bias=True) )
input_size = 63
hidden_size = 128
output_size = 26
num_layers = 2
complex_model = ComplexLSTMModel(input_size, hidden_size, num_layers, output_size)
complex_model.load_state_dict(torch.load('trained_lstm_model.pth'))
complex_model.eval()
ComplexLSTMModel( (lstm): LSTM(63, 128, num_layers=2, batch_first=True, dropout=0.5) (fc): Linear(in_features=128, out_features=26, bias=True) (dropout): Dropout(p=0.5, inplace=False) )
input_channels = 60
output_size = 26
sequence_length = 60
cnn_model = SimpleCNN(input_channels=input_channels, output_size=output_size)
cnn_model.load_state_dict(torch.load('trained_cnn_model.pth'))
cnn_model.eval()
SimpleCNN( (conv1): Conv1d(60, 64, kernel_size=(3,), stride=(1,), padding=(1,)) (conv2): Conv1d(64, 128, kernel_size=(3,), stride=(1,), padding=(1,)) (conv3): Conv1d(128, 256, kernel_size=(3,), stride=(1,), padding=(1,)) (pool): MaxPool1d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False) (fc1): Linear(in_features=1792, out_features=512, bias=True) (fc2): Linear(in_features=512, out_features=26, bias=True) )
input_channels = 60
output_size = 26
sequence_length = 60
cnn_model2 = LargerCNN(input_channels=input_channels, output_size=output_size, dropout_prob =0.5)
cnn_model2.load_state_dict(torch.load('trained_cnn2_model.pth'))
cnn_model2.eval()
LargerCNN( (conv1): Conv1d(60, 64, kernel_size=(3,), stride=(1,), padding=(1,)) (conv2): Conv1d(64, 128, kernel_size=(3,), stride=(1,), padding=(1,)) (conv3): Conv1d(128, 256, kernel_size=(3,), stride=(1,), padding=(1,)) (conv4): Conv1d(256, 512, kernel_size=(3,), stride=(1,), padding=(1,)) (conv5): Conv1d(512, 512, kernel_size=(3,), stride=(1,), padding=(1,)) (pool): MaxPool1d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False) (fc1): Linear(in_features=512, out_features=1024, bias=True) (dropout1): Dropout(p=0.5, inplace=False) (fc2): Linear(in_features=1024, out_features=512, bias=True) (dropout2): Dropout(p=0.5, inplace=False) (fc3): Linear(in_features=512, out_features=26, bias=True) )
input_channels = 60
output_size = 26
sequence_length = 60
deep_residual_cnn_model = DeepResidualCNN(input_channels=60, output_size=26)
deep_residual_cnn_model.load_state_dict(torch.load('trained_deep_residual_cnn_model.pth'))
deep_residual_cnn_model.eval()
DeepResidualCNN( (conv1): Conv1d(60, 64, kernel_size=(3,), stride=(1,), padding=(1,)) (bn1): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) (relu): ReLU(inplace=True) (layer1): Sequential( (0): ResidualBlock( (conv1): Conv1d(64, 64, kernel_size=(3,), stride=(2,), padding=(1,)) (bn1): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) (relu): ReLU(inplace=True) (conv2): Conv1d(64, 64, kernel_size=(3,), stride=(1,), padding=(1,)) (bn2): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) (downsample): Sequential( (0): Conv1d(64, 64, kernel_size=(1,), stride=(2,)) (1): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) ) ) (1): ResidualBlock( (conv1): Conv1d(64, 64, kernel_size=(3,), stride=(1,), padding=(1,)) (bn1): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) (relu): ReLU(inplace=True) (conv2): Conv1d(64, 64, kernel_size=(3,), stride=(1,), padding=(1,)) (bn2): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) ) ) (layer2): Sequential( (0): ResidualBlock( (conv1): Conv1d(64, 128, kernel_size=(3,), stride=(2,), padding=(1,)) (bn1): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) (relu): ReLU(inplace=True) (conv2): Conv1d(128, 128, kernel_size=(3,), stride=(1,), padding=(1,)) (bn2): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) (downsample): Sequential( (0): Conv1d(64, 128, kernel_size=(1,), stride=(2,)) (1): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) ) ) (1): ResidualBlock( (conv1): Conv1d(128, 128, kernel_size=(3,), stride=(1,), padding=(1,)) (bn1): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) (relu): ReLU(inplace=True) (conv2): Conv1d(128, 128, kernel_size=(3,), stride=(1,), padding=(1,)) (bn2): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) ) ) (layer3): Sequential( (0): ResidualBlock( (conv1): Conv1d(128, 256, kernel_size=(3,), stride=(2,), padding=(1,)) (bn1): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) (relu): ReLU(inplace=True) (conv2): Conv1d(256, 256, kernel_size=(3,), stride=(1,), padding=(1,)) (bn2): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) (downsample): Sequential( (0): Conv1d(128, 256, kernel_size=(1,), stride=(2,)) (1): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) ) ) (1): ResidualBlock( (conv1): Conv1d(256, 256, kernel_size=(3,), stride=(1,), padding=(1,)) (bn1): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) (relu): ReLU(inplace=True) (conv2): Conv1d(256, 256, kernel_size=(3,), stride=(1,), padding=(1,)) (bn2): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) ) ) (global_avg_pooling): AdaptiveAvgPool1d(output_size=1) (fc): Linear(in_features=256, out_features=26, bias=True) )
model_names = ['base_model', 'attention_model', 'bi_directional_model', 'cnn_model', 'cnn_model2', 'deep_residual_cnn_model']
# Initialize a list to store accuracy scores
accuracy_scores = []
base_model.to(device)
attention_model.to(device)
complex_model.to(device)
cnn_model.to(device)
cnn_model2.to(device)
deep_residual_cnn_model.to(device)
# Function to evaluate a model and return accuracy
def evaluate_model2(model, X_test, Y_test, class_names):
# Set to evaluation mode
model.eval().to(device)
# Make predictions on the test set
with torch.no_grad():
test_outputs = model(X_test.to(device))
_, predicted = torch.max(test_outputs, 1)
accuracy = (predicted == Y_test).sum().item() / Y_test.size(0)
return accuracy
# Evaluate each model and store accuracy scores
for model in [base_model, attention_model, complex_model, cnn_model, cnn_model2, deep_residual_cnn_model]:
accuracy = evaluate_model2(model, X_test, Y_test, abcs)
accuracy_scores.append(accuracy * 100)
# Create a table
table_data = list(zip(model_names, accuracy_scores))
table_headers = ["Model Name", "Test Accuracy %"]
table = tabulate(table_data, headers=table_headers, tablefmt="grid")
# Print the table
print(table)
+-------------------------+-------------------+ | Model Name | Test Accuracy % | +=========================+===================+ | base_model | 87.234 | +-------------------------+-------------------+ | attention_model | 89.3617 | +-------------------------+-------------------+ | bi_directional_model | 63.8298 | +-------------------------+-------------------+ | cnn_model | 93.617 | +-------------------------+-------------------+ | cnn_model2 | 76.5957 | +-------------------------+-------------------+ | deep_residual_cnn_model | 91.4894 | +-------------------------+-------------------+
Validation¶
Using some videos I collected to get some idea of the generalisability of the models
Load validation data for the collection code please see the Video_capture.ipynb file
validation_features = os.path.join("ISL", "Extracted_features_Validation")
# now load the data from the directories
X, Y = [] , []
for digit in os.listdir(validation_features):
#loop through each digit
for vid in os.listdir(os.path.join(validation_features, digit)):
#print(vid)
fs = np.load(os.path.join(validation_features, digit, vid))
X.append(fs)
Y.append(digit)
# now randomly subsample so that each series is of the same length
desired_length = 60 # is the lowest frame count in the dataset
X_valid = [element[np.random.choice(element.shape[0], desired_length, replace=False)] if element.shape[0] > desired_length else element for element in X]
X_valid = torch.tensor(np.array(X_valid), dtype=torch.float32)
# recode labels to numeric
num_classes = len(set(Y))
abcs=sorted(list(set(Y)))
label_encoder = LabelEncoder()
Y_num = label_encoder.fit_transform(Y)
Y_valid = torch.tensor(Y_num)
print(X_valid.shape)
print(Y_valid.shape)
torch.Size([52, 60, 63]) torch.Size([52])
model_names = ['base_model', 'attention_model', 'bi_directional_model', 'cnn_model', 'cnn_model2', 'deep_residual_cnn_model']
# Initialize a list to store accuracy scores
accuracy_scores = []
base_model.to(device)
attention_model.to(device)
complex_model.to(device)
cnn_model.to(device)
cnn_model2.to(device)
deep_residual_cnn_model.to(device)
# Evaluate each model and store accuracy scores
for model in [base_model, attention_model, complex_model, cnn_model, cnn_model2, deep_residual_cnn_model]:
accuracy = evaluate_model2(model, X_valid, Y_valid, abcs)
accuracy_scores.append(accuracy * 100)
# Create a table
table_data = list(zip(model_names, accuracy_scores))
table_headers = ["Model Name", "Vaildation Accuracy"]
table = tabulate(table_data, headers=table_headers, tablefmt="grid")
# Print the table
print(table)
+-------------------------+-----------------------+ | Model Name | Vaildation Accuracy | +=========================+=======================+ | base_model | 50 | +-------------------------+-----------------------+ | attention_model | 59.6154 | +-------------------------+-----------------------+ | bi_directional_model | 40.3846 | +-------------------------+-----------------------+ | cnn_model | 65.3846 | +-------------------------+-----------------------+ | cnn_model2 | 53.8462 | +-------------------------+-----------------------+ | deep_residual_cnn_model | 55.7692 | +-------------------------+-----------------------+
evaluate_model(cnn_model, X_valid, Y_valid, abcs)
Test Accuracy: 65.38% Classification Report: precision recall f1-score support A 1.00 1.00 1.00 2 B 1.00 1.00 1.00 2 C 0.00 0.00 1.00 2 D 0.00 0.00 1.00 2 E 0.40 1.00 0.57 2 F 1.00 0.00 0.00 2 G 0.50 1.00 0.67 2 H 1.00 1.00 1.00 2 I 1.00 1.00 1.00 2 J 1.00 1.00 1.00 2 K 1.00 1.00 1.00 2 L 1.00 0.00 0.00 2 M 1.00 0.50 0.67 2 N 0.50 1.00 0.67 2 O 1.00 0.50 0.67 2 P 1.00 1.00 1.00 2 Q 1.00 1.00 1.00 2 R 1.00 0.50 0.67 2 S 1.00 1.00 1.00 2 T 1.00 0.00 0.00 2 U 1.00 0.00 0.00 2 V 0.67 1.00 0.80 2 W 1.00 0.00 0.00 2 X 1.00 0.50 0.67 2 Y 1.00 1.00 1.00 2 Z 0.29 1.00 0.44 2 accuracy 0.65 52 macro avg 0.82 0.65 0.69 52 weighted avg 0.82 0.65 0.69 52
evaluate_model(deep_residual_cnn_model, X_valid, Y_valid, abcs)
Test Accuracy: 59.62% Classification Report: precision recall f1-score support A 1.00 1.00 1.00 2 B 0.67 1.00 0.80 2 C 0.00 0.00 1.00 2 D 1.00 0.00 0.00 2 E 1.00 1.00 1.00 2 F 0.67 1.00 0.80 2 G 1.00 0.50 0.67 2 H 0.67 1.00 0.80 2 I 1.00 1.00 1.00 2 J 1.00 0.50 0.67 2 K 1.00 0.50 0.67 2 L 1.00 0.00 0.00 2 M 1.00 0.50 0.67 2 N 0.50 1.00 0.67 2 O 1.00 1.00 1.00 2 P 0.00 0.00 1.00 2 Q 0.40 1.00 0.57 2 R 0.50 0.50 0.50 2 S 1.00 1.00 1.00 2 T 1.00 0.00 0.00 2 U 1.00 0.00 0.00 2 V 1.00 0.00 0.00 2 W 0.00 0.00 1.00 2 X 1.00 1.00 1.00 2 Y 1.00 1.00 1.00 2 Z 0.25 1.00 0.40 2 accuracy 0.60 52 macro avg 0.76 0.60 0.66 52 weighted avg 0.76 0.60 0.66 52
Given the higher F1 score for the simple CNN model and overall accuracy it is choosen to demonstrate the performance of the model in real time.
Real time detection¶
Load back dependencies
mp_hands = mp.solutions.hands # Instantiate the hands model
mp_drawing = mp.solutions.drawing_utils # Drawing utilities
# create a function to process video
def mediapipe_detection(image, model):
image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB) # COLOR CONVERSION BGR 2 RGB
image.flags.writeable = False # Image is no longer writeable
results = model.process(image) # Make prediction
image.flags.writeable = True # Image is now writeable
image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR) # COLOR COVERSION RGB 2 BGR
return image, results
# helper function to visualise the landmarks on the video feed
def draw_landmarks(image, results):
if results.multi_hand_landmarks:
for hand_landmarks in results.multi_hand_landmarks:
mp_drawing.draw_landmarks(image, hand_landmarks, mp_hands.HAND_CONNECTIONS)
# and a function to extract the results in real time to use for inference or creating training data
def extract_keypoints(results):
if results.multi_hand_landmarks:
positions = np.array([[res.x, res.y, res.z] for hand_landmarks in results.multi_hand_landmarks for res in hand_landmarks.landmark]).flatten()
else:
positions = np.zeros(21*3)
return positions
abcs = ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z']
# had to load the class labels to turn outputs to something understandable
#function to make detection easier and move between different models
def hand_gesture_recognition(model, cap):
# 1. New detection variables
sequence = []
predictions = []
threshold = 0.3
# Set mediapipe model
with mp_hands.Hands(static_image_mode=False, max_num_hands=1, min_detection_confidence=0.5) as hands:
while cap.isOpened():
# Read feed
ret, frame = cap.read()
# Make detections
image, results = mediapipe_detection(frame, hands)
# Draw landmarks
draw_landmarks(image, results)
# 2. Prediction logic
keypoints = extract_keypoints(results)
sequence.append(keypoints)
sequence = sequence[-60:]
# detect if we have 30 frames recorded
if len(sequence) == 60:
sequence_tensor = torch.Tensor(np.expand_dims(sequence, axis=0)) # create a tensor of the keypoints
#print(sequence_tensor.shape)
model.eval() # Switch to evaluation mode
with torch.no_grad():
res = model(sequence_tensor)
_, predicted_class = torch.max(res, 1)
predictions.append(predicted_class.item())
prediction_text = f"Prediction: {abcs[predicted_class.item()]}"
cv2.putText(image, prediction_text, (10, 30), cv2.FONT_HERSHEY_SIMPLEX, 1, (255, 255, 255), 2, cv2.LINE_AA)
# Show to screen
cv2.imshow('OpenCV Feed', image)
# Break gracefully
if cv2.waitKey(10) & 0xFF == ord('q'):
break
cap.release()
cv2.destroyAllWindows()
Detection code¶
Run any of the following for real time detection. Once done with detection and the screen cap is active press "q" to quit the program.
cap = cv2.VideoCapture(2)
hand_gesture_recognition(base_model, cap)
cap = cv2.VideoCapture(0)
hand_gesture_recognition(attention_model, cap)
cap = cv2.VideoCapture(0)
hand_gesture_recognition(complex_model, cap)
cap = cv2.VideoCapture(0)
hand_gesture_recognition(cnn_model, cap)