open source pkg v1

This commit is contained in:
Vijay Yadev
2020-08-04 19:12:31 -04:00
parent bef213dba9
commit c389fc2c47
3708 changed files with 1624220 additions and 1 deletions

View File

@@ -0,0 +1,514 @@
///////////////////////////////////////////////////////////////////////////////
// Copyright (C) 2017, Carnegie Mellon University and University of Cambridge,
// all rights reserved.
//
// ACADEMIC OR NON-PROFIT ORGANIZATION NONCOMMERCIAL RESEARCH USE ONLY
//
// BY USING OR DOWNLOADING THE SOFTWARE, YOU ARE AGREEING TO THE TERMS OF THIS LICENSE AGREEMENT.
// IF YOU DO NOT AGREE WITH THESE TERMS, YOU MAY NOT USE OR DOWNLOAD THE SOFTWARE.
//
// License can be found in OpenFace-license.txt
//
// * Any publications arising from the use of this software, including but
// not limited to academic journal and conference publications, technical
// reports and manuals, must cite at least one of the following works:
//
// OpenFace 2.0: Facial Behavior Analysis Toolkit
// Tadas Baltrušaitis, Amir Zadeh, Yao Chong Lim, and Louis-Philippe Morency
// in IEEE International Conference on Automatic Face and Gesture Recognition, 2018
//
// Convolutional experts constrained local model for facial landmark detection.
// A. Zadeh, T. Baltrušaitis, and Louis-Philippe Morency,
// in Computer Vision and Pattern Recognition Workshops, 2017.
//
// Rendering of Eyes for Eye-Shape Registration and Gaze Estimation
// Erroll Wood, Tadas Baltrušaitis, Xucong Zhang, Yusuke Sugano, Peter Robinson, and Andreas Bulling
// in IEEE International. Conference on Computer Vision (ICCV), 2015
//
// Cross-dataset learning and person-specific normalisation for automatic Action Unit detection
// Tadas Baltrušaitis, Marwa Mahmoud, and Peter Robinson
// in Facial Expression Recognition and Analysis Challenge,
// IEEE International Conference on Automatic Face and Gesture Recognition, 2015
//
///////////////////////////////////////////////////////////////////////////////
#include "stdafx.h"
#include "CCNF_patch_expert.h"
// OpenCV includes
#include <opencv2/core/core.hpp>
#include <opencv2/imgproc.hpp>
// Local includes
#include "LandmarkDetectorUtils.h"
using namespace LandmarkDetector;
// Copy constructors of neuron and patch expert
CCNF_neuron::CCNF_neuron(const CCNF_neuron& other) : weights(other.weights.clone())
{
this->neuron_type = other.neuron_type;
this->norm_weights = other.norm_weights;
this->bias = other.bias;
this->alpha = other.alpha;
for (std::map<int, cv::Mat_<double> >::const_iterator it = other.weights_dfts.begin(); it != other.weights_dfts.end(); it++)
{
// Make sure the matrix is copied.
this->weights_dfts.insert(std::pair<int, cv::Mat>(it->first, it->second.clone()));
}
}
// Copy constructor
CCNF_patch_expert::CCNF_patch_expert(const CCNF_patch_expert& other) : neurons(other.neurons), window_sizes(other.window_sizes), betas(other.betas)
{
this->width = other.width;
this->height = other.height;
this->patch_confidence = other.patch_confidence;
this->weight_matrix = other.weight_matrix.clone();
// Copy the Sigmas in a deep way
for (std::vector<cv::Mat_<float> >::const_iterator it = other.Sigmas.begin(); it != other.Sigmas.end(); it++)
{
// Make sure the matrix is copied.
this->Sigmas.push_back(it->clone());
}
}
// Compute sigmas for all landmarks for a particular view and window size
void CCNF_patch_expert::ComputeSigmas(std::vector<cv::Mat_<float> > sigma_components, int window_size)
{
for(size_t i=0; i < window_sizes.size(); ++i)
{
if( window_sizes[i] == window_size)
return;
}
// Each of the landmarks will have the same connections, hence constant number of sigma components
int n_betas = sigma_components.size();
// calculate the sigmas based on alphas and betas
float sum_alphas = 0;
int n_alphas = this->neurons.size();
// sum the alphas first
for(int a = 0; a < n_alphas; ++a)
{
sum_alphas = sum_alphas + this->neurons[a].alpha;
}
cv::Mat_<float> q1 = sum_alphas * cv::Mat_<float>::eye(window_size*window_size, window_size*window_size);
cv::Mat_<float> q2 = cv::Mat_<float>::zeros(window_size*window_size, window_size*window_size);
for (int b=0; b < n_betas; ++b)
{
q2 = q2 + ((float)this->betas[b]) * sigma_components[b];
}
cv::Mat_<float> SigmaInv = 2 * (q1 + q2);
cv::Mat Sigma_f;
cv::invert(SigmaInv, Sigma_f, cv::DECOMP_CHOLESKY);
window_sizes.push_back(window_size);
Sigmas.push_back(Sigma_f);
}
//===========================================================================
void CCNF_neuron::Read(std::ifstream &stream)
{
// Sanity check
int read_type;
stream.read ((char*)&read_type, 4);
assert(read_type == 2);
stream.read ((char*)&neuron_type, 4);
stream.read ((char*)&norm_weights, 8);
stream.read ((char*)&bias, 8);
stream.read ((char*)&alpha, 8);
LandmarkDetector::ReadMatBin(stream, weights);
}
// Perform im2col, while at the same time doing contrast normalization and adding a bias term
void im2colContrastNormBias(const cv::Mat_<float>& input, const unsigned int width, const unsigned int height, cv::Mat_<float>& output)
{
const unsigned int m = input.rows;
const unsigned int n = input.cols;
// determine how many blocks there will be with a sliding window of width x height in the input
const unsigned int yB = m - height + 1;
const unsigned int xB = n - width + 1;
// Allocate the output size
if (output.rows != xB*yB && output.cols != width * height + 1)
{
output = cv::Mat::ones(xB*yB, width * height + 1, CV_32F);
}
// Iterate over the blocks
unsigned int rowIdx = 0;
for (unsigned int j = 0; j< xB; j++)
{
for (unsigned int i = 0; i< yB; i++)
{
float* Mo = output.ptr<float>(rowIdx);
float sum = 0;
for (unsigned int yy = 0; yy < height; ++yy)
{
const float* Mi = input.ptr<float>(i + yy);
for (unsigned int xx = 0; xx < width; ++xx)
{
unsigned int colIdx = xx*height + yy;
float in = Mi[j + xx];
sum += in;
Mo[colIdx + 1] = in;
}
}
// Working out the mean
float mean = sum / (float)(width * height);
float sum_sq = 0;
const unsigned int num_items = width*height + 1;
// Working out the sum squared and subtracting the mean
for (unsigned int x = 1; x < num_items; ++x)
{
float in = Mo[x] - mean;
Mo[x] = in;
sum_sq += in * in;
}
float norm = sqrt(sum_sq);
// Avoiding division by 0
if (norm == 0)
{
norm = 1;
}
// Flip multiplication to division for speed
norm = 1.0 / norm;
for (unsigned int x = 1; x < num_items; ++x)
{
Mo[x] *= norm;
}
rowIdx++;
}
}
}
//===========================================================================
void CCNF_neuron::Response(const cv::Mat_<float> &im, cv::Mat_<double> &im_dft, cv::Mat &integral_img, cv::Mat &integral_img_sq, cv::Mat_<float> &resp)
{
int h = im.rows - weights.rows + 1;
int w = im.cols - weights.cols + 1;
// the patch area on which we will calculate reponses
cv::Mat_<float> I;
if(neuron_type == 3)
{
// Perform normalisation across whole patch (ignoring the invalid values indicated by <= 0
cv::Scalar mean;
cv::Scalar std;
// ignore missing values
cv::Mat_<uchar> mask = im > 0;
cv::meanStdDev(im, mean, std, mask);
// if all values the same don't divide by 0
if(std[0] != 0)
{
I = (im - mean[0]) / std[0];
}
else
{
I = (im - mean[0]);
}
I.setTo(0, mask == 0);
}
else
{
if(neuron_type == 0)
{
I = im;
}
else
{
printf("ERROR(%s,%d): Unsupported patch type %d!\n", __FILE__,__LINE__,neuron_type);
abort();
}
}
if(resp.empty())
{
resp.create(h, w);
}
// The response from neuron before activation
if(neuron_type == 3)
{
// In case of depth we use per area, rather than per patch normalisation
matchTemplate_m(I, im_dft, integral_img, integral_img_sq, weights, weights_dfts, resp, cv::TM_CCOEFF); // the linear multiplication, efficient calc of response
}
else
{
matchTemplate_m(I, im_dft, integral_img, integral_img_sq, weights, weights_dfts, resp, cv::TM_CCOEFF_NORMED); // the linear multiplication, efficient calc of response
}
cv::MatIterator_<float> p = resp.begin();
cv::MatIterator_<float> q1 = resp.begin(); // respone for each pixel
cv::MatIterator_<float> q2 = resp.end();
// the logistic function (sigmoid) applied to the response
while(q1 != q2)
{
*p++ = (2 * alpha) * 1.0 /(1.0 + exp( -(*q1++ * norm_weights + bias )));
}
}
//===========================================================================
void CCNF_patch_expert::Read(std::ifstream &stream, std::vector<int> window_sizes, std::vector<std::vector<cv::Mat_<float> > > sigma_components)
{
// Sanity check
int read_type;
stream.read ((char*)&read_type, 4);
assert(read_type == 5);
// the number of neurons for this patch
int num_neurons;
stream.read ((char*)&width, 4);
stream.read ((char*)&height, 4);
stream.read ((char*)&num_neurons, 4);
if(num_neurons == 0)
{
// empty patch due to landmark being invisible at that orientation
// read an empty int (due to the way things were written out)
stream.read ((char*)&num_neurons, 4);
return;
}
neurons.resize(num_neurons);
for(int i = 0; i < num_neurons; i++)
neurons[i].Read(stream);
// Combine the neuron weights to one weight matrix for more efficient computation
weight_matrix = cv::Mat_<float>(neurons.size(), 1 + neurons[0].weights.rows * neurons[0].weights.cols);
for (size_t i = 0; i < neurons.size(); i++)
{
cv::Mat_<float> w_tmp = neurons[i].weights.t();
cv::Mat_<float> weights_flat = w_tmp.reshape(1, neurons[i].weights.rows * neurons[i].weights.cols);
weights_flat = weights_flat.t();
// Incorporate neuron weights directly
weights_flat = weights_flat * neurons[i].norm_weights;
weights_flat.copyTo(weight_matrix(cv::Rect(1, i, neurons[i].weights.rows * neurons[i].weights.cols, 1)));
// Incorporate bias as well
weight_matrix.at<float>(i, 0) = neurons[i].bias;
}
// In case we are using OpenBLAS, make sure it is not multi-threading as we are multi-threading outside of it
openblas_set_num_threads(1);
int n_sigmas = window_sizes.size();
int n_betas = 0;
if(n_sigmas > 0)
{
n_betas = sigma_components[0].size();
betas.resize(n_betas);
for (int i=0; i < n_betas; ++i)
{
stream.read ((char*)&betas[i], 8);
}
}
// Read the patch confidence
stream.read ((char*)&patch_confidence, 8);
}
//===========================================================================
void CCNF_patch_expert::Response(const cv::Mat_<float> &area_of_interest, cv::Mat_<float> &response)
{
int response_height = area_of_interest.rows - height + 1;
int response_width = area_of_interest.cols - width + 1;
if(response.rows != response_height || response.cols != response_width)
{
response.create(response_height, response_width);
}
response.setTo(0);
// the placeholder for the DFT of the image, the integral image, and squared integral image so they don't get recalculated for every response
cv::Mat_<double> area_of_interest_dft;
cv::Mat integral_image, integral_image_sq;
cv::Mat_<float> neuron_response;
// responses from the neural layers
for(size_t i = 0; i < neurons.size(); i++)
{
// Do not bother with neuron response if the alpha is tiny and will not contribute much to overall result
if(neurons[i].alpha > 1e-4)
{
neurons[i].Response(area_of_interest, area_of_interest_dft, integral_image, integral_image_sq, neuron_response);
response = response + neuron_response;
}
}
int s_to_use = -1;
// Find the matching sigma
for(size_t i=0; i < window_sizes.size(); ++i)
{
if(window_sizes[i] == response_height)
{
// Found the correct sigma
s_to_use = i;
break;
}
}
cv::Mat_<float> resp_vec_f = response.reshape(1, response_height * response_width);
cv::Mat out = Sigmas[s_to_use] * resp_vec_f;
response = out.reshape(1, response_height);
// Making sure the response does not have negative numbers
double min;
minMaxIdx(response, &min, 0);
if(min < 0)
{
response = response - min;
}
}
void CCNF_patch_expert::ResponseOpenBlas(const cv::Mat_<float> &area_of_interest, cv::Mat_<float> &response, cv::Mat_<float>& im2col_prealloc)
{
int response_height = area_of_interest.rows - height + 1;
int response_width = area_of_interest.cols - width + 1;
if (response.rows != response_height || response.cols != response_width)
{
response.create(response_height, response_width);
}
response.setTo(0);
if (neurons.size() == 0)
{
return;
}
// the placeholder for the column normalized representation of the image, don't get recalculated for every response
im2colContrastNormBias(area_of_interest, neurons[0].weights.cols, neurons[0].weights.rows, im2col_prealloc);
cv::Mat_<float> normalized_input = im2col_prealloc.t();
// the placeholder for the DFT of the image, the integral image, and squared integral image so they don't get recalculated for every response
cv::Mat_<double> area_of_interest_dft;
cv::Mat integral_image, integral_image_sq;
cv::Mat_<float> neuron_response;
int h = area_of_interest.rows - neurons[0].weights.rows + 1;
int w = area_of_interest.cols - neurons[0].weights.cols + 1;
cv::Mat_<float> neuron_resp_full(weight_matrix.rows, normalized_input.cols, 0.0f);
// Perform matrix multiplication in OpenBLAS (fortran call)
float alpha1 = 1.0;
float beta1 = 0.0;
char N[2]; N[0] = 'N';
sgemm_(N, N, &normalized_input.cols, &weight_matrix.rows, &weight_matrix.cols, &alpha1, (float*)normalized_input.data, &normalized_input.cols, (float*)weight_matrix.data, &weight_matrix.cols, &beta1, (float*)neuron_resp_full.data, &normalized_input.cols);
// Above is a faster version of this
//cv::Mat_<float> neuron_resp_full = this->weight_matrix * normalized_input;
for (size_t i = 0; i < neurons.size(); i++)
{
if (neurons[i].alpha > 1e-4)
{
cv::MatIterator_<float> p = response.begin();
cv::Mat_<float> rel_row = neuron_resp_full.row(i);
cv::MatIterator_<float> q1 = rel_row.begin(); // respone for each pixel
cv::MatIterator_<float> q2 = rel_row.end();
// the logistic function (sigmoid) applied to the response
while (q1 != q2)
{
*p++ += (2.0 * neurons[i].alpha) / (1.0 + exp(-*q1++));
}
}
}
response = response.t();
int s_to_use = -1;
// Find the matching sigma
for (size_t i = 0; i < window_sizes.size(); ++i)
{
if (window_sizes[i] == response_height)
{
// Found the correct sigma
s_to_use = i;
break;
}
}
cv::Mat_<float> resp_vec_f = response.reshape(1, response_height * response_width);
cv::Mat_<float> out(Sigmas[s_to_use].rows, resp_vec_f.cols, 0.0f);
// Perform matrix multiplication in OpenBLAS (fortran call)
alpha1 = 1.0;
beta1 = 0.0;
sgemm_(N, N, &resp_vec_f.cols, &Sigmas[s_to_use].rows, &Sigmas[s_to_use].cols, &alpha1, (float*)resp_vec_f.data, &resp_vec_f.cols, (float*)Sigmas[s_to_use].data, &Sigmas[s_to_use].cols, &beta1, (float*)out.data, &resp_vec_f.cols);
// Above is a faster version of this
//cv::Mat out = Sigmas[s_to_use] * resp_vec_f;
response = out.reshape(1, response_height);
// Making sure the response does not have negative numbers
double min;
minMaxIdx(response, &min, 0);
if (min < 0)
{
response = response - min;
}
}

View File

@@ -0,0 +1,617 @@
///////////////////////////////////////////////////////////////////////////////
// Copyright (C) 2017, Carnegie Mellon University and University of Cambridge,
// all rights reserved.
//
// ACADEMIC OR NON-PROFIT ORGANIZATION NONCOMMERCIAL RESEARCH USE ONLY
//
// BY USING OR DOWNLOADING THE SOFTWARE, YOU ARE AGREEING TO THE TERMS OF THIS LICENSE AGREEMENT.
// IF YOU DO NOT AGREE WITH THESE TERMS, YOU MAY NOT USE OR DOWNLOAD THE SOFTWARE.
//
// License can be found in OpenFace-license.txt
//
// * Any publications arising from the use of this software, including but
// not limited to academic journal and conference publications, technical
// reports and manuals, must cite at least one of the following works:
//
// OpenFace 2.0: Facial Behavior Analysis Toolkit
// Tadas Baltrušaitis, Amir Zadeh, Yao Chong Lim, and Louis-Philippe Morency
// in IEEE International Conference on Automatic Face and Gesture Recognition, 2018
//
// Convolutional experts constrained local model for facial landmark detection.
// A. Zadeh, T. Baltrušaitis, and Louis-Philippe Morency,
// in Computer Vision and Pattern Recognition Workshops, 2017.
//
// Rendering of Eyes for Eye-Shape Registration and Gaze Estimation
// Erroll Wood, Tadas Baltrušaitis, Xucong Zhang, Yusuke Sugano, Peter Robinson, and Andreas Bulling
// in IEEE International. Conference on Computer Vision (ICCV), 2015
//
// Cross-dataset learning and person-specific normalisation for automatic Action Unit detection
// Tadas Baltrušaitis, Marwa Mahmoud, and Peter Robinson
// in Facial Expression Recognition and Analysis Challenge,
// IEEE International Conference on Automatic Face and Gesture Recognition, 2015
//
///////////////////////////////////////////////////////////////////////////////
#include "stdafx.h"
#include "CEN_patch_expert.h"
// OpenCV includes
#include <opencv2/core/core.hpp>
#include <opencv2/imgproc.hpp>
// Local includes
#include "LandmarkDetectorUtils.h"
// For exponential
#include <math.h>
using namespace LandmarkDetector;
// Copy constructor (do not perform a deep copy of data as it is very large, also there is no real need to stor the copies
CEN_patch_expert::CEN_patch_expert(const CEN_patch_expert& other) : confidence(other.confidence), width_support(other.width_support), height_support(other.height_support)
{
// Copy the layer weights in a deep way
for (size_t i = 0; i < other.weights.size(); ++i)
{
this->weights.push_back(other.weights[i]);
this->biases.push_back(other.biases[i]);
this->activation_function.push_back(other.activation_function[i]);
}
}
//===========================================================================
void CEN_patch_expert::Read(std::ifstream &stream)
{
// Setting up OpenBLAS
openblas_set_num_threads(1);
// Sanity check
int read_type;
stream.read((char*)&read_type, 4);
assert(read_type == 6);
// the number of neurons for this patch
int num_layers;
stream.read((char*)&width_support, 4);
stream.read((char*)&height_support, 4);
stream.read((char*)&num_layers, 4);
if (num_layers == 0)
{
// empty patch due to landmark being invisible at that orientation (or visible through mirroring)
stream.read((char*)&confidence, 8);
return;
}
activation_function.resize(num_layers);
weights.resize(num_layers);
biases.resize(num_layers);
for (int i = 0; i < num_layers; i++)
{
int neuron_type;
stream.read((char*)&neuron_type, 4);
activation_function[i] = neuron_type;
cv::Mat_<double> bias;
LandmarkDetector::ReadMatBin(stream, bias);
cv::Mat_<double> weight;
LandmarkDetector::ReadMatBin(stream, weight);
weights[i] = weight;
biases[i] = bias;
}
// Read the patch confidence
stream.read((char*)&confidence, 8);
}
// Contrast normalize the input for response map computation
void contrastNorm(const cv::Mat_<float>& input, cv::Mat_<float>& output)
{
const unsigned int num_cols = input.cols;
const unsigned int num_rows = input.rows;
output = input.clone();
cv::MatConstIterator_<float> p = input.begin();
// Compute row wise
for (unsigned int y = 0; y < num_rows; ++y)
{
cv::Scalar mean_s = cv::mean(input(cv::Rect(1,y,num_cols-1, 1)));
float mean = (float)mean_s[0];
p++;
float sum_sq = 0;
for (unsigned int x = 1; x < num_cols; ++x)
{
float curr = *p++;
sum_sq += (curr - mean) * (curr - mean);
}
float norm = sqrt(sum_sq);
if (norm == 0)
norm = 1;
for (unsigned int x = 1; x < num_cols; ++x)
{
output.at<float>(y, x) = (output.at<float>(y, x) - mean) / norm;
}
}
}
void im2colBias(const cv::Mat_<float>& input, const unsigned int width, const unsigned int height, cv::Mat_<float>& output)
{
const unsigned int m = input.rows;
const unsigned int n = input.cols;
// determine how many blocks there will be with a sliding window of width x height in the input
const unsigned int yB = m - height + 1;
const unsigned int xB = n - width + 1;
// Allocate the output size
if(output.rows != xB*yB && output.cols != width * height + 1)
{
output = cv::Mat::ones(xB*yB, width * height + 1, CV_32F);
}
// Iterate over the blocks
for (unsigned int j = 0; j< xB; j++)
{
for (unsigned int i = 0; i< yB; i++)
{
unsigned int rowIdx = i + j*yB;
for (unsigned int yy = 0; yy < height; ++yy)
for (unsigned int xx = 0; xx < width; ++xx)
{
unsigned int colIdx = xx*height + yy;
output.at<float>(rowIdx, colIdx + 1) = input.at<float>(i + yy, j + xx);
}
}
}
}
//===========================================================================
void CEN_patch_expert::Response(const cv::Mat_<float> &area_of_interest, cv::Mat_<float> &response)
{
int response_height = area_of_interest.rows - height_support + 1;
int response_width = area_of_interest.cols - width_support + 1;
cv::Mat_<float> input_col;
im2colBias(area_of_interest, width_support, height_support, input_col);
// Mean and standard deviation normalization
contrastNorm(input_col, response);
response = response.t();
for (size_t layer = 0; layer < activation_function.size(); ++layer)
{
// We are performing response = weights[layers] * response(t), but in OpenBLAS as that is significantly quicker than OpenCV
cv::Mat_<float> resp = response;
float* m1 = (float*)resp.data;
cv::Mat_<float> weight = weights[layer];
float* m2 = (float*)weight.data;
cv::Mat_<float> resp_blas(weight.rows, resp.cols);
float* m3 = (float*)resp_blas.data;
// Perform matrix multiplication in OpenBLAS (fortran call)
float alpha1 = 1.0;
float beta1 = 0.0;
char N[2]; N[0] = 'N';
sgemm_(N, N, &resp.cols, &weight.rows, &weight.cols, &alpha1, m1, &resp.cols, m2, &weight.cols, &beta1, m3, &resp.cols);
// The above is a faster version of this, by calling the fortran version directly
//cblas_sgemm(CblasColMajor, CblasNoTrans, CblasNoTrans, resp.cols, weight.rows, weight.cols, 1, m1, resp.cols, m2, weight.cols, 0.0, m3, resp.cols);
// Adding the bias (bit ugly, but the fastest way to do this)
response = resp_blas;
float* data = (float*)response.data;
size_t height = response.rows;
size_t width = response.cols;
float* data_b = (float*)biases[layer].data;
for (size_t y = 0; y < height; ++y)
{
float bias = data_b[y];
for (size_t x = 0; x < width; ++x)
{
float in = *data + bias;
*data++ = in;
}
}
// Perform activation and add bias at the same time
if (activation_function[layer] == 0) // Sigmoid
{
size_t resp_size = response.rows * response.cols;
// Iterate over the data directly
float* data = (float*)response.data;
for (size_t counter = 0; counter < resp_size; ++counter)
{
float in = *data;
*data++ = 1.0 / (1.0 + exp(-(in)));
}
}
else if (activation_function[layer] == 2)// ReLU
{
cv::threshold(response, response, 0, 0, cv::THRESH_TOZERO);
}
}
response = response.t();
response = response.reshape(1, response_height);
response = response.t();
}
// Perform im2col, while at the same time doing contrast normalization and adding a bias term (also skip every other region)
void im2colBiasSparseContrastNorm(const cv::Mat_<float>& input, const unsigned int width, const unsigned int height, cv::Mat_<float>& output)
{
const unsigned int m = input.rows;
const unsigned int n = input.cols;
// determine how many blocks there will be with a sliding window of width x height in the input
const unsigned int yB = m - height + 1;
const unsigned int xB = n - width + 1;
// As we will be skipping half of the outputs
const unsigned int out_size = (yB*xB - 1) / 2;
// Allocate the output size
if (output.rows != out_size && output.cols != width * height + 1)
{
output = cv::Mat::ones(out_size, width * height + 1, CV_32F);
}
// Iterate over the blocks, skipping every second block
unsigned int rowIdx = 0;
unsigned int skipCounter = 0;
for (unsigned int j = 0; j< xB; j++)
{
for (unsigned int i = 0; i< yB; i++)
{
// Skip every second row
skipCounter++;
if ((skipCounter + 1) % 2 == 0)
{
continue;
}
float* Mo = output.ptr<float>(rowIdx);
float sum = 0;
for (unsigned int yy = 0; yy < height; ++yy)
{
const float* Mi = input.ptr<float>(i + yy);
for (unsigned int xx = 0; xx < width; ++xx)
{
int colIdx = xx*height + yy;
float in = Mi[j + xx];
sum += in;
Mo[colIdx+1] = in;
}
}
// Working out the mean
float mean = sum / (float)(width * height);
float sum_sq = 0;
const unsigned int num_items = width*height + 1;
// Working out the sum squared and subtracting the mean
for (unsigned int x = 1; x < num_items; ++x)
{
float in = Mo[x] - mean;
Mo[x] = in;
sum_sq += in * in;
}
float norm = sqrt(sum_sq);
// Avoiding division by 0
if (norm == 0)
{
norm = 1;
}
// Flip multiplication to division for speed
norm = 1.0 / norm;
for (unsigned int x = 1; x < num_items; ++x)
{
Mo[x] *= norm;
}
rowIdx++;
}
}
}
void im2colBiasSparse(const cv::Mat_<float>& input, const unsigned int width, const unsigned int height, cv::Mat_<float>& output)
{
const unsigned int m = input.rows;
const unsigned int n = input.cols;
// determine how many blocks there will be with a sliding window of width x height in the input
const unsigned int yB = m - height + 1;
const unsigned int xB = n - width + 1;
// As we will be skipping half of the outputs
const unsigned int out_size = (yB*xB - 1) / 2;
// Allocate the output size
if (output.rows != out_size && output.cols != width * height + 1)
{
output = cv::Mat::ones(out_size, width * height + 1, CV_32F);
}
// Iterate over the blocks, skipping every second block
unsigned int rowIdx = 0;
unsigned int skipCounter = 0;
for (unsigned int j = 0; j< xB; j++)
{
for (unsigned int i = 0; i< yB; i++)
{
// Skip every second row
skipCounter++;
if ((skipCounter + 1) % 2 == 0)
{
continue;
}
for (unsigned int yy = 0; yy < height; ++yy)
{
for (unsigned int xx = 0; xx < width; ++xx)
{
unsigned int colIdx = xx*height + yy;
output.at<float>(rowIdx, colIdx + 1) = input.at<float>(i + yy, j + xx);
}
}
rowIdx++;
}
}
}
// As the sparse patch expert output with interpolation, this function creates an interpolation matrix
void LandmarkDetector::interpolationMatrix(cv::Mat_<float>& mapMatrix, int response_height, int response_width,
int input_width, int input_height)
{
int m = input_height;
int n = input_width;
// determine how many blocks there will be with a sliding window of width x height in the input
int yB = m - 11 + 1;
int xB = n - 11 + 1;
// As we will be skipping half of the outputs
int out_size = (yB*xB - 1) / 2;
mapMatrix.create(out_size, response_height * response_width);
mapMatrix.setTo(0.0f);
// Find a mapping from indices in the computed sparse response and the original full response
cv::Mat_<int> value_id_matrix(response_width, response_height, 0);
int ind = 0;
for (int k = 0; k < value_id_matrix.rows * value_id_matrix.cols; ++k)
{
if (k % 2 != 0)
{
value_id_matrix.at<int>(k) = ind;
ind++;
}
}
value_id_matrix = value_id_matrix.t();
int skip_counter = 0;
for (int x = 0; x < response_width; ++x)
{
for (int y = 0; y < response_height; ++y)
{
int mapping_col = x * response_height + y;
skip_counter++;
if (skip_counter % 2 == 0)
{
int val_id = value_id_matrix.at<int>(y, x);
mapMatrix.at<float>(val_id, mapping_col) = 1;
continue;
}
float num_neigh = 0.0;
std::vector<int> val_ids;
if (x - 1 >= 0)
{
num_neigh++;
val_ids.push_back(value_id_matrix.at<int>(y, x - 1));
}
if (y - 1 >= 0)
{
num_neigh++;
val_ids.push_back(value_id_matrix.at<int>(y - 1, x));
}
if (x + 1 < response_width)
{
num_neigh++;
val_ids.push_back(value_id_matrix.at<int>(y, x + 1));
}
if (y + 1 < response_height)
{
num_neigh++;
val_ids.push_back(value_id_matrix.at<int>(y + 1, x));
}
for (size_t k = 0; k < val_ids.size(); ++k)
{
mapMatrix.at<float>(val_ids[k], mapping_col) = 1.0 / num_neigh;
}
}
}
}
void CEN_patch_expert::ResponseInternal(cv::Mat_<float>& response)
{
for (size_t layer = 0; layer < activation_function.size(); ++layer)
{
// We are performing response = weights[layers] * response, but in OpenBLAS as that is significantly quicker than OpenCV
cv::Mat_<float> resp = response;
float* m1 = (float*)resp.data;
float* m2 = (float*)weights[layer].data;
cv::Mat_<float> resp_blas(weights[layer].rows, resp.cols);
float* m3 = (float*)resp_blas.data;
// Perform matrix multiplication in OpenBLAS (fortran call)
float alpha1 = 1.0;
float beta1 = 0.0;
char N[2]; N[0] = 'N';
sgemm_(N, N, &resp.cols, &weights[layer].rows, &weights[layer].cols, &alpha1, m1, &resp.cols, m2, &weights[layer].cols, &beta1, m3, &resp.cols);
// The above is a faster version of this, by calling the fortran version directly
//cblas_sgemm(CblasColMajor, CblasNoTrans, CblasNoTrans, resp.cols, weight.rows, weight.cols, 1, m1, resp.cols, m2, weight.cols, 0.0, m3, resp.cols);
response = resp_blas;
// Alternative is to multiply the responses directly using OpenCV (much slower)
//response = weights[layer] * response;
// Adding the bias (bit ugly, but the fastest way to do this), TODO can this bias be incorporated in the above?
float* data = (float*)response.data;
const unsigned height = response.rows;
const unsigned width = response.cols;
float* data_b = (float*)biases[layer].data;
for (unsigned int y = 0; y < height; ++y)
{
float bias = data_b[y];
for (unsigned int x = 0; x < width; ++x)
{
float in = *data + bias;
*data++ = in;
}
}
// Perform activation and add bias at the same time
if (activation_function[layer] == 0) // Sigmoid
{
const unsigned int resp_size = response.rows * response.cols;
// Iterate over the data directly
float* data = (float*)response.data;
for (unsigned int counter = 0; counter < resp_size; ++counter)
{
float in = *data;
*data++ = 1.0f / (1.0f + exp(-(in)));
}
}
else if (activation_function[layer] == 2)// ReLU
{
cv::threshold(response, response, 0, 0, cv::THRESH_TOZERO);
}
}
}
//===========================================================================
void CEN_patch_expert::ResponseSparse(const cv::Mat_<float> &area_of_interest_left, const cv::Mat_<float> &area_of_interest_right, cv::Mat_<float> &response_left, cv::Mat_<float> &response_right, cv::Mat_<float>& mapMatrix, cv::Mat_<float>& im2col_prealloc_left, cv::Mat_<float>& im2col_prealloc_right)
{
unsigned int response_height = 0;
const bool left_provided = !area_of_interest_left.empty();
const bool right_provided = !area_of_interest_right.empty();
if(right_provided)
{
cv::flip(area_of_interest_right, area_of_interest_right, 1);
response_height = area_of_interest_right.rows - height_support + 1;
im2colBiasSparseContrastNorm(area_of_interest_right, width_support, height_support, im2col_prealloc_right);
}
// Extract im2col but in a sparse way and contrast normalize
if(left_provided)
{
response_height = area_of_interest_left.rows - height_support + 1;
im2colBiasSparseContrastNorm(area_of_interest_left, width_support, height_support, im2col_prealloc_left);
}
cv::Mat_<float> response;
if(right_provided && left_provided)
{
cv::vconcat(im2col_prealloc_left, im2col_prealloc_right, response);
response = response.t();
}
else if (left_provided)
{
response = im2col_prealloc_left.t();
}
else if (right_provided)
{
response = im2col_prealloc_right.t();
}
ResponseInternal(response);
if(left_provided && right_provided)
{
response_left = response(cv::Rect(0, 0, response.cols / 2, 1));
response_right = response(cv::Rect(response.cols / 2, 0, response.cols / 2, 1));
}
else if (left_provided)
{
response_left = response;
}
else if (right_provided)
{
response_right = response;
}
if(left_provided)
{
// TODO This could and should be gemm'ed
response_left = response_left * mapMatrix;
response_left = response_left.t();
response_left = response_left.reshape(1, response_height);
response_left = response_left.t();
}
if(right_provided)
{
// TODO This could and should be gemm'ed
response_right = response_right * mapMatrix;
response_right = response_right.t();
response_right = response_right.reshape(1, response_height);
response_right = response_right.t();
cv::flip(response_right, response_right, 1);
}
}

View File

@@ -0,0 +1,543 @@
///////////////////////////////////////////////////////////////////////////////
// Copyright (C) 2017, Tadas Baltrusaitis, all rights reserved.
//
// ACADEMIC OR NON-PROFIT ORGANIZATION NONCOMMERCIAL RESEARCH USE ONLY
//
// BY USING OR DOWNLOADING THE SOFTWARE, YOU ARE AGREEING TO THE TERMS OF THIS LICENSE AGREEMENT.
// IF YOU DO NOT AGREE WITH THESE TERMS, YOU MAY NOT USE OR DOWNLOAD THE SOFTWARE.
//
// License can be found in OpenFace-license.txt
//
// * Any publications arising from the use of this software, including but
// not limited to academic journal and conference publications, technical
// reports and manuals, must cite at least one of the following works:
//
// OpenFace 2.0: Facial Behavior Analysis Toolkit
// Tadas Baltrušaitis, Amir Zadeh, Yao Chong Lim, and Louis-Philippe Morency
// in IEEE International Conference on Automatic Face and Gesture Recognition, 2018
//
// Convolutional experts constrained local model for facial landmark detection.
// A. Zadeh, T. Baltrušaitis, and Louis-Philippe Morency,
// in Computer Vision and Pattern Recognition Workshops, 2017.
//
// Rendering of Eyes for Eye-Shape Registration and Gaze Estimation
// Erroll Wood, Tadas Baltrušaitis, Xucong Zhang, Yusuke Sugano, Peter Robinson, and Andreas Bulling
// in IEEE International. Conference on Computer Vision (ICCV), 2015
//
// Cross-dataset learning and person-specific normalisation for automatic Action Unit detection
// Tadas Baltrušaitis, Marwa Mahmoud, and Peter Robinson
// in Facial Expression Recognition and Analysis Challenge,
// IEEE International Conference on Automatic Face and Gesture Recognition, 2015
//
///////////////////////////////////////////////////////////////////////////////
#include "stdafx.h"
#include "CNN_utils.h"
namespace LandmarkDetector
{
// Parametric ReLU with leaky weights (separate ones per channel)
void PReLU(std::vector<cv::Mat_<float> >& input_output_maps, cv::Mat_<float> prelu_weights)
{
if (input_output_maps.size() > 1)
{
unsigned int h = input_output_maps[0].rows;
unsigned int w = input_output_maps[0].cols;
unsigned int size_in = h * w;
for (int k = 0; k < (int) input_output_maps.size(); ++k)
{
// Apply the PReLU
auto iter = input_output_maps[k].begin();
float neg_mult = prelu_weights.at<float>(k);
for (unsigned int i = 0; i < size_in; ++i)
{
float in_val = *iter;
// The prelu step
*iter++ = in_val >= 0 ? in_val : in_val * neg_mult;
}
}
}
else
{
int w = input_output_maps[0].cols;
for (int k = 0; k < prelu_weights.rows; ++k)
{
auto iter = input_output_maps[0].row(k).begin();
float neg_mult = prelu_weights.at<float>(k);
for (int i = 0; i < w; ++i)
{
float in_val = *iter;
// Apply the PReLU
*iter = in_val >= 0 ? in_val : in_val * neg_mult;
// To deal with OpenCV 3.4s debug mode not allowing to go over iteration boundaries
if(i + 1 < w)
{
iter++;
}
}
}
}
}
void fully_connected(std::vector<cv::Mat_<float> >& outputs, const std::vector<cv::Mat_<float> >& input_maps, cv::Mat_<float> weights, cv::Mat_<float> biases)
{
outputs.clear();
if (input_maps.size() > 1)
{
// Concatenate all the maps
cv::Size orig_size = input_maps[0].size();
cv::Mat_<float> input_concat((int)input_maps.size(), input_maps[0].cols * input_maps[0].rows);
for (int in = 0; in < (int)input_maps.size(); ++in)
{
cv::Mat_<float> add = input_maps[in];
// Reshape if all of the data will be flattened
if (input_concat.rows != weights.cols)
{
add = add.t();
}
add = add.reshape(0, 1);
add.copyTo(input_concat.row(in));
}
// Treat the input as separate feature maps
if (input_concat.rows == weights.cols)
{
input_concat = weights * input_concat;
// Add biases
for (int k = 0; k < biases.rows; ++k)
{
input_concat.row(k) = input_concat.row(k) + biases.at<float>(k);
}
outputs.clear();
// Resize and add as output
for (int k = 0; k < biases.rows; ++k)
{
cv::Mat_<float> reshaped = input_concat.row(k).clone();
reshaped = reshaped.reshape(1, orig_size.height);
outputs.push_back(reshaped);
}
}
else
{
// Flatten the input
input_concat = input_concat.reshape(0, input_concat.rows * input_concat.cols);
input_concat = weights * input_concat + biases;
outputs.clear();
outputs.push_back(input_concat);
}
}
else
{
cv::Mat out = weights * input_maps[0] + biases;
outputs.clear();
outputs.push_back(out.t());
}
}
void max_pooling(std::vector<cv::Mat_<float> >& outputs, const std::vector<cv::Mat_<float> >& input_maps, int stride_x, int stride_y, int kernel_size_x, int kernel_size_y)
{
std::vector<cv::Mat_<float> > outputs_sub;
// Iterate over kernel height and width, based on stride
for (size_t in = 0; in < input_maps.size(); ++in)
{
// Help with rounding up a bit, to match caffe style output
int out_x = (int)round((float)(input_maps[in].cols - kernel_size_x) / (float)stride_x) + 1;
int out_y = (int)round((float)(input_maps[in].rows - kernel_size_y) / (float)stride_y) + 1;
cv::Mat_<float> sub_out(out_y, out_x, 0.0);
cv::Mat_<float> in_map = input_maps[in];
for (int x = 0; x < input_maps[in].cols; x += stride_x)
{
int max_x = cv::min(input_maps[in].cols, x + kernel_size_x);
int x_in_out = int(x / stride_x);
if (x_in_out >= out_x)
continue;
for (int y = 0; y < input_maps[in].rows; y += stride_y)
{
int y_in_out = int(y / stride_y);
if (y_in_out >= out_y)
continue;
int max_y = cv::min(input_maps[in].rows, y + kernel_size_y);
float curr_max = -FLT_MAX;
for (int x_in = x; x_in < max_x; ++x_in)
{
for (int y_in = y; y_in < max_y; ++y_in)
{
float curr_val = in_map.at<float>(y_in, x_in);
if (curr_val > curr_max)
{
curr_max = curr_val;
}
}
}
sub_out.at<float>(y_in_out, x_in_out) = curr_max;
}
}
outputs_sub.push_back(sub_out);
}
outputs = outputs_sub;
}
void convolution_single_kern_fft(const std::vector<cv::Mat_<float> >& input_imgs, std::vector<cv::Mat_<double> >& img_dfts,
const std::vector<cv::Mat_<float> >& _templs, std::map<int, std::vector<cv::Mat_<double> > >& _templ_dfts, cv::Mat_<float>& result)
{
// Assume result is defined properly
if (result.empty())
{
cv::Size corrSize(input_imgs[0].cols - _templs[0].cols + 1, input_imgs[0].rows - _templs[0].rows + 1);
result.create(corrSize);
}
// Our model will always be under min block size so can ignore this
//const double blockScale = 4.5;
//const int minBlockSize = 256;
int maxDepth = CV_64F;
cv::Size dftsize;
dftsize.width = cv::getOptimalDFTSize(result.cols + _templs[0].cols - 1);
dftsize.height = cv::getOptimalDFTSize(result.rows + _templs[0].rows - 1);
// Compute block size
cv::Size blocksize;
blocksize.width = dftsize.width - _templs[0].cols + 1;
blocksize.width = MIN(blocksize.width, result.cols);
blocksize.height = dftsize.height - _templs[0].rows + 1;
blocksize.height = MIN(blocksize.height, result.rows);
std::vector<cv::Mat_<double>> dftTempl;
// if this has not been precomputed, precompute it, otherwise use it
if (_templ_dfts.find(dftsize.width) == _templ_dfts.end())
{
dftTempl.resize(_templs.size());
for (size_t k = 0; k < _templs.size(); ++k)
{
dftTempl[k].create(dftsize.height, dftsize.width);
cv::Mat_<float> src = _templs[k];
cv::Mat_<double> dst(dftTempl[k], cv::Rect(0, 0, dftsize.width, dftsize.height));
cv::Mat_<double> dst1(dftTempl[k], cv::Rect(0, 0, _templs[k].cols, _templs[k].rows));
if (dst1.data != src.data)
src.convertTo(dst1, dst1.depth());
if (dst.cols > _templs[k].cols)
{
cv::Mat_<double> part(dst, cv::Range(0, _templs[k].rows), cv::Range(_templs[k].cols, dst.cols));
part.setTo(0);
}
// Perform DFT of the template
dft(dst, dst, 0, _templs[k].rows);
}
_templ_dfts[dftsize.width] = dftTempl;
}
else
{
dftTempl = _templ_dfts[dftsize.width];
}
cv::Size bsz(std::min(blocksize.width, result.cols), std::min(blocksize.height, result.rows));
cv::Mat src;
cv::Mat cdst(result, cv::Rect(0, 0, bsz.width, bsz.height));
std::vector<cv::Mat_<double> > dftImgs;
dftImgs.resize(input_imgs.size());
if (img_dfts.empty())
{
for (size_t k = 0; k < input_imgs.size(); ++k)
{
dftImgs[k].create(dftsize);
dftImgs[k].setTo(0.0);
cv::Size dsz(bsz.width + _templs[k].cols - 1, bsz.height + _templs[k].rows - 1);
int x2 = std::min(input_imgs[k].cols, dsz.width);
int y2 = std::min(input_imgs[k].rows, dsz.height);
cv::Mat src0(input_imgs[k], cv::Range(0, y2), cv::Range(0, x2));
cv::Mat dst(dftImgs[k], cv::Rect(0, 0, dsz.width, dsz.height));
cv::Mat dst1(dftImgs[k], cv::Rect(0, 0, x2, y2));
src = src0;
if (dst1.data != src.data)
src.convertTo(dst1, dst1.depth());
dft(dftImgs[k], dftImgs[k], 0, dsz.height);
img_dfts.push_back(dftImgs[k].clone());
}
}
cv::Mat_<double> dft_img(img_dfts[0].rows, img_dfts[0].cols, 0.0);
for (size_t k = 0; k < input_imgs.size(); ++k)
{
cv::Mat dftTempl1(dftTempl[k], cv::Rect(0, 0, dftsize.width, dftsize.height));
if (k == 0)
{
cv::mulSpectrums(img_dfts[k], dftTempl1, dft_img, 0, true);
}
else
{
cv::mulSpectrums(img_dfts[k], dftTempl1, dftImgs[k], 0, true);
dft_img = dft_img + dftImgs[k];
}
}
cv::dft(dft_img, dft_img, cv::DFT_INVERSE + cv::DFT_SCALE, bsz.height);
src = dft_img(cv::Rect(0, 0, bsz.width, bsz.height));
src.convertTo(cdst, CV_32F);
}
void convolution_fft2(std::vector<cv::Mat_<float> >& outputs, const std::vector<cv::Mat_<float> >& input_maps,
const std::vector<std::vector<cv::Mat_<float> > >& kernels, const std::vector<float >& biases,
std::vector<std::map<int, std::vector<cv::Mat_<double> > > >& precomp_dfts)
{
outputs.clear();
// Useful precomputed data placeholders for quick correlation (convolution)
std::vector<cv::Mat_<double> > input_image_dft;
for (size_t k = 0; k < kernels.size(); ++k)
{
// The convolution (with precomputation)
cv::Mat_<float> output;
convolution_single_kern_fft(input_maps, input_image_dft, kernels[k], precomp_dfts[k], output);
// Combining the maps
outputs.push_back(output + biases[k]);
}
}
void im2col_t(const cv::Mat_<float>& input, const unsigned int width, const unsigned int height, cv::Mat_<float>& output)
{
const unsigned int m = input.cols;
const unsigned int n = input.rows;
// determine how many blocks there will be with a sliding window of width x height in the input
const unsigned int yB = m - height + 1;
const unsigned int xB = n - width + 1;
// Allocate the output size
if (output.rows != width * height || output.cols != xB*yB)
{
output = cv::Mat::ones(width * height, xB*yB, CV_32F);
}
// Iterate over the whole image
for (unsigned int i = 0; i< yB; i++)
{
unsigned int rowIdx = i;
for (unsigned int j = 0; j< xB; j++)
{
//int rowIdx = i; +j*yB;
// iterate over the blocks within the image
for (unsigned int yy = 0; yy < height; ++yy)
{
// Faster iteration over the image
const float* Mi = input.ptr<float>(j + yy);
for (unsigned int xx = 0; xx < width; ++xx)
{
unsigned int colIdx = xx*height + yy;
output.at<float>(colIdx, rowIdx) = Mi[i + xx];
}
}
rowIdx += yB;
}
}
}
void im2col(const cv::Mat_<float>& input, const unsigned int width, const unsigned int height, cv::Mat_<float>& output)
{
const unsigned int m = input.rows;
const unsigned int n = input.cols;
// determine how many blocks there will be with a sliding window of width x height in the input
const unsigned int yB = m - height + 1;
const unsigned int xB = n - width + 1;
// Allocate the output size
if (output.cols != width * height || output.rows != xB*yB)
{
output = cv::Mat::ones(xB*yB, width * height, CV_32F);
}
// Iterate over the whole image
for (unsigned int i = 0; i< yB; i++)
{
unsigned int rowIdx = i*xB;
for (unsigned int j = 0; j< xB; j++)
{
float* Mo = output.ptr<float>(rowIdx);
// iterate over the blocks within the image
for (unsigned int yy = 0; yy < height; ++yy)
{
// Faster iteration over the image
const float* Mi = input.ptr<float>(i + yy);
for (unsigned int xx = 0; xx < width; ++xx)
{
unsigned int colIdx = xx*height + yy;
//output.at<float>(rowIdx, colIdx) = Mi[j + xx]; //input.at<float>(i + yy, j + xx);
Mo[colIdx] = Mi[j + xx];
}
}
rowIdx++;
}
}
}
void im2col_multimap(const std::vector<cv::Mat_<float> >& inputs, const unsigned int width, const unsigned int height,
cv::Mat_<float>& output)
{
const unsigned int m = inputs[0].rows;
const unsigned int n = inputs[0].cols;
// determine how many blocks there will be with a sliding window of width x height in the input
const unsigned int yB = m - height + 1;
const unsigned int xB = n - width + 1;
int stride = height * width;
unsigned int num_maps = (unsigned int)inputs.size();
// Allocate the output size
if (output.cols != width * height * inputs.size() + 1 || (unsigned int) output.rows < xB*yB)
{
output = cv::Mat::ones(xB*yB, width * height * num_maps + 1, CV_32F);
}
// Iterate over the whole image
for (unsigned int i = 0; i< yB; i++)
{
unsigned int rowIdx = i*xB;
for (unsigned int j = 0; j< xB; j++)
{
float* Mo = output.ptr<float>(rowIdx);
// TODO, this should be rearranged and done through mem-copy
// iterate over the blocks within the image
for (unsigned int yy = 0; yy < height; ++yy)
{
for (unsigned int in_maps = 0; in_maps < num_maps; ++in_maps)
{
// Faster iteration over the image
const float* Mi = inputs[in_maps].ptr<float>(i + yy);
for (unsigned int xx = 0; xx < width; ++xx)
{
unsigned int colIdx = xx*height + yy + in_maps * stride;
//output.at<float>(rowIdx, colIdx) = Mi[j + xx]; //input.at<float>(i + yy, j + xx);
Mo[colIdx] = Mi[j + xx];
}
}
}
rowIdx++;
}
}
}
// A fast convolution implementation, can provide a pre-allocated im2col as well, if empty, it is created
void convolution_direct_blas(std::vector<cv::Mat_<float> >& outputs, const std::vector<cv::Mat_<float> >& input_maps, const cv::Mat_<float>& weight_matrix, int height_k, int width_k, cv::Mat_<float>& pre_alloc_im2col)
{
outputs.clear();
int height_in = input_maps[0].rows;
int width_n = input_maps[0].cols;
// determine how many blocks there will be with a sliding window of width x height in the input
int yB = height_in - height_k + 1;
int xB = width_n - width_k + 1;
int num_rows = yB * xB;
// Instead of re-allocating data use the first rows of already allocated data and re-allocate only if not enough rows are present, this is what makes this non thread safe, as same memory would be used
im2col_multimap(input_maps, width_k, height_k, pre_alloc_im2col);
float* m1 = (float*)pre_alloc_im2col.data;
float* m2 = (float*)weight_matrix.data;
int m2_cols = weight_matrix.cols;
int m2_rows = weight_matrix.rows;
cv::Mat_<float> out(num_rows, weight_matrix.cols, 1.0);
float* m3 = (float*)out.data;
//cblas_sgemm(CblasColMajor, CblasNoTrans, CblasNoTrans, weight_t.cols, yB * xB, pre_alloc_im2col.cols, 1, m2, weight_t.cols, m1, pre_alloc_im2col.cols, 0.0, m3, weight_t.cols);
float alpha = 1.0f;
float beta = 0.0f;
// Call fortran directly (faster)
char N[2]; N[0] = 'N';
sgemm_(N, N, &m2_cols, &num_rows, &pre_alloc_im2col.cols, &alpha, m2, &m2_cols, m1, &pre_alloc_im2col.cols, &beta, m3, &m2_cols);
// Above is equivalent to out = pre_alloc_im2col * weight_matrix;
out = out.t();
// Move back to vectors and reshape accordingly
for (int k = 0; k < out.rows; ++k)
{
outputs.push_back(out.row(k).reshape(1, yB));
}
}
}

View File

@@ -0,0 +1,893 @@
///////////////////////////////////////////////////////////////////////////////
// Copyright (C) 2017, Carnegie Mellon University and University of Cambridge,
// all rights reserved.
//
// ACADEMIC OR NON-PROFIT ORGANIZATION NONCOMMERCIAL RESEARCH USE ONLY
//
// BY USING OR DOWNLOADING THE SOFTWARE, YOU ARE AGREEING TO THE TERMS OF THIS LICENSE AGREEMENT.
// IF YOU DO NOT AGREE WITH THESE TERMS, YOU MAY NOT USE OR DOWNLOAD THE SOFTWARE.
//
// License can be found in OpenFace-license.txt
//
// * Any publications arising from the use of this software, including but
// not limited to academic journal and conference publications, technical
// reports and manuals, must cite at least one of the following works:
//
// OpenFace 2.0: Facial Behavior Analysis Toolkit
// Tadas Baltrušaitis, Amir Zadeh, Yao Chong Lim, and Louis-Philippe Morency
// in IEEE International Conference on Automatic Face and Gesture Recognition, 2018
//
// Convolutional experts constrained local model for facial landmark detection.
// A. Zadeh, T. Baltrušaitis, and Louis-Philippe Morency,
// in Computer Vision and Pattern Recognition Workshops, 2017.
//
// Rendering of Eyes for Eye-Shape Registration and Gaze Estimation
// Erroll Wood, Tadas Baltrušaitis, Xucong Zhang, Yusuke Sugano, Peter Robinson, and Andreas Bulling
// in IEEE International. Conference on Computer Vision (ICCV), 2015
//
// Cross-dataset learning and person-specific normalisation for automatic Action Unit detection
// Tadas Baltrušaitis, Marwa Mahmoud, and Peter Robinson
// in Facial Expression Recognition and Analysis Challenge,
// IEEE International Conference on Automatic Face and Gesture Recognition, 2015
//
///////////////////////////////////////////////////////////////////////////////
#include "stdafx.h"
#include "FaceDetectorMTCNN.h"
#ifndef M_PI
#define M_PI 3.14159265358979323846
#endif
#include "LandmarkDetectorUtils.h"
// CNN includes
#include "CNN_utils.h"
// Instead of including cblas.h (the definitions from OpenBLAS and other BLAS libraries differ, declare the required OpenBLAS functionality here)
#ifdef __cplusplus
extern "C" {
/* Assume C declarations for C++ */
#endif /* __cplusplus */
/*Set the number of threads on runtime.*/
void openblas_set_num_threads(int num_threads);
}
using namespace LandmarkDetector;
// Constructor from model file location
FaceDetectorMTCNN::FaceDetectorMTCNN(const std::string& location)
{
this->Read(location);
}
// Copy constructor
FaceDetectorMTCNN::FaceDetectorMTCNN(const FaceDetectorMTCNN& other) : PNet(other.PNet), RNet(other.RNet), ONet(other.ONet)
{
}
CNN::CNN(const CNN& other) : cnn_layer_types(other.cnn_layer_types), cnn_max_pooling_layers(other.cnn_max_pooling_layers), cnn_convolutional_layers_bias(other.cnn_convolutional_layers_bias), conv_layer_pre_alloc_im2col(other.conv_layer_pre_alloc_im2col)
{
this->cnn_convolutional_layers_weights.resize(other.cnn_convolutional_layers_weights.size());
for (size_t l = 0; l < other.cnn_convolutional_layers_weights.size(); ++l)
{
// Make sure the matrix is copied.
this->cnn_convolutional_layers_weights[l] = other.cnn_convolutional_layers_weights[l].clone();
}
this->cnn_convolutional_layers.resize(other.cnn_convolutional_layers.size());
for (size_t l = 0; l < other.cnn_convolutional_layers.size(); ++l)
{
this->cnn_convolutional_layers[l].resize(other.cnn_convolutional_layers[l].size());
for (size_t i = 0; i < other.cnn_convolutional_layers[l].size(); ++i)
{
this->cnn_convolutional_layers[l][i].resize(other.cnn_convolutional_layers[l][i].size());
for (size_t k = 0; k < other.cnn_convolutional_layers[l][i].size(); ++k)
{
// Make sure the matrix is copied.
this->cnn_convolutional_layers[l][i][k] = other.cnn_convolutional_layers[l][i][k].clone();
}
}
}
this->cnn_fully_connected_layers_weights.resize(other.cnn_fully_connected_layers_weights.size());
for (size_t l = 0; l < other.cnn_fully_connected_layers_weights.size(); ++l)
{
// Make sure the matrix is copied.
this->cnn_fully_connected_layers_weights[l] = other.cnn_fully_connected_layers_weights[l].clone();
}
this->cnn_fully_connected_layers_biases.resize(other.cnn_fully_connected_layers_biases.size());
for (size_t l = 0; l < other.cnn_fully_connected_layers_biases.size(); ++l)
{
// Make sure the matrix is copied.
this->cnn_fully_connected_layers_biases[l] = other.cnn_fully_connected_layers_biases[l].clone();
}
this->cnn_prelu_layer_weights.resize(other.cnn_prelu_layer_weights.size());
for (size_t l = 0; l < other.cnn_prelu_layer_weights.size(); ++l)
{
// Make sure the matrix is copied.
this->cnn_prelu_layer_weights[l] = other.cnn_prelu_layer_weights[l].clone();
}
}
std::vector<cv::Mat_<float>> CNN::Inference(const cv::Mat& input_img, bool direct, bool thread_safe)
{
if (input_img.channels() == 1)
{
cv::cvtColor(input_img, input_img, cv::COLOR_GRAY2BGR);
}
int cnn_layer = 0;
int fully_connected_layer = 0;
int prelu_layer = 0;
int max_pool_layer = 0;
// Slit a BGR image into three chnels
cv::Mat channels[3];
cv::split(input_img, channels);
// Flip the BGR order to RGB
std::vector<cv::Mat_<float> > input_maps;
input_maps.push_back(channels[2]);
input_maps.push_back(channels[1]);
input_maps.push_back(channels[0]);
std::vector<cv::Mat_<float> > outputs;
for (size_t layer = 0; layer < cnn_layer_types.size(); ++layer)
{
// Determine layer type
int layer_type = cnn_layer_types[layer];
// Convolutional layer
if (layer_type == 0)
{
// Either perform direct convolution through matrix multiplication or use an FFT optimized version, which one is optimal depends on the kernel and input sizes
if (direct)
{
if(thread_safe)
{
cv::Mat_<float> pre_alloc;
convolution_direct_blas(outputs, input_maps, cnn_convolutional_layers_weights[cnn_layer], cnn_convolutional_layers[cnn_layer][0][0].rows, cnn_convolutional_layers[cnn_layer][0][0].cols, pre_alloc);
}
else
{
convolution_direct_blas(outputs, input_maps, cnn_convolutional_layers_weights[cnn_layer], cnn_convolutional_layers[cnn_layer][0][0].rows, cnn_convolutional_layers[cnn_layer][0][0].cols, conv_layer_pre_alloc_im2col[cnn_layer]);
}
}
else
{
convolution_fft2(outputs, input_maps, cnn_convolutional_layers[cnn_layer], cnn_convolutional_layers_bias[cnn_layer], cnn_convolutional_layers_dft[cnn_layer]);
}
//vector<cv::Mat_<float> > outs;
//convolution_fft(outs, input_maps, cnn_convolutional_layers[cnn_layer], cnn_convolutional_layers_bias[cnn_layer], cnn_convolutional_layers_dft[cnn_layer]);
cnn_layer++;
}
if (layer_type == 1)
{
int stride_x = std::get<2>(cnn_max_pooling_layers[max_pool_layer]);
int stride_y = std::get<3>(cnn_max_pooling_layers[max_pool_layer]);
int kernel_size_x = std::get<0>(cnn_max_pooling_layers[max_pool_layer]);
int kernel_size_y = std::get<1>(cnn_max_pooling_layers[max_pool_layer]);
max_pooling(outputs, input_maps, stride_x, stride_y, kernel_size_x, kernel_size_y);
max_pool_layer++;
}
if (layer_type == 2)
{
fully_connected(outputs, input_maps, cnn_fully_connected_layers_weights[fully_connected_layer], cnn_fully_connected_layers_biases[fully_connected_layer]);
fully_connected_layer++;
}
if (layer_type == 3) // PReLU
{
// In place prelu computation
PReLU(input_maps, cnn_prelu_layer_weights[prelu_layer]);
outputs = input_maps;
prelu_layer++;
}
if (layer_type == 4)
{
outputs.clear();
for (size_t k = 0; k < input_maps.size(); ++k)
{
// Apply the sigmoid
cv::exp(-input_maps[k], input_maps[k]);
input_maps[k] = 1.0 / (1.0 + input_maps[k]);
outputs.push_back(input_maps[k]);
}
}
// Set the outputs of this layer to inputs of the next one
input_maps = outputs;
}
return outputs;
}
void ReadMatBin(std::ifstream& stream, cv::Mat &output_mat)
{
// Read in the number of rows, columns and the data type
int row, col, type;
stream.read((char*)&row, 4);
stream.read((char*)&col, 4);
stream.read((char*)&type, 4);
output_mat = cv::Mat(row, col, type);
int size = output_mat.rows * output_mat.cols * output_mat.elemSize();
stream.read((char *)output_mat.data, size);
}
void CNN::ClearPrecomp()
{
for (size_t k1 = 0; k1 < cnn_convolutional_layers_dft.size(); ++k1)
{
for (size_t k2 = 0; k2 < cnn_convolutional_layers_dft[k1].size(); ++k2)
{
cnn_convolutional_layers_dft[k1][k2].clear();
}
}
}
void CNN::Read(const std::string& location)
{
openblas_set_num_threads(1);
std::ifstream cnn_stream(location, std::ios::in | std::ios::binary);
if (cnn_stream.is_open())
{
cnn_stream.seekg(0, std::ios::beg);
// Reading in CNNs
int network_depth;
cnn_stream.read((char*)&network_depth, 4);
cnn_layer_types.resize(network_depth);
for (int layer = 0; layer < network_depth; ++layer)
{
int layer_type;
cnn_stream.read((char*)&layer_type, 4);
cnn_layer_types[layer] = layer_type;
// convolutional
if (layer_type == 0)
{
// Read the number of input maps
int num_in_maps;
cnn_stream.read((char*)&num_in_maps, 4);
// Read the number of kernels for each input map
int num_kernels;
cnn_stream.read((char*)&num_kernels, 4);
std::vector<std::vector<cv::Mat_<float> > > kernels;
kernels.resize(num_in_maps);
std::vector<float> biases;
for (int k = 0; k < num_kernels; ++k)
{
float bias;
cnn_stream.read((char*)&bias, 4);
biases.push_back(bias);
}
cnn_convolutional_layers_bias.push_back(biases);
// For every input map
for (int in = 0; in < num_in_maps; ++in)
{
kernels[in].resize(num_kernels);
// For every kernel on that input map
for (int k = 0; k < num_kernels; ++k)
{
ReadMatBin(cnn_stream, kernels[in][k]);
}
}
// Rearrange the kernels for faster inference with FFT
std::vector<std::vector<cv::Mat_<float> > > kernels_rearr;
kernels_rearr.resize(num_kernels);
// Fill up the rearranged layer
for (int k = 0; k < num_kernels; ++k)
{
for (int in = 0; in < num_in_maps; ++in)
{
kernels_rearr[k].push_back(kernels[in][k]);
}
}
cnn_convolutional_layers.push_back(kernels_rearr);
// Place-holders for DFT precomputation
std::vector<std::map<int, std::vector<cv::Mat_<double> > > > cnn_convolutional_layers_dft_curr_layer;
cnn_convolutional_layers_dft_curr_layer.resize(num_kernels);
cnn_convolutional_layers_dft.push_back(cnn_convolutional_layers_dft_curr_layer);
// Rearrange the flattened kernels into weight matrices for direct convolution computation
cv::Mat_<float> weight_matrix(num_in_maps * kernels_rearr[0][0].rows * kernels_rearr[0][0].cols, num_kernels);
for (int k = 0; k < num_kernels; ++k)
{
for (int i = 0; i < num_in_maps; ++i)
{
// Flatten the kernel
cv::Mat_<float> k_flat = kernels_rearr[k][i].t();
k_flat = k_flat.reshape(0, 1).t();
k_flat.copyTo(weight_matrix(cv::Rect(k, i * kernels_rearr[0][0].rows * kernels_rearr[0][0].cols, 1, kernels_rearr[0][0].rows * kernels_rearr[0][0].cols)));
}
}
// Transpose the weight matrix for more convenient computation
weight_matrix = weight_matrix.t();
// Add a bias term to the weight matrix for efficiency
cv::Mat_<float> W(weight_matrix.rows, weight_matrix.cols + 1, 1.0);
for (int k = 0; k < weight_matrix.rows; ++k)
{
W.at<float>(k, weight_matrix.cols) = biases[k];
}
weight_matrix.copyTo(W(cv::Rect(0, 0, weight_matrix.cols, weight_matrix.rows)));
cnn_convolutional_layers_weights.push_back(W.t());
conv_layer_pre_alloc_im2col.push_back(cv::Mat_<float>());
}
else if (layer_type == 1)
{
int kernel_x, kernel_y, stride_x, stride_y;
cnn_stream.read((char*)&kernel_x, 4);
cnn_stream.read((char*)&kernel_y, 4);
cnn_stream.read((char*)&stride_x, 4);
cnn_stream.read((char*)&stride_y, 4);
cnn_max_pooling_layers.push_back(std::tuple<int, int, int, int>(kernel_x, kernel_y, stride_x, stride_y));
}
else if (layer_type == 2)
{
cv::Mat_<float> biases;
ReadMatBin(cnn_stream, biases);
cnn_fully_connected_layers_biases.push_back(biases);
// Fully connected layer
cv::Mat_<float> weights;
ReadMatBin(cnn_stream, weights);
cnn_fully_connected_layers_weights.push_back(weights.t());
}
else if (layer_type == 3)
{
cv::Mat_<float> weights;
ReadMatBin(cnn_stream, weights);
cnn_prelu_layer_weights.push_back(weights);
}
}
}
else
{
std::cout << "WARNING: Can't find the CNN location" << std::endl;
}
}
//===========================================================================
// Read in the MTCNN detector
void FaceDetectorMTCNN::Read(const std::string& location)
{
std::cout << "Reading the MTCNN face detector from: " << location << std::endl;
std::ifstream locations(location.c_str(), std::ios_base::in);
if (!locations.is_open())
{
std::cout << "MTCNN model file not found or can't be opened" << std::endl;
return;
}
std::string line;
// The other module locations should be defined as relative paths from the main model
fs::path root = fs::path(location).parent_path();
// The main file contains the references to other files
while (!locations.eof())
{
getline(locations, line);
std::stringstream lineStream(line);
std::string module;
std::string location;
// figure out which module is to be read from which file
lineStream >> module;
lineStream >> location;
// remove carriage return at the end for compatibility with unix systems
if (location.size() > 0 && location.at(location.size() - 1) == '\r')
{
location = location.substr(0, location.size() - 1);
}
// append to root
location = (root / location).string();
if (module.compare("PNet") == 0)
{
std::cout << "Reading the PNet module from: " << location << std::endl;
PNet.Read(location);
}
else if(module.compare("RNet") == 0)
{
std::cout << "Reading the RNet module from: " << location << std::endl;
RNet.Read(location);
}
else if (module.compare("ONet") == 0)
{
std::cout << "Reading the ONet module from: " << location << std::endl;
ONet.Read(location);
}
}
}
// Perform non maximum supression on proposal bounding boxes prioritizing boxes with high score/confidence
std::vector<int> non_maximum_supression(const std::vector<cv::Rect_<float> >& original_bb, const std::vector<float>& scores, float thresh, bool minimum)
{
// Sort the input bounding boxes by the detection score, using the nice trick of multimap always being sorted internally
std::multimap<float, size_t> idxs;
for (size_t i = 0; i < original_bb.size(); ++i)
{
idxs.insert(std::pair<float, size_t>(scores[i], i));
}
std::vector<int> output_ids;
// keep looping while some indexes still remain in the indexes list
while (idxs.size() > 0)
{
// grab the last rectangle
auto lastElem = --std::end(idxs);
size_t curr_id = lastElem->second;
const cv::Rect& rect1 = original_bb[curr_id];
idxs.erase(lastElem);
// Iterate through remaining bounding boxes and choose which ones to remove
for (auto pos = std::begin(idxs); pos != std::end(idxs); )
{
// grab the current rectangle
const cv::Rect& rect2 = original_bb[pos->second];
float intArea = (rect1 & rect2).area();
float unionArea;
if (minimum)
{
unionArea = cv::min(rect1.area(), rect2.area());
}
else
{
unionArea = rect1.area() + rect2.area() - intArea;
}
float overlap = intArea / unionArea;
// Remove the bounding boxes with less confidence but with significant overlap with the current one
if (overlap > thresh)
{
pos = idxs.erase(pos);
}
else
{
++pos;
}
}
output_ids.push_back(curr_id);
}
return output_ids;
}
// Helper function for selecting a subset of bounding boxes based on indices
void select_subset(const std::vector<int>& to_keep, std::vector<cv::Rect_<float> >& bounding_boxes, std::vector<float>& scores,
std::vector<cv::Rect_<float> >& corrections)
{
std::vector<cv::Rect_<float> > bounding_boxes_tmp;
std::vector<float> scores_tmp;
std::vector<cv::Rect_<float> > corrections_tmp;
for (size_t i = 0; i < to_keep.size(); ++i)
{
bounding_boxes_tmp.push_back(bounding_boxes[to_keep[i]]);
scores_tmp.push_back(scores[to_keep[i]]);
corrections_tmp.push_back(corrections[to_keep[i]]);
}
bounding_boxes = bounding_boxes_tmp;
scores = scores_tmp;
corrections = corrections_tmp;
}
// Use the heatmap generated by PNet to generate bounding boxes in the original image space, also generate the correction values and scores of the bounding boxes as well
void generate_bounding_boxes(std::vector<cv::Rect_<float> >& o_bounding_boxes, std::vector<float>& o_scores,
std::vector<cv::Rect_<float> >& o_corrections, const cv::Mat_<float>& heatmap, const std::vector<cv::Mat_<float> >& corrections,
float scale, float threshold, int face_support)
{
// Correction for the pooling
int stride = 2;
o_bounding_boxes.clear();
o_scores.clear();
o_corrections.clear();
int counter = 0;
for (int x = 0; x < heatmap.cols; ++x)
{
for(int y = 0; y < heatmap.rows; ++y)
{
if (heatmap.at<float>(y, x) >= threshold)
{
float min_x = int((stride * x + 1) / scale);
float max_x = int((stride * x + face_support) / scale);
float min_y = int((stride * y + 1) / scale);
float max_y = int((stride * y + face_support) / scale);
o_bounding_boxes.push_back(cv::Rect_<float>(min_x, min_y, max_x - min_x, max_y - min_y));
o_scores.push_back(heatmap.at<float>(y, x));
float corr_x = corrections[0].at<float>(y, x);
float corr_y = corrections[1].at<float>(y, x);
float corr_width = corrections[2].at<float>(y, x);
float corr_height = corrections[3].at<float>(y, x);
o_corrections.push_back(cv::Rect_<float>(corr_x, corr_y, corr_width, corr_height));
counter++;
}
}
}
}
// Converting the bounding boxes to squares
void rectify(std::vector<cv::Rect_<float> >& total_bboxes)
{
// Apply size and location offsets
for (size_t i = 0; i < total_bboxes.size(); ++i)
{
float height = total_bboxes[i].height;
float width = total_bboxes[i].width;
float max_side = std::max(width, height);
// Correct the starts based on new size
float new_min_x = total_bboxes[i].x + 0.5 * (width - max_side);
float new_min_y = total_bboxes[i].y + 0.5 * (height - max_side);
total_bboxes[i].x = (int)new_min_x;
total_bboxes[i].y = (int)new_min_y;
total_bboxes[i].width = (int)max_side;
total_bboxes[i].height = (int)max_side;
}
}
void apply_correction(std::vector<cv::Rect_<float> >& total_bboxes, const std::vector<cv::Rect_<float> > corrections, bool add1)
{
// Apply size and location offsets
for (size_t i = 0; i < total_bboxes.size(); ++i)
{
cv::Rect curr_box = total_bboxes[i];
if (add1)
{
curr_box.width++;
curr_box.height++;
}
float new_min_x = curr_box.x + corrections[i].x * curr_box.width;
float new_min_y = curr_box.y + corrections[i].y * curr_box.height;
float new_max_x = curr_box.x + curr_box.width + curr_box.width * corrections[i].width;
float new_max_y = curr_box.y + curr_box.height + curr_box.height * corrections[i].height;
total_bboxes[i] = cv::Rect_<float>(new_min_x, new_min_y, new_max_x - new_min_x, new_max_y - new_min_y);
}
}
// The actual MTCNN face detection step
bool FaceDetectorMTCNN::DetectFaces(std::vector<cv::Rect_<float> >& o_regions, const cv::Mat& img_in,
std::vector<float>& o_confidences, int min_face_size, float t1, float t2, float t3)
{
int height_orig = img_in.size().height;
int width_orig = img_in.size().width;
// Size ratio of image pyramids
double pyramid_factor = 0.709;
// Face support region is 12x12 px, so from that can work out the largest
// scale(which is 12 / min), and work down from there to smallest scale(no smaller than 12x12px)
int min_dim = std::min(height_orig, width_orig);
int face_support = 12;
int num_scales = floor(log((double)min_face_size / (double)min_dim) / log(pyramid_factor)) + 1;
cv::Mat input_img;
// Force the image to three channels
if (img_in.channels() == 1)
{
cv::cvtColor(img_in, input_img, cv::COLOR_GRAY2RGB);
}
else
{
input_img = img_in;
}
cv::Mat img_float;
input_img.convertTo(img_float, CV_32FC3);
std::vector<cv::Rect_<float> > proposal_boxes_all;
std::vector<float> scores_all;
std::vector<cv::Rect_<float> > proposal_corrections_all;
// As the scales will be done in parallel have some containers for them
std::vector<std::vector<cv::Rect_<float> > > proposal_boxes_cross_scale(num_scales);
std::vector<std::vector<float> > scores_cross_scale(num_scales);
std::vector<std::vector<cv::Rect_<float> > > proposal_corrections_cross_scale(num_scales);
for (int i = 0; i < num_scales; ++i)
{
double scale = ((double)face_support / (double)min_face_size)*cv::pow(pyramid_factor, i);
int h_pyr = ceil(height_orig * scale);
int w_pyr = ceil(width_orig * scale);
cv::Mat normalised_img;
cv::resize(img_float, normalised_img, cv::Size(w_pyr, h_pyr));
// Normalize the image
normalised_img = (normalised_img - 127.5) * 0.0078125;
// Actual PNet CNN step
std::vector<cv::Mat_<float> > pnet_out = PNet.Inference(normalised_img, true, false);
// Clear the precomputations, as the image sizes will be different
PNet.ClearPrecomp();
// Extract the probabilities from PNet response
cv::Mat_<float> prob_heatmap;
cv::exp(pnet_out[0]- pnet_out[1], prob_heatmap);
prob_heatmap = 1.0 / (1.0 + prob_heatmap);
// Extract the probabilities from PNet response
std::vector<cv::Mat_<float>> corrections_heatmap(pnet_out.begin() + 2, pnet_out.end());
// Grab the detections
std::vector<cv::Rect_<float> > proposal_boxes;
std::vector<float> scores;
std::vector<cv::Rect_<float> > proposal_corrections;
generate_bounding_boxes(proposal_boxes, scores, proposal_corrections, prob_heatmap, corrections_heatmap, scale, t1, face_support);
proposal_boxes_cross_scale[i] = proposal_boxes;
scores_cross_scale[i] = scores;
proposal_corrections_cross_scale[i] = proposal_corrections;
}
//});
// Perform non-maximum supression on proposals accross scales and combine them
for (int i = 0; i < num_scales; ++i)
{
std::vector<int> to_keep = non_maximum_supression(proposal_boxes_cross_scale[i], scores_cross_scale[i], 0.5, false);
select_subset(to_keep, proposal_boxes_cross_scale[i], scores_cross_scale[i], proposal_corrections_cross_scale[i]);
proposal_boxes_all.insert(proposal_boxes_all.end(), proposal_boxes_cross_scale[i].begin(), proposal_boxes_cross_scale[i].end());
scores_all.insert(scores_all.end(), scores_cross_scale[i].begin(), scores_cross_scale[i].end());
proposal_corrections_all.insert(proposal_corrections_all.end(), proposal_corrections_cross_scale[i].begin(), proposal_corrections_cross_scale[i].end());
}
// Preparation for RNet step
// Non maximum supression accross bounding boxes, and their offset correction
std::vector<int> to_keep = non_maximum_supression(proposal_boxes_all, scores_all, 0.7, false);
select_subset(to_keep, proposal_boxes_all, scores_all, proposal_corrections_all);
apply_correction(proposal_boxes_all, proposal_corrections_all, false);
// Convert to rectangles and round
rectify(proposal_boxes_all);
// Creating proposal images from previous step detections
std::vector<bool> above_thresh;
above_thresh.resize(proposal_boxes_all.size(), false);
for (size_t k = 0; k < proposal_boxes_all.size(); ++k)
{
float width_target = proposal_boxes_all[k].width + 1;
float height_target = proposal_boxes_all[k].height + 1;
// Work out the start and end indices in the original image
int start_x_in = cv::max((int)(proposal_boxes_all[k].x - 1), 0);
int start_y_in = cv::max((int)(proposal_boxes_all[k].y - 1), 0);
int end_x_in = cv::min((int)(proposal_boxes_all[k].x + width_target - 1), width_orig);
int end_y_in = cv::min((int)(proposal_boxes_all[k].y + height_target - 1), height_orig);
// Work out the start and end indices in the target image
int start_x_out = cv::max((int)(-proposal_boxes_all[k].x + 1), 0);
int start_y_out = cv::max((int)(-proposal_boxes_all[k].y + 1), 0);
int end_x_out = cv::min(width_target - (proposal_boxes_all[k].x + proposal_boxes_all[k].width - width_orig), width_target);
int end_y_out = cv::min(height_target - (proposal_boxes_all[k].y + proposal_boxes_all[k].height - height_orig), height_target);
cv::Mat tmp(height_target, width_target, CV_32FC3, cv::Scalar(0.0f,0.0f,0.0f));
img_float(cv::Rect(start_x_in, start_y_in, end_x_in - start_x_in, end_y_in - start_y_in)).copyTo(
tmp(cv::Rect(start_x_out, start_y_out, end_x_out - start_x_out, end_y_out - start_y_out)));
cv::Mat prop_img;
cv::resize(tmp, prop_img, cv::Size(24, 24));
prop_img = (prop_img - 127.5) * 0.0078125;
// Perform RNet on the proposal image
std::vector<cv::Mat_<float> > rnet_out = RNet.Inference(prop_img, true, false);
float prob = 1.0 / (1.0 + cv::exp(rnet_out[0].at<float>(0) - rnet_out[0].at<float>(1)));
scores_all[k] = prob;
proposal_corrections_all[k].x = rnet_out[0].at<float>(2);
proposal_corrections_all[k].y = rnet_out[0].at<float>(3);
proposal_corrections_all[k].width = rnet_out[0].at<float>(4);
proposal_corrections_all[k].height = rnet_out[0].at<float>(5);
if(prob >= t2)
{
above_thresh[k] = true;
}
else
{
above_thresh[k] = false;
}
}
//});
to_keep.clear();
for (size_t i = 0; i < above_thresh.size(); ++i)
{
if (above_thresh[i])
{
to_keep.push_back(i);
}
}
// Pick only the bounding boxes above the threshold
select_subset(to_keep, proposal_boxes_all, scores_all, proposal_corrections_all);
// Non maximum supression accross bounding boxes, and their offset correction
to_keep = non_maximum_supression(proposal_boxes_all, scores_all, 0.7, false);
select_subset(to_keep, proposal_boxes_all, scores_all, proposal_corrections_all);
apply_correction(proposal_boxes_all, proposal_corrections_all, false);
// Convert to rectangles and round
rectify(proposal_boxes_all);
// Preparing for the ONet stage
above_thresh.clear();
above_thresh.resize(proposal_boxes_all.size());
for (size_t k = 0; k < proposal_boxes_all.size(); ++k)
{
float width_target = proposal_boxes_all[k].width + 1;
float height_target = proposal_boxes_all[k].height + 1;
// Work out the start and end indices in the original image
int start_x_in = cv::max((int)(proposal_boxes_all[k].x - 1), 0);
int start_y_in = cv::max((int)(proposal_boxes_all[k].y - 1), 0);
int end_x_in = cv::min((int)(proposal_boxes_all[k].x + width_target - 1), width_orig);
int end_y_in = cv::min((int)(proposal_boxes_all[k].y + height_target - 1), height_orig);
// Work out the start and end indices in the target image
int start_x_out = cv::max((int)(-proposal_boxes_all[k].x + 1), 0);
int start_y_out = cv::max((int)(-proposal_boxes_all[k].y + 1), 0);
int end_x_out = cv::min(width_target - (proposal_boxes_all[k].x + proposal_boxes_all[k].width - width_orig), width_target);
int end_y_out = cv::min(height_target - (proposal_boxes_all[k].y + proposal_boxes_all[k].height - height_orig), height_target);
cv::Mat tmp(height_target, width_target, CV_32FC3, cv::Scalar(0.0f, 0.0f, 0.0f));
img_float(cv::Rect(start_x_in, start_y_in, end_x_in - start_x_in, end_y_in - start_y_in)).copyTo(
tmp(cv::Rect(start_x_out, start_y_out, end_x_out - start_x_out, end_y_out - start_y_out)));
cv::Mat prop_img;
cv::resize(tmp, prop_img, cv::Size(48, 48));
prop_img = (prop_img - 127.5) * 0.0078125;
// Perform RNet on the proposal image
std::vector<cv::Mat_<float> > onet_out = ONet.Inference(prop_img, true, false);
float prob = 1.0 / (1.0 + cv::exp(onet_out[0].at<float>(0) - onet_out[0].at<float>(1)));
scores_all[k] = prob;
proposal_corrections_all[k].x = onet_out[0].at<float>(2);
proposal_corrections_all[k].y = onet_out[0].at<float>(3);
proposal_corrections_all[k].width = onet_out[0].at<float>(4);
proposal_corrections_all[k].height = onet_out[0].at<float>(5);
if (prob >= t3)
{
above_thresh[k] = true;
}
else
{
above_thresh[k] = false;
}
}
//});
to_keep.clear();
for (size_t i = 0; i < above_thresh.size(); ++i)
{
if (above_thresh[i])
{
to_keep.push_back(i);
}
}
// Pick only the bounding boxes above the threshold
select_subset(to_keep, proposal_boxes_all, scores_all, proposal_corrections_all);
apply_correction(proposal_boxes_all, proposal_corrections_all, true);
// Non maximum supression accross bounding boxes, and their offset correction
to_keep = non_maximum_supression(proposal_boxes_all, scores_all, 0.7, true);
select_subset(to_keep, proposal_boxes_all, scores_all, proposal_corrections_all);
// Correct the box to expectation to be tight around facial landmarks
for (size_t k = 0; k < proposal_boxes_all.size(); ++k)
{
proposal_boxes_all[k].x = proposal_boxes_all[k].width * -0.0075 + proposal_boxes_all[k].x;
proposal_boxes_all[k].y = proposal_boxes_all[k].height * 0.2459 + proposal_boxes_all[k].y;
proposal_boxes_all[k].width = 1.0323 * proposal_boxes_all[k].width;
proposal_boxes_all[k].height = 0.7751 * proposal_boxes_all[k].height;
o_regions.push_back(cv::Rect_<float>(proposal_boxes_all[k].x, proposal_boxes_all[k].y, proposal_boxes_all[k].width, proposal_boxes_all[k].height));
o_confidences.push_back(scores_all[k]);
}
if(o_regions.size() > 0)
{
return true;
}
else
{
return false;
}
}

View File

@@ -0,0 +1,550 @@
///////////////////////////////////////////////////////////////////////////////
// Copyright (C) 2017, Carnegie Mellon University and University of Cambridge,
// all rights reserved.
//
// ACADEMIC OR NON-PROFIT ORGANIZATION NONCOMMERCIAL RESEARCH USE ONLY
//
// BY USING OR DOWNLOADING THE SOFTWARE, YOU ARE AGREEING TO THE TERMS OF THIS LICENSE AGREEMENT.
// IF YOU DO NOT AGREE WITH THESE TERMS, YOU MAY NOT USE OR DOWNLOAD THE SOFTWARE.
//
// License can be found in OpenFace-license.txt
//
// * Any publications arising from the use of this software, including but
// not limited to academic journal and conference publications, technical
// reports and manuals, must cite at least one of the following works:
//
// OpenFace 2.0: Facial Behavior Analysis Toolkit
// Tadas Baltrušaitis, Amir Zadeh, Yao Chong Lim, and Louis-Philippe Morency
// in IEEE International Conference on Automatic Face and Gesture Recognition, 2018
//
// Convolutional experts constrained local model for facial landmark detection.
// A. Zadeh, T. Baltrušaitis, and Louis-Philippe Morency,
// in Computer Vision and Pattern Recognition Workshops, 2017.
//
// Rendering of Eyes for Eye-Shape Registration and Gaze Estimation
// Erroll Wood, Tadas Baltrušaitis, Xucong Zhang, Yusuke Sugano, Peter Robinson, and Andreas Bulling
// in IEEE International. Conference on Computer Vision (ICCV), 2015
//
// Cross-dataset learning and person-specific normalisation for automatic Action Unit detection
// Tadas Baltrušaitis, Marwa Mahmoud, and Peter Robinson
// in Facial Expression Recognition and Analysis Challenge,
// IEEE International Conference on Automatic Face and Gesture Recognition, 2015
//
///////////////////////////////////////////////////////////////////////////////
#include "stdafx.h"
#include "LandmarkDetectionValidator.h"
// OpenCV includes
#include <opencv2/core/core.hpp>
#include <opencv2/imgproc.hpp>
// System includes
#include <fstream>
// Math includes
#define _USE_MATH_DEFINES
#include <cmath>
#ifndef M_PI
#define M_PI 3.14159265358979323846
#endif
// Local includes
#include "LandmarkDetectorUtils.h"
#include "CNN_utils.h"
using namespace LandmarkDetector;
// Copy constructor
DetectionValidator::DetectionValidator(const DetectionValidator& other) : orientations(other.orientations), paws(other.paws),
cnn_subsampling_layers(other.cnn_subsampling_layers), cnn_layer_types(other.cnn_layer_types), cnn_convolutional_layers_im2col_precomp(other.cnn_convolutional_layers_im2col_precomp),
cnn_convolutional_layers_weights(other.cnn_convolutional_layers_weights)
{
this->cnn_convolutional_layers.resize(other.cnn_convolutional_layers.size());
for (size_t v = 0; v < other.cnn_convolutional_layers.size(); ++v)
{
this->cnn_convolutional_layers[v].resize(other.cnn_convolutional_layers[v].size());
for (size_t l = 0; l < other.cnn_convolutional_layers[v].size(); ++l)
{
this->cnn_convolutional_layers[v][l].resize(other.cnn_convolutional_layers[v][l].size());
for (size_t i = 0; i < other.cnn_convolutional_layers[v][l].size(); ++i)
{
this->cnn_convolutional_layers[v][l][i].resize(other.cnn_convolutional_layers[v][l][i].size());
for (size_t k = 0; k < other.cnn_convolutional_layers[v][l][i].size(); ++k)
{
// Make sure the matrix is copied.
this->cnn_convolutional_layers[v][l][i][k] = other.cnn_convolutional_layers[v][l][i][k].clone();
}
}
}
}
this->cnn_fully_connected_layers_weights.resize(other.cnn_fully_connected_layers_weights.size());
for (size_t v = 0; v < other.cnn_fully_connected_layers_weights.size(); ++v)
{
this->cnn_fully_connected_layers_weights[v].resize(other.cnn_fully_connected_layers_weights[v].size());
for (size_t l = 0; l < other.cnn_fully_connected_layers_weights[v].size(); ++l)
{
// Make sure the matrix is copied.
this->cnn_fully_connected_layers_weights[v][l] = other.cnn_fully_connected_layers_weights[v][l].clone();
}
}
this->cnn_fully_connected_layers_biases.resize(other.cnn_fully_connected_layers_biases.size());
for (size_t v = 0; v < other.cnn_fully_connected_layers_biases.size(); ++v)
{
this->cnn_fully_connected_layers_biases[v].resize(other.cnn_fully_connected_layers_biases[v].size());
for (size_t l = 0; l < other.cnn_fully_connected_layers_biases[v].size(); ++l)
{
// Make sure the matrix is copied.
this->cnn_fully_connected_layers_biases[v][l] = other.cnn_fully_connected_layers_biases[v][l].clone();
}
}
this->mean_images.resize(other.mean_images.size());
for (size_t i = 0; i < other.mean_images.size(); ++i)
{
// Make sure the matrix is copied.
this->mean_images[i] = other.mean_images[i].clone();
}
this->standard_deviations.resize(other.standard_deviations.size());
for (size_t i = 0; i < other.standard_deviations.size(); ++i)
{
// Make sure the matrix is copied.
this->standard_deviations[i] = other.standard_deviations[i].clone();
}
}
//===========================================================================
// Read in the landmark detection validation module
void DetectionValidator::Read(std::string location)
{
std::ifstream detection_validator_stream (location, std::ios::in | std::ios::binary);
if (detection_validator_stream.is_open())
{
detection_validator_stream.seekg (0, std::ios::beg);
// Read validator type
int validator_type;
detection_validator_stream.read ((char*)&validator_type, 4);
if (validator_type != 3)
{
std::cout << "ERROR: Using old face validator, no longer supported" << std::endl;
}
// Read the number of views (orientations) within the validator
int n;
detection_validator_stream.read ((char*)&n, 4);
orientations.resize(n);
for(int i = 0; i < n; i++)
{
cv::Mat_<double> orientation_tmp;
LandmarkDetector::ReadMatBin(detection_validator_stream, orientation_tmp);
orientations[i] = cv::Vec3d(orientation_tmp.at<double>(0), orientation_tmp.at<double>(1), orientation_tmp.at<double>(2));
// Convert from degrees to radians
orientations[i] = orientations[i] * M_PI / 180.0;
}
// Initialise the piece-wise affine warps, biases and weights
paws.resize(n);
cnn_convolutional_layers_weights.resize(n);
cnn_convolutional_layers_im2col_precomp.resize(n);
cnn_convolutional_layers.resize(n);
cnn_fully_connected_layers_weights.resize(n);
cnn_layer_types.resize(n);
cnn_fully_connected_layers_biases.resize(n);
// Initialise the normalisation terms
mean_images.resize(n);
standard_deviations.resize(n);
// Read in the validators for each of the views
for(int i = 0; i < n; i++)
{
// Read in the mean images
cv::Mat_<double> mean_img;
LandmarkDetector::ReadMatBin(detection_validator_stream, mean_img);
mean_img.convertTo(mean_images[i], CV_32F);
mean_images[i] = mean_images[i].t();
cv::Mat_<double> std_dev;
LandmarkDetector::ReadMatBin(detection_validator_stream, std_dev);
std_dev.convertTo(standard_deviations[i], CV_32F);
standard_deviations[i] = standard_deviations[i].t();
// Model specifics
if (validator_type == 3)
{
int network_depth;
detection_validator_stream.read((char*)&network_depth, 4);
cnn_layer_types[i].resize(network_depth);
for (int layer = 0; layer < network_depth; ++layer)
{
int layer_type;
detection_validator_stream.read((char*)&layer_type, 4);
cnn_layer_types[i][layer] = layer_type;
// convolutional
if (layer_type == 0)
{
// Read the number of input maps
int num_in_maps;
detection_validator_stream.read((char*)&num_in_maps, 4);
// Read the number of kernels for each input map
int num_kernels;
detection_validator_stream.read((char*)&num_kernels, 4);
std::vector<std::vector<cv::Mat_<float> > > kernels;
kernels.resize(num_in_maps);
std::vector<float> biases;
for (int k = 0; k < num_kernels; ++k)
{
float bias;
detection_validator_stream.read((char*)&bias, 4);
biases.push_back(bias);
}
// For every input map
for (int in = 0; in < num_in_maps; ++in)
{
kernels[in].resize(num_kernels);
// For every kernel on that input map
for (int k = 0; k < num_kernels; ++k)
{
ReadMatBin(detection_validator_stream, kernels[in][k]);
}
}
cnn_convolutional_layers[i].push_back(kernels);
// Rearrange the kernels for faster inference with FFT
std::vector<std::vector<cv::Mat_<float> > > kernels_rearr;
kernels_rearr.resize(num_kernels);
// Fill up the rearranged layer
for (int k = 0; k < num_kernels; ++k)
{
for (int in = 0; in < num_in_maps; ++in)
{
kernels_rearr[k].push_back(kernels[in][k]);
}
}
// Rearrange the flattened kernels into weight matrices for direct convolution computation
cv::Mat_<float> weight_matrix(num_in_maps * kernels_rearr[0][0].rows * kernels_rearr[0][0].cols, num_kernels);
for (int k = 0; k < num_kernels; ++k)
{
for (int i = 0; i < num_in_maps; ++i)
{
// Flatten the kernel
cv::Mat_<float> k_flat = kernels_rearr[k][i].t();
k_flat = k_flat.reshape(0, 1).t();
k_flat.copyTo(weight_matrix(cv::Rect(k, i * kernels_rearr[0][0].rows * kernels_rearr[0][0].cols, 1, kernels_rearr[0][0].rows * kernels_rearr[0][0].cols)));
}
}
// Transpose the weight matrix for more convenient computation
weight_matrix = weight_matrix.t();
// Add a bias term to the weight matrix for efficiency
cv::Mat_<float> W(weight_matrix.rows, weight_matrix.cols + 1, 1.0);
for (int k = 0; k < weight_matrix.rows; ++k)
{
W.at<float>(k, weight_matrix.cols) = biases[k];
}
weight_matrix.copyTo(W(cv::Rect(0, 0, weight_matrix.cols, weight_matrix.rows)));
cnn_convolutional_layers_weights[i].push_back(W.t());
cnn_convolutional_layers_im2col_precomp[i].push_back(cv::Mat_<float>());
}
else if (layer_type == 2)
{
cv::Mat_<float> biases;
ReadMatBin(detection_validator_stream, biases);
cnn_fully_connected_layers_biases[i].push_back(biases);
// Fully connected layer
cv::Mat_<float> weights;
ReadMatBin(detection_validator_stream, weights);
cnn_fully_connected_layers_weights[i].push_back(weights);
}
}
}
// Read in the piece-wise affine warps
paws[i].Read(detection_validator_stream);
}
}
else
{
std::cout << "WARNING: Can't find the Face checker location" << std::endl;
}
}
//===========================================================================
// Check if the fitting actually succeeded
float DetectionValidator::Check(const cv::Vec3d& orientation, const cv::Mat_<uchar>& intensity_img, cv::Mat_<float>& detected_landmarks)
{
int id = GetViewId(orientation);
// The warped (cropped) image, corresponding to a face lying withing the detected lanmarks
cv::Mat_<float> warped;
// First only use the ROI of the image of interest
cv::Mat_<float> detected_landmarks_local = detected_landmarks.clone();
float min_x_f, max_x_f, min_y_f, max_y_f;
ExtractBoundingBox(detected_landmarks_local, min_x_f, max_x_f, min_y_f, max_y_f);
cv::Mat_<float> xs = detected_landmarks_local(cv::Rect(0, 0, 1, detected_landmarks.rows / 2));
cv::Mat_<float> ys = detected_landmarks_local(cv::Rect(0, detected_landmarks.rows / 2, 1, detected_landmarks.rows / 2));
// Picking the ROI (some extra space for bilinear interpolation)
int min_x = (int)(min_x_f - 3.0f);
int max_x = (int)(max_x_f + 3.0f);
int min_y = (int)(min_y_f - 3.0f);
int max_y = (int)(max_y_f + 3.0f);
if (min_x < 0) min_x = 0;
if (min_y < 0) min_y = 0;
if (max_x > intensity_img.cols - 1) max_x = intensity_img.cols - 1;
if (max_y > intensity_img.rows - 1) max_y = intensity_img.rows - 1;
xs = xs - min_x;
ys = ys - min_y;
// If the ROI is non existent return failure (this could happen if all landmarks are outside of the image)
if (max_x - min_x <= 1 || max_y - min_y <= 1)
{
return 0.0f;
}
cv::Mat_<float> intensity_img_float_local;
intensity_img(cv::Rect(min_x, min_y, max_x - min_x, max_y - min_y)).convertTo(intensity_img_float_local, CV_32F);
// the piece-wise affine image warping
paws[id].Warp(intensity_img_float_local, warped, detected_landmarks_local);
// The actual validation step
double dec = CheckCNN(warped, id);
// Convert it to a more interpretable signal (0 low confidence, 1 high confidence)
dec = 0.5 * (1.0 - dec);
return (float)dec;
}
double DetectionValidator::CheckCNN(const cv::Mat_<float>& warped_img, int view_id)
{
cv::Mat_<float> feature_vec;
NormaliseWarpedToVector(warped_img, feature_vec, view_id);
// Create a normalised image from the crop vector
cv::Mat_<float> img(warped_img.size(), 0.0);
img = img.t();
cv::Mat mask = paws[view_id].pixel_mask.t();
cv::MatIterator_<uchar> mask_it = mask.begin<uchar>();
cv::MatIterator_<float> feature_it = feature_vec.begin();
cv::MatIterator_<float> img_it = img.begin();
int wInt = img.cols;
int hInt = img.rows;
for (int i = 0; i < wInt; ++i)
{
for (int j = 0; j < hInt; ++j, ++mask_it, ++img_it)
{
// if is within mask
if (*mask_it)
{
// assign the feature to image if it is within the mask
*img_it = (float)*feature_it++;
}
}
}
img = img.t();
int cnn_layer = 0;
int fully_connected_layer = 0;
std::vector<cv::Mat_<float> > input_maps;
input_maps.push_back(img);
std::vector<cv::Mat_<float> > outputs;
for (size_t layer = 0; layer < cnn_layer_types[view_id].size(); ++layer)
{
// Determine layer type
int layer_type = cnn_layer_types[view_id][layer];
// Convolutional layer
if (layer_type == 0)
{
convolution_direct_blas(outputs, input_maps, cnn_convolutional_layers_weights[view_id][cnn_layer], cnn_convolutional_layers[view_id][cnn_layer][0][0].rows, cnn_convolutional_layers[view_id][cnn_layer][0][0].cols, cnn_convolutional_layers_im2col_precomp[view_id][cnn_layer]);
cnn_layer++;
}
if (layer_type == 1)
{
max_pooling(outputs, input_maps, 2, 2, 2, 2);
}
if (layer_type == 2)
{
fully_connected(outputs, input_maps, cnn_fully_connected_layers_weights[view_id][fully_connected_layer].t(), cnn_fully_connected_layers_biases[view_id][fully_connected_layer]);
fully_connected_layer++;
}
if (layer_type == 3) // ReLU
{
outputs.clear();
for (size_t k = 0; k < input_maps.size(); ++k)
{
// Apply the ReLU
cv::threshold(input_maps[k], input_maps[k], 0, 0, cv::THRESH_TOZERO);
outputs.push_back(input_maps[k]);
}
}
if (layer_type == 4)
{
outputs.clear();
for (size_t k = 0; k < input_maps.size(); ++k)
{
// Apply the sigmoid
cv::exp(-input_maps[k], input_maps[k]);
input_maps[k] = 1.0 / (1.0 + input_maps[k]);
outputs.push_back(input_maps[k]);
}
}
// Set the outputs of this layer to inputs of the next
input_maps = outputs;
}
// Convert the class label to a continuous value
double max_val = 0;
cv::Point max_loc;
cv::minMaxLoc(outputs[0].t(), 0, &max_val, 0, &max_loc);
int max_idx = max_loc.y;
double max = 1;
double min = -1;
double bins = (double)outputs[0].cols;
// Unquantizing the softmax layer to continuous value
double step_size = (max - min) / bins; // This should be saved somewhere
double unquantized = min + step_size / 2.0 + max_idx * step_size;
return unquantized;
}
void DetectionValidator::NormaliseWarpedToVector(const cv::Mat_<float>& warped_img, cv::Mat_<float>& feature_vec, int view_id)
{
cv::Mat_<float> warped_t = warped_img.t();
// the vector to be filled with paw values
cv::MatIterator_<float> vp;
cv::MatIterator_<float> cp;
cv::Mat_<float> vec(paws[view_id].number_of_pixels,1);
vp = vec.begin();
cp = warped_t.begin();
int wInt = warped_img.cols;
int hInt = warped_img.rows;
// the mask indicating if point is within or outside the face region
cv::Mat maskT = paws[view_id].pixel_mask.t();
cv::MatIterator_<uchar> mp = maskT.begin<uchar>();
for(int i=0; i < wInt; ++i)
{
for(int j=0; j < hInt; ++j, ++mp, ++cp)
{
// if is within mask
if(*mp)
{
*vp++ = *cp;
}
}
}
// Local normalisation
cv::Scalar mean;
cv::Scalar std;
cv::meanStdDev(vec, mean, std);
// subtract the mean image
vec -= mean[0];
// Normalise the image
if(std[0] == 0)
{
std[0] = 1;
}
vec /= std[0];
// Global normalisation
feature_vec = (vec - mean_images[view_id]) / standard_deviations[view_id];
}
// Getting the closest view center based on orientation
int DetectionValidator::GetViewId(const cv::Vec3d& orientation) const
{
int id = 0;
double dbest = -1.0;
for(size_t i = 0; i < this->orientations.size(); i++)
{
// Distance to current view
double d = cv::norm(orientation, this->orientations[i]);
if(i == 0 || d < dbest)
{
dbest = d;
id = i;
}
}
return id;
}

View File

@@ -0,0 +1,768 @@
///////////////////////////////////////////////////////////////////////////////
// Copyright (C) 2017, Carnegie Mellon University and University of Cambridge,
// all rights reserved.
//
// ACADEMIC OR NON-PROFIT ORGANIZATION NONCOMMERCIAL RESEARCH USE ONLY
//
// BY USING OR DOWNLOADING THE SOFTWARE, YOU ARE AGREEING TO THE TERMS OF THIS LICENSE AGREEMENT.
// IF YOU DO NOT AGREE WITH THESE TERMS, YOU MAY NOT USE OR DOWNLOAD THE SOFTWARE.
//
// License can be found in OpenFace-license.txt
//
// * Any publications arising from the use of this software, including but
// not limited to academic journal and conference publications, technical
// reports and manuals, must cite at least one of the following works:
//
// OpenFace 2.0: Facial Behavior Analysis Toolkit
// Tadas Baltrušaitis, Amir Zadeh, Yao Chong Lim, and Louis-Philippe Morency
// in IEEE International Conference on Automatic Face and Gesture Recognition, 2018
//
// Convolutional experts constrained local model for facial landmark detection.
// A. Zadeh, T. Baltrušaitis, and Louis-Philippe Morency,
// in Computer Vision and Pattern Recognition Workshops, 2017.
//
// Rendering of Eyes for Eye-Shape Registration and Gaze Estimation
// Erroll Wood, Tadas Baltrušaitis, Xucong Zhang, Yusuke Sugano, Peter Robinson, and Andreas Bulling
// in IEEE International. Conference on Computer Vision (ICCV), 2015
//
// Cross-dataset learning and person-specific normalisation for automatic Action Unit detection
// Tadas Baltrušaitis, Marwa Mahmoud, and Peter Robinson
// in Facial Expression Recognition and Analysis Challenge,
// IEEE International Conference on Automatic Face and Gesture Recognition, 2015
//
///////////////////////////////////////////////////////////////////////////////
#include "stdafx.h"
#include "LandmarkDetectorFunc.h"
#include "RotationHelpers.h"
#include "ImageManipulationHelpers.h"
// OpenCV includes
#include <opencv2/core/core.hpp>
#include <opencv2/calib3d.hpp>
#include <opencv2/imgproc.hpp>
// System includes
#include <vector>
#include <numeric>
using namespace LandmarkDetector;
// Getting a head pose estimate from the currently detected landmarks, with appropriate correction due to the PDM assuming an orthographic camera
// which is only correct close to the centre of the image
// This method returns a corrected pose estimate with respect to world coordinates with camera at origin (0,0,0)
// The format returned is [Tx, Ty, Tz, Eul_x, Eul_y, Eul_z]
cv::Vec6f LandmarkDetector::GetPose(const CLNF& clnf_model, float fx, float fy, float cx, float cy)
{
if (!clnf_model.detected_landmarks.empty() && clnf_model.params_global[0] != 0)
{
// This is used as an initial estimate for the iterative PnP algorithm
float Z = fx / clnf_model.params_global[0];
float X = ((clnf_model.params_global[4] - cx) * (1.0 / fx)) * Z;
float Y = ((clnf_model.params_global[5] - cy) * (1.0 / fy)) * Z;
// Correction for orientation
// 2D points
cv::Mat_<float> landmarks_2D = clnf_model.detected_landmarks;
landmarks_2D = landmarks_2D.reshape(1, 2).t();
// 3D points
cv::Mat_<float> landmarks_3D;
clnf_model.pdm.CalcShape3D(landmarks_3D, clnf_model.params_local);
landmarks_3D = landmarks_3D.reshape(1, 3).t();
// Solving the PNP model
// The camera matrix
cv::Matx33f camera_matrix(fx, 0, cx, 0, fy, cy, 0, 0, 1);
cv::Vec3f vec_trans(X, Y, Z);
cv::Vec3f vec_rot(clnf_model.params_global[1], clnf_model.params_global[2], clnf_model.params_global[3]);
cv::solvePnP(landmarks_3D, landmarks_2D, camera_matrix, cv::Mat(), vec_rot, vec_trans, true);
cv::Vec3f euler = Utilities::AxisAngle2Euler(vec_rot);
return cv::Vec6f(vec_trans[0], vec_trans[1], vec_trans[2], euler[0], euler[1], euler[2]);
}
else
{
return cv::Vec6f(0, 0, 0, 0, 0, 0);
}
}
// Getting a head pose estimate from the currently detected landmarks, with appropriate correction due to perspective projection
// This method returns a corrected pose estimate with respect to a point camera (NOTE not the world coordinates), which is useful to find out if the person is looking at a camera
// The format returned is [Tx, Ty, Tz, Eul_x, Eul_y, Eul_z]
cv::Vec6f LandmarkDetector::GetPoseWRTCamera(const CLNF& clnf_model, float fx, float fy, float cx, float cy)
{
if (!clnf_model.detected_landmarks.empty() && clnf_model.params_global[0] != 0)
{
float Z = fx / clnf_model.params_global[0];
float X = ((clnf_model.params_global[4] - cx) * (1.0 / fx)) * Z;
float Y = ((clnf_model.params_global[5] - cy) * (1.0 / fy)) * Z;
// Correction for orientation
// 3D points
cv::Mat_<float> landmarks_3D;
clnf_model.pdm.CalcShape3D(landmarks_3D, clnf_model.params_local);
landmarks_3D = landmarks_3D.reshape(1, 3).t();
// 2D points
cv::Mat_<float> landmarks_2D = clnf_model.detected_landmarks;
landmarks_2D = landmarks_2D.reshape(1, 2).t();
// Solving the PNP model
// The camera matrix
cv::Matx33f camera_matrix(fx, 0, cx, 0, fy, cy, 0, 0, 1);
cv::Vec3f vec_trans(X, Y, Z);
cv::Vec3f vec_rot(clnf_model.params_global[1], clnf_model.params_global[2], clnf_model.params_global[3]);
cv::solvePnP(landmarks_3D, landmarks_2D, camera_matrix, cv::Mat(), vec_rot, vec_trans, true);
// Here we correct for the camera orientation, for this need to determine the angle the camera makes with the head pose
float z_x = cv::sqrt(vec_trans[0] * vec_trans[0] + vec_trans[2] * vec_trans[2]);
float eul_x = atan2(vec_trans[1], z_x);
float z_y = cv::sqrt(vec_trans[1] * vec_trans[1] + vec_trans[2] * vec_trans[2]);
float eul_y = -atan2(vec_trans[0], z_y);
cv::Matx33f camera_rotation = Utilities::Euler2RotationMatrix(cv::Vec3f(eul_x, eul_y, 0));
cv::Matx33f head_rotation = Utilities::AxisAngle2RotationMatrix(vec_rot);
cv::Matx33f corrected_rotation = camera_rotation * head_rotation;
cv::Vec3f euler_corrected = Utilities::RotationMatrix2Euler(corrected_rotation);
return cv::Vec6f(vec_trans[0], vec_trans[1], vec_trans[2], euler_corrected[0], euler_corrected[1], euler_corrected[2]);
}
else
{
return cv::Vec6f(0, 0, 0, 0, 0, 0);
}
}
// If landmark detection in video succeeded create a template for use in simple tracking
void UpdateTemplate(const cv::Mat_<uchar> &grayscale_image, CLNF& clnf_model)
{
cv::Rect_<float> bounding_box;
clnf_model.pdm.CalcBoundingBox(bounding_box, clnf_model.params_global, clnf_model.params_local);
// Make sure the box is not out of bounds
cv::Rect_<int> bbox_tmp((int)bounding_box.x, (int)bounding_box.y, (int)bounding_box.width, (int)bounding_box.height);
bounding_box = bbox_tmp & cv::Rect(0, 0, grayscale_image.cols, grayscale_image.rows);
clnf_model.face_template = grayscale_image(bounding_box).clone();
}
// This method uses basic template matching in order to allow for better tracking of fast moving faces
void CorrectGlobalParametersVideo(const cv::Mat_<uchar> &grayscale_image, CLNF& clnf_model, const FaceModelParameters& params)
{
cv::Rect_<float> init_box;
clnf_model.pdm.CalcBoundingBox(init_box, clnf_model.params_global, clnf_model.params_local);
cv::Rect roi(init_box.x - init_box.width/2, init_box.y - init_box.height/2, init_box.width * 2, init_box.height * 2);
roi = roi & cv::Rect(0, 0, grayscale_image.cols, grayscale_image.rows);
int off_x = roi.x;
int off_y = roi.y;
float scaling = params.face_template_scale / clnf_model.params_global[0];
cv::Mat_<uchar> image;
if(scaling < 1)
{
cv::resize(clnf_model.face_template, clnf_model.face_template, cv::Size(), scaling, scaling);
cv::resize(grayscale_image(roi), image, cv::Size(), scaling, scaling);
}
else
{
scaling = 1;
image = grayscale_image(roi).clone();
}
// Resizing the template
cv::Mat corr_out;
cv::matchTemplate(image, clnf_model.face_template, corr_out, cv::TM_CCOEFF_NORMED);
// Actually matching it
//double min, max;
int max_loc[2];
cv::minMaxIdx(corr_out, NULL, NULL, NULL, max_loc);
cv::Rect_<float> out_bbox(max_loc[1]/scaling + off_x, max_loc[0]/scaling + off_y, clnf_model.face_template.rows / scaling, clnf_model.face_template.cols / scaling);
float shift_x = out_bbox.x - init_box.x;
float shift_y = out_bbox.y - init_box.y;
clnf_model.params_global[4] = clnf_model.params_global[4] + shift_x;
clnf_model.params_global[5] = clnf_model.params_global[5] + shift_y;
}
bool LandmarkDetector::DetectLandmarksInVideo(const cv::Mat &rgb_image, CLNF& clnf_model, FaceModelParameters& params, cv::Mat& grayscale_image)
{
// First need to decide if the landmarks should be "detected" or "tracked"
// Detected means running face detection and a larger search area, tracked means initialising from previous step
// and using a smaller search area
if(grayscale_image.empty())
{
Utilities::ConvertToGrayscale_8bit(rgb_image, grayscale_image);
}
// Indicating that this is a first detection in video sequence or after restart
bool initial_detection = !clnf_model.tracking_initialised;
// Only do it if there was a face detection at all
if(clnf_model.tracking_initialised)
{
// The area of interest search size will depend if the previous track was successful
if(!clnf_model.detection_success)
{
params.window_sizes_current = params.window_sizes_init;
}
else
{
params.window_sizes_current = params.window_sizes_small;
}
// Before the expensive landmark detection step apply a quick template tracking approach
if(params.use_face_template && !clnf_model.face_template.empty() && clnf_model.detection_success)
{
CorrectGlobalParametersVideo(grayscale_image, clnf_model, params);
}
bool track_success = clnf_model.DetectLandmarks(grayscale_image, params);
if(!track_success)
{
// Make a record that tracking failed
clnf_model.failures_in_a_row++;
}
else
{
// indicate that tracking is a success
clnf_model.failures_in_a_row = -1;
if(params.use_face_template)
{
UpdateTemplate(grayscale_image, clnf_model);
}
}
}
// This is used for both detection (if it the tracking has not been initialised yet) or if the tracking failed (however we do this every n frames, for speed)
// This also has the effect of an attempt to reinitialise just after the tracking has failed, which is useful during large motions
if((!clnf_model.tracking_initialised && (clnf_model.failures_in_a_row + 1) % (params.reinit_video_every * 6) == 0)
|| (clnf_model.tracking_initialised && !clnf_model.detection_success && params.reinit_video_every > 0 && clnf_model.failures_in_a_row % params.reinit_video_every == 0))
{
cv::Rect_<float> bounding_box;
// If the face detector has not been initialised and we're using it, then read it in
if(clnf_model.face_detector_HAAR.empty() && params.curr_face_detector == params.HAAR_DETECTOR)
{
clnf_model.face_detector_HAAR.load(params.haar_face_detector_location);
clnf_model.haar_face_detector_location = params.haar_face_detector_location;
}
if (clnf_model.face_detector_MTCNN.empty() && params.curr_face_detector == params.MTCNN_DETECTOR)
{
clnf_model.face_detector_MTCNN.Read(params.mtcnn_face_detector_location);
clnf_model.mtcnn_face_detector_location = params.mtcnn_face_detector_location;
// If the model is still empty default to HOG
if (clnf_model.face_detector_MTCNN.empty())
{
std::cout << "INFO: defaulting to HOG-SVM face detector" << std::endl;
params.curr_face_detector = LandmarkDetector::FaceModelParameters::HOG_SVM_DETECTOR;
}
}
cv::Point preference_det(-1, -1);
if(clnf_model.preference_det.x != -1 && clnf_model.preference_det.y != -1)
{
preference_det.x = clnf_model.preference_det.x * grayscale_image.cols;
preference_det.y = clnf_model.preference_det.y * grayscale_image.rows;
clnf_model.preference_det = cv::Point(-1, -1);
}
bool face_detection_success;
if(params.curr_face_detector == FaceModelParameters::HOG_SVM_DETECTOR)
{
float confidence;
face_detection_success = LandmarkDetector::DetectSingleFaceHOG(bounding_box, grayscale_image, clnf_model.face_detector_HOG, confidence, preference_det);
}
else if(params.curr_face_detector == FaceModelParameters::HAAR_DETECTOR)
{
face_detection_success = LandmarkDetector::DetectSingleFace(bounding_box, grayscale_image, clnf_model.face_detector_HAAR, preference_det);
}
else if (params.curr_face_detector == FaceModelParameters::MTCNN_DETECTOR)
{
float confidence;
face_detection_success = LandmarkDetector::DetectSingleFaceMTCNN(bounding_box, rgb_image, clnf_model.face_detector_MTCNN, confidence, preference_det);
}
// Attempt to detect landmarks using the detected face (if unseccessful the detection will be ignored)
if(face_detection_success)
{
// Indicate that tracking has started as a face was detected
clnf_model.tracking_initialised = true;
// Keep track of old model values so that they can be restored if redetection fails
cv::Vec6f params_global_init = clnf_model.params_global;
cv::Mat_<float> params_local_init = clnf_model.params_local.clone();
float likelihood_init = clnf_model.model_likelihood;
cv::Mat_<float> detected_landmarks_init = clnf_model.detected_landmarks.clone();
cv::Mat_<float> landmark_likelihoods_init = clnf_model.landmark_likelihoods.clone();
// Use the detected bounding box and empty local parameters
clnf_model.params_local.setTo(0);
clnf_model.pdm.CalcParams(clnf_model.params_global, bounding_box, clnf_model.params_local);
// Make sure the search size is large
params.window_sizes_current = params.window_sizes_init;
// TODO rem (should the multi-hyp version be only for CEN and not CLNF?), otherwise poss too slow, and poss not accurate
//bool landmark_detection_success = clnf_model.DetectLandmarks(grayscale_image, params);
// Do the actual landmark detection (and keep it only if successful)
// Perform multi-hypothesis detection here (as face detector can pick up multiple of them)
params.multi_view = true;
bool landmark_detection_success = DetectLandmarksInImage(rgb_image, bounding_box, clnf_model, params, grayscale_image);
params.multi_view = false;
// If landmark reinitialisation unsucessful continue from previous estimates
// if it's initial detection however, do not care if it was successful as the validator might be wrong, so continue trackig
// regardless
if(!initial_detection && !landmark_detection_success)
{
// Restore previous estimates
clnf_model.params_global = params_global_init;
clnf_model.params_local = params_local_init.clone();
clnf_model.pdm.CalcShape2D(clnf_model.detected_landmarks, clnf_model.params_local, clnf_model.params_global);
clnf_model.model_likelihood = likelihood_init;
clnf_model.detected_landmarks = detected_landmarks_init.clone();
clnf_model.landmark_likelihoods = landmark_likelihoods_init.clone();
return false;
}
else
{
clnf_model.failures_in_a_row = -1;
if(params.use_face_template)
{
UpdateTemplate(grayscale_image, clnf_model);
}
return true;
}
}
}
// if the model has not been initialised yet class it as a failure
if(!clnf_model.tracking_initialised)
{
clnf_model.failures_in_a_row++;
}
// un-initialise the tracking
if( clnf_model.failures_in_a_row > 100)
{
clnf_model.tracking_initialised = false;
}
return clnf_model.detection_success;
}
bool LandmarkDetector::DetectLandmarksInVideo(const cv::Mat &rgb_image, const cv::Rect_<double> bounding_box, CLNF& clnf_model, FaceModelParameters& params, cv::Mat &grayscale_image)
{
if(bounding_box.width > 0)
{
// calculate the local and global parameters from the generated 2D shape (mapping from the 2D to 3D because camera params are unknown)
clnf_model.params_local.setTo(0);
clnf_model.pdm.CalcParams(clnf_model.params_global, bounding_box, clnf_model.params_local);
// indicate that face was detected so initialisation is not necessary
clnf_model.tracking_initialised = true;
}
return DetectLandmarksInVideo(rgb_image, clnf_model, params, grayscale_image);
}
//================================================================================================================
// Landmark detection in image, need to provide an image and optionally CLNF model together with parameters (default values work well)
// Optionally can provide a bounding box in which detection is performed (this is useful if multiple faces are to be detected in images)
//================================================================================================================
bool DetectLandmarksInImageMultiHypBasic(const cv::Mat_<uchar> &grayscale_image, std::vector<cv::Vec3d> rotation_hypotheses,
const cv::Rect_<double> bounding_box, CLNF& clnf_model, FaceModelParameters& params)
{
// Use the initialisation size for the landmark detection
params.window_sizes_current = params.window_sizes_init;
// Store the current best estimate
float best_likelihood;
float best_detection_certainty;
cv::Vec6f best_global_parameters;
cv::Mat_<float> best_local_parameters;
cv::Mat_<float> best_detected_landmarks;
cv::Mat_<float> best_landmark_likelihoods;
bool best_success;
// The hierarchical model parameters
std::vector<float> best_likelihood_h(clnf_model.hierarchical_models.size());
std::vector<cv::Vec6f> best_global_parameters_h(clnf_model.hierarchical_models.size());
std::vector<cv::Mat_<float>> best_local_parameters_h(clnf_model.hierarchical_models.size());
std::vector<cv::Mat_<float>> best_detected_landmarks_h(clnf_model.hierarchical_models.size());
std::vector<cv::Mat_<float>> best_landmark_likelihoods_h(clnf_model.hierarchical_models.size());
for (size_t hypothesis = 0; hypothesis < rotation_hypotheses.size(); ++hypothesis)
{
// Reset the potentially set clnf_model parameters
clnf_model.params_local.setTo(0.0);
for (size_t part = 0; part < clnf_model.hierarchical_models.size(); ++part)
{
clnf_model.hierarchical_models[part].params_local.setTo(0.0);
}
// calculate the local and global parameters from the generated 2D shape (mapping from the 2D to 3D because camera params are unknown)
clnf_model.pdm.CalcParams(clnf_model.params_global, bounding_box, clnf_model.params_local, rotation_hypotheses[hypothesis]);
bool success = clnf_model.DetectLandmarks(grayscale_image, params);
if (hypothesis == 0 || best_likelihood < clnf_model.model_likelihood)
{
best_likelihood = clnf_model.model_likelihood;
best_global_parameters = clnf_model.params_global;
best_local_parameters = clnf_model.params_local.clone();
best_detected_landmarks = clnf_model.detected_landmarks.clone();
best_landmark_likelihoods = clnf_model.landmark_likelihoods.clone();
best_detection_certainty = clnf_model.detection_certainty;
best_success = success;
for (size_t part = 0; part < clnf_model.hierarchical_models.size(); ++part)
{
best_likelihood_h[part] = clnf_model.hierarchical_models[part].model_likelihood;
best_global_parameters_h[part] = clnf_model.hierarchical_models[part].params_global;
best_local_parameters_h[part] = clnf_model.hierarchical_models[part].params_local.clone();
best_detected_landmarks_h[part] = clnf_model.hierarchical_models[part].detected_landmarks.clone();
best_landmark_likelihoods_h[part] = clnf_model.hierarchical_models[part].landmark_likelihoods.clone();
}
}
}
// Store the best estimates in the clnf_model
clnf_model.model_likelihood = best_likelihood;
clnf_model.params_global = best_global_parameters;
clnf_model.params_local = best_local_parameters.clone();
clnf_model.detected_landmarks = best_detected_landmarks.clone();
clnf_model.detection_success = best_success;
clnf_model.landmark_likelihoods = best_landmark_likelihoods.clone();
clnf_model.detection_certainty = best_detection_certainty;
for (size_t part = 0; part < clnf_model.hierarchical_models.size(); ++part)
{
clnf_model.hierarchical_models[part].params_global = best_global_parameters_h[part];
clnf_model.hierarchical_models[part].params_local = best_local_parameters_h[part].clone();
clnf_model.hierarchical_models[part].detected_landmarks = best_detected_landmarks_h[part].clone();
clnf_model.hierarchical_models[part].landmark_likelihoods = best_landmark_likelihoods_h[part].clone();
}
return best_success;
}
// Helper index sorting function
template <typename T> std::vector<size_t> sort_indexes(const std::vector<T> &v) {
// initialize original index locations
std::vector<size_t> idx(v.size());
std::iota(idx.begin(), idx.end(), 0);
// sort indexes based on comparing values in v
std::sort(idx.begin(), idx.end(),
[&v](size_t i1, size_t i2) {return v[i1] > v[i2]; });
return idx;
}
bool DetectLandmarksInImageMultiHypEarlyTerm(const cv::Mat_<uchar> &grayscale_image, std::vector<cv::Vec3d> rotation_hypotheses,
const cv::Rect_<double> bounding_box, CLNF& clnf_model, FaceModelParameters& params)
{
FaceModelParameters old_params(params);
// Use the initialisation size for the landmark detection
params.window_sizes_current = params.window_sizes_init;
bool early_term = false;
// Setup the parameters accordingly
// Only do the first iteration
for (size_t i = 1; i < params.window_sizes_current.size(); ++i)
{
params.window_sizes_current[i] = 0;
}
params.refine_hierarchical = false;
params.validate_detections = false;
bool success = false;
// Keeping track of converges
std::vector<float> likelihoods;
std::vector<cv::Vec6f> global_parameters;
std::vector<cv::Mat_<float>> local_parameters;
for (size_t hypothesis = 0; hypothesis < rotation_hypotheses.size(); ++hypothesis)
{
// Reset the potentially set clnf_model parameters
clnf_model.params_local.setTo(0.0);
for (size_t part = 0; part < clnf_model.hierarchical_models.size(); ++part)
{
clnf_model.hierarchical_models[part].params_local.setTo(0.0);
}
// calculate the local and global parameters from the generated 2D shape (mapping from the 2D to 3D because camera params are unknown)
clnf_model.pdm.CalcParams(clnf_model.params_global, bounding_box, clnf_model.params_local, rotation_hypotheses[hypothesis]);
// Perform landmark detection in first scale
clnf_model.DetectLandmarks(grayscale_image, params);
float lhood = clnf_model.model_likelihood * clnf_model.patch_experts.early_term_weights[clnf_model.view_used] + clnf_model.patch_experts.early_term_biases[clnf_model.view_used];
// If likelihood higher than cutoff continue on this model
if (lhood > clnf_model.patch_experts.early_term_cutoffs[clnf_model.view_used])
{
params.refine_hierarchical = old_params.refine_hierarchical;
params.window_sizes_current = params.window_sizes_init;
params.window_sizes_current[0] = 0;
params.validate_detections = old_params.validate_detections;
success = clnf_model.DetectLandmarks(grayscale_image, params);
early_term = true;
break;
}
else
{
likelihoods.push_back(lhood);
global_parameters.push_back(clnf_model.params_global);
local_parameters.push_back(clnf_model.params_local);
}
}
if (!early_term)
{
// Store the current best estimate
float best_likelihood;
cv::Vec6f best_global_parameters;
cv::Mat_<float> best_local_parameters;
cv::Mat_<float> best_detected_landmarks;
cv::Mat_<float> best_landmark_likelihoods;
bool best_success;
// The hierarchical model parameters
std::vector<float> best_likelihood_h(clnf_model.hierarchical_models.size());
std::vector<cv::Vec6f> best_global_parameters_h(clnf_model.hierarchical_models.size());
std::vector<cv::Mat_<float>> best_local_parameters_h(clnf_model.hierarchical_models.size());
std::vector<cv::Mat_<float>> best_detected_landmarks_h(clnf_model.hierarchical_models.size());
std::vector<cv::Mat_<float>> best_landmark_likelihoods_h(clnf_model.hierarchical_models.size());
// Sort the likelihoods and pick the best top 3 models
std::vector<size_t> indices = sort_indexes(likelihoods);
// Pick 3 best hypotheses and complete them
size_t max = indices.size() >= 3 ? 3 : indices.size();
params.refine_hierarchical = old_params.refine_hierarchical;
params.window_sizes_current = params.window_sizes_init;
params.window_sizes_current[0] = 0;
params.validate_detections = old_params.validate_detections;
for (size_t i = 0; i < max; ++i)
{
// Reset the potentially set clnf_model parameters
clnf_model.params_local = local_parameters[indices[i]];
clnf_model.params_global = global_parameters[indices[i]];
for (size_t part = 0; part < clnf_model.hierarchical_models.size(); ++part)
{
clnf_model.hierarchical_models[part].params_local.setTo(0.0);
}
// Perform landmark detection in first scale
success = clnf_model.DetectLandmarks(grayscale_image, params);
if (i == 0 || best_likelihood < clnf_model.model_likelihood)
{
best_likelihood = clnf_model.model_likelihood;
best_global_parameters = clnf_model.params_global;
best_local_parameters = clnf_model.params_local.clone();
best_detected_landmarks = clnf_model.detected_landmarks.clone();
best_landmark_likelihoods = clnf_model.landmark_likelihoods.clone();
best_success = success;
for (size_t part = 0; part < clnf_model.hierarchical_models.size(); ++part)
{
best_likelihood_h[part] = clnf_model.hierarchical_models[part].model_likelihood;
best_global_parameters_h[part] = clnf_model.hierarchical_models[part].params_global;
best_local_parameters_h[part] = clnf_model.hierarchical_models[part].params_local.clone();
best_detected_landmarks_h[part] = clnf_model.hierarchical_models[part].detected_landmarks.clone();
best_landmark_likelihoods_h[part] = clnf_model.hierarchical_models[part].landmark_likelihoods.clone();
}
}
}
// Store the best estimates in the clnf_model
clnf_model.model_likelihood = best_likelihood;
clnf_model.params_global = best_global_parameters;
clnf_model.params_local = best_local_parameters.clone();
clnf_model.detected_landmarks = best_detected_landmarks.clone();
clnf_model.detection_success = best_success;
clnf_model.landmark_likelihoods = best_landmark_likelihoods.clone();
for (size_t part = 0; part < clnf_model.hierarchical_models.size(); ++part)
{
clnf_model.hierarchical_models[part].params_global = best_global_parameters_h[part];
clnf_model.hierarchical_models[part].params_local = best_local_parameters_h[part].clone();
clnf_model.hierarchical_models[part].detected_landmarks = best_detected_landmarks_h[part].clone();
clnf_model.hierarchical_models[part].landmark_likelihoods = best_landmark_likelihoods_h[part].clone();
}
}
params = old_params;
return success;
}
// This is the one where the actual work gets done, other DetectLandmarksInImage calls lead to this one
bool LandmarkDetector::DetectLandmarksInImage(const cv::Mat &rgb_image, const cv::Rect_<double> bounding_box, CLNF& clnf_model, FaceModelParameters& params, cv::Mat &grayscale_image)
{
if (grayscale_image.empty())
{
Utilities::ConvertToGrayscale_8bit(rgb_image, grayscale_image);
}
// Can have multiple hypotheses
std::vector<cv::Vec3d> rotation_hypotheses;
if(params.multi_view)
{
// Try out different orientation initialisations
// It is possible to add other orientation hypotheses easilly by just pushing to this vector
rotation_hypotheses.push_back(cv::Vec3d(0,0,0));
rotation_hypotheses.push_back(cv::Vec3d(0, -0.5236, 0));
rotation_hypotheses.push_back(cv::Vec3d(0, 0.5236,0));
rotation_hypotheses.push_back(cv::Vec3d(0, -0.96, 0));
rotation_hypotheses.push_back(cv::Vec3d(0, 0.96, 0));
rotation_hypotheses.push_back(cv::Vec3d(0, 0, 0.5236));
rotation_hypotheses.push_back(cv::Vec3d(0, 0, -0.5236));
rotation_hypotheses.push_back(cv::Vec3d(0, -1.57, 0));
rotation_hypotheses.push_back(cv::Vec3d(0, 1.57, 0));
rotation_hypotheses.push_back(cv::Vec3d(0, -1.22, 0.698));
rotation_hypotheses.push_back(cv::Vec3d(0, 1.22, -0.698));
}
else
{
// Assume the face is close to frontal
rotation_hypotheses.push_back(cv::Vec3d(0,0,0));
}
bool success;
// Either use basic multi-hypothesis testing or clever testing if early termination parameters are present
if(clnf_model.patch_experts.early_term_biases.size() == 0)
{
success = DetectLandmarksInImageMultiHypBasic(grayscale_image, rotation_hypotheses, bounding_box, clnf_model, params);
}
else
{
success = DetectLandmarksInImageMultiHypEarlyTerm(grayscale_image, rotation_hypotheses, bounding_box, clnf_model, params);
}
return success;
}
bool LandmarkDetector::DetectLandmarksInImage(const cv::Mat &rgb_image, CLNF& clnf_model, FaceModelParameters& params, cv::Mat &grayscale_image)
{
if (grayscale_image.empty())
{
Utilities::ConvertToGrayscale_8bit(rgb_image, grayscale_image);
}
cv::Rect_<float> bounding_box;
// If the face detector has not been initialised read it in
if(clnf_model.face_detector_HAAR.empty() && params.curr_face_detector == FaceModelParameters::HAAR_DETECTOR)
{
clnf_model.face_detector_HAAR.load(params.haar_face_detector_location);
clnf_model.haar_face_detector_location = params.haar_face_detector_location;
}
if (clnf_model.face_detector_MTCNN.empty() && params.curr_face_detector == FaceModelParameters::MTCNN_DETECTOR)
{
clnf_model.face_detector_MTCNN.Read(params.mtcnn_face_detector_location);
clnf_model.mtcnn_face_detector_location = params.mtcnn_face_detector_location;
// If the model is still empty default to HOG
if (clnf_model.face_detector_MTCNN.empty())
{
std::cout << "INFO: defaulting to HOG-SVM face detector" << std::endl;
params.curr_face_detector = LandmarkDetector::FaceModelParameters::HOG_SVM_DETECTOR;
}
}
// Detect the face first
if(params.curr_face_detector == FaceModelParameters::HOG_SVM_DETECTOR)
{
float confidence;
LandmarkDetector::DetectSingleFaceHOG(bounding_box, grayscale_image, clnf_model.face_detector_HOG, confidence);
}
else if(params.curr_face_detector == FaceModelParameters::HAAR_DETECTOR)
{
LandmarkDetector::DetectSingleFace(bounding_box, rgb_image, clnf_model.face_detector_HAAR);
}
else if (params.curr_face_detector == FaceModelParameters::MTCNN_DETECTOR)
{
float confidence;
LandmarkDetector::DetectSingleFaceMTCNN(bounding_box, rgb_image, clnf_model.face_detector_MTCNN, confidence);
}
if(bounding_box.width == 0)
{
return false;
}
else
{
return DetectLandmarksInImage(rgb_image, bounding_box, clnf_model, params, grayscale_image);
}
}

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,340 @@
///////////////////////////////////////////////////////////////////////////////
// Copyright (C) 2017, Carnegie Mellon University and University of Cambridge,
// all rights reserved.
//
// ACADEMIC OR NON-PROFIT ORGANIZATION NONCOMMERCIAL RESEARCH USE ONLY
//
// BY USING OR DOWNLOADING THE SOFTWARE, YOU ARE AGREEING TO THE TERMS OF THIS LICENSE AGREEMENT.
// IF YOU DO NOT AGREE WITH THESE TERMS, YOU MAY NOT USE OR DOWNLOAD THE SOFTWARE.
//
// License can be found in OpenFace-license.txt
//
// * Any publications arising from the use of this software, including but
// not limited to academic journal and conference publications, technical
// reports and manuals, must cite at least one of the following works:
//
// OpenFace 2.0: Facial Behavior Analysis Toolkit
// Tadas Baltrušaitis, Amir Zadeh, Yao Chong Lim, and Louis-Philippe Morency
// in IEEE International Conference on Automatic Face and Gesture Recognition, 2018
//
// Convolutional experts constrained local model for facial landmark detection.
// A. Zadeh, T. Baltrušaitis, and Louis-Philippe Morency,
// in Computer Vision and Pattern Recognition Workshops, 2017.
//
// Rendering of Eyes for Eye-Shape Registration and Gaze Estimation
// Erroll Wood, Tadas Baltrušaitis, Xucong Zhang, Yusuke Sugano, Peter Robinson, and Andreas Bulling
// in IEEE International. Conference on Computer Vision (ICCV), 2015
//
// Cross-dataset learning and person-specific normalisation for automatic Action Unit detection
// Tadas Baltrušaitis, Marwa Mahmoud, and Peter Robinson
// in Facial Expression Recognition and Analysis Challenge,
// IEEE International Conference on Automatic Face and Gesture Recognition, 2015
//
///////////////////////////////////////////////////////////////////////////////
#include "stdafx.h"
#include "LandmarkDetectorParameters.h"
// System includes
#include <sstream>
#include <iostream>
#include <cstdlib>
#ifndef CONFIG_DIR
#define CONFIG_DIR "~"
#endif
using namespace LandmarkDetector;
FaceModelParameters::FaceModelParameters()
{
// initialise the default values
init();
check_model_path();
}
FaceModelParameters::FaceModelParameters(std::vector<std::string> &arguments)
{
// initialise the default values
init();
// First element is reserved for the executable location (useful for finding relative model locs)
fs::path root = fs::path(arguments[0]).parent_path();
bool* valid = new bool[arguments.size()];
valid[0] = true;
for (size_t i = 1; i < arguments.size(); ++i)
{
valid[i] = true;
if (arguments[i].compare("-mloc") == 0)
{
std::string model_loc = arguments[i + 1];
model_location = model_loc;
valid[i] = false;
valid[i + 1] = false;
i++;
}
if (arguments[i].compare("-fdloc") ==0)
{
std::string face_detector_loc = arguments[i + 1];
haar_face_detector_location = face_detector_loc;
curr_face_detector = HAAR_DETECTOR;
valid[i] = false;
valid[i + 1] = false;
i++;
}
if (arguments[i].compare("-sigma") == 0)
{
std::stringstream data(arguments[i + 1]);
data >> sigma;
valid[i] = false;
valid[i + 1] = false;
i++;
}
else if (arguments[i].compare("-w_reg") == 0)
{
std::stringstream data(arguments[i + 1]);
data >> weight_factor;
valid[i] = false;
valid[i + 1] = false;
i++;
}
else if (arguments[i].compare("-reg") == 0)
{
std::stringstream data(arguments[i + 1]);
data >> reg_factor;
valid[i] = false;
valid[i + 1] = false;
i++;
}
else if (arguments[i].compare("-multi_view") == 0)
{
std::stringstream data(arguments[i + 1]);
int m_view;
data >> m_view;
multi_view = (bool)(m_view != 0);
valid[i] = false;
valid[i + 1] = false;
i++;
}
else if (arguments[i].compare("-validate_detections") == 0)
{
std::stringstream data(arguments[i + 1]);
int v_det;
data >> v_det;
validate_detections = (bool)(v_det != 0);
valid[i] = false;
valid[i + 1] = false;
i++;
}
else if (arguments[i].compare("-n_iter") == 0)
{
std::stringstream data(arguments[i + 1]);
data >> num_optimisation_iteration;
valid[i] = false;
valid[i + 1] = false;
i++;
}
else if (arguments[i].compare("-wild") == 0)
{
// For in the wild fitting these parameters are suitable
window_sizes_init = std::vector<int>(4);
window_sizes_init[0] = 15; window_sizes_init[1] = 13; window_sizes_init[2] = 11; window_sizes_init[3] = 11;
sigma = 1.25;
reg_factor = 35;
weight_factor = 2.5;
num_optimisation_iteration = 10;
valid[i] = false;
// For in-the-wild images use an in-the wild detector
curr_face_detector = MTCNN_DETECTOR;
// Use multi-view hypotheses if in-the-wild setting
multi_view = true;
}
}
for (int i = (int)arguments.size() - 1; i >= 0; --i)
{
if (!valid[i])
{
arguments.erase(arguments.begin() + i);
}
}
// Make sure model_location is valid
// First check working directory, then the executable's directory, then the config path set by the build process.
fs::path config_path = fs::path(CONFIG_DIR);
fs::path model_path = fs::path(model_location);
if (fs::exists(model_path))
{
model_location = model_path.string();
}
else if (fs::exists(root/model_path))
{
model_location = (root/model_path).string();
}
else if (fs::exists(config_path/model_path))
{
model_location = (config_path/model_path).string();
}
else
{
std::cout << "Could not find the landmark detection model to load" << std::endl;
}
if (model_path.stem().string().compare("main_ceclm_general") == 0)
{
curr_landmark_detector = CECLM_DETECTOR;
sigma = 1.5f * sigma;
reg_factor = 0.9f * reg_factor;
}
else if (model_path.stem().string().compare("main_clnf_general") == 0)
{
curr_landmark_detector = CLNF_DETECTOR;
}
else if (model_path.stem().string().compare("main_clm_general") == 0)
{
curr_landmark_detector = CLM_DETECTOR;
}
// Make sure face detector location is valid
// First check working directory, then the executable's directory, then the config path set by the build process.
model_path = fs::path(haar_face_detector_location);
if (fs::exists(model_path))
{
haar_face_detector_location = model_path.string();
}
else if (fs::exists(root / model_path))
{
haar_face_detector_location = (root / model_path).string();
}
else if (fs::exists(config_path / model_path))
{
haar_face_detector_location = (config_path / model_path).string();
}
else
{
std::cout << "Could not find the HAAR face detector location" << std::endl;
}
// Make sure face detector location is valid
// First check working directory, then the executable's directory, then the config path set by the build process.
model_path = fs::path(mtcnn_face_detector_location);
if (fs::exists(model_path))
{
mtcnn_face_detector_location = model_path.string();
}
else if (fs::exists(root / model_path))
{
mtcnn_face_detector_location = (root / model_path).string();
}
else if (fs::exists(config_path / model_path))
{
mtcnn_face_detector_location = (config_path / model_path).string();
}
else
{
std::cout << "Could not find the MTCNN face detector location" << std::endl;
}
check_model_path(root.string());
}
void FaceModelParameters::check_model_path(const std::string& root)
{
// Make sure model_location is valid
// First check working directory, then the executable's directory, then the config path set by the build process.
fs::path config_path = fs::path(CONFIG_DIR);
fs::path model_path = fs::path(model_location);
fs::path root_path = fs::path(root);
if (fs::exists(model_path))
{
model_location = model_path.string();
}
else if (fs::exists(root_path / model_path))
{
model_location = (root_path / model_path).string();
}
else if (fs::exists(config_path / model_path))
{
model_location = (config_path / model_path).string();
}
else
{
std::cout << "Could not find the landmark detection model to load" << std::endl;
}
}
void FaceModelParameters::init()
{
// number of iterations that will be performed at each scale
num_optimisation_iteration = 5;
// using an external face checker based on SVM
validate_detections = true;
// Using hierarchical refinement by default (can be turned off)
refine_hierarchical = true;
// Refining parameters by default
refine_parameters = true;
window_sizes_small = std::vector<int>(4);
window_sizes_init = std::vector<int>(4);
// For fast tracking
window_sizes_small[0] = 0;
window_sizes_small[1] = 9;
window_sizes_small[2] = 7;
window_sizes_small[3] = 0;
// Just for initialisation
window_sizes_init.at(0) = 11;
window_sizes_init.at(1) = 9;
window_sizes_init.at(2) = 7;
window_sizes_init.at(3) = 5;
face_template_scale = 0.3f;
// Off by default (as it might lead to some slight inaccuracies in slowly moving faces)
use_face_template = false;
// For first frame use the initialisation
window_sizes_current = window_sizes_init;
model_location = "model/main_ceclm_general.txt";
curr_landmark_detector = CECLM_DETECTOR;
sigma = 1.5f;
reg_factor = 25.0f;
weight_factor = 0.0f; // By default do not use NU-RLMS for videos as it does not work as well for them
validation_boundary = 0.725f;
limit_pose = true;
multi_view = false;
reinit_video_every = 2;
// Face detection
haar_face_detector_location = "classifiers/haarcascade_frontalface_alt.xml";
mtcnn_face_detector_location = "model/mtcnn_detector/MTCNN_detector.txt";
// By default use MTCNN
curr_face_detector = MTCNN_DETECTOR;
}

View File

@@ -0,0 +1,925 @@
///////////////////////////////////////////////////////////////////////////////
// Copyright (C) 2017, Carnegie Mellon University and University of Cambridge,
// all rights reserved.
//
// ACADEMIC OR NON-PROFIT ORGANIZATION NONCOMMERCIAL RESEARCH USE ONLY
//
// BY USING OR DOWNLOADING THE SOFTWARE, YOU ARE AGREEING TO THE TERMS OF THIS LICENSE AGREEMENT.
// IF YOU DO NOT AGREE WITH THESE TERMS, YOU MAY NOT USE OR DOWNLOAD THE SOFTWARE.
//
// License can be found in OpenFace-license.txt
//
// * Any publications arising from the use of this software, including but
// not limited to academic journal and conference publications, technical
// reports and manuals, must cite at least one of the following works:
//
// OpenFace 2.0: Facial Behavior Analysis Toolkit
// Tadas Baltrušaitis, Amir Zadeh, Yao Chong Lim, and Louis-Philippe Morency
// in IEEE International Conference on Automatic Face and Gesture Recognition, 2018
//
// Convolutional experts constrained local model for facial landmark detection.
// A. Zadeh, T. Baltrušaitis, and Louis-Philippe Morency,
// in Computer Vision and Pattern Recognition Workshops, 2017.
//
// Rendering of Eyes for Eye-Shape Registration and Gaze Estimation
// Erroll Wood, Tadas Baltrušaitis, Xucong Zhang, Yusuke Sugano, Peter Robinson, and Andreas Bulling
// in IEEE International. Conference on Computer Vision (ICCV), 2015
//
// Cross-dataset learning and person-specific normalisation for automatic Action Unit detection
// Tadas Baltrušaitis, Marwa Mahmoud, and Peter Robinson
// in Facial Expression Recognition and Analysis Challenge,
// IEEE International Conference on Automatic Face and Gesture Recognition, 2015
//
///////////////////////////////////////////////////////////////////////////////
#include "stdafx.h"
#include <LandmarkDetectorUtils.h>
#include <RotationHelpers.h>
// OpenCV includes
#include <opencv2/core/core.hpp>
#include <opencv2/imgproc.hpp>
#include <opencv2/calib3d.hpp>
namespace LandmarkDetector
{
//===========================================================================
// Fast patch expert response computation (linear model across a ROI) using normalised cross-correlation
//===========================================================================
void crossCorr_m(const cv::Mat_<float>& img, cv::Mat_<double>& img_dft, const cv::Mat_<float>& _templ,
std::map<int, cv::Mat_<double> >& _templ_dfts, cv::Mat_<float>& corr)
{
// Our model will always be under min block size so can ignore this
//const double blockScale = 4.5;
//const int minBlockSize = 256;
int maxDepth = CV_64F;
cv::Size dftsize;
dftsize.width = cv::getOptimalDFTSize(corr.cols + _templ.cols - 1);
dftsize.height = cv::getOptimalDFTSize(corr.rows + _templ.rows - 1);
// Compute block size
cv::Size blocksize;
blocksize.width = dftsize.width - _templ.cols + 1;
blocksize.width = MIN(blocksize.width, corr.cols);
blocksize.height = dftsize.height - _templ.rows + 1;
blocksize.height = MIN(blocksize.height, corr.rows);
cv::Mat_<double> dftTempl;
// if this has not been precomputed, precompute it, otherwise use it
if (_templ_dfts.find(dftsize.width) == _templ_dfts.end())
{
dftTempl.create(dftsize.height, dftsize.width);
cv::Mat_<float> src = _templ;
cv::Mat_<double> dst(dftTempl, cv::Rect(0, 0, dftsize.width, dftsize.height));
cv::Mat_<double> dst1(dftTempl, cv::Rect(0, 0, _templ.cols, _templ.rows));
if (dst1.data != src.data)
src.convertTo(dst1, dst1.depth());
if (dst.cols > _templ.cols)
{
cv::Mat_<double> part(dst, cv::Range(0, _templ.rows), cv::Range(_templ.cols, dst.cols));
part.setTo(0);
}
// Perform DFT of the template
dft(dst, dst, 0, _templ.rows);
_templ_dfts[dftsize.width] = dftTempl;
}
else
{
// use the precomputed version
dftTempl = _templ_dfts.find(dftsize.width)->second;
}
cv::Size bsz(std::min(blocksize.width, corr.cols), std::min(blocksize.height, corr.rows));
cv::Mat src;
cv::Mat cdst(corr, cv::Rect(0, 0, bsz.width, bsz.height));
cv::Mat_<double> dftImg;
if (img_dft.empty())
{
dftImg.create(dftsize);
dftImg.setTo(0.0);
cv::Size dsz(bsz.width + _templ.cols - 1, bsz.height + _templ.rows - 1);
int x2 = std::min(img.cols, dsz.width);
int y2 = std::min(img.rows, dsz.height);
cv::Mat src0(img, cv::Range(0, y2), cv::Range(0, x2));
cv::Mat dst(dftImg, cv::Rect(0, 0, dsz.width, dsz.height));
cv::Mat dst1(dftImg, cv::Rect(0, 0, x2, y2));
src = src0;
if (dst1.data != src.data)
src.convertTo(dst1, dst1.depth());
dft(dftImg, dftImg, 0, dsz.height);
img_dft = dftImg.clone();
}
cv::Mat dftTempl1(dftTempl, cv::Rect(0, 0, dftsize.width, dftsize.height));
cv::mulSpectrums(img_dft, dftTempl1, dftImg, 0, true);
cv::dft(dftImg, dftImg, cv::DFT_INVERSE + cv::DFT_SCALE, bsz.height);
src = dftImg(cv::Rect(0, 0, bsz.width, bsz.height));
src.convertTo(cdst, CV_32F);
}
////////////////////////////////////////////////////////////////////////////////////////////////////////
void matchTemplate_m(const cv::Mat_<float>& input_img, cv::Mat_<double>& img_dft, cv::Mat& _integral_img,
cv::Mat& _integral_img_sq, const cv::Mat_<float>& templ, std::map<int, cv::Mat_<double> >& templ_dfts,
cv::Mat_<float>& result, int method)
{
int numType = method == cv::TM_CCORR || method == cv::TM_CCORR_NORMED ? 0 :
method == cv::TM_CCOEFF || method == cv::TM_CCOEFF_NORMED ? 1 : 2;
bool isNormed = method == cv::TM_CCORR_NORMED ||
method == cv::TM_SQDIFF_NORMED ||
method == cv::TM_CCOEFF_NORMED;
// Assume result is defined properly
if (result.empty())
{
cv::Size corrSize(input_img.cols - templ.cols + 1, input_img.rows - templ.rows + 1);
result.create(corrSize);
}
LandmarkDetector::crossCorr_m(input_img, img_dft, templ, templ_dfts, result);
if (method == cv::TM_CCORR)
return;
double invArea = 1. / ((double)templ.rows * templ.cols);
cv::Mat sum, sqsum;
cv::Scalar templMean, templSdv;
double *q0 = 0, *q1 = 0, *q2 = 0, *q3 = 0;
double templNorm = 0, templSum2 = 0;
if (method == cv::TM_CCOEFF)
{
// If it has not been precomputed compute it now
if (_integral_img.empty())
{
integral(input_img, _integral_img, CV_64F);
}
sum = _integral_img;
templMean = cv::mean(templ);
}
else
{
// If it has not been precomputed compute it now
if (_integral_img.empty())
{
integral(input_img, _integral_img, _integral_img_sq, CV_64F);
}
sum = _integral_img;
sqsum = _integral_img_sq;
meanStdDev(templ, templMean, templSdv);
templNorm = templSdv[0] * templSdv[0] + templSdv[1] * templSdv[1] + templSdv[2] * templSdv[2] + templSdv[3] * templSdv[3];
if (templNorm < DBL_EPSILON && method == cv::TM_CCOEFF_NORMED)
{
result.setTo(1.0);
return;
}
templSum2 = templNorm + templMean[0] * templMean[0] + templMean[1] * templMean[1] + templMean[2] * templMean[2] + templMean[3] * templMean[3];
if (numType != 1)
{
templMean = cv::Scalar::all(0);
templNorm = templSum2;
}
templSum2 /= invArea;
templNorm = std::sqrt(templNorm);
templNorm /= std::sqrt(invArea); // care of accuracy here
q0 = (double*)sqsum.data;
q1 = q0 + templ.cols;
q2 = (double*)(sqsum.data + templ.rows*sqsum.step);
q3 = q2 + templ.cols;
}
double* p0 = (double*)sum.data;
double* p1 = p0 + templ.cols;
double* p2 = (double*)(sum.data + templ.rows*sum.step);
double* p3 = p2 + templ.cols;
int sumstep = sum.data ? (int)(sum.step / sizeof(double)) : 0;
int sqstep = sqsum.data ? (int)(sqsum.step / sizeof(double)) : 0;
int i, j;
for (i = 0; i < result.rows; i++)
{
float* rrow = result.ptr<float>(i);
int idx = i * sumstep;
int idx2 = i * sqstep;
for (j = 0; j < result.cols; j++, idx += 1, idx2 += 1)
{
double num = rrow[j], t;
double wndMean2 = 0, wndSum2 = 0;
if (numType == 1)
{
t = p0[idx] - p1[idx] - p2[idx] + p3[idx];
wndMean2 += t*t;
num -= t*templMean[0];
wndMean2 *= invArea;
}
if (isNormed || numType == 2)
{
t = q0[idx2] - q1[idx2] - q2[idx2] + q3[idx2];
wndSum2 += t;
if (numType == 2)
{
num = wndSum2 - 2 * num + templSum2;
num = MAX(num, 0.);
}
}
if (isNormed)
{
t = std::sqrt(MAX(wndSum2 - wndMean2, 0))*templNorm;
if (fabs(num) < t)
num /= t;
else if (fabs(num) < t*1.125)
num = num > 0 ? 1 : -1;
else
num = method != cv::TM_SQDIFF_NORMED ? 0 : 1;
}
rrow[j] = (float)num;
}
}
}
// Useful utility for grabing a bounding box around a set of 2D landmarks (as a 1D 2n x 1 vector of xs followed by doubles or as an n x 2 vector)
void ExtractBoundingBox(const cv::Mat_<float>& landmarks, float &min_x, float &max_x, float &min_y, float &max_y)
{
if (landmarks.cols == 1)
{
int n = landmarks.rows / 2;
cv::MatConstIterator_<float> landmarks_it = landmarks.begin();
for (int i = 0; i < n; ++i)
{
float val = *landmarks_it++;
if (i == 0 || val < min_x)
min_x = val;
if (i == 0 || val > max_x)
max_x = val;
}
for (int i = 0; i < n; ++i)
{
float val = *landmarks_it++;
if (i == 0 || val < min_y)
min_y = val;
if (i == 0 || val > max_y)
max_y = val;
}
}
else
{
int n = landmarks.rows;
for (int i = 0; i < n; ++i)
{
float val_x = landmarks.at<float>(i, 0);
float val_y = landmarks.at<float>(i, 1);
if (i == 0 || val_x < min_x)
min_x = val_x;
if (i == 0 || val_x > max_x)
max_x = val_x;
if (i == 0 || val_y < min_y)
min_y = val_y;
if (i == 0 || val_y > max_y)
max_y = val_y;
}
}
}
// Computing landmarks (to be drawn later possibly)
std::vector<cv::Point2f> CalculateVisibleLandmarks(const cv::Mat_<float>& shape2D, const cv::Mat_<int>& visibilities)
{
int n = shape2D.rows / 2;
std::vector<cv::Point2f> landmarks;
for (int i = 0; i < n; ++i)
{
if (visibilities.at<int>(i))
{
cv::Point2f featurePoint(shape2D.at<float>(i), shape2D.at<float>(i + n));
landmarks.push_back(featurePoint);
}
}
return landmarks;
}
// Computing landmarks (to be drawn later possibly)
std::vector<cv::Point2f> CalculateAllLandmarks(const cv::Mat_<float>& shape2D)
{
int n = 0;
std::vector<cv::Point2f> landmarks;
if (shape2D.cols == 2)
{
n = shape2D.rows;
}
else if (shape2D.cols == 1)
{
n = shape2D.rows / 2;
}
for (int i = 0; i < n; ++i)
{
cv::Point2f featurePoint;
if (shape2D.cols == 1)
{
featurePoint = cv::Point2f(shape2D.at<float>(i), shape2D.at<float>(i + n));
}
else
{
featurePoint = cv::Point2f(shape2D.at<float>(i, 0), shape2D.at<float>(i, 1));
}
landmarks.push_back(featurePoint);
}
return landmarks;
}
// Computing landmarks (to be drawn later possibly)
std::vector<cv::Point2f> CalculateAllLandmarks(const CLNF& clnf_model)
{
return CalculateAllLandmarks(clnf_model.detected_landmarks);
}
// Computing landmarks (to be drawn later possibly)
std::vector<cv::Point2f> CalculateVisibleLandmarks(const CLNF& clnf_model)
{
// If the detection was not successful no landmarks are visible
if (clnf_model.detection_success)
{
int idx = clnf_model.patch_experts.GetViewIdx(clnf_model.params_global, 0);
// Because we only draw visible points, need to find which points patch experts consider visible at a certain orientation
return CalculateVisibleLandmarks(clnf_model.detected_landmarks, clnf_model.patch_experts.visibilities[0][idx]);
}
else
{
return std::vector<cv::Point2f>();
}
}
// Computing eye landmarks (to be drawn later or in different interfaces)
std::vector<cv::Point2f> CalculateVisibleEyeLandmarks(const CLNF& clnf_model)
{
std::vector<cv::Point2f> to_return;
// If the model has hierarchical updates draw those too
for (size_t i = 0; i < clnf_model.hierarchical_models.size(); ++i)
{
if (clnf_model.hierarchical_model_names[i].compare("left_eye_28") == 0 ||
clnf_model.hierarchical_model_names[i].compare("right_eye_28") == 0)
{
auto lmks = CalculateVisibleLandmarks(clnf_model.hierarchical_models[i]);
for (auto lmk : lmks)
{
to_return.push_back(lmk);
}
}
}
return to_return;
}
// Computing the 3D eye landmarks
std::vector<cv::Point3f> Calculate3DEyeLandmarks(const CLNF& clnf_model, float fx, float fy, float cx, float cy)
{
std::vector<cv::Point3f> to_return;
for (size_t i = 0; i < clnf_model.hierarchical_models.size(); ++i)
{
if (clnf_model.hierarchical_model_names[i].compare("left_eye_28") == 0 ||
clnf_model.hierarchical_model_names[i].compare("right_eye_28") == 0)
{
auto lmks = clnf_model.hierarchical_models[i].GetShape(fx, fy, cx, cy);
int num_landmarks = lmks.cols;
for (int lmk = 0; lmk < num_landmarks; ++lmk)
{
cv::Point3f curr_lmk(lmks.at<float>(0, lmk), lmks.at<float>(1, lmk), lmks.at<float>(2, lmk));
to_return.push_back(curr_lmk);
}
}
}
return to_return;
}
// Computing eye landmarks (to be drawn later or in different interfaces)
std::vector<cv::Point2f> CalculateAllEyeLandmarks(const CLNF& clnf_model)
{
std::vector<cv::Point2f> to_return;
// If the model has hierarchical updates draw those too
for (size_t i = 0; i < clnf_model.hierarchical_models.size(); ++i)
{
if (clnf_model.hierarchical_model_names[i].compare("left_eye_28") == 0 ||
clnf_model.hierarchical_model_names[i].compare("right_eye_28") == 0)
{
auto lmks = CalculateAllLandmarks(clnf_model.hierarchical_models[i]);
for (auto lmk : lmks)
{
to_return.push_back(lmk);
}
}
}
return to_return;
}
//===========================================================================
//============================================================================
// Face detection helpers
//============================================================================
bool DetectFaces(std::vector<cv::Rect_<float> >& o_regions, const cv::Mat_<uchar>& intensity, float min_width, cv::Rect_<float> roi)
{
cv::CascadeClassifier classifier("./classifiers/haarcascade_frontalface_alt.xml");
if (classifier.empty())
{
std::cout << "Couldn't load the Haar cascade classifier" << std::endl;
return false;
}
else
{
return DetectFaces(o_regions, intensity, classifier, min_width, roi);
}
}
bool DetectFaces(std::vector<cv::Rect_<float> >& o_regions, const cv::Mat_<uchar>& intensity, cv::CascadeClassifier& classifier, float min_width, cv::Rect_<float> roi)
{
std::vector<cv::Rect> face_detections;
if (min_width == -1)
{
classifier.detectMultiScale(intensity, face_detections, 1.2, 2, 0, cv::Size(50, 50));
}
else
{
classifier.detectMultiScale(intensity, face_detections, 1.2, 2, 0, cv::Size(min_width, min_width));
}
// Convert from int bounding box do a double one with corrections
for (size_t face = 0; face < face_detections.size(); ++face)
{
// OpenCV is overgenerous with face size and y location is off
// CLNF detector expects the bounding box to encompass from eyebrow to chin in y, and from cheeck outline to cheeck outline in x, so we need to compensate
// The scalings were learned using the Face Detections on LFPW, Helen, AFW and iBUG datasets, using ground truth and detections from openCV
cv::Rect_<float> region;
// Correct for scale
region.width = face_detections[face].width * 0.8924f;
region.height = face_detections[face].height * 0.8676f;
// Move the face slightly to the right (as the width was made smaller)
region.x = face_detections[face].x + 0.0578f * face_detections[face].width;
// Shift face down as OpenCV Haar Cascade detects the forehead as well, and we're not interested
region.y = face_detections[face].y + face_detections[face].height * 0.2166f;
if (min_width != -1)
{
if (region.width < min_width || region.x < ((float)intensity.cols) * roi.x || region.y < ((float)intensity.cols) * roi.y || region.x + region.width >((float)intensity.cols) * (roi.x + roi.width) || region.y + region.height >((float)intensity.rows) * (roi.y + roi.height))
continue;
}
o_regions.push_back(region);
}
return o_regions.size() > 0;
}
bool DetectSingleFace(cv::Rect_<float>& o_region, const cv::Mat_<uchar>& intensity_image, cv::CascadeClassifier& classifier, cv::Point preference, float min_width, cv::Rect_<float> roi)
{
// The tracker can return multiple faces
std::vector<cv::Rect_<float> > face_detections;
bool detect_success = LandmarkDetector::DetectFaces(face_detections, intensity_image, classifier, min_width, roi);
if (detect_success)
{
bool use_preferred = (preference.x != -1) && (preference.y != -1);
if (face_detections.size() > 1)
{
// keep the closest one if preference point not set
float best = -1;
int bestIndex = -1;
for (size_t i = 0; i < face_detections.size(); ++i)
{
float dist;
bool better;
if (use_preferred)
{
dist = sqrt((preference.x) * (face_detections[i].width / 2 + face_detections[i].x) +
(preference.y) * (face_detections[i].height / 2 + face_detections[i].y));
better = dist < best;
}
else
{
dist = face_detections[i].width;
better = face_detections[i].width > best;
}
// Pick a closest face to preffered point or the biggest face
if (i == 0 || better)
{
bestIndex = i;
best = dist;
}
}
o_region = face_detections[bestIndex];
}
else
{
o_region = face_detections[0];
}
}
else
{
// if not detected
o_region = cv::Rect_<float>(0, 0, 0, 0);
}
return detect_success;
}
bool DetectFacesHOG(std::vector<cv::Rect_<float> >& o_regions, const cv::Mat_<uchar>& intensity,
std::vector<float>& confidences, float min_width, cv::Rect_<float> roi)
{
dlib::frontal_face_detector detector = dlib::get_frontal_face_detector();
return DetectFacesHOG(o_regions, intensity, detector, confidences, min_width, roi);
}
bool DetectFacesHOG(std::vector<cv::Rect_<float> >& o_regions, const cv::Mat_<uchar>& intensity,
dlib::frontal_face_detector& detector, std::vector<float>& o_confidences, float min_width, cv::Rect_<float> roi)
{
if (detector.num_detectors() == 0)
{
detector = dlib::get_frontal_face_detector();
}
cv::Mat_<uchar> upsampled_intensity;
float scaling = 1.3f;
cv::resize(intensity, upsampled_intensity, cv::Size((int)(intensity.cols * scaling), (int)(intensity.rows * scaling)));
dlib::cv_image<uchar> cv_grayscale(upsampled_intensity);
std::vector<dlib::full_detection> face_detections;
detector(cv_grayscale, face_detections, -0.2);
// Convert from int bounding box do a double one with corrections
for (size_t face = 0; face < face_detections.size(); ++face)
{
// CLNF expects the bounding box to encompass from eyebrow to chin in y, and from cheeck outline to cheeck outline in x, so we need to compensate
cv::Rect_<float> region;
// Move the face slightly to the right (as the width was made smaller)
region.x = (face_detections[face].rect.get_rect().tl_corner().x() + 0.0389f * face_detections[face].rect.get_rect().width()) / scaling;
// Shift face down as OpenCV Haar Cascade detects the forehead as well, and we're not interested
region.y = (face_detections[face].rect.get_rect().tl_corner().y() + 0.1278f * face_detections[face].rect.get_rect().height()) / scaling;
// Correct for scale
region.width = (face_detections[face].rect.get_rect().width() * 0.9611) / scaling;
region.height = (face_detections[face].rect.get_rect().height() * 0.9388) / scaling;
// The scalings were learned using the Face Detections on LFPW and Helen using ground truth and detections from the HOG detector
if (min_width != -1)
{
if (region.width < min_width || region.x < ((float)intensity.cols) * roi.x || region.y < ((float)intensity.cols) * roi.y ||
region.x + region.width >((float)intensity.cols) * (roi.x + roi.width) || region.y + region.height >((float)intensity.rows) * (roi.y + roi.height))
continue;
}
o_regions.push_back(region);
o_confidences.push_back(face_detections[face].detection_confidence);
}
return o_regions.size() > 0;
}
bool DetectSingleFaceHOG(cv::Rect_<float>& o_region, const cv::Mat_<uchar>& intensity_img, dlib::frontal_face_detector& detector, float& confidence, cv::Point preference, float min_width, cv::Rect_<float> roi)
{
if (detector.num_detectors() == 0)
{
detector = dlib::get_frontal_face_detector();
}
// The tracker can return multiple faces
std::vector<cv::Rect_<float> > face_detections;
std::vector<float> confidences;
bool detect_success = LandmarkDetector::DetectFacesHOG(face_detections, intensity_img, detector, confidences, min_width, roi);
// In case of multiple faces pick the biggest one
bool use_size = true;
if (detect_success)
{
bool use_preferred = (preference.x != -1) && (preference.y != -1);
// keep the most confident one or the one closest to preference point if set
float best_so_far;
if (use_preferred)
{
best_so_far = sqrt((preference.x - (face_detections[0].width / 2 + face_detections[0].x)) * (preference.x - (face_detections[0].width / 2 + face_detections[0].x)) +
(preference.y - (face_detections[0].height / 2 + face_detections[0].y)) * (preference.y - (face_detections[0].height / 2 + face_detections[0].y)));
}
else if (use_size)
{
best_so_far = (face_detections[0].width + face_detections[0].height) / 2.0;
}
else
{
best_so_far = confidences[0];
}
int bestIndex = 0;
for (size_t i = 1; i < face_detections.size(); ++i)
{
float dist;
bool better;
if (use_preferred)
{
dist = sqrt((preference.x - (face_detections[i].width / 2 + face_detections[i].x)) * (preference.x - (face_detections[i].width / 2 + face_detections[i].x)) +
(preference.y - (face_detections[i].height / 2 + face_detections[i].y)) * (preference.y - (face_detections[i].height / 2 + face_detections[i].y)));
better = dist < best_so_far;
}
else if (use_size)
{
dist = (face_detections[i].width + face_detections[i].height) / 2.0;
better = dist > best_so_far;
}
else
{
dist = confidences[i];
better = dist > best_so_far;
}
// Pick a closest face
if (better)
{
best_so_far = dist;
bestIndex = i;
}
}
o_region = face_detections[bestIndex];
confidence = confidences[bestIndex];
}
else
{
// if not detected
o_region = cv::Rect_<float>(0, 0, 0, 0);
// A completely unreliable detection (shouldn't really matter what is returned here)
confidence = -2;
}
return detect_success;
}
bool DetectFacesMTCNN(std::vector<cv::Rect_<float> >& o_regions, const cv::Mat& image, LandmarkDetector::FaceDetectorMTCNN& detector,
std::vector<float>& o_confidences)
{
detector.DetectFaces(o_regions, image, o_confidences);
return o_regions.size() > 0;
}
bool DetectSingleFaceMTCNN(cv::Rect_<float>& o_region, const cv::Mat& image, LandmarkDetector::FaceDetectorMTCNN& detector,
float& confidence, cv::Point preference)
{
// The tracker can return multiple faces
std::vector<cv::Rect_<float> > face_detections;
std::vector<float> confidences;
detector.DetectFaces(face_detections, image, confidences);
bool detect_success = face_detections.size() > 0;
if (detect_success)
{
bool use_preferred = (preference.x != -1) && (preference.y != -1);
// keep the most confident one or the one closest to preference point if set
float best_so_far;
if (use_preferred)
{
best_so_far = sqrt((preference.x - (face_detections[0].width / 2 + face_detections[0].x)) * (preference.x - (face_detections[0].width / 2 + face_detections[0].x)) +
(preference.y - (face_detections[0].height / 2 + face_detections[0].y)) * (preference.y - (face_detections[0].height / 2 + face_detections[0].y)));
}
else
{
best_so_far = face_detections[0].width;
}
int bestIndex = 0;
for (size_t i = 1; i < face_detections.size(); ++i)
{
float dist;
bool better;
if (use_preferred)
{
dist = sqrt((preference.x - (face_detections[i].width / 2 + face_detections[i].x)) * (preference.x - (face_detections[i].width / 2 + face_detections[i].x)) +
(preference.y - (face_detections[i].height / 2 + face_detections[i].y)) * (preference.y - (face_detections[i].height / 2 + face_detections[i].y)));
better = dist < best_so_far;
}
else
{
dist = face_detections[i].width;
better = dist > best_so_far;
}
// Pick a closest face
if (better)
{
best_so_far = dist;
bestIndex = i;
}
}
o_region = face_detections[bestIndex];
confidence = confidences[bestIndex];
}
else
{
// if not detected
o_region = cv::Rect_<float>(0, 0, 0, 0);
// A completely unreliable detection (shouldn't really matter what is returned here)
confidence = -2;
}
return detect_success;
}
//============================================================================
// Matrix reading functionality
//============================================================================
// Reading in a matrix from a stream
void ReadMat(std::ifstream& stream, cv::Mat &output_mat)
{
// Read in the number of rows, columns and the data type
int row, col, type;
stream >> row >> col >> type;
output_mat = cv::Mat(row, col, type);
switch (output_mat.type())
{
case CV_64FC1:
{
cv::MatIterator_<double> begin_it = output_mat.begin<double>();
cv::MatIterator_<double> end_it = output_mat.end<double>();
while (begin_it != end_it)
{
stream >> *begin_it++;
}
}
break;
case CV_32FC1:
{
cv::MatIterator_<float> begin_it = output_mat.begin<float>();
cv::MatIterator_<float> end_it = output_mat.end<float>();
while (begin_it != end_it)
{
stream >> *begin_it++;
}
}
break;
case CV_32SC1:
{
cv::MatIterator_<int> begin_it = output_mat.begin<int>();
cv::MatIterator_<int> end_it = output_mat.end<int>();
while (begin_it != end_it)
{
stream >> *begin_it++;
}
}
break;
case CV_8UC1:
{
cv::MatIterator_<uchar> begin_it = output_mat.begin<uchar>();
cv::MatIterator_<uchar> end_it = output_mat.end<uchar>();
while (begin_it != end_it)
{
stream >> *begin_it++;
}
}
break;
default:
printf("ERROR(%s,%d) : Unsupported Matrix type %d!\n", __FILE__, __LINE__, output_mat.type()); abort();
}
}
void ReadMatBin(std::ifstream& stream, cv::Mat &output_mat)
{
// Read in the number of rows, columns and the data type
int row, col, type;
stream.read((char*)&row, 4);
stream.read((char*)&col, 4);
stream.read((char*)&type, 4);
output_mat = cv::Mat(row, col, type);
int size = output_mat.rows * output_mat.cols * output_mat.elemSize();
stream.read((char *)output_mat.data, size);
}
// Skipping lines that start with # (together with empty lines)
void SkipComments(std::ifstream& stream)
{
while (stream.peek() == '#' || stream.peek() == '\n' || stream.peek() == ' ' || stream.peek() == '\r')
{
std::string skipped;
std::getline(stream, skipped);
}
}
}

View File

@@ -0,0 +1,515 @@
///////////////////////////////////////////////////////////////////////////////
// Copyright (C) 2017, Carnegie Mellon University and University of Cambridge,
// all rights reserved.
//
// ACADEMIC OR NON-PROFIT ORGANIZATION NONCOMMERCIAL RESEARCH USE ONLY
//
// BY USING OR DOWNLOADING THE SOFTWARE, YOU ARE AGREEING TO THE TERMS OF THIS LICENSE AGREEMENT.
// IF YOU DO NOT AGREE WITH THESE TERMS, YOU MAY NOT USE OR DOWNLOAD THE SOFTWARE.
//
// License can be found in OpenFace-license.txt
//
// * Any publications arising from the use of this software, including but
// not limited to academic journal and conference publications, technical
// reports and manuals, must cite at least one of the following works:
//
// OpenFace 2.0: Facial Behavior Analysis Toolkit
// Tadas Baltrušaitis, Amir Zadeh, Yao Chong Lim, and Louis-Philippe Morency
// in IEEE International Conference on Automatic Face and Gesture Recognition, 2018
//
// Convolutional experts constrained local model for facial landmark detection.
// A. Zadeh, T. Baltrušaitis, and Louis-Philippe Morency,
// in Computer Vision and Pattern Recognition Workshops, 2017.
//
// Rendering of Eyes for Eye-Shape Registration and Gaze Estimation
// Erroll Wood, Tadas Baltrušaitis, Xucong Zhang, Yusuke Sugano, Peter Robinson, and Andreas Bulling
// in IEEE International. Conference on Computer Vision (ICCV), 2015
//
// Cross-dataset learning and person-specific normalisation for automatic Action Unit detection
// Tadas Baltrušaitis, Marwa Mahmoud, and Peter Robinson
// in Facial Expression Recognition and Analysis Challenge,
// IEEE International Conference on Automatic Face and Gesture Recognition, 2015
//
///////////////////////////////////////////////////////////////////////////////
#include "stdafx.h"
#include "PAW.h"
// OpenCV includes
#include <opencv2/core/core.hpp>
#include <opencv2/imgproc.hpp>
#include "LandmarkDetectorUtils.h"
using namespace LandmarkDetector;
// Copy constructor
PAW::PAW(const PAW& other) : destination_landmarks(other.destination_landmarks.clone()), source_landmarks(other.source_landmarks.clone()), triangulation(other.triangulation.clone()),
triangle_id(other.triangle_id.clone()), pixel_mask(other.pixel_mask.clone()), coefficients(other.coefficients.clone()), alpha(other.alpha.clone()), beta(other.beta.clone()), map_x(other.map_x.clone()), map_y(other.map_y.clone())
{
this->number_of_pixels = other.number_of_pixels;
this->min_x = other.min_x;
this->min_y = other.min_y;
}
// A constructor from destination shape and triangulation
PAW::PAW(const cv::Mat_<float>& destination_shape, const cv::Mat_<int>& triangulation)
{
// Initialise some variables directly
this->destination_landmarks = destination_shape;
this->triangulation = triangulation;
int num_points = destination_shape.rows / 2;
int num_tris = triangulation.rows;
// Pre-compute the rest
alpha = cv::Mat_<float>(num_tris, 3);
beta = cv::Mat_<float>(num_tris, 3);
cv::Mat_<float> xs = destination_shape(cv::Rect(0, 0, 1, num_points));
cv::Mat_<float> ys = destination_shape(cv::Rect(0, num_points, 1, num_points));
// Create a vector representation of the control points
std::vector<std::vector<float>> destination_points;
for (int tri = 0; tri < num_tris; ++tri)
{
int j = triangulation.at<int>(tri, 0);
int k = triangulation.at<int>(tri, 1);
int l = triangulation.at<int>(tri, 2);
float c1 = ys.at<float>(l) - ys.at<float>(j);
float c2 = xs.at<float>(l) - xs.at<float>(j);
float c4 = ys.at<float>(k) - ys.at<float>(j);
float c3 = xs.at<float>(k) - xs.at<float>(j);
float c5 = c3*c1 - c2*c4;
alpha.at<float>(tri, 0) = (ys.at<float>(j) * c2 - xs.at<float>(j) * c1) / c5;
alpha.at<float>(tri, 1) = c1 / c5;
alpha.at<float>(tri, 2) = -c2 / c5;
beta.at<float>(tri, 0) = (xs.at<float>(j) * c4 - ys.at<float>(j) * c3) / c5;
beta.at<float>(tri, 1) = -c4 / c5;
beta.at<float>(tri, 2) = c3 / c5;
// Add points corresponding to triangles as optimisation
std::vector<float> triangle_points(10);
triangle_points[0] = xs.at<float>(j);
triangle_points[1] = ys.at<float>(j);
triangle_points[2] = xs.at<float>(k);
triangle_points[3] = ys.at<float>(k);
triangle_points[4] = xs.at<float>(l);
triangle_points[5] = ys.at<float>(l);
cv::Vec3f xs_three(triangle_points[0], triangle_points[2], triangle_points[4]);
cv::Vec3f ys_three(triangle_points[1], triangle_points[3], triangle_points[5]);
double min_x, max_x, min_y, max_y;
cv::minMaxIdx(xs_three, &min_x, &max_x);
cv::minMaxIdx(ys_three, &min_y, &max_y);
triangle_points[6] = (float)max_x;
triangle_points[7] = (float)max_y;
triangle_points[8] = (float)min_x;
triangle_points[9] = (float)min_y;
destination_points.push_back(triangle_points);
}
double max_x;
double max_y;
double min_x_d;
double min_y_d;
minMaxLoc(xs, &min_x_d, &max_x);
minMaxLoc(ys, &min_y_d, &max_y);
min_x = min_x_d;
min_y = min_y_d;
int w = (int)(max_x - min_x + 1.5);
int h = (int)(max_y - min_y + 1.5);
// Round the min_x and min_y for simplicity?
pixel_mask = cv::Mat_<uchar>(h, w, (uchar)0);
triangle_id = cv::Mat_<int>(h, w, -1);
int curr_tri = -1;
for (int y = 0; y < pixel_mask.rows; y++)
{
for (int x = 0; x < pixel_mask.cols; x++)
{
curr_tri = findTriangle(cv::Point_<float>(x + min_x, y + min_y), destination_points, curr_tri);
// If there is a triangle at this location
if (curr_tri != -1)
{
triangle_id.at<int>(y, x) = curr_tri;
pixel_mask.at<uchar>(y, x) = 1;
}
}
}
// Preallocate maps and coefficients
coefficients.create(num_tris, 6);
map_x.create(pixel_mask.rows, pixel_mask.cols);
map_y.create(pixel_mask.rows, pixel_mask.cols);
}
// Manually define min and max values
PAW::PAW(const cv::Mat_<float>& destination_shape, const cv::Mat_<int>& triangulation, float in_min_x, float in_min_y, float in_max_x, float in_max_y)
{
// Initialise some variables directly
this->destination_landmarks = destination_shape;
this->triangulation = triangulation;
int num_points = destination_shape.rows / 2;
int num_tris = triangulation.rows;
// Pre-compute the rest
alpha = cv::Mat_<float>(num_tris, 3);
beta = cv::Mat_<float>(num_tris, 3);
cv::Mat_<float> xs = destination_shape(cv::Rect(0, 0, 1, num_points));
cv::Mat_<float> ys = destination_shape(cv::Rect(0, num_points, 1, num_points));
// Create a vector representation of the control points
std::vector<std::vector<float>> destination_points;
for (int tri = 0; tri < num_tris; ++tri)
{
int j = triangulation.at<int>(tri, 0);
int k = triangulation.at<int>(tri, 1);
int l = triangulation.at<int>(tri, 2);
float c1 = ys.at<float>(l) - ys.at<float>(j);
float c2 = xs.at<float>(l) - xs.at<float>(j);
float c4 = ys.at<float>(k) - ys.at<float>(j);
float c3 = xs.at<float>(k) - xs.at<float>(j);
float c5 = c3*c1 - c2*c4;
alpha.at<float>(tri, 0) = (ys.at<float>(j) * c2 - xs.at<float>(j) * c1) / c5;
alpha.at<float>(tri, 1) = c1 / c5;
alpha.at<float>(tri, 2) = -c2 / c5;
beta.at<float>(tri, 0) = (xs.at<float>(j) * c4 - ys.at<float>(j) * c3) / c5;
beta.at<float>(tri, 1) = -c4 / c5;
beta.at<float>(tri, 2) = c3 / c5;
// Add points corresponding to triangles as optimisation
std::vector<float> triangle_points(10);
triangle_points[0] = xs.at<float>(j);
triangle_points[1] = ys.at<float>(j);
triangle_points[2] = xs.at<float>(k);
triangle_points[3] = ys.at<float>(k);
triangle_points[4] = xs.at<float>(l);
triangle_points[5] = ys.at<float>(l);
cv::Vec3f xs_three(triangle_points[0], triangle_points[2], triangle_points[4]);
cv::Vec3f ys_three(triangle_points[1], triangle_points[3], triangle_points[5]);
double min_x, max_x, min_y, max_y;
cv::minMaxIdx(xs_three, &min_x, &max_x);
cv::minMaxIdx(ys_three, &min_y, &max_y);
triangle_points[6] = (float)max_x;
triangle_points[7] = (float)max_y;
triangle_points[8] = (float)min_x;
triangle_points[9] = (float)min_y;
destination_points.push_back(triangle_points);
}
float max_x;
float max_y;
min_x = in_min_x;
min_y = in_min_y;
max_x = in_max_x;
max_y = in_max_y;
int w = (int)(max_x - min_x + 1.5);
int h = (int)(max_y - min_y + 1.5);
// Round the min_x and min_y for simplicity?
pixel_mask = cv::Mat_<uchar>(h, w, (uchar)0);
triangle_id = cv::Mat_<int>(h, w, -1);
int curr_tri = -1;
for (int y = 0; y < pixel_mask.rows; y++)
{
for (int x = 0; x < pixel_mask.cols; x++)
{
curr_tri = findTriangle(cv::Point_<float>(x + min_x, y + min_y), destination_points, curr_tri);
// If there is a triangle at this location
if (curr_tri != -1)
{
triangle_id.at<int>(y, x) = curr_tri;
pixel_mask.at<uchar>(y, x) = 1;
}
}
}
// Preallocate maps and coefficients
coefficients.create(num_tris, 6);
map_x.create(pixel_mask.rows, pixel_mask.cols);
map_y.create(pixel_mask.rows, pixel_mask.cols);
}
//===========================================================================
void PAW::Read(std::ifstream& stream)
{
stream.read((char*)&number_of_pixels, 4);
double min_x_d, min_y_d;
stream.read((char*)&min_x_d, 8);
stream.read((char*)&min_y_d, 8);
min_x = (float)min_x_d;
min_y = (float)min_y_d;
cv::Mat_<double> destination_landmarks_d;
ReadMatBin(stream, destination_landmarks_d);
destination_landmarks_d.convertTo(destination_landmarks, CV_32F);
ReadMatBin(stream, triangulation);
ReadMatBin(stream, triangle_id);
cv::Mat tmpMask;
ReadMatBin(stream, tmpMask);
tmpMask.convertTo(pixel_mask, CV_8U);
cv::Mat_<double> alpha_d;
ReadMatBin(stream, alpha_d);
alpha_d.convertTo(alpha, CV_32F);
cv::Mat_<double> beta_d;
ReadMatBin(stream, beta_d);
beta_d.convertTo(beta, CV_32F);
map_x.create(pixel_mask.rows, pixel_mask.cols);
map_y.create(pixel_mask.rows, pixel_mask.cols);
coefficients.create(this->NumberOfTriangles(), 6);
source_landmarks = destination_landmarks;
}
//=============================================================================
// cropping from the source image to the destination image using the shape in s, used to determine if shape fitting converged successfully
void PAW::Warp(const cv::Mat& image_to_warp, cv::Mat& destination_image, const cv::Mat_<float>& landmarks_to_warp)
{
// set the current shape
source_landmarks = landmarks_to_warp.clone();
// prepare the mapping coefficients using the current shape
this->CalcCoeff();
// Do the actual mapping computation (where to warp from)
this->WarpRegion(map_x, map_y);
// Do the actual warp (with bi-linear interpolation)
remap(image_to_warp, destination_image, map_x, map_y, cv::INTER_LINEAR);
}
//=============================================================================
// Calculate the warping coefficients
void PAW::CalcCoeff()
{
int p = this->NumberOfLandmarks();
for (int l = 0; l < this->NumberOfTriangles(); l++)
{
int i = triangulation.at<int>(l, 0);
int j = triangulation.at<int>(l, 1);
int k = triangulation.at<int>(l, 2);
float c1 = source_landmarks.at<float>(i, 0);
float c2 = source_landmarks.at<float>(j, 0) - c1;
float c3 = source_landmarks.at<float>(k, 0) - c1;
float c4 = source_landmarks.at<float>(i + p, 0);
float c5 = source_landmarks.at<float>(j + p, 0) - c4;
float c6 = source_landmarks.at<float>(k + p, 0) - c4;
// Get a pointer to the coefficient we will be precomputing
float *coeff = coefficients.ptr<float>(l);
// Extract the relevant alphas and betas
float *c_alpha = alpha.ptr<float>(l);
float *c_beta = beta.ptr<float>(l);
coeff[0] = c1 + c2 * c_alpha[0] + c3 * c_beta[0];
coeff[1] = c2 * c_alpha[1] + c3 * c_beta[1];
coeff[2] = c2 * c_alpha[2] + c3 * c_beta[2];
coeff[3] = c4 + c5 * c_alpha[0] + c6 * c_beta[0];
coeff[4] = c5 * c_alpha[1] + c6 * c_beta[1];
coeff[5] = c5 * c_alpha[2] + c6 * c_beta[2];
}
}
//======================================================================
// Compute the mapping coefficients
void PAW::WarpRegion(cv::Mat_<float>& mapx, cv::Mat_<float>& mapy)
{
cv::MatIterator_<float> xp = mapx.begin();
cv::MatIterator_<float> yp = mapy.begin();
cv::MatIterator_<uchar> mp = pixel_mask.begin();
cv::MatIterator_<int> tp = triangle_id.begin();
// The coefficients corresponding to the current triangle
float * a;
// Current triangle being processed
int k = -1;
for (int y = 0; y < pixel_mask.rows; y++)
{
float yi = float(y) + min_y;
for (int x = 0; x < pixel_mask.cols; x++)
{
float xi = float(x) + min_x;
if (*mp == 0)
{
*xp = -1;
*yp = -1;
}
else
{
// triangle corresponding to the current pixel
int j = *tp;
// If it is different from the previous triangle point to new coefficients
// This will always be the case in the first iteration, hence a will not point to nothing
if (j != k)
{
// Update the coefficient pointer if a new triangle is being processed
a = coefficients.ptr<float>(j);
k = j;
}
//ap is now the pointer to the coefficients
float *ap = a;
//look at the first coefficient (and increment). first coefficient is an x offset
float xo = *ap++;
//second coefficient is an x scale as a function of x
xo += *ap++ * xi;
//third coefficient ap(2) is an x scale as a function of y
*xp = float(xo + *ap++ * yi);
//then fourth coefficient ap(3) is a y offset
float yo = *ap++;
//fifth coeff adds coeff[4]*x to y
yo += *ap++ * xi;
//final coeff adds coeff[5]*y to y
*yp = float(yo + *ap++ * yi);
}
mp++; tp++; xp++; yp++;
}
}
}
// ============================================================
// Helper functions to determine which point a triangle lies in
// ============================================================
// Is the point (x0,y0) on same side as a half-plane defined by (x1,y1), (x2, y2), and (x3, y3)
bool PAW::sameSide(float x0, float y0, float x1, float y1, float x2, float y2, float x3, float y3)
{
float x = (x3 - x2)*(y0 - y2) - (x0 - x2)*(y3 - y2);
float y = (x3 - x2)*(y1 - y2) - (x1 - x2)*(y3 - y2);
return x*y >= 0;
}
// if point (x0, y0) is on same side for all three half-planes it is in a triangle
bool PAW::pointInTriangle(float x0, float y0, float x1, float y1, float x2, float y2, float x3, float y3)
{
bool same_1 = sameSide(x0, y0, x1, y1, x2, y2, x3, y3);
bool same_2 = sameSide(x0, y0, x2, y2, x1, y1, x3, y3);
bool same_3 = sameSide(x0, y0, x3, y3, x1, y1, x2, y2);
return same_1 && same_2 && same_3;
}
// Find if a given point lies in the triangles
int PAW::findTriangle(const cv::Point_<float>& point, const std::vector<std::vector<float>>& control_points, int guess)
{
int num_tris = control_points.size();
int tri = -1;
float x0 = point.x;
float y0 = point.y;
// Allow a guess for speed (so as not to go through all triangles)
if (guess != -1)
{
bool in_triangle = pointInTriangle(x0, y0, control_points[guess][0], control_points[guess][1], control_points[guess][2], control_points[guess][3], control_points[guess][4], control_points[guess][5]);
if (in_triangle)
{
return guess;
}
}
for (int i = 0; i < num_tris; ++i)
{
float max_x = control_points[i][6];
float max_y = control_points[i][7];
float min_x = control_points[i][8];
float min_y = control_points[i][9];
// Skip the check if the point is outside the bounding box of the triangle
if (max_x < x0 || min_x > x0 || max_y < y0 || min_y > y0)
{
continue;
}
bool in_triangle = pointInTriangle(x0, y0,
control_points[i][0], control_points[i][1],
control_points[i][2], control_points[i][3],
control_points[i][4], control_points[i][5]);
if (in_triangle)
{
tri = i;
break;
}
}
return tri;
}

View File

@@ -0,0 +1,738 @@
///////////////////////////////////////////////////////////////////////////////
// Copyright (C) 2017, Carnegie Mellon University and University of Cambridge,
// all rights reserved.
//
// ACADEMIC OR NON-PROFIT ORGANIZATION NONCOMMERCIAL RESEARCH USE ONLY
//
// BY USING OR DOWNLOADING THE SOFTWARE, YOU ARE AGREEING TO THE TERMS OF THIS LICENSE AGREEMENT.
// IF YOU DO NOT AGREE WITH THESE TERMS, YOU MAY NOT USE OR DOWNLOAD THE SOFTWARE.
//
// License can be found in OpenFace-license.txt
//
// * Any publications arising from the use of this software, including but
// not limited to academic journal and conference publications, technical
// reports and manuals, must cite at least one of the following works:
//
// OpenFace 2.0: Facial Behavior Analysis Toolkit
// Tadas Baltrušaitis, Amir Zadeh, Yao Chong Lim, and Louis-Philippe Morency
// in IEEE International Conference on Automatic Face and Gesture Recognition, 2018
//
// Convolutional experts constrained local model for facial landmark detection.
// A. Zadeh, T. Baltrušaitis, and Louis-Philippe Morency,
// in Computer Vision and Pattern Recognition Workshops, 2017.
//
// Rendering of Eyes for Eye-Shape Registration and Gaze Estimation
// Erroll Wood, Tadas Baltrušaitis, Xucong Zhang, Yusuke Sugano, Peter Robinson, and Andreas Bulling
// in IEEE International. Conference on Computer Vision (ICCV), 2015
//
// Cross-dataset learning and person-specific normalisation for automatic Action Unit detection
// Tadas Baltrušaitis, Marwa Mahmoud, and Peter Robinson
// in Facial Expression Recognition and Analysis Challenge,
// IEEE International Conference on Automatic Face and Gesture Recognition, 2015
//
///////////////////////////////////////////////////////////////////////////////
#include "stdafx.h"
#include <PDM.h>
#include <RotationHelpers.h>
// OpenCV include
#include <opencv2/core/core.hpp>
#include <opencv2/imgproc.hpp>
// Math includes
#define _USE_MATH_DEFINES
#include <cmath>
#ifndef M_PI
#define M_PI 3.14159265358979323846
#endif
#include <LandmarkDetectorUtils.h>
using namespace LandmarkDetector;
//===========================================================================
//=============================================================================
// Orthonormalising the 3x3 rotation matrix
void PDM::Orthonormalise(cv::Matx33f &R)
{
cv::SVD svd(R, cv::SVD::MODIFY_A);
// get the orthogonal matrix from the initial rotation matrix
cv::Mat_ <float> X = svd.u*svd.vt;
// This makes sure that the handedness is preserved and no reflection happened
// by making sure the determinant is 1 and not -1
cv::Mat_<float> W = cv::Mat_<float>::eye(3,3);
float d = determinant(X);
W(2,2) = determinant(X);
cv::Mat Rt = svd.u*W*svd.vt;
Rt.copyTo(R);
}
// A copy constructor
PDM::PDM(const PDM& other) {
// Make sure the matrices are allocated properly
this->mean_shape = other.mean_shape.clone();
this->princ_comp = other.princ_comp.clone();
this->eigen_values = other.eigen_values.clone();
}
//===========================================================================
// Clamping the parameter values to be within 3 standard deviations
void PDM::Clamp(cv::Mat_<float>& local_params, cv::Vec6f& params_global, const FaceModelParameters& parameters)
{
float n_sigmas = 3;
cv::MatConstIterator_<float> e_it = this->eigen_values.begin();
cv::MatIterator_<float> p_it = local_params.begin();
float v;
// go over all parameters
for(; p_it != local_params.end(); ++p_it, ++e_it)
{
// Work out the maximum value
v = n_sigmas*sqrt(*e_it);
// if the values is too extreme clamp it
if(fabs(*p_it) > v)
{
// Dealing with positive and negative cases
if(*p_it > 0.0)
{
*p_it=v;
}
else
{
*p_it=-v;
}
}
}
// do not let the pose get out of hand
//if(parameters.limit_pose)
//{
// if(params_global[1] > M_PI / 2)
// params_global[1] = M_PI/2;
// if(params_global[1] < -M_PI / 2)
// params_global[1] = -M_PI/2;
// if(params_global[2] > M_PI / 2)
// params_global[2] = M_PI/2;
// if(params_global[2] < -M_PI / 2)
// params_global[2] = -M_PI/2;
// if(params_global[3] > M_PI / 2)
// params_global[3] = M_PI/2;
// if(params_global[3] < -M_PI / 2)
// params_global[3] = -M_PI/2;
//}
}
//===========================================================================
// Compute the 3D representation of shape (in object space) using the local parameters
void PDM::CalcShape3D(cv::Mat_<float>& out_shape, const cv::Mat_<float>& p_local) const
{
out_shape = mean_shape.clone();
// Perform matrix vector multiplication in OpenBLAS (fortran call)
float alpha1 = 1.0;
float beta1 = 1.0;
int p_local_cols = p_local.cols;
int princ_comp_rows = princ_comp.rows;
int princ_comp_cols = princ_comp.cols;
char N[2]; N[0] = 'N';
sgemm_(N, N, &p_local_cols, &princ_comp_rows, &princ_comp_cols, &alpha1, (float*)p_local.data, &p_local_cols, (float*)princ_comp.data, &princ_comp_cols, &beta1, (float*)out_shape.data, &p_local_cols);
// Above is a fast (but ugly) version of
// out_shape = mean_shape + princ_comp * p_local;
}
//===========================================================================
// Get the 2D shape (in image space) from global and local parameters
void PDM::CalcShape2D(cv::Mat_<float>& out_shape, const cv::Mat_<float>& params_local, const cv::Vec6f& params_global) const
{
int n = this->NumberOfPoints();
float s = params_global[0]; // scaling factor
float tx = params_global[4]; // x offset
float ty = params_global[5]; // y offset
// get the rotation matrix from the euler angles
cv::Vec3f euler(params_global[1], params_global[2], params_global[3]);
cv::Matx33f currRot = Utilities::Euler2RotationMatrix(euler);
// get the 3D shape of the object
cv::Mat_<float> Shape_3D;
this->CalcShape3D(Shape_3D, params_local);
// create the 2D shape matrix (if it has not been defined yet)
if((out_shape.rows != 2 * mean_shape.rows / 3) || (out_shape.cols != 1))
{
out_shape.create(2*n,1);
}
// for every vertex
for(int i = 0; i < n; i++)
{
// Transform this using the weak-perspective mapping to 2D from 3D
out_shape.at<float>(i ,0) = s * ( currRot(0,0) * Shape_3D.at<float>(i, 0) + currRot(0,1) * Shape_3D.at<float>(i+n ,0) + currRot(0,2) * Shape_3D.at<float>(i+n*2,0) ) + tx;
out_shape.at<float>(i+n,0) = s * ( currRot(1,0) * Shape_3D.at<float>(i, 0) + currRot(1,1) * Shape_3D.at<float>(i+n ,0) + currRot(1,2) * Shape_3D.at<float>(i+n*2,0) ) + ty;
}
}
//===========================================================================
// provided the bounding box of a face and the local parameters (with optional rotation), generates the global parameters that can generate the face with the provided bounding box
// This all assumes that the bounding box describes face from left outline to right outline of the face and chin to eyebrows
void PDM::CalcParams(cv::Vec6f& out_params_global, const cv::Rect_<float>& bounding_box, const cv::Mat_<float>& params_local, const cv::Vec3f rotation)
{
// get the shape instance based on local params
cv::Mat_<float> current_shape(mean_shape.size());
CalcShape3D(current_shape, params_local);
// rotate the shape
cv::Matx33f rotation_matrix = Utilities::Euler2RotationMatrix(rotation);
cv::Mat_<float> reshaped = current_shape.reshape(1, 3);
cv::Mat rotated_shape = (cv::Mat(rotation_matrix) * reshaped);
// Get the width of expected shape
double min_x;
double max_x;
cv::minMaxLoc(rotated_shape.row(0), &min_x, &max_x);
double min_y;
double max_y;
cv::minMaxLoc(rotated_shape.row(1), &min_y, &max_y);
float width = (float) abs(min_x - max_x);
float height = (float)abs(min_y - max_y);
float scaling = ((bounding_box.width / width) + (bounding_box.height / height)) / 2.0f;
// The estimate of face center also needs some correction
float tx = bounding_box.x + bounding_box.width / 2;
float ty = bounding_box.y + bounding_box.height / 2;
// Correct it so that the bounding box is just around the minimum and maximum point in the initialised face
tx = tx - scaling * (min_x + max_x)/2.0f;
ty = ty - scaling * (min_y + max_y)/2.0f;
out_params_global = cv::Vec6f(scaling, rotation[0], rotation[1], rotation[2], tx, ty);
}
//===========================================================================
// provided the model parameters, compute the bounding box of a face
// The bounding box describes face from left outline to right outline of the face and chin to eyebrows
void PDM::CalcBoundingBox(cv::Rect_<float>& out_bounding_box, const cv::Vec6f& params_global, const cv::Mat_<float>& params_local)
{
// get the shape instance based on local params
cv::Mat_<float> current_shape;
CalcShape2D(current_shape, params_local, params_global);
// Get the width of expected shape
float min_x, max_x, min_y, max_y;
ExtractBoundingBox(current_shape, min_x, max_x, min_y, max_y);
float width = abs(min_x - max_x);
float height = abs(min_y - max_y);
out_bounding_box = cv::Rect_<float>(min_x, min_y, width, height);
}
//===========================================================================
// Calculate the PDM's Jacobian over rigid parameters (rotation, translation and scaling), the additional input W represents trust for each of the landmarks and is part of Non-Uniform RLMS
void PDM::ComputeRigidJacobian(const cv::Mat_<float>& p_local, const cv::Vec6f& params_global, cv::Mat_<float> &Jacob, const cv::Mat_<float> W, cv::Mat_<float> &Jacob_t_w)
{
// number of verts
int n = this->NumberOfPoints();
Jacob.create(n * 2, 6);
float X,Y,Z;
float s = params_global[0];
cv::Mat_<float> shape_3D;
this->CalcShape3D(shape_3D, p_local);
// Get the rotation matrix
cv::Vec3f euler(params_global[1], params_global[2], params_global[3]);
cv::Matx33f currRot = Utilities::Euler2RotationMatrix(euler);
float r11 = currRot(0,0);
float r12 = currRot(0,1);
float r13 = currRot(0,2);
float r21 = currRot(1,0);
float r22 = currRot(1,1);
float r23 = currRot(1,2);
float r31 = currRot(2,0);
float r32 = currRot(2,1);
float r33 = currRot(2,2);
cv::MatIterator_<float> Jx = Jacob.begin();
cv::MatIterator_<float> Jy = Jx + n * 6;
for(int i = 0; i < n; i++)
{
X = shape_3D.at<float>(i, 0);
Y = shape_3D.at<float>(i + n, 0);
Z = shape_3D.at<float>(i + n * 2, 0);
// The rigid jacobian from the axis angle rotation matrix approximation using small angle assumption (R * R')
// where R' = [1, -wz, wy
// wz, 1, -wx
// -wy, wx, 1]
// And this is derived using the small angle assumption on the axis angle rotation matrix parametrisation
// scaling term
*Jx++ = (X * r11 + Y * r12 + Z * r13);
*Jy++ = (X * r21 + Y * r22 + Z * r23);
// rotation terms
*Jx++ = (s * (Y * r13 - Z * r12) );
*Jy++ = (s * (Y * r23 - Z * r22) );
*Jx++ = (-s * (X * r13 - Z * r11));
*Jy++ = (-s * (X * r23 - Z * r21));
*Jx++ = (s * (X * r12 - Y * r11) );
*Jy++ = (s * (X * r22 - Y * r21) );
// translation terms
*Jx++ = 1.0f;
*Jy++ = 0.0f;
*Jx++ = 0.0f;
*Jy++ = 1.0f;
}
cv::Mat Jacob_w = cv::Mat::zeros(Jacob.rows, Jacob.cols, Jacob.type());
Jx = Jacob.begin();
Jy = Jx + n*6;
cv::MatIterator_<float> Jx_w = Jacob_w.begin<float>();
cv::MatIterator_<float> Jy_w = Jx_w + n*6;
// Iterate over all Jacobian values and multiply them by the weight in diagonal of W
for(int i = 0; i < n; i++)
{
float w_x = W.at<float>(i, i);
float w_y = W.at<float>(i+n, i+n);
for(int j = 0; j < Jacob.cols; ++j)
{
*Jx_w++ = *Jx++ * w_x;
*Jy_w++ = *Jy++ * w_y;
}
}
Jacob_t_w = Jacob_w.t();
}
//===========================================================================
// Calculate the PDM's Jacobian over all parameters (rigid and non-rigid), the additional input W represents trust for each of the landmarks and is part of Non-Uniform RLMS
void PDM::ComputeJacobian(const cv::Mat_<float>& params_local, const cv::Vec6f& params_global, cv::Mat_<float> &Jacobian, const cv::Mat_<float> W, cv::Mat_<float> &Jacob_t_w)
{
// number of vertices
int n = this->NumberOfPoints();
// number of non-rigid parameters
int m = this->NumberOfModes();
Jacobian.create(n * 2, 6 + m);
float X,Y,Z;
float s = params_global[0];
cv::Mat_<float> shape_3D;
this->CalcShape3D(shape_3D, params_local);
cv::Vec3f euler(params_global[1], params_global[2], params_global[3]);
cv::Matx33f currRot = Utilities::Euler2RotationMatrix(euler);
float r11 = currRot(0, 0);
float r12 = currRot(0, 1);
float r13 = currRot(0, 2);
float r21 = currRot(1, 0);
float r22 = currRot(1, 1);
float r23 = currRot(1, 2);
float r31 = currRot(2, 0);
float r32 = currRot(2, 1);
float r33 = currRot(2, 2);
cv::MatIterator_<float> Jx = Jacobian.begin();
cv::MatIterator_<float> Jy = Jx + n * (6 + m);
cv::MatConstIterator_<float> Vx = this->princ_comp.begin();
cv::MatConstIterator_<float> Vy = Vx + n*m;
cv::MatConstIterator_<float> Vz = Vy + n*m;
for(int i = 0; i < n; i++)
{
X = shape_3D.at<float>(i, 0);
Y = shape_3D.at<float>(i + n, 0);
Z = shape_3D.at<float>(i + n * 2, 0);
// The rigid jacobian from the axis angle rotation matrix approximation using small angle assumption (R * R')
// where R' = [1, -wz, wy
// wz, 1, -wx
// -wy, wx, 1]
// And this is derived using the small angle assumption on the axis angle rotation matrix parametrisation
// scaling term
*Jx++ = (X * r11 + Y * r12 + Z * r13);
*Jy++ = (X * r21 + Y * r22 + Z * r23);
// rotation terms
*Jx++ = (s * (Y * r13 - Z * r12) );
*Jy++ = (s * (Y * r23 - Z * r22) );
*Jx++ = (-s * (X * r13 - Z * r11));
*Jy++ = (-s * (X * r23 - Z * r21));
*Jx++ = (s * (X * r12 - Y * r11) );
*Jy++ = (s * (X * r22 - Y * r21) );
// translation terms
*Jx++ = 1.0f;
*Jy++ = 0.0f;
*Jx++ = 0.0f;
*Jy++ = 1.0f;
for(int j = 0; j < m; j++,++Vx,++Vy,++Vz)
{
// How much the change of the non-rigid parameters (when object is rotated) affect 2D motion
*Jx++ = ( s*(r11*(*Vx) + r12*(*Vy) + r13*(*Vz)) );
*Jy++ = ( s*(r21*(*Vx) + r22*(*Vy) + r23*(*Vz)) );
}
}
// Adding the weights here
if(cv::trace(W)[0] != W.rows)
{
cv::Mat Jacob_w = Jacobian.clone();
Jx = Jacobian.begin();
Jy = Jx + n*(6+m);
cv::MatIterator_<float> Jx_w = Jacob_w.begin<float>();
cv::MatIterator_<float> Jy_w = Jx_w + n*(6+m);
// Iterate over all Jacobian values and multiply them by the weight in diagonal of W
for(int i = 0; i < n; i++)
{
float w_x = W.at<float>(i, i);
float w_y = W.at<float>(i+n, i+n);
for(int j = 0; j < Jacobian.cols; ++j)
{
*Jx_w++ = *Jx++ * w_x;
*Jy_w++ = *Jy++ * w_y;
}
}
Jacob_t_w = Jacob_w.t();
}
else
{
Jacob_t_w = Jacobian.t();
}
}
//===========================================================================
// Updating the parameters (more details in my thesis)
void PDM::UpdateModelParameters(const cv::Mat_<float>& delta_p, cv::Mat_<float>& params_local, cv::Vec6f& params_global)
{
// The scaling and translation parameters can be just added
params_global[0] += delta_p.at<float>(0,0);
params_global[4] += delta_p.at<float>(4,0);
params_global[5] += delta_p.at<float>(5,0);
// get the original rotation matrix
cv::Vec3f eulerGlobal(params_global[1], params_global[2], params_global[3]);
cv::Matx33f R1 = Utilities::Euler2RotationMatrix(eulerGlobal);
// construct R' = [1, -wz, wy
// wz, 1, -wx
// -wy, wx, 1]
cv::Matx33f R2 = cv::Matx33f::eye();
R2(1,2) = -1.0*(R2(2,1) = delta_p.at<float>(1,0));
R2(2,0) = -1.0*(R2(0,2) = delta_p.at<float>(2,0));
R2(0,1) = -1.0*(R2(1,0) = delta_p.at<float>(3,0));
// Make sure it's orthonormal
Orthonormalise(R2);
// Combine rotations
cv::Matx33f R3 = R1 *R2;
// Extract euler angle (through axis angle first to make sure it's legal)
cv::Vec3f axis_angle = Utilities::RotationMatrix2AxisAngle(R3);
cv::Vec3f euler = Utilities::AxisAngle2Euler(axis_angle);
// Temporary fix to numerical instability
if (std::isnan(euler[0]) || std::isnan(euler[1]) || std::isnan(euler[2]))
{
euler[0] = 0;
euler[1] = 0;
euler[2] = 0;
}
params_global[1] = euler[0];
params_global[2] = euler[1];
params_global[3] = euler[2];
// Local parameter update, just simple addition
if(delta_p.rows > 6)
{
params_local = params_local + delta_p(cv::Rect(0,6,1, this->NumberOfModes()));
}
}
void PDM::CalcParams(cv::Vec6f& out_params_global, cv::Mat_<float>& out_params_local, const cv::Mat_<float> & landmark_locations, const cv::Vec3f rotation)
{
int m = this->NumberOfModes();
int n = this->NumberOfPoints();
cv::Mat_<int> visi_ind_2D(n * 2, 1, 1);
cv::Mat_<int> visi_ind_3D(3 * n , 1, 1);
int visi_count = n;
for(int i = 0; i < n; ++i)
{
// If the landmark is invisible indicate this
if(landmark_locations.at<float>(i) == 0)
{
visi_ind_2D.at<int>(i) = 0;
visi_ind_2D.at<int>(i+n) = 0;
visi_ind_3D.at<int>(i) = 0;
visi_ind_3D.at<int>(i+n) = 0;
visi_ind_3D.at<int>(i+2*n) = 0;
visi_count--;
}
}
// As not all landmarks might be visible, subsample the Mean and principal component matrices
cv::Mat_<float> M(visi_count * 3, mean_shape.cols, 0.0);
cv::Mat_<float> V(visi_count * 3, princ_comp.cols, 0.0);
visi_count = 0;
for (int i = 0; i < n * 3; ++i)
{
if (visi_ind_3D.at<int>(i) == 1)
{
this->mean_shape.row(i).copyTo(M.row(visi_count));
this->princ_comp.row(i).copyTo(V.row(visi_count));
visi_count++;
}
}
cv::Mat_<float> m_old = this->mean_shape.clone();
cv::Mat_<float> v_old = this->princ_comp.clone();
this->mean_shape = M;
this->princ_comp = V;
// The new number of points
n = M.rows / 3;
// Extract the relevant landmark locations
cv::Mat_<float> landmark_locs_vis(n*2, 1, 0.0f);
int k = 0;
for(int i = 0; i < visi_ind_2D.rows; ++i)
{
if(visi_ind_2D.at<int>(i) == 1)
{
landmark_locs_vis.at<float>(k) = landmark_locations.at<float>(i);
k++;
}
}
// Compute the initial global parameters
float min_x, max_x, min_y, max_y;
ExtractBoundingBox(landmark_locs_vis, min_x, max_x, min_y, max_y);
float width = abs(min_x - max_x);
float height = abs(min_y - max_y);
cv::Rect_<float> model_bbox;
CalcBoundingBox(model_bbox, cv::Vec6f(1.0, 0.0, 0.0, 0.0, 0.0, 0.0), cv::Mat_<float>(this->NumberOfModes(), 1, 0.0));
cv::Rect_<float> bbox(min_x, min_y, width, height);
float scaling = ((width / model_bbox.width) + (height / model_bbox.height)) / 2.0f;
cv::Vec3f rotation_init = rotation;
cv::Matx33f R = Utilities::Euler2RotationMatrix(rotation_init);
cv::Vec2f translation((min_x + max_x) / 2.0f, (min_y + max_y) / 2.0f);
cv::Mat_<float> loc_params(this->NumberOfModes(),1, 0.0);
cv::Vec6f glob_params(scaling, rotation_init[0], rotation_init[1], rotation_init[2], translation[0], translation[1]);
// get the 3D shape of the object
cv::Mat_<float> shape_3D = M + V * loc_params;
cv::Mat_<float> curr_shape(2*n, 1);
// for every vertex
for(int i = 0; i < n; i++)
{
// Transform this using the weak-perspective mapping to 2D from 3D
curr_shape.at<float>(i ,0) = scaling * ( R(0,0) * shape_3D.at<float>(i, 0) + R(0,1) * shape_3D.at<float>(i+n ,0) + R(0,2) * shape_3D.at<float>(i+n*2,0) ) + translation[0];
curr_shape.at<float>(i+n,0) = scaling * ( R(1,0) * shape_3D.at<float>(i, 0) + R(1,1) * shape_3D.at<float>(i+n ,0) + R(1,2) * shape_3D.at<float>(i+n*2,0) ) + translation[1];
}
float currError = cv::norm(curr_shape - landmark_locs_vis);
cv::Mat_<float> regularisations = cv::Mat_<float>::zeros(1, 6 + m);
float reg_factor = 1;
// Setting the regularisation to the inverse of eigenvalues
cv::Mat(reg_factor / this->eigen_values).copyTo(regularisations(cv::Rect(6, 0, m, 1)));
regularisations = cv::Mat::diag(regularisations.t());
cv::Mat_<float> WeightMatrix = cv::Mat_<float>::eye(n*2, n*2);
int not_improved_in = 0;
for (size_t i = 0; i < 1000; ++i)
{
// get the 3D shape of the object
shape_3D = M + V * loc_params;
shape_3D = shape_3D.reshape(1, 3);
cv::Matx23f R_2D(R(0,0), R(0,1), R(0,2), R(1,0), R(1,1), R(1,2));
cv::Mat_<float> curr_shape_2D = scaling * shape_3D.t() * cv::Mat(R_2D).t();
curr_shape_2D.col(0) = curr_shape_2D.col(0) + translation(0);
curr_shape_2D.col(1) = curr_shape_2D.col(1) + translation(1);
curr_shape_2D = cv::Mat(curr_shape_2D.t()).reshape(1, n * 2);
cv::Mat_<float> error_resid;
cv::Mat(landmark_locs_vis - curr_shape_2D).convertTo(error_resid, CV_32F);
cv::Mat_<float> J, J_w_t;
this->ComputeJacobian(loc_params, glob_params, J, WeightMatrix, J_w_t);
// projection of the meanshifts onto the jacobians (using the weighted Jacobian, see Baltrusaitis 2013)
cv::Mat_<float> J_w_t_m = J_w_t * error_resid;
// Add the regularisation term
J_w_t_m(cv::Rect(0,6,1, m)) = J_w_t_m(cv::Rect(0,6,1, m)) - regularisations(cv::Rect(6,6, m, m)) * loc_params;
cv::Mat_<float> Hessian = regularisations.clone();
// Perform matrix multiplication in OpenBLAS (fortran call)
float alpha1 = 1.0;
float beta1 = 1.0;
char N[2]; N[0] = 'N';
sgemm_(N, N, &J.cols, &J_w_t.rows, &J_w_t.cols, &alpha1, (float*)J.data, &J.cols, (float*)J_w_t.data, &J_w_t.cols, &beta1, (float*)Hessian.data, &J.cols);
// Above is a fast (but ugly) version of
// cv::Mat_<float> Hessian2 = J_w_t * J + regularisations;
// Solve for the parameter update (from Baltrusaitis 2013 based on eq (36) Saragih 2011)
cv::Mat_<float> param_update;
cv::solve(Hessian, J_w_t_m, param_update, cv::DECOMP_CHOLESKY);
// To not overshoot, have the gradient decent rate a bit smaller
param_update = 0.75 * param_update;
UpdateModelParameters(param_update, loc_params, glob_params);
scaling = glob_params[0];
rotation_init[0] = glob_params[1];
rotation_init[1] = glob_params[2];
rotation_init[2] = glob_params[3];
translation[0] = glob_params[4];
translation[1] = glob_params[5];
R = Utilities::Euler2RotationMatrix(rotation_init);
R_2D(0,0) = R(0,0);R_2D(0,1) = R(0,1); R_2D(0,2) = R(0,2);
R_2D(1,0) = R(1,0);R_2D(1,1) = R(1,1); R_2D(1,2) = R(1,2);
curr_shape_2D = scaling * shape_3D.t() * cv::Mat(R_2D).t();
curr_shape_2D.col(0) = curr_shape_2D.col(0) + translation(0);
curr_shape_2D.col(1) = curr_shape_2D.col(1) + translation(1);
curr_shape_2D = cv::Mat(curr_shape_2D.t()).reshape(1, n * 2);
float error = cv::norm(curr_shape_2D - landmark_locs_vis);
if(0.999 * currError < error)
{
not_improved_in++;
if (not_improved_in == 3)
{
break;
}
}
currError = error;
}
out_params_global = glob_params;
out_params_local = loc_params;
this->mean_shape = m_old;
this->princ_comp = v_old;
}
bool PDM::Read(std::string location)
{
std::ifstream pdmLoc(location, std::ios_base::in);
if (!pdmLoc.is_open())
{
return false;
}
LandmarkDetector::SkipComments(pdmLoc);
// Reading mean values
cv::Mat_<double> mean_shape_d;
LandmarkDetector::ReadMat(pdmLoc, mean_shape_d);
mean_shape_d.convertTo(mean_shape, CV_32F); // Moving things to floats for speed
LandmarkDetector::SkipComments(pdmLoc);
// Reading principal components
cv::Mat_<double> princ_comp_d;
LandmarkDetector::ReadMat(pdmLoc, princ_comp_d);
princ_comp_d.convertTo(princ_comp, CV_32F);
LandmarkDetector::SkipComments(pdmLoc);
// Reading eigenvalues
cv::Mat_<double> eigen_values_d;
LandmarkDetector::ReadMat(pdmLoc, eigen_values_d);
eigen_values_d.convertTo(eigen_values, CV_32F);
return true;
}

View File

@@ -0,0 +1,696 @@
///////////////////////////////////////////////////////////////////////////////
// Copyright (C) 2017, Carnegie Mellon University and University of Cambridge,
// all rights reserved.
//
// ACADEMIC OR NON-PROFIT ORGANIZATION NONCOMMERCIAL RESEARCH USE ONLY
//
// BY USING OR DOWNLOADING THE SOFTWARE, YOU ARE AGREEING TO THE TERMS OF THIS LICENSE AGREEMENT.
// IF YOU DO NOT AGREE WITH THESE TERMS, YOU MAY NOT USE OR DOWNLOAD THE SOFTWARE.
//
// License can be found in OpenFace-license.txt
//
// * Any publications arising from the use of this software, including but
// not limited to academic journal and conference publications, technical
// reports and manuals, must cite at least one of the following works:
//
// OpenFace 2.0: Facial Behavior Analysis Toolkit
// Tadas Baltrušaitis, Amir Zadeh, Yao Chong Lim, and Louis-Philippe Morency
// in IEEE International Conference on Automatic Face and Gesture Recognition, 2018
//
// Convolutional experts constrained local model for facial landmark detection.
// A. Zadeh, T. Baltrušaitis, and Louis-Philippe Morency,
// in Computer Vision and Pattern Recognition Workshops, 2017.
//
// Rendering of Eyes for Eye-Shape Registration and Gaze Estimation
// Erroll Wood, Tadas Baltrušaitis, Xucong Zhang, Yusuke Sugano, Peter Robinson, and Andreas Bulling
// in IEEE International. Conference on Computer Vision (ICCV), 2015
//
// Cross-dataset learning and person-specific normalisation for automatic Action Unit detection
// Tadas Baltrušaitis, Marwa Mahmoud, and Peter Robinson
// in Facial Expression Recognition and Analysis Challenge,
// IEEE International Conference on Automatic Face and Gesture Recognition, 2015
//
///////////////////////////////////////////////////////////////////////////////
#include "stdafx.h"
#include "Patch_experts.h"
#include "RotationHelpers.h"
// Math includes
#define _USE_MATH_DEFINES
#include <cmath>
#ifndef M_PI
#define M_PI 3.14159265358979323846
#endif
#include "LandmarkDetectorUtils.h"
using namespace LandmarkDetector;
// A copy constructor
Patch_experts::Patch_experts(const Patch_experts& other) : patch_scaling(other.patch_scaling), centers(other.centers), svr_expert_intensity(other.svr_expert_intensity),
ccnf_expert_intensity(other.ccnf_expert_intensity), cen_expert_intensity(other.cen_expert_intensity),
early_term_weights(other.early_term_weights), early_term_biases(other.early_term_biases), early_term_cutoffs(other.early_term_cutoffs),
mirror_inds(other.mirror_inds),mirror_views(other.mirror_views)
{
// Make sure the matrices are allocated properly
this->sigma_components.resize(other.sigma_components.size());
for (size_t i = 0; i < other.sigma_components.size(); ++i)
{
this->sigma_components[i].resize(other.sigma_components[i].size());
for (size_t j = 0; j < other.sigma_components[i].size(); ++j)
{
// Make sure the matrix is copied.
this->sigma_components[i][j] = other.sigma_components[i][j].clone();
}
}
// Make sure the matrices are allocated properly
this->visibilities.resize(other.visibilities.size());
for (size_t i = 0; i < other.visibilities.size(); ++i)
{
this->visibilities[i].resize(other.visibilities[i].size());
for (size_t j = 0; j < other.visibilities[i].size(); ++j)
{
// Make sure the matrix is copied.
this->visibilities[i][j] = other.visibilities[i][j].clone();
}
}
preallocated_im2col.resize(other.preallocated_im2col.size());
}
// Returns indices to landmarks that need to have patch responses computed (omits mirrored frontal landmarks for CEN as they will be computed together with their mirrored pair)
std::vector<int> Patch_experts::Collect_visible_landmarks(std::vector<std::vector<cv::Mat_<int> > > visibilities, int scale, int view_id, int n)
{
std::vector<int> vis_lmk;
for (int i = 0; i < n; i++)
{
if (visibilities[scale][view_id].rows == n)
{
if (visibilities[scale][view_id].at<int>(i, 0) != 0)
{
// For CEN patch experts and frontal views skip the mirror indices
if (!cen_expert_intensity.empty())
{
// If frontal view we can do mirrored landmarks together
if (view_id == 0)
{
// If the patch expert does not have values, means it's a mirrored version and will be done in another part of a loop
if (!cen_expert_intensity[scale][view_id][i].biases.empty())
{
vis_lmk.push_back(i);
}
}
else
{
vis_lmk.push_back(i);
}
}
else
{
vis_lmk.push_back(i);
}
}
}
}
return vis_lmk;
}
// Returns the patch expert responses given a grayscale image.
// Additionally returns the transform from the image coordinates to the response coordinates (and vice versa).
// The computation also requires the current landmark locations to compute response around, the PDM corresponding to the desired model, and the parameters describing its instance
// Also need to provide the size of the area of interest and the desired scale of analysis
void Patch_experts::Response(std::vector<cv::Mat_<float> >& patch_expert_responses, cv::Matx22f& sim_ref_to_img,
cv::Matx22f& sim_img_to_ref, const cv::Mat_<float>& grayscale_image, const PDM& pdm, const cv::Vec6f& params_global,
const cv::Mat_<float>& params_local, int window_size, int scale)
{
int view_id = GetViewIdx(params_global, scale);
int n = pdm.NumberOfPoints();
// Compute the current landmark locations (around which responses will be computed)
cv::Mat_<float> landmark_locations;
pdm.CalcShape2D(landmark_locations, params_local, params_global);
cv::Mat_<float> reference_shape;
// Initialise the reference shape on which we'll be warping
cv::Vec6f global_ref(patch_scaling[scale], 0, 0, 0, 0, 0);
// Compute the reference shape
pdm.CalcShape2D(reference_shape, params_local, global_ref);
// similarity and inverse similarity transform to and from image and reference shape
cv::Mat_<float> reference_shape_2D = (reference_shape.reshape(1, 2).t());
cv::Mat_<float> image_shape_2D = landmark_locations.reshape(1, 2).t();
sim_img_to_ref = Utilities::AlignShapesWithScale(image_shape_2D, reference_shape_2D);
sim_ref_to_img = sim_img_to_ref.inv(cv::DECOMP_LU);
float a1 = sim_ref_to_img(0, 0);
float b1 = -sim_ref_to_img(0, 1);
bool use_ccnf = !this->ccnf_expert_intensity.empty();
bool use_cen = !this->cen_expert_intensity.empty();
// If using CCNF patch experts might need to precalculate Sigmas
if (use_ccnf)
{
std::vector<cv::Mat_<float> > sigma_components;
// Retrieve the correct sigma component size
for (size_t w_size = 0; w_size < this->sigma_components.size(); ++w_size)
{
if (!this->sigma_components[w_size].empty())
{
if (window_size*window_size == this->sigma_components[w_size][0].rows)
{
sigma_components = this->sigma_components[w_size];
}
}
}
// Go through all of the landmarks and compute the Sigma for each
for (int lmark = 0; lmark < n; lmark++)
{
// Only for visible landmarks
if (visibilities[scale][view_id].at<int>(lmark, 0))
{
// Precompute sigmas if they are not computed yet
ccnf_expert_intensity[scale][view_id][lmark].ComputeSigmas(sigma_components, window_size);
}
}
}
// If using CEN precalculate interpolation matrix
cv::Mat_<float> interp_mat;
if (use_cen)
{
// Assuming the same size for all experts
int support_region = 11;
int area_of_interest_width = window_size + support_region - 1;
int area_of_interest_height = window_size + support_region - 1;
int resp_size = area_of_interest_height - support_region + 1;
interpolationMatrix(interp_mat, resp_size, resp_size, area_of_interest_width, area_of_interest_height);
}
// We do not want to create threads for invisible landmarks, so construct an index of visible ones
std::vector<int> vis_lmk = Collect_visible_landmarks(visibilities, scale, view_id, n);
// calculate the patch responses for every landmark (this is the heavy lifting of landmark detection)
parallel_for_(cv::Range(0, vis_lmk.size()), [&](const cv::Range& range) {
for (int i = range.start; i < range.end; i++)
{
// Work out how big the area of interest has to be to get a response of window size
int area_of_interest_width;
int area_of_interest_height;
int ind = vis_lmk.at(i);
if (use_cen)
{
area_of_interest_width = window_size + cen_expert_intensity[scale][view_id][ind].width_support - 1;
area_of_interest_height = window_size + cen_expert_intensity[scale][view_id][ind].height_support - 1;
}
else if (use_ccnf)
{
area_of_interest_width = window_size + ccnf_expert_intensity[scale][view_id][ind].width - 1;
area_of_interest_height = window_size + ccnf_expert_intensity[scale][view_id][ind].height - 1;
}
else
{
area_of_interest_width = window_size + svr_expert_intensity[scale][view_id][ind].width - 1;
area_of_interest_height = window_size + svr_expert_intensity[scale][view_id][ind].height - 1;
}
// scale and rotate to mean shape to reference frame
cv::Mat sim = (cv::Mat_<float>(2, 3) << a1, -b1, landmark_locations.at<float>(ind, 0) - a1 * (area_of_interest_width - 1.0f) / 2.0f + b1 * (area_of_interest_width - 1.0f) / 2.0f, b1, a1, landmark_locations.at<float>(ind + n, 0) - a1 * (area_of_interest_width - 1.0f) / 2.0f - b1 * (area_of_interest_width - 1.0f) / 2.0f);
// Extract the region of interest around the current landmark location
cv::Mat_<float> area_of_interest(area_of_interest_height, area_of_interest_width, 0.0f);
cv::warpAffine(grayscale_image, area_of_interest, sim, area_of_interest.size(), cv::WARP_INVERSE_MAP + cv::INTER_LINEAR);
// Get intensity response either from the SVR, CCNF, or CEN patch experts (prefer CEN as they are the most accurate so far)
if (!cen_expert_intensity.empty())
{
int im2col_size = (area_of_interest_width * area_of_interest_height - 1) / 2;
cv::Mat_<float> prealloc_mat = preallocated_im2col[ind][im2col_size];
// If frontal view we can do mirrored landmarks together
if (view_id == 0)
{
// If the patch expert does not have values, means it's a mirrored version and will be done in another part of a loop
if (!cen_expert_intensity[scale][view_id][ind].biases.empty())
{
// No mirrored expert, so do normally
int mirror_id = mirror_inds.at<int>(ind);
if (mirror_id == ind)
{
cv::Mat_<float> empty(0, 0, 0.0f);
cen_expert_intensity[scale][view_id][ind].ResponseSparse(area_of_interest, empty, patch_expert_responses[ind], empty, interp_mat, prealloc_mat, empty);
}
else
{
// Grab mirrored area of interest
// scale and rotate to mean shape to reference frame
cv::Mat sim_r = (cv::Mat_<float>(2, 3) << a1, -b1, landmark_locations.at<float>(mirror_id, 0) - a1 * (area_of_interest_width - 1.0f) / 2.0f + b1 * (area_of_interest_width - 1.0f) / 2.0f, b1, a1, landmark_locations.at<float>(mirror_id + n, 0) - a1 * (area_of_interest_width - 1.0f) / 2.0f - b1 * (area_of_interest_width - 1.0f) / 2.0f);
// Extract the region of interest around the current landmark location
cv::Mat_<float> area_of_interest_r(area_of_interest_height, area_of_interest_width, 0.0f);
cv::warpAffine(grayscale_image, area_of_interest_r, sim_r, area_of_interest_r.size(), cv::WARP_INVERSE_MAP + cv::INTER_LINEAR);
cv::Mat_<float> prealloc_mat_right = preallocated_im2col[mirror_id][im2col_size];
cen_expert_intensity[scale][view_id][ind].ResponseSparse(area_of_interest, area_of_interest_r, patch_expert_responses[ind], patch_expert_responses[mirror_id], interp_mat, prealloc_mat, prealloc_mat_right);
preallocated_im2col[mirror_id][im2col_size] = prealloc_mat_right;
}
}
}
else
{
// For space and memory saving use a mirrored patch expert
if (!cen_expert_intensity[scale][view_id][ind].biases.empty())
{
cv::Mat_<float> empty(0, 0, 0.0f);
cen_expert_intensity[scale][view_id][ind].ResponseSparse(area_of_interest, empty, patch_expert_responses[ind], empty, interp_mat, prealloc_mat, empty);
// A slower, but slightly more accurate version
//cen_expert_intensity[scale][view_id][ind].Response(area_of_interest, patch_expert_responses[ind]);
}
else
{
cv::Mat_<float> empty(0, 0, 0.0f);
cen_expert_intensity[scale][mirror_views.at<int>(view_id)][mirror_inds.at<int>(ind)].ResponseSparse(empty, area_of_interest, empty, patch_expert_responses[ind], interp_mat, empty, prealloc_mat);
}
}
preallocated_im2col[ind][im2col_size] = prealloc_mat;
}
else if (!ccnf_expert_intensity.empty())
{
// get the correct size response window
patch_expert_responses[ind] = cv::Mat_<float>(window_size, window_size);
int im2col_size = area_of_interest_width * area_of_interest_height;
cv::Mat_<float> prealloc_mat = preallocated_im2col[ind][im2col_size];
ccnf_expert_intensity[scale][view_id][ind].ResponseOpenBlas(area_of_interest, patch_expert_responses[ind], prealloc_mat);
preallocated_im2col[ind][im2col_size] = prealloc_mat;
// Below is an alternative way to compute the same, but that uses FFT instead of OpenBLAS
// ccnf_expert_intensity[scale][view_id][ind].Response(area_of_interest, patch_expert_responses[ind]);
}
else
{
// get the correct size response window
patch_expert_responses[ind] = cv::Mat_<float>(window_size, window_size);
svr_expert_intensity[scale][view_id][ind].Response(area_of_interest, patch_expert_responses[ind]);
}
}
});
}
//=============================================================================
// Getting the closest view center based on orientation
int Patch_experts::GetViewIdx(const cv::Vec6f& params_global, int scale) const
{
int idx = 0;
float dbest;
for(int i = 0; i < this->nViews(scale); i++)
{
float v1 = params_global[1] - centers[scale][i][0];
float v2 = params_global[2] - centers[scale][i][1];
float v3 = params_global[3] - centers[scale][i][2];
float d = v1*v1 + v2*v2 + v3*v3;
if(i == 0 || d < dbest)
{
dbest = d;
idx = i;
}
}
return idx;
}
//===========================================================================
bool Patch_experts::Read(std::vector<std::string> intensity_svr_expert_locations, std::vector<std::string> intensity_ccnf_expert_locations,
std::vector<std::string> intensity_cen_expert_locations, std::string early_term_loc)
{
// initialise the SVR intensity patch expert parameters
int num_intensity_svr = intensity_svr_expert_locations.size();
centers.resize(num_intensity_svr);
visibilities.resize(num_intensity_svr);
patch_scaling.resize(num_intensity_svr);
svr_expert_intensity.resize(num_intensity_svr);
// Reading in SVR intensity patch experts for each scales it is defined in
for(int scale = 0; scale < num_intensity_svr; ++scale)
{
std::string location = intensity_svr_expert_locations[scale];
std::cout << "Reading the intensity SVR patch experts from: " << location << "....";
bool success_read = Read_SVR_patch_experts(location, centers[scale], visibilities[scale], svr_expert_intensity[scale], patch_scaling[scale]);
if (!success_read)
{
return false;
}
}
// Initialise and read CCNF patch experts (currently only intensity based),
int num_intensity_ccnf = intensity_ccnf_expert_locations.size();
// CCNF experts override the SVR ones
if(num_intensity_ccnf > 0)
{
centers.resize(num_intensity_ccnf);
visibilities.resize(num_intensity_ccnf);
patch_scaling.resize(num_intensity_ccnf);
ccnf_expert_intensity.resize(num_intensity_ccnf);
}
for(int scale = 0; scale < num_intensity_ccnf; ++scale)
{
std::string location = intensity_ccnf_expert_locations[scale];
std::cout << "Reading the intensity CCNF patch experts from: " << location << "....";
bool success_read = Read_CCNF_patch_experts(location, centers[scale], visibilities[scale], ccnf_expert_intensity[scale], patch_scaling[scale]);
if (!success_read)
{
return false;
}
if (scale == 0)
{
preallocated_im2col.resize(ccnf_expert_intensity[0][0].size());
}
}
// Initialise and read CEN patch experts (currently only intensity based),
int num_intensity_cen = intensity_cen_expert_locations.size();
// CEN experts override the SVR and CCNF ones
if (num_intensity_cen > 0)
{
centers.resize(num_intensity_cen);
visibilities.resize(num_intensity_cen);
patch_scaling.resize(num_intensity_cen);
cen_expert_intensity.resize(num_intensity_cen);
}
for (int scale = 0; scale < num_intensity_cen; ++scale)
{
std::string location = intensity_cen_expert_locations[scale];
std::cout << "Reading the intensity CEN patch experts from: " << location << "....";
bool success_read = Read_CEN_patch_experts(location, centers[scale], visibilities[scale], cen_expert_intensity[scale], patch_scaling[scale]);
if (!success_read)
{
return false;
}
if (scale == 0)
{
preallocated_im2col.resize(cen_expert_intensity[0][0].size());
}
}
// Reading in early termination parameters
if (!early_term_loc.empty())
{
std::ifstream earlyTermFile(early_term_loc.c_str(), std::ios_base::in);
if (!earlyTermFile.is_open())
{
return false;
}
// Reading in weights/biases/cutoffs
for (size_t i = 0; i < centers[0].size(); ++i)
{
double weight;
earlyTermFile >> weight;
early_term_weights.push_back(weight);
}
for (size_t i = 0; i < centers[0].size(); ++i)
{
double bias;
earlyTermFile >> bias;
early_term_biases.push_back(bias);
}
for (size_t i = 0; i < centers[0].size(); ++i)
{
double cutoff;
earlyTermFile >> cutoff;
early_term_cutoffs.push_back(cutoff);
}
}
return true;
}
//======================= Reading the SVR patch experts =========================================//
bool Patch_experts::Read_SVR_patch_experts(std::string expert_location, std::vector<cv::Vec3d>& centers,
std::vector<cv::Mat_<int> >& visibility, std::vector<std::vector<Multi_SVR_patch_expert> >& patches, double& scale)
{
std::ifstream patchesFile(expert_location.c_str(), std::ios_base::in);
if(patchesFile.is_open())
{
LandmarkDetector::SkipComments(patchesFile);
patchesFile >> scale;
LandmarkDetector::SkipComments(patchesFile);
int numberViews;
patchesFile >> numberViews;
// read the visibility
centers.resize(numberViews);
visibility.resize(numberViews);
patches.resize(numberViews);
LandmarkDetector::SkipComments(patchesFile);
// centers of each view (which view corresponds to which orientation)
for(size_t i = 0; i < centers.size(); i++)
{
cv::Mat center;
LandmarkDetector::ReadMat(patchesFile, center);
center.copyTo(centers[i]);
centers[i] = centers[i] * M_PI / 180.0;
}
LandmarkDetector::SkipComments(patchesFile);
// the visibility of points for each of the views (which verts are visible at a specific view
for(size_t i = 0; i < visibility.size(); i++)
{
LandmarkDetector::ReadMat(patchesFile, visibility[i]);
}
int numberOfPoints = visibility[0].rows;
LandmarkDetector::SkipComments(patchesFile);
// read the patches themselves
for(size_t i = 0; i < patches.size(); i++)
{
// number of patches for each view
patches[i].resize(numberOfPoints);
// read in each patch
for(int j = 0; j < numberOfPoints; j++)
{
patches[i][j].Read(patchesFile);
}
}
std::cout << "Done" << std::endl;
return true;
}
else
{
std::cout << "Can't find/open the patches file" << std::endl;
return false;
}
}
//======================= Reading the CCNF patch experts =========================================//
bool Patch_experts::Read_CCNF_patch_experts(std::string patchesFileLocation, std::vector<cv::Vec3d>& centers,
std::vector<cv::Mat_<int> >& visibility, std::vector<std::vector<CCNF_patch_expert> >& patches, double& patchScaling)
{
std::ifstream patchesFile(patchesFileLocation.c_str(), std::ios::in | std::ios::binary);
if(patchesFile.is_open())
{
patchesFile.read ((char*)&patchScaling, 8);
int numberViews;
patchesFile.read ((char*)&numberViews, 4);
// read the visibility
centers.resize(numberViews);
visibility.resize(numberViews);
patches.resize(numberViews);
// centers of each view (which view corresponds to which orientation)
for(size_t i = 0; i < centers.size(); i++)
{
cv::Mat center;
LandmarkDetector::ReadMatBin(patchesFile, center);
center.copyTo(centers[i]);
centers[i] = centers[i] * M_PI / 180.0;
}
// the visibility of points for each of the views (which verts are visible at a specific view
for(size_t i = 0; i < visibility.size(); i++)
{
LandmarkDetector::ReadMatBin(patchesFile, visibility[i]);
}
int numberOfPoints = visibility[0].rows;
// Read the possible SigmaInvs (without beta), this will be followed by patch reading (this assumes all of them have the same type, and number of betas)
int num_win_sizes;
int num_sigma_comp;
patchesFile.read ((char*)&num_win_sizes, 4);
std::vector<int> windows;
windows.resize(num_win_sizes);
std::vector<std::vector<cv::Mat_<float> > > sigma_components;
sigma_components.resize(num_win_sizes);
for (int w=0; w < num_win_sizes; ++w)
{
patchesFile.read ((char*)&windows[w], 4);
patchesFile.read ((char*)&num_sigma_comp, 4);
sigma_components[w].resize(num_sigma_comp);
for(int s=0; s < num_sigma_comp; ++s)
{
LandmarkDetector::ReadMatBin(patchesFile, sigma_components[w][s]);
}
}
this->sigma_components = sigma_components;
// read the patches themselves
for(size_t i = 0; i < patches.size(); i++)
{
// number of patches for each view
patches[i].resize(numberOfPoints);
// read in each patch
for(int j = 0; j < numberOfPoints; j++)
{
patches[i][j].Read(patchesFile, windows, sigma_components);
}
}
std::cout << "Done" << std::endl;
return true;
}
else
{
std::cout << "Can't find/open the patches file" << std::endl;
return false;
}
}
//======================= Reading the CEN patch experts =========================================//
bool Patch_experts::Read_CEN_patch_experts(std::string expert_location, std::vector<cv::Vec3d>& centers,
std::vector<cv::Mat_<int> >& visibility, std::vector<std::vector<CEN_patch_expert> >& patches, double& scale)
{
std::ifstream patchesFile(expert_location.c_str(), std::ios::in | std::ios::binary);
if (patchesFile.is_open())
{
patchesFile.read((char*)&scale, 8);
int numberViews;
patchesFile.read((char*)&numberViews, 4);
// read the visibility
centers.resize(numberViews);
visibility.resize(numberViews);
patches.resize(numberViews);
// centers of each view (which view corresponds to which orientation)
for (size_t i = 0; i < centers.size(); i++)
{
cv::Mat center;
LandmarkDetector::ReadMatBin(patchesFile, center);
center.copyTo(centers[i]);
centers[i] = centers[i] * M_PI / 180.0;
}
// the visibility of points for each of the views (which verts are visible at a specific view
for (size_t i = 0; i < visibility.size(); i++)
{
LandmarkDetector::ReadMatBin(patchesFile, visibility[i]);
}
int numberOfPoints = visibility[0].rows;
LandmarkDetector::ReadMatBin(patchesFile, mirror_inds);
LandmarkDetector::ReadMatBin(patchesFile, mirror_views);
// read the patches themselves
for (size_t i = 0; i < patches.size(); i++)
{
// number of patches for each view
patches[i].resize(numberOfPoints);
// read in each patch
for (int j = 0; j < numberOfPoints; j++)
{
patches[i][j].Read(patchesFile);
}
}
std::cout << "Done" << std::endl;
return true;
}
else
{
std::cout << "Could not find CEN patch experts, for instructions of how to download them, see https://github.com/TadasBaltrusaitis/OpenFace/wiki/Model-download \n" << std::endl;
return false;
}
}

View File

@@ -0,0 +1,337 @@
///////////////////////////////////////////////////////////////////////////////
// Copyright (C) 2017, Carnegie Mellon University and University of Cambridge,
// all rights reserved.
//
// ACADEMIC OR NON-PROFIT ORGANIZATION NONCOMMERCIAL RESEARCH USE ONLY
//
// BY USING OR DOWNLOADING THE SOFTWARE, YOU ARE AGREEING TO THE TERMS OF THIS LICENSE AGREEMENT.
// IF YOU DO NOT AGREE WITH THESE TERMS, YOU MAY NOT USE OR DOWNLOAD THE SOFTWARE.
//
// License can be found in OpenFace-license.txt
//
// * Any publications arising from the use of this software, including but
// not limited to academic journal and conference publications, technical
// reports and manuals, must cite at least one of the following works:
//
// OpenFace 2.0: Facial Behavior Analysis Toolkit
// Tadas Baltrušaitis, Amir Zadeh, Yao Chong Lim, and Louis-Philippe Morency
// in IEEE International Conference on Automatic Face and Gesture Recognition, 2018
//
// Convolutional experts constrained local model for facial landmark detection.
// A. Zadeh, T. Baltrušaitis, and Louis-Philippe Morency,
// in Computer Vision and Pattern Recognition Workshops, 2017.
//
// Rendering of Eyes for Eye-Shape Registration and Gaze Estimation
// Erroll Wood, Tadas Baltrušaitis, Xucong Zhang, Yusuke Sugano, Peter Robinson, and Andreas Bulling
// in IEEE International. Conference on Computer Vision (ICCV), 2015
//
// Cross-dataset learning and person-specific normalisation for automatic Action Unit detection
// Tadas Baltrušaitis, Marwa Mahmoud, and Peter Robinson
// in Facial Expression Recognition and Analysis Challenge,
// IEEE International Conference on Automatic Face and Gesture Recognition, 2015
//
///////////////////////////////////////////////////////////////////////////////
#include "stdafx.h"
#include "SVR_patch_expert.h"
// OpenCV include
#include <opencv2/core/core.hpp>
#include <opencv2/imgproc.hpp>
#include "LandmarkDetectorUtils.h"
using namespace LandmarkDetector;
//===========================================================================
// Computing the image gradient
void Grad(const cv::Mat& im, cv::Mat& grad)
{
/*float filter[3] = {1, 0, -1};
float dfilter[1] = {1};
cv::Mat filterX = cv::Mat(1,3,CV_32F, filter).clone();
cv::Mat filterY = cv::Mat(1,1,CV_32F, dfilter).clone();
cv::Mat gradX;
cv::Mat gradY;
cv::sepFilter2D(im, gradX, CV_32F, filterY, filterX, cv::Point(-1,-1), 0);
cv::sepFilter2D(im, gradY, CV_32F, filterX.t(), filterY, cv::Point(-1,-1), 0);
cv::pow(gradX,2, gradX);
cv::pow(gradY,2, gradY);
grad = gradX + gradY;
grad.row(0).setTo(0);
grad.col(0).setTo(0);
grad.col(grad.cols-1).setTo(0);
grad.row(grad.rows-1).setTo(0); */
// A quicker alternative
int x,y,h = im.rows,w = im.cols;
float vx,vy;
// Initialise the gradient
grad.create(im.size(), CV_32F);
grad.setTo(0.0f);
cv::MatIterator_<float> gp = grad.begin<float>() + w+1;
cv::MatConstIterator_<float> px1 = im.begin<float>() + w+2;
cv::MatConstIterator_<float> px2 = im.begin<float>() + w;
cv::MatConstIterator_<float> py1 = im.begin<float>() + 2*w+1;
cv::MatConstIterator_<float> py2 = im.begin<float>() + 1;
for(y = 1; y < h-1; y++)
{
for(x = 1; x < w-1; x++)
{
vx = *px1++ - *px2++;
vy = *py1++ - *py2++;
*gp++ = vx*vx + vy*vy;
}
px1 += 2;
px2 += 2;
py1 += 2;
py2 += 2;
gp += 2;
}
}
// A copy constructor
SVR_patch_expert::SVR_patch_expert(const SVR_patch_expert& other) : weights(other.weights.clone())
{
this->type = other.type;
this->scaling = other.scaling;
this->bias = other.bias;
this->confidence = other.confidence;
for (std::map<int, cv::Mat_<double> >::const_iterator it = other.weights_dfts.begin(); it != other.weights_dfts.end(); it++)
{
// Make sure the matrix is copied.
this->weights_dfts.insert(std::pair<int, cv::Mat>(it->first, it->second.clone()));
}
}
//===========================================================================
void SVR_patch_expert::Read(std::ifstream &stream)
{
// A sanity check when reading patch experts
int read_type;
stream >> read_type;
assert(read_type == 2);
stream >> type >> confidence >> scaling >> bias;
LandmarkDetector::ReadMat(stream, weights);
// OpenCV and Matlab matrix cardinality is different, hence the transpose
weights = weights.t();
}
//===========================================================================
void SVR_patch_expert::Response(const cv::Mat_<float>& area_of_interest, cv::Mat_<float>& response)
{
int response_height = area_of_interest.rows - weights.rows + 1;
int response_width = area_of_interest.cols - weights.cols + 1;
// the patch area on which we will calculate reponses
cv::Mat_<float> normalised_area_of_interest;
if(response.rows != response_height || response.cols != response_width)
{
response.create(response_height, response_width);
}
// If type is raw just normalise mean and standard deviation
if(type == 0)
{
// Perform normalisation across whole patch
cv::Scalar mean;
cv::Scalar std;
cv::meanStdDev(area_of_interest, mean, std);
// Avoid division by zero
if(std[0] == 0)
{
std[0] = 1;
}
normalised_area_of_interest = (area_of_interest - mean[0]) / std[0];
}
// If type is gradient, perform the image gradient computation
else if(type == 1)
{
Grad(area_of_interest, normalised_area_of_interest);
}
else
{
printf("ERROR(%s,%d): Unsupported patch type %d!\n", __FILE__,__LINE__, type);
abort();
}
cv::Mat_<float> svr_response;
// The empty matrix as we don't pass precomputed dft's of image
cv::Mat_<double> empty_matrix_0(0,0,0.0);
cv::Mat_<float> empty_matrix_1(0,0,0.0);
cv::Mat_<float> empty_matrix_2(0,0,0.0);
// Efficient calc of patch expert SVR response across the area of interest
matchTemplate_m(normalised_area_of_interest, empty_matrix_0, empty_matrix_1, empty_matrix_2, weights, weights_dfts, svr_response, cv::TM_CCOEFF_NORMED);
response.create(svr_response.size());
cv::MatIterator_<float> p = response.begin();
cv::MatIterator_<float> q1 = svr_response.begin(); // respone for each pixel
cv::MatIterator_<float> q2 = svr_response.end();
while(q1 != q2)
{
// the SVR response passed into logistic regressor
*p++ = 1.0/(1.0 + exp( -(*q1++ * scaling + bias )));
}
}
void SVR_patch_expert::ResponseDepth(const cv::Mat_<float>& area_of_interest, cv::Mat_<float> &response)
{
// How big the response map will be
int response_height = area_of_interest.rows - weights.rows + 1;
int response_width = area_of_interest.cols - weights.cols + 1;
// the patch area on which we will calculate reponses
cv::Mat_<float> normalised_area_of_interest;
if(response.rows != response_height || response.cols != response_width)
{
response.create(response_height, response_width);
}
if(type == 0)
{
// Perform normalisation across whole patch
cv::Scalar mean;
cv::Scalar std;
// ignore missing values
cv::Mat_<uchar> mask = area_of_interest > 0;
cv::meanStdDev(area_of_interest, mean, std, mask);
// if all values the same don't divide by 0
if(std[0] == 0)
{
std[0] = 1;
}
normalised_area_of_interest = (area_of_interest - mean[0]) / std[0];
// Set the invalid pixels to 0
normalised_area_of_interest.setTo(0, mask == 0);
}
else
{
printf("ERROR(%s,%d): Unsupported patch type %d!\n", __FILE__,__LINE__,type);
abort();
}
cv::Mat_<float> svr_response;
// The empty matrix as we don't pass precomputed dft's of image
cv::Mat_<double> empty_matrix_0(0,0,0.0);
cv::Mat_<float> empty_matrix_1(0,0,0.0);
cv::Mat_<float> empty_matrix_2(0,0,0.0);
// Efficient calc of patch expert response across the area of interest
matchTemplate_m(normalised_area_of_interest, empty_matrix_0, empty_matrix_1, empty_matrix_2, weights, weights_dfts, svr_response, cv::TM_CCOEFF);
response.create(svr_response.size());
cv::MatIterator_<float> p = response.begin();
cv::MatIterator_<float> q1 = svr_response.begin(); // respone for each pixel
cv::MatIterator_<float> q2 = svr_response.end();
while(q1 != q2)
{
// the SVR response passed through a logistic regressor
*p++ = 1.0/(1.0 + exp( -(*q1++ * scaling + bias )));
}
}
// Copy constructor
Multi_SVR_patch_expert::Multi_SVR_patch_expert(const Multi_SVR_patch_expert& other) : svr_patch_experts(other.svr_patch_experts)
{
this->width = other.width;
this->height = other.height;
}
//===========================================================================
void Multi_SVR_patch_expert::Read(std::ifstream &stream)
{
// A sanity check when reading patch experts
int type;
stream >> type;
assert(type == 3);
// The number of patch experts for this view (with different modalities)
int number_modalities;
stream >> width >> height >> number_modalities;
svr_patch_experts.resize(number_modalities);
for(int i = 0; i < number_modalities; i++)
svr_patch_experts[i].Read(stream);
}
//===========================================================================
void Multi_SVR_patch_expert::Response(const cv::Mat_<float> &area_of_interest, cv::Mat_<float> &response)
{
int response_height = area_of_interest.rows - height + 1;
int response_width = area_of_interest.cols - width + 1;
if(response.rows != response_height || response.cols != response_width)
{
response.create(response_height, response_width);
}
// For the purposes of the experiment only use the response of normal intensity, for fair comparison
if(svr_patch_experts.size() == 1)
{
svr_patch_experts[0].Response(area_of_interest, response);
}
else
{
// responses from multiple patch experts these can be gradients, LBPs etc.
response.setTo(1.0);
cv::Mat_<float> modality_resp(response_height, response_width);
for(size_t i = 0; i < svr_patch_experts.size(); i++)
{
svr_patch_experts[i].Response(area_of_interest, modality_resp);
response = response.mul(modality_resp);
}
}
}
void Multi_SVR_patch_expert::ResponseDepth(const cv::Mat_<float>& area_of_interest, cv::Mat_<float>& response)
{
int response_height = area_of_interest.rows - height + 1;
int response_width = area_of_interest.cols - width + 1;
if(response.rows != response_height || response.cols != response_width)
{
response.create(response_height, response_width);
}
// With depth patch experts only do raw data modality
svr_patch_experts[0].ResponseDepth(area_of_interest, response);
}
//===========================================================================

View File

@@ -0,0 +1,12 @@
///////////////////////////////////////////////////////////////////////////////
// Copyright (C) 2017, Carnegie Mellon University and University of Cambridge,
// all rights reserved.
//
// ACADEMIC OR NON-PROFIT ORGANIZATION NONCOMMERCIAL RESEARCH USE ONLY
//
// BY USING OR DOWNLOADING THE SOFTWARE, YOU ARE AGREEING TO THE TERMS OF THIS LICENSE AGREEMENT.
// IF YOU DO NOT AGREE WITH THESE TERMS, YOU MAY NOT USE OR DOWNLOAD THE SOFTWARE.
//
// License can be found in OpenFace-license.txt
//
#include "stdafx.h"