open source pkg v1

2020-08-04 19:12:31 -04:00
parent bef213dba9
commit c389fc2c47
3708 changed files with 1624220 additions and 1 deletions
--- a/pkg/OpenFace/model_training/CCNF/CCRF/lib/CCRF_training_bfgs.m
+++ b/pkg/OpenFace/model_training/CCNF/CCRF/lib/CCRF_training_bfgs.m
@@ -0,0 +1,57 @@
+function [ alphas, betas, scaling, finalLikelihood] = CCRF_training_bfgs( num_seqs, thresholdX, thresholdFun, x, y, yUnnormed, alphas, betas, lambda_a, lambda_b, similarityFNs, Precalc_Bs, Precalc_Bs_flat, Precalc_yBys, varargin)
+%GRADIENTDESCENTCCRF Performs CCRF gradient descen given the initial state
+%and gradient descent parameters
+%   Detailed explanation goes here
+
+    % if these are not provided calculate them, TODO this might be
+    
+    % It is possible to predefine the component B^(k) required 
+    % to compute B term and partial derivatives, also can predefine yB^(k)y,
+    % as they also do not change through the iterations
+    if(sum(strcmp(varargin,'PrecalcBs')) && sum(strcmp(varargin,'PrecalcBsFlat'))...
+             && sum(strcmp(varargin,'Precalc_yBy')))
+         
+        ind = find(strcmp(varargin,'PrecalcBs')) + 1;
+        Precalc_Bs = varargin{ind};
+
+        ind = find(strcmp(varargin,'PrecalcBsFlat')) + 1;
+        Precalc_Bs_flat = varargin{ind};
+
+        ind = find(strcmp(varargin,'Precalc_yBys')) + 1;
+        Precalc_yBys = varargin{ind};
+    else
+        % if these are not provided calculate them        
+        [ ~, Precalc_Bs, Precalc_Bs_flat, Precalc_yBys ] = CalculateSimilarities( num_seqs, x, similarityFNs, y);
+    end              
+    
+    params = [alphas; betas];
+    
+    objectiveFun = @(params)objectiveFunction(params, numel(alphas), lambda_a, lambda_b, Precalc_Bs, x, y, Precalc_yBys, Precalc_Bs_flat);
+
+    options = optimset('Algorithm','interior-point','GradObj','on', 'TolX', thresholdX, 'TolFun', thresholdFun, 'Hessian', 'bfgs', 'display','off', 'useParallel', 'Always');
+    
+    if(sum(strcmp(varargin,'max_iter'))) 
+        options.MaxIter = varargin{find(strcmp(varargin,'max_iter')) + 1};
+    end      
+
+    params = fmincon(objectiveFun, params, [], [],[],[], zeros(numel(params),1), Inf(numel(params), 1), [], options);
+    alphas = params(1:numel(alphas));
+    betas = params(numel(alphas)+1:end);
+
+    finalLikelihood = LogLikelihoodCCRF(y, x, alphas, betas, lambda_a, lambda_b, Precalc_Bs_flat);
+%     fprintf('Final log likelihood at iteration; logL %f, learning rate\n', finalLikelihood);
+    
+    % establish the scaling
+    scaling = getScaling2(alphas, betas, x, yUnnormed, Precalc_Bs);
+
+end
+
+function [loss, gradient] = objectiveFunction(params, numAlpha, lambda_a, lambda_b, PrecalcBs, x, y, Precalc_yBys, PrecalcBsFlat)
+    
+    alphas = params(1:numAlpha);
+    betas = params(numAlpha+1:end);
+    [gradient, SigmaInvs, CholDecomps, Sigmas] = gradientCCRFFull(params, lambda_a, lambda_b, PrecalcBs, x, y, Precalc_yBys, PrecalcBsFlat);
+    % as bfgs does gradient descent rather than ascent, negate the results
+    gradient = -gradient;
+    loss = -LogLikelihoodCCRF(y, x, alphas, betas, lambda_a, lambda_b, PrecalcBsFlat, SigmaInvs, CholDecomps, Sigmas);
+end
--- a/pkg/OpenFace/model_training/CCNF/CCRF/lib/CCRF_training_gradient_descent.m
+++ b/pkg/OpenFace/model_training/CCNF/CCRF/lib/CCRF_training_gradient_descent.m
@@ -0,0 +1,122 @@
+function [ alphas, betas, scaling, finalLikelihood] = CCRF_training_gradient_descent( nIterations, nExamples, learningRate, threshold, x, y, yUnnormed, masks, alphas, betas, lambda_a, lambda_b, similarityFNs, useIndicators, verbose)
+%GRADIENTDESCENTCCRF Performs CCRF gradient descen given the initial state
+%and gradient descent parameters
+%   Detailed explanation goes here
+
+    if(verbose)
+        logLikelihood = zeros(round(nIterations/10)+1, 1);
+        alphaTrack = zeros(nIterations, numel(alphas));
+        betaTrack = zeros(nIterations, numel(betas));
+    end
+
+    logAlphas = log(alphas);
+    logBetas = log(betas);
+
+    K = numel(similarityFNs);
+    
+    %calculate similarity measures for each of the sequences
+    Similarities = cell(nExamples, 1);
+    PrecalcQ2s = cell(nExamples,1);
+    PrecalcQ2sFlat = cell(nExamples,1);
+    
+    PrecalcYqDs = zeros(nExamples, K);
+    
+    for q = 1 : nExamples
+
+        yq = y{q};
+        xq = x{q};
+        mask = masks{q};
+        
+        n = size(yq, 1);
+        Similarities{q} = zeros([n, n, K]);
+%         PrecalcQ2s{q} = zeros([n, n, K]);
+        PrecalcQ2s{q} = cell(K,1);
+%         PrecalcQ2sFlat{q} = cell(K,1);
+        PrecalcQ2sFlat{q} = zeros((n*(n+1))/2,K);
+        % go over all of the similarity metrics and construct the
+        % similarity matrices
+        for k=1:K
+            Similarities{q}(:,:,k) = similarityFNs{k}(xq, mask);
+            S = Similarities{q}(:,:,k);
+            D =  diag(sum(S));
+            B = D - S;
+%             PrecalcQ2s{q}(:,:,k) = B;
+            PrecalcQ2s{q}{k} = B;
+%             PrecalcQ2sFlat{q}{k} = PrecalcQ2s{q}{k}(logical(tril(ones(size(S)))));
+            PrecalcQ2sFlat{q}(:,k) = B(logical(tril(ones(size(S)))));
+            PrecalcYqDs(q,k) = -yq'*B*yq;
+        end
+    end    
+    
+    %stochastic gradient descent
+    for iter = 1 : nIterations
+        prevAlphas = alphas;
+        prevBetas = betas;        
+
+        for q = 1 : nExamples
+
+            yq = y{q};
+            xq = x{q};
+            mask = masks{q};
+
+            PrecalcQ2 = PrecalcQ2s{q};
+            PrecalcQ2Flat = PrecalcQ2sFlat{q};
+            [ logGradientsAlphas, logGradientsBetas] = gradientCCRF(alphas, betas, lambda_a, lambda_b, PrecalcQ2, xq, yq, mask, PrecalcYqDs(q, :), useIndicators, PrecalcQ2Flat);
+            
+%             [logGradientAlphasAnalytical, logGradientBetasAnalytical] = gradientAnalytical(PrecalcQ2, alphas, betas, lambda, xq, yq, mask);
+%  
+%             diffInGradientsAlpha = mean(abs(logGradientsAlphas - logGradientAlphasAnalytical));
+%             diffInGradientsBeta = mean(abs(logGradientsBetas - logGradientBetasAnalytical));
+            
+            %update log alpha
+            logAlphas = logAlphas + learningRate * logGradientsAlphas;
+            alphas = exp(logAlphas);
+
+            %update log beta
+            logBetas = logBetas + learningRate * logGradientsBetas;
+            betas = exp(logBetas);
+
+            if(verbose)
+                %record alpha and beta values for each iteration for debug purposes
+                alphaTrack(iter,:) = alphas(:);
+                betaTrack(iter,:) = betas;
+            end
+        end
+
+        %check for convergence 
+        if (norm([prevAlphas;prevBetas] - [alphas;betas])/norm([prevAlphas;prevBetas]) < threshold || norm([logGradientsAlphas;logGradientsBetas]) < threshold)
+            break;
+        end
+        
+        if(verbose)
+            if(mod(iter, 10)==0)
+                logLikelihood(iter/10 + 1) = LogLikelihoodCCRF(y, x, masks, alphas, betas, lambda_a, lambda_b, PrecalcQ2sFlat, useIndicators);
+                fprintf('Iteration %d; logL %f\n', iter, logLikelihood(iter/10 + 1));
+            end
+            
+        end
+    end
+
+    % establish the scaling
+    scaling = getScaling(alphas, betas, x, yUnnormed, masks, PrecalcQ2s, useIndicators);
+
+    if(verbose)  
+        figure
+        subplot(1,3,1)
+        plot(betaTrack(1:iter,:));
+        title('beta');
+        subplot(1,3,2)
+        plot(alphaTrack(1:iter,:))
+        title('alpha');
+        subplot(1,3,3)
+        plot(logLikelihood(1:round(iter/10),:))
+        title('log likelihood');
+        finalLikelihood = LogLikelihoodCCRF(y, x, masks, alphas, betas, lambda_a, lambda_b, PrecalcQ2sFlat, useIndicators);
+        fprintf('Final log likelihood at iteration %d; logL %f, learning rate %f\n', iter, finalLikelihood, learningRate);
+    else
+        finalLikelihood = LogLikelihoodCCRF(y, x, masks, alphas, betas, lambda_a, lambda_b, PrecalcQ2sFlat, useIndicators);
+        fprintf('Final log likelihood at iteration %d; logL %f, learning rate %f\n', iter, finalLikelihood, learningRate);
+    end
+    
+end
+
--- a/pkg/OpenFace/model_training/CCNF/CCRF/lib/CalcSigmaCCRF.m
+++ b/pkg/OpenFace/model_training/CCNF/CCRF/lib/CalcSigmaCCRF.m
@@ -0,0 +1,50 @@
+function [ SigmaInv] = CalcSigmaCCRF(alphas, betas, precalcBwithoutBeta )
+%CALCSIGMAPRF Summary of this function goes here
+%   Detailed explanation goes here
+% constructing the sigma
+ 
+    % the number of elements in a current sequence
+    n = size(precalcBwithoutBeta{1},1);
+
+    q1 = sum(alphas) * eye(n);
+ 
+    % the above code can be simplified by the following 2 lines of the
+    % inner loop, we want to do that for every beta however
+    K2 = numel(betas);
+
+    q2 = zeros([n,n]);
+
+    % calculating the q2 from the paper
+    for i=1:K2
+
+        % We're basically performing the following calculation, but use
+        % precalculated D - S instead of doing it every iteration
+%         S = Similarities(:,:,i);
+%         D =  diag(sum(S));
+%         q = betas(i) * D - betas(i) * S;
+%         q2s(:,:,i) = q;
+%         q2 = q2 + betas(i)*precalcQ2withoutBeta(:,:,i);
+        q2 = q2 + betas(i)*precalcBwithoutBeta{i};
+    end
+    % This is another alternative, does not seem to be faster
+%     q2old = sum(bsxfun(@times, precalcQ2withoutBeta, reshape(betas,[1,1,K2])),3);
+
+%     q2 = sum(q2s, 3);
+%     % An alternative way of calculating the above could be using bsxfun,
+%     but this seems to be actually slower than using it
+%     S = bsxfun(@times, Similarities, -reshape(betas,[1,1,K2]));
+% 
+%     % now need the diagonals
+%     d = sum(Similarities);
+% 
+%     I = repmat(eye(n), [1, 1, K2]);
+%     I = bsxfun(@times, I, reshape(betas,[1,1,K2]));
+%     D = bsxfun(@times, I, d);
+% 
+%     q2s = D + S;
+%     q2 = sum(q2s2,3);
+    
+    SigmaInv = 2 * (q1 + q2);
+
+end
+
--- a/pkg/OpenFace/model_training/CCNF/CCRF/lib/CalcSigmaCCRFflat.m
+++ b/pkg/OpenFace/model_training/CCNF/CCRF/lib/CalcSigmaCCRFflat.m
@@ -0,0 +1,26 @@
+function [ SigmaInv] = CalcSigmaCCRFflat(alphas, betas, n, PrecalcB_flat)
+%CALCSIGMAPRF Summary of this function goes here
+%   Detailed explanation goes here
+% constructing the Sigma (that is laid out in an efficient way for
+% symmertic matrices
+ 
+    A = sum(alphas) * eye(n);
+
+    % calculating the B from the paper
+    % using the precalculated lower triangular elements of B without beta
+    Btmp = PrecalcB_flat * betas;        
+    
+    % not faster
+
+    % now make it into a square symmetric matrix
+    B = zeros(n,n);
+    on = tril(true(n,n));
+    B(on) = Btmp;
+    B = B';
+    B(on) = Btmp;
+    
+    % Combine A and B
+    SigmaInv = 2 * (A + B);
+
+end
+
--- a/pkg/OpenFace/model_training/CCNF/CCRF/lib/CalcbCCRF.m
+++ b/pkg/OpenFace/model_training/CCNF/CCRF/lib/CalcbCCRF.m
@@ -0,0 +1,14 @@
+function b = CalcbCCRF( alpha, x)
+%CALCBPRF Summary of this function goes here
+%   Detailed explanation goes here
+
+%     b = zeros(size(x,1),1);
+% 
+%     for i=1:size(x,1)
+%        b(i) = 2 *  x(i,:) * alpha; 
+%     end
+
+    % vectorising above code
+    b = 2 * x * alpha;
+end
+
--- a/pkg/OpenFace/model_training/CCNF/CCRF/lib/CalculateSimilarities.m
+++ b/pkg/OpenFace/model_training/CCNF/CCRF/lib/CalculateSimilarities.m
@@ -0,0 +1,85 @@
+function [ Similarities, PrecalcQ2s, PrecalcQ2sFlat, PrecalcYqDs ] = CalculateSimilarities( n_sequences, x, similarityFNs, y)
+%CALCULATESIMILARITIES Summary of this function goes here
+%   Detailed explanation goes here
+
+    K = numel(similarityFNs);
+    
+    %calculate similarity measures for each of the sequences
+    Similarities = cell(n_sequences, 1);
+    PrecalcQ2s = cell(n_sequences,1);
+    PrecalcQ2sFlat = cell(n_sequences,1);
+    
+    PrecalcYqDs = zeros(n_sequences, K);
+    
+    if(iscell(x))
+        for q = 1 : n_sequences
+
+            xq = x{q};
+
+            n = size(xq, 1);
+            Similarities{q} = zeros([n, n, K]);
+            
+            PrecalcQ2s{q} = cell(K,1);
+
+            PrecalcQ2sFlat{q} = zeros((n*(n+1))/2,K);
+            % go over all of the similarity metrics and construct the
+            % similarity matrices
+
+            if(nargin > 3)
+                yq = y{q};
+            end
+
+            for k=1:K
+                Similarities{q}(:,:,k) = similarityFNs{k}(xq);
+                S = Similarities{q}(:,:,k);
+                D =  diag(sum(S));
+    %             PrecalcQ2s{q}(:,:,k) = D - S;
+                PrecalcQ2s{q}{k} = D - S;
+                B = D - S;
+    %             PrecalcQ2sFlat{q}{k} = PrecalcQ2s{q}{k}(logical(tril(ones(size(S)))));
+                PrecalcQ2sFlat{q}(:,k) = B(logical(tril(ones(size(S)))));
+                if(nargin > 3)        
+                    PrecalcYqDs(q,k) = -yq'*B*yq;
+                end
+            end
+        end
+    else
+        sample_length = size(x,2)/n_sequences;
+        for q = 1 : n_sequences
+
+            beg_ind = (q-1)*sample_length + 1;
+            end_ind = q*sample_length;
+            
+            % don't take the bias term
+            xq = x(2:end, beg_ind:end_ind);
+
+            Similarities{q} = zeros([sample_length, sample_length, K]);
+            
+            PrecalcQ2s{q} = cell(K,1);
+
+            PrecalcQ2sFlat{q} = zeros((sample_length*(sample_length+1))/2,K);
+            
+            % go over all of the similarity metrics and construct the
+            % similarity matrices
+
+            if(nargin > 3)
+                yq = y(:,q);
+            end
+
+            for k=1:K
+                Similarities{q}(:,:,k) = similarityFNs{k}(xq);
+                S = Similarities{q}(:,:,k);
+                D =  diag(sum(S));
+    %             PrecalcQ2s{q}(:,:,k) = D - S;
+                PrecalcQ2s{q}{k} = D - S;
+                B = D - S;
+    %             PrecalcQ2sFlat{q}{k} = PrecalcQ2s{q}{k}(logical(tril(ones(size(S)))));
+                PrecalcQ2sFlat{q}(:,k) = B(logical(tril(ones(size(S)))));
+                if(nargin > 3)        
+                    PrecalcYqDs(q,k) = -yq'*B*yq;
+                end
+            end
+        end        
+    end
+end
+
--- a/pkg/OpenFace/model_training/CCNF/CCRF/lib/CalculateSimilarities_sparsity.m
+++ b/pkg/OpenFace/model_training/CCNF/CCRF/lib/CalculateSimilarities_sparsity.m
@@ -0,0 +1,173 @@
+function [ Similarities, PrecalcQ2s, PrecalcQ2sFlat, PrecalcYqDs ] = CalculateSimilarities_sparsity( n_sequences, x, similarityFNs, sparsityFNs, y, const)
+%CALCULATESIMILARITIES Summary of this function goes here
+%   Detailed explanation goes here
+
+    K = numel(similarityFNs);
+    K2 = numel(sparsityFNs);
+    
+    %calculate similarity measures for each of the sequences
+    Similarities = cell(n_sequences, 1);
+    PrecalcQ2s = cell(n_sequences,1);
+    PrecalcQ2sFlat = cell(n_sequences,1);
+    
+    PrecalcYqDs = zeros(n_sequences, K + K2);
+    
+    if(iscell(x))
+        for q = 1 : n_sequences
+
+            xq = x{q};
+
+            n = size(xq, 1);
+            Similarities{q} = zeros([n, n, K+K2]);
+            
+            PrecalcQ2s{q} = cell(K+K2,1);
+
+            PrecalcQ2sFlat{q} = zeros((n*(n+1))/2,K+K2);
+            % go over all of the similarity metrics and construct the
+            % similarity matrices
+
+            if(nargin > 4)
+                yq = y{q};
+            end
+
+            for k=1:K
+                Similarities{q}(:,:,k) = similarityFNs{k}(xq);
+                S = Similarities{q}(:,:,k);
+                D =  diag(sum(S));
+    %             PrecalcQ2s{q}(:,:,k) = D - S;
+                PrecalcQ2s{q}{k} = D - S;
+                B = D - S;
+    %             PrecalcQ2sFlat{q}{k} = PrecalcQ2s{q}{k}(logical(tril(ones(size(S)))));
+                PrecalcQ2sFlat{q}(:,k) = B(logical(tril(ones(size(S)))));
+                if(nargin > 4)        
+                    PrecalcYqDs(q,k) = -yq'*B*yq;
+                end
+            end
+            for k=1:K2
+                Similarities{q}(:,:,K+k) = sparsityFNs{k}(xq);
+                S = Similarities{q}(:,:,K+k);
+                D =  diag(sum(S));
+    %             PrecalcQ2s{q}(:,:,k) = D - S;
+                PrecalcQ2s{q}{K+k} = D + S;
+                B = D +  S;
+    %             PrecalcQ2sFlat{q}{k} = PrecalcQ2s{q}{k}(logical(tril(ones(size(S)))));
+                PrecalcQ2sFlat{q}(:,K+k) = B(logical(tril(ones(size(S)))));
+                if(nargin > 4)        
+                    PrecalcYqDs(q,K+k) = -yq'*B*yq;
+                end
+            end            
+        end
+    elseif(~const)
+        sample_length = size(x,2)/n_sequences;
+
+        similarities = cell(K, 1);
+        sparsities = cell(K2, 1);
+
+        for q = 1 : n_sequences
+
+            beg_ind = (q-1)*sample_length + 1;
+            end_ind = q*sample_length;
+            
+            % don't take the bias term
+            xq = x(2:end, beg_ind:end_ind);
+
+            Similarities{q} = zeros([sample_length, sample_length, K+K2]);
+            
+            PrecalcQ2s{q} = cell(K+K2,1);
+
+            PrecalcQ2sFlat{q} = zeros((sample_length*(sample_length+1))/2,K+K2);
+            
+            % go over all of the similarity metrics and construct the
+            % similarity matrices
+
+            if(nargin > 4)
+                yq = y(:,q);
+            end
+
+            for k=1:K
+                if(q==1)
+                    similarities{k} = similarityFNs{k}(xq);
+                end
+                Similarities{q}(:,:,k) = similarities{k};
+                S = Similarities{q}(:,:,k);
+                D =  diag(sum(S));
+    %             PrecalcQ2s{q}(:,:,k) = D - S;
+                PrecalcQ2s{q}{k} = D - S;
+                B = D - S;
+    %             PrecalcQ2sFlat{q}{k} = PrecalcQ2s{q}{k}(logical(tril(ones(size(S)))));
+                PrecalcQ2sFlat{q}(:,k) = B(logical(tril(ones(size(S)))));
+                if(nargin > 4)        
+                    PrecalcYqDs(q,k) = -yq'*B*yq;
+                end
+            end
+            for k=1:K2
+                % this is constant so don't need to recalc
+                if(q==1)
+                   sparsities{k} = sparsityFNs{k}(xq);
+                end
+                
+                Similarities{q}(:,:,K+k) = sparsities{k};
+                S = Similarities{q}(:,:,K+k);
+                D =  diag(sum(S));
+    %             PrecalcQ2s{q}(:,:,k) = D - S;
+                PrecalcQ2s{q}{K+k} = D + S;
+                B = D +  S;
+    %             PrecalcQ2sFlat{q}{k} = PrecalcQ2s{q}{k}(logical(tril(ones(size(S)))));
+                PrecalcQ2sFlat{q}(:,K+k) = B(logical(tril(ones(size(S)))));
+                if(nargin > 4)        
+                    PrecalcYqDs(q,K+k) = -yq'*B*yq;
+                end
+            end
+
+        end
+    else
+        sample_length = size(x,2)/n_sequences;
+
+        similarities = cell(K, 1);
+        sparsities = cell(K2, 1);
+        
+        PrecalcQ2s = {cell(K+K2,1)};
+        PrecalcQ2sFlat = {zeros((sample_length*(sample_length+1))/2,K+K2)};
+        Similarities = {zeros([sample_length, sample_length, K+K2])};
+            
+        beg_ind = 1;
+        end_ind = sample_length;
+
+        % don't take the bias term
+        xq = x(2:end, beg_ind:end_ind);
+
+        % go over all of the similarity metrics and construct the
+        % similarity matrices
+        for k=1:K
+            similarities{k} = similarityFNs{k}(xq);
+
+            Similarities{1}(:,:,k) = similarities{k};
+            S = Similarities{1}(:,:,k);
+            D =  diag(sum(S));
+            PrecalcQ2s{1}{k} = D - S;
+            B = D - S;
+            % flatten the symmetric matrix to save space
+            PrecalcQ2sFlat{1}(:,k) = B(logical(tril(ones(size(S)))));
+            if(nargin > 4)
+                PrecalcYqDs(:,k) = diag(-y'*B*y);
+            end
+        end
+        for k=1:K2
+            % this is constant so don't need to recalc
+            sparsities{k} = sparsityFNs{k}(xq);
+
+            Similarities{1}(:,:,K+k) = sparsities{k};
+            S = Similarities{1}(:,:,K+k);
+            D =  diag(sum(S));
+%             PrecalcQ2s{q}(:,:,k) = D - S;
+            PrecalcQ2s{1}{K+k} = D + S;
+            B = D +  S;
+%             PrecalcQ2sFlat{q}{k} = PrecalcQ2s{q}{k}(logical(tril(ones(size(S)))));
+            PrecalcQ2sFlat{1}(:,K+k) = B(logical(tril(ones(size(S)))));
+            if(nargin > 4)        
+                PrecalcYqDs(:,K+k) = diag(-y'*B*y);
+            end
+        end   
+    end
+end
+
--- a/pkg/OpenFace/model_training/CCNF/CCRF/lib/CalculateYqDs.m
+++ b/pkg/OpenFace/model_training/CCNF/CCRF/lib/CalculateYqDs.m
@@ -0,0 +1,54 @@
+function [ PrecalcYqDs ] = CalculateYqDs( n_sequences, x, similarityFNs, sparsityFNs, y)
+%CALCULATESIMILARITIES Summary of this function goes here
+%   Detailed explanation goes here
+
+    K = numel(similarityFNs);
+    K2 = numel(sparsityFNs);
+    
+    PrecalcYqDs = zeros(n_sequences, K + K2);
+    
+
+    sample_length = size(y,1);
+
+    similarities = cell(K, 1);
+    sparsities = cell(K2, 1);
+
+    Similarities = zeros([sample_length, sample_length, K+K2]);
+
+    Bs = zeros([sample_length, sample_length, K+K2]);
+
+    for k=1:K
+        similarities{k} = similarityFNs{k}(x);
+        Similarities(:,:,k) = similarities{k};
+        S = Similarities(:,:,k);
+        D =  diag(sum(S));
+        Bs(:,:,k) = D - S;
+
+    end    
+    
+    for k=1:K2
+        % this is constant so don't need to recalc
+        sparsities{k} = sparsityFNs{k}(x);
+        
+        Similarities(:,:,K+k) = sparsities{k};
+        S = Similarities(:,:,K+k);
+        D =  diag(sum(S));
+        %             PrecalcQ2s{q}(:,:,k) = D - S;
+        Bs(:,:,K+k) = D + S;
+        %             PrecalcQ2sFlat{q}{k} = PrecalcQ2s{q}{k}(logical(tril(ones(size(S)))));
+
+    end
+    
+    for q = 1 : n_sequences
+
+        % go over all of the similarity metrics and construct the
+        % similarity matrices
+        yq = y(:,q);
+   
+        for k=1:K+K2
+            PrecalcYqDs(q,k) = -yq'*Bs(:,:,k)*yq;                       
+        end
+
+    end
+end
+
--- a/pkg/OpenFace/model_training/CCNF/CCRF/lib/LogLikelihoodCCRF.m
+++ b/pkg/OpenFace/model_training/CCNF/CCRF/lib/LogLikelihoodCCRF.m
@@ -0,0 +1,48 @@
+function logL = LogLikelihoodCCRF(y_coll, x_coll, alphas, betas,...
+                                  lambda_a,lambda_b, PrecalcBsFlat,...
+                                  SigmaInvs, ChDecomps, Sigmas)
+% Calculating the log likelihood of the CCRF with multi alpha and beta    
+
+Q = numel(y_coll);
+logL = 0;
+for q=1:Q
+    
+    yq = y_coll{q};
+    xq = x_coll{q};
+    
+    n = size(xq, 1);
+      
+    b = CalcbCCRF(alphas, xq);
+            
+    % constructing the sigma inverse
+    if(nargin < 11)
+        [SigmaInv] = CalcSigmaCCRFflat(alphas, betas, n, PrecalcBsFlat{q});
+        L = chol(SigmaInv);        
+        mu = SigmaInv \ b;
+    else
+        SigmaInv = SigmaInvs{q};
+        L = ChDecomps{q};
+        Sigma = Sigmas{q};        
+        mu = Sigma * b;
+    end    
+
+    % normalisation = 1/((2*pi)^(n/2)*sqrt(det(Sigma)));
+    % Removing the division by pi, as it is constant
+    % normalisation = 1/(sqrt(det(sigma)));
+    % flipping around determinant of SigmaInv, as det(inv(Sigma)) = inv(det(Sigma)  
+%     normalisation = log(sqrt(det(SigmaInv)));
+
+    % normalisation 2 using Cholesky decomposition
+    normalisation2 = sum(log(diag(L))); % no times 2 here as we calculate the square root of determinant
+
+    % probq = normalisation * exp(-0.5 * (y - mu)'*SigmaInv*(y-mu));
+    % applying a logarithm to this leads to
+%     logLq = log(normalisation) + (-0.5 * (yq - mu)'*SigmaInv*(yq-mu));
+    logLq = normalisation2 + (-0.5 * (yq - mu)'*SigmaInv*(yq-mu));
+  
+    logL = logL + logLq;
+    
+end
+
+% add regularisation term
+logL = logL -lambda_b * (betas'*betas)/2 - lambda_a * (alphas'*alphas)/2;
--- a/pkg/OpenFace/model_training/CCNF/CCRF/lib/evaluateCCRFmodel.m
+++ b/pkg/OpenFace/model_training/CCNF/CCRF/lib/evaluateCCRFmodel.m
@@ -0,0 +1,83 @@
+function [ correlations, rms, meanCorr, meanRMS, longCorr, longRMS, predictions, gt ] = evaluateCCRFmodel( alphas, betas, x, xOffsets, y, similarityFNs, scaling, verbose, PrecalcBsFlat)
+%EVALUATEPRFMODEL Summary of this function goes here
+%   Detailed explanation goes here
+
+num_x_plots = 8;
+num_y_plots = 10;
+
+total_plots = num_x_plots * num_y_plots;
+
+nExamples = numel(x);
+
+if(nargin < 11)
+    [ ~, ~, PrecalcBsFlat, ~ ] = CalculateSimilarities( nExamples, x, similarityFNs);
+end
+    
+correlations = zeros(nExamples, 1);
+rms = zeros(nExamples, 1);
+
+% concatenated data for an alternative correlation
+y_predConcat = [];
+y_trueConcat = [];
+
+for q=1:nExamples
+     
+    X = x{q};
+    
+    nFrames = size(X,1);
+          
+    PrecalcBflat = PrecalcBsFlat{q};
+    
+    SigmaInv = CalcSigmaCCRFflat(alphas, betas, nFrames, PrecalcBflat);
+    b = CalcbCCRF(alphas, x{q});
+    y_est = SigmaInv \ b;
+    
+%     y_est = y_est * scaling + xOffsets(q);
+    y_est = y_est * scaling + xOffsets(q);
+
+    R = corrcoef(y_est, y{q});
+    correlations(q) = R(1,2);
+ 
+    rms(q) = sqrt( (1/nFrames) * sum((y_est - y{q}).^2) );
+    
+    y_predConcat = cat(1, y_predConcat, y_est);
+    y_trueConcat = cat(1, y_trueConcat, y{q});
+
+    if(verbose)
+
+        if(mod(q,total_plots) == 1)
+            figure;
+            remainingPlots = nExamples - q;
+            if(remainingPlots < total_plots)
+                num_y_plots = ceil(remainingPlots / num_x_plots);            
+            end            
+        end        
+        
+        subplot(num_y_plots,num_x_plots,mod(q-1,total_plots)+1);
+        t = 1:nFrames;
+        plot(t,y{q},'g',t,y_est,'b');
+        title(sprintf('C %.2f, R %.2f', correlations(q), rms(q)));
+        set(gca, 'XTick', [], 'YTick', []);
+%         legend('y_{true}','y_{ccrf}');
+    
+    end   
+    
+end
+
+meanCorr = mean(correlations); 
+meanRMS = mean(rms);
+longCorr = corr(y_predConcat, y_trueConcat).^2;
+longRMS = sqrt( (1/numel(y_predConcat)) * sum((y_predConcat - y_trueConcat).^2) );
+
+predictions = y_predConcat;
+gt = y_trueConcat;
+
+if(verbose)
+    figure
+    plot([1:numel(y_trueConcat)],y_trueConcat,'g',[1:numel(y_trueConcat)],y_predConcat,'b');
+    title(sprintf('C %.2f, R %.2f', longCorr, longRMS));
+    set(gca, 'XTick', [], 'YTick', []);
+end
+
+end
+
--- a/pkg/OpenFace/model_training/CCNF/CCRF/lib/getScaling.m
+++ b/pkg/OpenFace/model_training/CCNF/CCRF/lib/getScaling.m
@@ -0,0 +1,28 @@
+function [ scaling ] = getScaling(  alphas, betas, x, y, masks, PrecalcQ2s, useIndicator)
+%getScaling Summary of this function goes here
+%   Detailed explanation goes here
+
+% for visualisation use only the first sequence
+nExamples = numel(x);
+
+scalings = zeros(1,nExamples);
+
+for q=1:nExamples
+     
+    mask = masks{q};
+    
+    PrecalcQ2 = PrecalcQ2s{q};
+    SigmaInv = CalcSigmaCCRF(alphas, betas, PrecalcQ2, mask, useIndicator);
+
+    
+    b = CalcbCCRF(alphas, x{q}, mask, useIndicator);
+    y_est = SigmaInv \ b;
+        
+    sc = std(y{q}) / std(y_est);
+    scalings(q) = sc;
+end
+ 
+scaling = mean(scalings);
+
+end
+
--- a/pkg/OpenFace/model_training/CCNF/CCRF/lib/getScaling2.m
+++ b/pkg/OpenFace/model_training/CCNF/CCRF/lib/getScaling2.m
@@ -0,0 +1,30 @@
+function [ scaling ] = getScaling2(  alphas, betas, x, y, PrecalcBs)
+%getScaling Summary of this function goes here
+%   Detailed explanation goes here
+
+% for visualisation use only the first sequence
+nExamples = numel(x);
+
+cat_y = [];
+cat_y_pred = [];
+
+for q=1:nExamples
+     
+    
+    PrecalcB = PrecalcBs{q};
+    SigmaInv = CalcSigmaCCRF(alphas, betas, PrecalcB);
+
+    b = CalcbCCRF(alphas, x{q});
+    y_est = SigmaInv \ b;
+        
+    cat_y = cat(1, cat_y, y{q} - mean(y{q}));
+%     cat_y = cat(1, cat_y, y{q});
+    cat_y_pred = cat(1, cat_y_pred, y_est);
+    
+end
+ 
+% scaling = (max(cat_y) - min(cat_y)) / (max(cat_y_pred) - min(cat_y_pred));
+scaling = std(cat_y) / std(cat_y_pred);
+
+end
+
--- a/pkg/OpenFace/model_training/CCNF/CCRF/lib/gradientCCRF.m
+++ b/pkg/OpenFace/model_training/CCNF/CCRF/lib/gradientCCRF.m
@@ -0,0 +1,92 @@
+function [ logGradientAlphas, logGradientBetas, SigmaInv, ChDecomp ] = gradientCCRF( alphas, betas, lambda_a, lambda_b, precalcQ2withoutBeta, xq, yq, mask, precalcYQ, useIndicator, PrecalcQ2Flat)
+%GRADIENTPRF Summary of this function goes here
+%   Detailed explanation goes here
+
+    % Calculate the Sigma inverse now
+%     [SigmaInv2] = CalcSigmaCCRF(alphas, betas, precalcQ2withoutBeta, mask);
+    
+    % This is an optimised version as it does not use the whole matrix but
+    % a lower diagonal part due to symmetry
+    numElemsInSeq = size(precalcQ2withoutBeta{1}, 1);
+    [SigmaInv] = CalcSigmaCCRFflat(alphas, betas, numElemsInSeq, PrecalcQ2Flat, mask, useIndicator);
+    
+    % Get the actual sigma from out SigmaInv
+    
+    % Sigma = inv(SigmaInv);
+    % Below is an optimised version of the above using Cholesky decomposition
+    % which decomposes a matrix into a upper triangular (R) and its
+    % conjugate transpose R'; A = R'*R for real numbers, thus
+    % inv(A) = inv(R)inv(R')
+    ChDecomp=chol(SigmaInv);
+    I=eye(size(SigmaInv));    
+        
+    % Rinv = (R\I);
+    % Sigma = Rinv*Rinv';
+    % This is a very slightly faster version of the above
+    Sigma=ChDecomp\(ChDecomp'\I);
+    
+    b = CalcbCCRF(alphas, xq, mask, useIndicator);
+
+    % mu = SigmaInv \ b = Sigma * b;
+    % as we've calculate Sigma already, this is equivalent of the above
+    mu = Sigma * b;    
+   
+    logGradientAlphas = zeros(size(alphas));
+    logGradientBetas = zeros(size(betas));
+
+    K1 = numel(alphas);
+    K2 = numel(betas);
+
+    % calculating the derivative of L with respect to alpha_k        
+    for k = 1:K1
+
+        if(useIndicator)
+            dQ1da = diag(mask(:,k));
+            dbda = xq(:,k).*mask(:,k);
+            gaussGradient = -yq'*dQ1da*yq +2*yq'*dbda -2 * dbda' * mu + mu'*dQ1da*mu;
+            zGradient = Sigma(:)'*dQ1da(:);
+        else
+            % if we don't use the masks here's a speedup
+            gaussGradient = -yq'*yq +2*yq'*xq(:,k) -2 * xq(:,k)' * mu + sum(mu.^2);
+                    
+            % simplification as trace(Sigma * I) = trace(Sigma)
+            zGradient = trace(Sigma);
+        end
+        
+        % add the Z derivative now
+        dLda = zGradient + gaussGradient;
+        
+        % add regularisation
+        dLda = dLda - lambda_a * alphas(k);
+ 
+        logGradientAlphas(k) = alphas(k) * dLda;
+
+    end
+
+    % This was done for gradient checking
+%   [alphasG, betaG] = gradientAnalytical(nFrames, S, alphas, beta, xq, yq, mask); 
+
+    % calculating the derivative of log(L) with respect to the betas
+    for k=1:K2
+
+        % Bs = Bs(:,:,k);
+        % dSdb = q2./betas(k); we precalculate this, as it does not change
+        % over the course of optimisation (dSdb - dSigma/dbeta)
+        dSdb = precalcQ2withoutBeta{k};
+
+        % -yq'*dSdb*yq can be precalculated as they don't change through
+        % iterations (precalcQ2withoutBeta is dSdb
+        % gaussGradient = -yq'*dSdb*yq + mu'*dSdb*mu;
+        % this does the above line
+        gaussGradient = precalcYQ(k) + mu'*dSdb*mu;
+        
+        % zGradient = trace(Sigma*dSdb);
+        zGradient = Sigma(:)'*dSdb(:); % equivalent but faster to the above line
+        dLdb = gaussGradient + zGradient;
+        
+        % add regularisation term
+        dLdb = dLdb - lambda_b * betas(k);
+        
+        logGradientBetas(k) = betas(k) * dLdb;
+    end
+end
--- a/pkg/OpenFace/model_training/CCNF/CCRF/lib/gradientCCRFFull.m
+++ b/pkg/OpenFace/model_training/CCNF/CCRF/lib/gradientCCRFFull.m
@@ -0,0 +1,39 @@
+function [ gradientParams, SigmaInvs, CholDecomps, Sigmas ] = gradientCCRFFull( params, lambda_a, lambda_b, PrecalcBs, x, y, Precalc_yBys, PrecalcBsFlat)
+%GRADIENTPRF Summary of this function goes here
+%   Detailed explanation goes here
+
+    nExamples = numel(x);
+
+    numBetas = size(PrecalcBsFlat{1},2);
+    numAlphas = numel(params) - numBetas;
+    
+    alphasInit = params(1:numAlphas);
+    betasInit = params(numAlphas+1:end);
+    gradientParams = zeros(size(params));
+    
+    % These might be use to calculate the LogLikelihood, don't want to
+    % recompute them
+    SigmaInvs = cell(nExamples, 1);
+    CholDecomps = cell(nExamples, 1);
+    Sigmas = cell(nExamples, 1);
+    gradients = zeros(nExamples, numel(params));
+    for q = 1 : nExamples
+
+        yq = y{q};
+        xq = x{q};
+
+        PrecalcB = PrecalcBs{q};
+        PrecalcB_flat = PrecalcBsFlat{q};
+        
+        [ logGradientsAlphas, logGradientsBetas, SigmaInv, CholDecomp, Sigma ] = gradientCCRF_withoutReg(alphasInit, betasInit, PrecalcB, xq, yq, Precalc_yBys(q, :), PrecalcB_flat);
+        SigmaInvs{q} = SigmaInv;
+        CholDecomps{q} = CholDecomp;
+        Sigmas{q} = Sigma;
+        
+        gradients(q,:) = [logGradientsAlphas; logGradientsBetas];
+    end
+    gradientParams = sum(gradients,1)';
+    regAlpha = alphasInit * lambda_a;
+    regBeta = betasInit * lambda_b;
+    gradientParams = gradientParams - [regAlpha; regBeta];
+end
--- a/pkg/OpenFace/model_training/CCNF/CCRF/lib/gradientCCRF_withoutReg.m
+++ b/pkg/OpenFace/model_training/CCNF/CCRF/lib/gradientCCRF_withoutReg.m
@@ -0,0 +1,76 @@
+function [ logGradientAlphas, logGradientBetas, SigmaInv, CholDecomp, Sigma ] = gradientCCRF_withoutReg( alphas, betas, precalcQ2withoutBeta, xq, yq, Precalc_yBy, PrecalcB_flat)
+%GRADIENTPRF Summary of this function goes here
+%   Detailed explanation goes here
+
+    % Calculate the Sigma inverse now
+    
+    % This is an optimised version as it does not use the whole matrix but
+    % a lower diagonal part due to symmetry
+    n = size(xq, 1);
+    [SigmaInv] = CalcSigmaCCRFflat(alphas, betas, n, PrecalcB_flat);
+    
+    % Get the actual sigma from out SigmaInv
+    
+    % Sigma = inv(SigmaInv);
+    % Below is an optimised version of the above using Cholesky decomposition
+    % which decomposes a matrix into a upper triangular (R) and its
+    % conjugate transpose R'; A = R'*R for real numbers, thus
+    % inv(A) = inv(R)inv(R')
+
+    CholDecomp=chol(SigmaInv);
+    I=eye(size(SigmaInv));    
+    
+    % This is a way of calculating it faster than just inv(SigmaInv)
+    Sigma=CholDecomp\(CholDecomp'\I);
+    b = CalcbCCRF(alphas, xq);
+
+    % mu = SigmaInv \ b = Sigma * b;
+    % as we've calculate Sigma already, this is equivalent of the above
+    mu = Sigma * b;    
+   
+    logGradientAlphas = zeros(size(alphas));
+    logGradientBetas = zeros(size(betas));
+
+    K1 = numel(alphas);
+    K2 = numel(betas);
+
+    % calculating the derivative of L with respect to alpha_k        
+    for k = 1:K1
+
+        gaussGradient = -yq'*yq +2*yq'*xq(:,k) -2 * xq(:,k)' * mu + sum(mu.^2);
+
+        % simplification as trace(Sigma * I) = trace(Sigma)
+        zGradient = trace(Sigma);
+        
+        % add the Z (partition function) derivative now
+        dLda = zGradient + gaussGradient;
+
+        logGradientAlphas(k) = dLda;
+
+    end
+
+    % This was done for gradient checking
+%   [alphasG, betaG] = gradientAnalytical(nFrames, S, alphas, beta, xq, yq, mask); 
+
+    % calculating the derivative of log(L) with respect to the betas
+    for k=1:K2
+
+        % Bs = Bs(:,:,k);
+        % dSdb = q2./betas(k); we precalculate this, as it does not change
+        % over the course of optimisation (dSdb - dSigma/dbeta)
+        dSdb = precalcQ2withoutBeta{k};
+
+        % -yq'*dSdb*yq can be precalculated as they don't change through
+        % iterations (precalcQ2withoutBeta is dSdb
+        % gaussGradient = -yq'*dSdb*yq + mu'*dSdb*mu;
+        % this does the above line
+        gaussGradient = Precalc_yBy(k) + mu'*dSdb*mu;
+        
+        % zGradient = trace(Sigma*dSdb);
+        zGradient = Sigma(:)'*dSdb(:); % equivalent but faster to the above line
+        
+        dLdb = gaussGradient + zGradient;
+        
+        logGradientBetas(k) = dLdb;
+    end
+end
--- a/pkg/OpenFace/model_training/CCNF/CCRF/lib/randInitializeWeights.m
+++ b/pkg/OpenFace/model_training/CCNF/CCRF/lib/randInitializeWeights.m
@@ -0,0 +1,35 @@
+function W = randInitializeWeights(L_in, L_out)
+%RANDINITIALIZEWEIGHTS Randomly initialize the weights of a layer with L_in
+%incoming connections and L_out outgoing connections
+%   W = RANDINITIALIZEWEIGHTS(L_in, L_out) randomly initializes the weights 
+%   of a layer with L_in incoming connections and L_out outgoing 
+%   connections. 
+%
+%   Note that W should be set to a matrix of size(L_out, 1 + L_in) as
+%   the column row of W handles the "bias" terms
+%
+
+% You need to return the following variables correctly 
+% epsilon_init  =  0.12;
+% epsilon_init  =  0.12;
+epsilon_init = 1/sqrt(L_in);
+W  =  rand(L_out,  1  +  L_in)  *  2  *  epsilon_init - epsilon_init;
+
+% ====================== YOUR CODE HERE ======================
+% Instructions: Initialize W randomly so that we break the symmetry while
+%               training the neural network.
+%
+% Note: The first row of W corresponds to the parameters for the bias units
+%
+
+
+
+
+
+
+
+
+
+% =========================================================================
+
+end
--- a/pkg/OpenFace/model_training/CCNF/CCRF/lib/similarityEuclidean.m
+++ b/pkg/OpenFace/model_training/CCNF/CCRF/lib/similarityEuclidean.m
@@ -0,0 +1,8 @@
+function SimilarityMatrix = similarityEuclidean(x)
+    %spatial distance measure
+    Distances = sqrt(pdist(x)+3e-6).^-1; % 0.05 best so far
+
+    SimilarityMatrix = squareform(Distances) + eye(size(x, 1));
+     
+end
+
--- a/pkg/OpenFace/model_training/CCNF/CCRF/lib/similarityGauss.m
+++ b/pkg/OpenFace/model_training/CCNF/CCRF/lib/similarityGauss.m
@@ -0,0 +1,25 @@
+function SimilarityMatrix = similarityGauss(x, sigma, range, mask)
+%spatial distance measure, based on exponential decay, creates a matrix of
+%similarities
+
+% get the euclidean distance for each pair
+if(numel(range) > 0)
+Distances = exp(-pdist(x(:,range))/sigma); % 0.05 best so far
+else
+Distances = exp(-pdist(x)/sigma); % 0.05 best so far    
+end
+SimilarityMatrix = squareform(Distances);
+
+% invalidate the illegal values from the mask (if at least one element is
+% not present in the mask set similarity to 0)
+if(numel(mask) ~= 0)    
+    invalidInds = sum(mask(:,range),2) < numel(range);
+
+    SimilarityMatrix(invalidInds,:) = 0;
+    SimilarityMatrix(:,invalidInds) = 0;
+end
+
+SimilarityMatrix = SimilarityMatrix + eye(size(x, 1));
+
+end
+
--- a/pkg/OpenFace/model_training/CCNF/CCRF/lib/similarityNeighbor.m
+++ b/pkg/OpenFace/model_training/CCNF/CCRF/lib/similarityNeighbor.m
@@ -0,0 +1,25 @@
+function [ SimilarityMatrix ] = similarityNeighbor( x, n, range)
+%SIMILARITYNEIGHBOR Summary of this function goes here
+%   Detailed explanation goes here
+
+    sz = size(x,1);
+    SimilarityMatrix = eye(sz);
+
+    i = 1:sz-n;
+    SimilarityMatrix(sub2ind([sz, sz], i+n,i)) = 1;
+    SimilarityMatrix(sub2ind([sz, sz], i,i+n)) = 1;
+
+    % invalidate the illegal values from the mask (if at least one element is
+    % not present in the mask set similarity to 0)
+%     if(numel(mask)~=0)
+%         invalidInds = sum(mask(:,range),2) < numel(range);
+% 
+%         SimilarityMatrix(invalidInds,:) = 0;
+%         SimilarityMatrix(:,invalidInds) = 0;
+%     end
+    
+    DiagMask = ones(size(x, 1)) - eye(size(x,1));
+    SimilarityMatrix = SimilarityMatrix .* DiagMask;
+    SimilarityMatrix = SimilarityMatrix + eye(size(x, 1));
+    
+end