open source pkg v1

2020-08-04 19:12:31 -04:00
parent bef213dba9
commit c389fc2c47
3708 changed files with 1624220 additions and 1 deletions
--- a/pkg/OpenFace/lib/3rdParty/dlib/include/dlib/cuda/cpu_dlib.h
+++ b/pkg/OpenFace/lib/3rdParty/dlib/include/dlib/cuda/cpu_dlib.h
@@ -0,0 +1,505 @@
+// Copyright (C) 2015  Davis E. King (davis@dlib.net)
+// License: Boost Software License   See LICENSE.txt for the full license.
+#ifndef DLIB_DNN_CPU_H_
+#define DLIB_DNN_CPU_H_
+
+// This file contains CPU implementations of the GPU based functions in cuda_dlib.h
+// and cudnn_dlibapi.h
+
+#include "tensor.h"
+#include "../geometry/rectangle.h"
+
+namespace dlib
+{
+    namespace cpu 
+    {
+
+    // -----------------------------------------------------------------------------------
+
+        void multiply (
+            bool add_to,
+            tensor& dest,
+            const tensor& src1,
+            const tensor& src2
+        );
+
+        void multiply_conv (
+            bool add_to,
+            tensor& dest,
+            const tensor& src1,
+            const tensor& src2
+        );
+
+        void multiply_zero_padded (
+            bool add_to,
+            tensor& dest,
+            const tensor& src1,
+            const tensor& src2
+        );
+
+        void scale_channels (
+            bool add_to,
+            tensor& dest,
+            const tensor& src,
+            const tensor& scales
+        );
+
+        void add(
+            float beta,
+            tensor& dest,
+            float alpha,
+            const tensor& src
+        );
+
+        void assign_bias_gradient (
+            tensor& grad,
+            const tensor& gradient_input
+        );
+
+        void add (
+            tensor& dest,
+            const tensor& src1,
+            const tensor& src2
+        );
+
+        void assign_conv_bias_gradient (
+            tensor& grad,
+            const tensor& gradient_input
+        );
+
+    // -----------------------------------------------------------------------------------
+
+        void affine_transform(
+            tensor& dest,
+            const tensor& src,
+            const float A,
+            const float B
+        );
+
+        void affine_transform(
+            tensor& dest,
+            const tensor& src1,
+            const tensor& src2,
+            const float A,
+            const float B,
+            const float C
+        );
+
+        void affine_transform(
+            tensor& dest,
+            const tensor& src1,
+            const tensor& src2,
+            const tensor& src3,
+            const float A,
+            const float B,
+            const float C,
+            const float D
+        );
+
+        void affine_transform_range(
+            size_t begin,
+            size_t end,
+            tensor& dest,
+            const tensor& src1,
+            const tensor& src2,
+            const tensor& src3,
+            const float A,
+            const float B,
+            const float C
+        );
+
+    // -----------------------------------------------------------------------------------
+
+        void affine_transform(
+            tensor& dest,
+            const tensor& src,
+            const tensor& A,
+            const tensor& B
+        );
+
+    // -----------------------------------------------------------------------------------
+
+        void affine_transform_conv(
+            tensor& dest,
+            const tensor& src,
+            const tensor& A,
+            const tensor& B
+        );
+
+    // -----------------------------------------------------------------------------------
+
+        void affine_transform(
+            const rectangle& rect,
+            tensor& dest, 
+            const tensor& src1, 
+            const tensor& src2, 
+            const tensor& src3, 
+            float A, 
+            float B,
+            float C
+        );
+
+    // -----------------------------------------------------------------------------------
+
+        void compute_adam_update (
+            size_t begin,
+            size_t end,
+            tensor& s,
+            tensor& m,
+            tensor& v,
+            const float t,
+            const float learning_rate,
+            const float weight_decay,
+            const float momentum1,
+            const float momentum2,
+            const tensor& params,
+            const tensor& params_grad
+        );
+
+    // -----------------------------------------------------------------------------------
+
+        void batch_normalize_inference (
+            const double eps,
+            resizable_tensor& dest,
+            const tensor& src,
+            const tensor& gamma, 
+            const tensor& beta,
+            const tensor& running_means,
+            const tensor& running_variances
+        );
+
+        void batch_normalize (
+            const double eps,
+            resizable_tensor& dest,
+            resizable_tensor& means,
+            resizable_tensor& invstds,
+            const double averaging_factor,
+            resizable_tensor& running_means,
+            resizable_tensor& running_variances,
+            const tensor& src,
+            const tensor& gamma, 
+            const tensor& beta 
+        );
+
+        void batch_normalize_gradient (
+            const double eps,
+            const tensor& gradient_input,
+            const tensor& means,
+            const tensor& invstds,
+            const tensor& src,
+            const tensor& gamma,
+            tensor& src_grad,
+            tensor& gamma_grad, 
+            tensor& beta_grad 
+        );
+
+        void batch_normalize_conv_inference (
+            const double eps,
+            resizable_tensor& dest,
+            const tensor& src,
+            const tensor& gamma, 
+            const tensor& beta,
+            const tensor& running_means,
+            const tensor& running_variances
+        );
+
+        void batch_normalize_conv (
+            const double eps,
+            resizable_tensor& dest,
+            resizable_tensor& means,
+            resizable_tensor& invstds,
+            const double averaging_factor,
+            resizable_tensor& running_means,
+            resizable_tensor& running_variances,
+            const tensor& src,
+            const tensor& gamma, 
+            const tensor& beta 
+        );
+
+        void batch_normalize_conv_gradient (
+            const double eps,
+            const tensor& gradient_input,
+            const tensor& means,
+            const tensor& invstds,
+            const tensor& src,
+            const tensor& gamma,
+            tensor& src_grad,
+            tensor& gamma_grad, 
+            tensor& beta_grad 
+        );
+
+    // -----------------------------------------------------------------------------------
+
+        void threshold (
+            tensor& data,
+            float thresh
+        );
+
+        void dot (
+            const tensor& a,
+            const tensor& b,
+            tensor& result,
+            size_t idx
+        );
+
+    // -----------------------------------------------------------------------------------
+
+        void softmax (
+            tensor& dest,
+            const tensor& src
+        );
+
+        void softmax_gradient (
+            tensor& grad,
+            const tensor& dest,
+            const tensor& gradient_input
+        );
+
+    // ------------------------------------------------------------------------------------
+
+        void softmax_all (
+            tensor& dest,
+            const tensor& src
+        );
+
+        void softmax_all_gradient (
+            tensor& grad,
+            const tensor& dest,
+            const tensor& gradient_input
+        );
+
+    // ------------------------------------------------------------------------------------
+
+        void sigmoid (
+            tensor& dest,
+            const tensor& src
+        );
+
+        void sigmoid_gradient (
+            tensor& grad,
+            const tensor& dest,
+            const tensor& gradient_input
+        );
+
+    // ------------------------------------------------------------------------------------
+
+        void relu (
+            tensor& dest,
+            const tensor& src
+        );
+
+        void relu_gradient (
+            tensor& grad,
+            const tensor& dest,
+            const tensor& gradient_input
+        );
+
+    // ----------------------------------------------------------------------------------------
+
+        void prelu (
+            tensor& dest,
+            const tensor& src,
+            const tensor& param
+        );
+
+        void prelu_gradient (
+            tensor& grad,
+            const tensor& src,
+            const tensor& gradient_input,
+            const tensor& param,
+            tensor& params_grad 
+        );
+
+    // ------------------------------------------------------------------------------------
+
+        void tanh (
+            tensor& dest,
+            const tensor& src
+        );
+
+        void tanh_gradient (
+            tensor& grad,
+            const tensor& dest,
+            const tensor& gradient_input
+        );
+
+    // ----------------------------------------------------------------------------------------
+
+        void resize_bilinear (
+            tensor& dest,
+            long dest_row_stride,
+            long dest_channel_stride,
+            const tensor& src,
+            long src_row_stride,
+            long src_channel_stride
+        );
+
+        void resize_bilinear_gradient (
+            tensor& grad,
+            long grad_row_stride,
+            long grad_channel_stride,
+            const tensor& gradient_input,
+            long gradient_input_row_stride,
+            long gradient_input_channel_stride
+        );
+
+        inline void resize_bilinear (
+            tensor& dest,
+            const tensor& src
+        ) { resize_bilinear(dest, dest.nc(), dest.nr()*dest.nc(), src, src.nc(), src.nr()*src.nc()); }
+
+        inline void resize_bilinear_gradient (
+            tensor& grad,
+            const tensor& gradient_input
+        ) { resize_bilinear_gradient(grad, grad.nc(), grad.nr()*grad.nc(), gradient_input, gradient_input.nc(), gradient_input.nr()*gradient_input.nc()); }
+
+    // -----------------------------------------------------------------------------------
+
+        class pooling
+        {
+        public:
+
+            pooling(const pooling&) = delete;
+            pooling& operator=(const pooling&) = delete;
+
+            pooling (
+            );
+
+            void clear(
+            );
+
+            void setup_max_pooling(
+                int window_height,
+                int window_width,
+                int stride_y,
+                int stride_x,
+                int padding_y,
+                int padding_x
+            );
+
+            void setup_avg_pooling(
+                int window_height,
+                int window_width,
+                int stride_y,
+                int stride_x,
+                int padding_y,
+                int padding_x
+            );
+
+            bool does_max_pooling(
+            ) const { return do_max_pooling; }
+
+            void operator() (
+                resizable_tensor& dest,
+                const tensor& src
+            );
+
+            void get_gradient(
+                const tensor& gradient_input, 
+                const tensor& dest,
+                const tensor& src,
+                tensor& grad 
+            );
+
+        private:
+            int window_height;
+            int window_width;
+            int stride_y;
+            int stride_x;
+            int padding_y;
+            int padding_x;
+            bool do_max_pooling;
+
+        };
+
+    // -----------------------------------------------------------------------------------
+
+        class tensor_conv
+        {
+        public:
+            tensor_conv(const tensor_conv&) = delete;
+            tensor_conv& operator=(const tensor_conv&) = delete;
+
+            tensor_conv() {}
+
+            void clear(
+            ) {}
+
+            void setup(
+                const tensor& data,    /* not used but required for interface */
+                const tensor& filters, /* not used but required for interface */
+                int stride_y,
+                int stride_x,
+                int padding_y,
+                int padding_x
+            ) 
+            {
+                (void)data;    /* silence compiler */
+                DLIB_CASSERT(stride_y > 0 && stride_x > 0);
+                DLIB_CASSERT(0 <= padding_y && padding_y < filters.nr());
+                DLIB_CASSERT(0 <= padding_x && padding_x < filters.nc());
+                last_stride_y = stride_y;
+                last_stride_x = stride_x;
+                last_padding_y = padding_y;
+                last_padding_x = padding_x;            
+            }
+
+             void operator() (
+                const bool add_to_output,
+                resizable_tensor& output,
+                const tensor& data,
+                const tensor& filters
+            );
+
+             void operator() (
+                const bool add_to_output,
+                tensor& output,
+                const tensor& data,
+                const tensor& filters
+            );
+
+            void get_gradient_for_data (
+                const bool add_to_output,
+                const tensor& gradient_input, 
+                const tensor& filters,
+                tensor& data_gradient
+            );
+
+            void get_gradient_for_filters (
+                const bool add_to_output,
+                const tensor& gradient_input, 
+                const tensor& data,
+                tensor& filters_gradient
+            );
+
+        private:
+
+            long last_stride_y = 0;
+            long last_stride_x = 0;
+            long last_padding_y = 0;
+            long last_padding_x = 0;
+        };
+
+    // -----------------------------------------------------------------------------------
+
+        void copy_tensor(
+            bool add_to,
+            tensor& dest,
+            size_t dest_k_offset,
+            const tensor& src,
+            size_t src_k_offset,
+            size_t count_k
+        );
+
+    // -----------------------------------------------------------------------------------
+
+    } 
+}
+
+#ifdef NO_MAKEFILE
+#include "cpu_dlib.cpp"
+#endif
+
+#endif // DLIB_DNN_CPU_H_
+
+
--- a/pkg/OpenFace/lib/3rdParty/dlib/include/dlib/cuda/cublas_dlibapi.h
+++ b/pkg/OpenFace/lib/3rdParty/dlib/include/dlib/cuda/cublas_dlibapi.h
@@ -0,0 +1,50 @@
+// Copyright (C) 2015  Davis E. King (davis@dlib.net)
+// License: Boost Software License   See LICENSE.txt for the full license.
+#ifndef DLIB_DNN_CuBLAS_H_
+#define DLIB_DNN_CuBLAS_H_
+
+#ifdef DLIB_USE_CUDA
+
+#include "tensor.h"
+#include "cuda_errors.h"
+
+namespace dlib
+{
+    namespace cuda 
+    {
+
+    // -----------------------------------------------------------------------------------
+
+        void gemm (
+            float beta,
+            tensor& dest,
+            float alpha,
+            const tensor& lhs,
+            bool trans_lhs,
+            const tensor& rhs,
+            bool trans_rhs
+        );
+        /*!
+            requires
+                - The dimensions of lhs and rhs must be compatible for matrix
+                  multiplication.  In particular:
+                    - Let L == trans_lhs ? trans(mat(lhs)) : mat(lhs)
+                    - Let R == trans_rhs ? trans(mat(rhs)) : mat(rhs)
+                    - Let D == mat(dest)
+                    - D.nr() == L.nr() && D.nc() == R.nc()
+                      (i.e. dest must be preallocated and have the correct output dimensions)
+                    - L.nc() == R.nr()
+            ensures
+                - performs: dest = alpha*L*R + beta*mat(dest)
+        !*/
+
+    // ------------------------------------------------------------------------------------
+
+    }  
+}
+
+#endif // DLIB_USE_CUDA
+
+#endif // DLIB_DNN_CuBLAS_H_
+
+
--- a/pkg/OpenFace/lib/3rdParty/dlib/include/dlib/cuda/cuda_data_ptr.h
+++ b/pkg/OpenFace/lib/3rdParty/dlib/include/dlib/cuda/cuda_data_ptr.h
@@ -0,0 +1,256 @@
+// Copyright (C) 2017  Davis E. King (davis@dlib.net)
+// License: Boost Software License   See LICENSE.txt for the full license.
+#ifndef DLIB_DNN_CuDA_DATA_PTR_H_
+#define DLIB_DNN_CuDA_DATA_PTR_H_
+
+#ifdef DLIB_USE_CUDA
+
+#include <memory>
+#include <vector>
+#include "../assert.h"
+
+namespace dlib
+{
+    namespace cuda
+    {
+
+    // ------------------------------------------------------------------------------------
+
+        class cuda_data_void_ptr
+        {
+            /*!
+                WHAT THIS OBJECT REPRESENTS
+                    This is a block of memory on a CUDA device.  
+            !*/
+        public:
+
+            cuda_data_void_ptr() = default;
+
+            cuda_data_void_ptr(size_t n); 
+            /*!
+                ensures
+                    - This object will allocate a device memory buffer of n bytes.
+                    - #size() == n
+            !*/
+
+            void* data() { return pdata.get(); }
+            const void* data() const { return pdata.get(); }
+            operator void*() { return pdata.get(); }
+            operator const void*() const { return pdata.get(); }
+
+            void reset() { pdata.reset(); }
+
+            size_t size() const { return num; }
+            /*!
+                ensures
+                    - returns the length of this buffer, in bytes.
+            !*/
+
+            cuda_data_void_ptr operator+ (size_t offset) const 
+            /*!
+                requires
+                    - offset < size()
+                ensures
+                    - returns a pointer that is offset by the given amount.
+            !*/
+            { 
+                DLIB_CASSERT(offset < num);
+                cuda_data_void_ptr temp;
+                temp.num = num-offset;
+                temp.pdata = std::shared_ptr<void>(pdata, ((char*)pdata.get())+offset);
+                return temp;
+            }
+
+        private:
+
+            size_t num = 0;
+            std::shared_ptr<void> pdata;
+        };
+
+        inline cuda_data_void_ptr operator+(size_t offset, const cuda_data_void_ptr& rhs) { return rhs+offset; }
+
+    // ------------------------------------------------------------------------------------
+
+        void memcpy(
+            void* dest,
+            const cuda_data_void_ptr& src
+        );
+        /*!
+            requires
+                - dest == a pointer to at least src.size() bytes on the host machine.
+            ensures
+                - copies the GPU data from src into dest.
+                - This routine is equivalent to performing: memcpy(dest,src,src.size())
+        !*/
+
+        void memcpy(
+            void* dest,
+            const cuda_data_void_ptr& src,
+            const size_t num
+        );
+        /*!
+            requires
+                - dest == a pointer to at least num bytes on the host machine.
+                - num <= src.size()
+            ensures
+                - copies the GPU data from src into dest.  Copies only the first num bytes
+                  of src to dest.
+        !*/
+
+    // ------------------------------------------------------------------------------------
+
+        void memcpy(
+            cuda_data_void_ptr dest, 
+            const void* src
+        );
+        /*!
+            requires
+                - dest == a pointer to at least src.size() bytes on the host machine.
+            ensures
+                - copies the host data from src to the GPU memory buffer dest.
+                - This routine is equivalent to performing: memcpy(dest,src,dest.size())
+        !*/
+
+        void memcpy(
+            cuda_data_void_ptr dest, 
+            const void* src,
+            const size_t num
+        );
+        /*!
+            requires
+                - dest == a pointer to at least num bytes on the host machine.
+                - num <= dest.size()
+            ensures
+                - copies the host data from src to the GPU memory buffer dest.  Copies only
+                  the first num bytes of src to dest.
+        !*/
+
+    // ------------------------------------------------------------------------------------
+    // ------------------------------------------------------------------------------------
+    // ------------------------------------------------------------------------------------
+
+        template <typename T>
+        class cuda_data_ptr
+        {
+            /*!
+                WHAT THIS OBJECT REPRESENTS
+                    This is a block of memory on a CUDA device.   It is just a type safe
+                    version of cuda_data_void_ptr.
+            !*/
+
+        public:
+
+            static_assert(std::is_standard_layout<T>::value, "You can only create basic standard layout types on the GPU");
+
+            cuda_data_ptr() = default;
+            cuda_data_ptr(size_t n) : num(n)
+            /*!
+                ensures
+                    - This object will allocate a device memory buffer of n T objects.
+                    - #size() == n
+            !*/
+            {
+                if (n == 0)
+                    return;
+
+                pdata = cuda_data_void_ptr(n*sizeof(T));
+            }
+
+            T* data() { return (T*)pdata.data(); }
+            const T* data() const { return (T*)pdata.data(); }
+
+            operator T*() { return (T*)pdata.data(); }
+            operator const T*() const { return (T*)pdata.data(); }
+
+            void reset() { pdata.reset(); }
+
+            size_t size() const { return num; }
+
+
+            friend void memcpy(
+                std::vector<T>& dest,
+                const cuda_data_ptr& src
+            )
+            {
+                dest.resize(src.size());
+                if (src.size() != 0)
+                    memcpy(dest.data(), src.pdata);
+            }
+
+            friend void memcpy(
+                cuda_data_ptr& src,
+                const std::vector<T>& dest
+            )
+            {
+                if (dest.size() != src.size())
+                    dest = cuda_data_ptr<T>(src.size());
+
+                if (src.size() != 0)
+                    memcpy(src.pdata, dest.data());
+            }
+
+        private:
+
+            size_t num = 0;
+            cuda_data_void_ptr pdata;
+        };
+
+    // ------------------------------------------------------------------------------------
+
+        class resizable_cuda_buffer
+        {
+            /*!
+                WHAT THIS OBJECT REPRESENTS
+                    This is a block of memory on a CUDA device that will be automatically
+                    resized if requested size is larger than allocated.
+            !*/
+        public:
+            cuda_data_void_ptr get(size_t size)
+            /*!
+                ensures
+                    - This object will return the buffer of requested size or larger.
+                    - buffer.size() >= size
+                    - Client code should not hold the returned cuda_data_void_ptr for long
+                      durations, but instead should call get() whenever the buffer is
+                      needed.  Doing so ensures that multiple buffers are not kept around
+                      in the event of a resize.
+            !*/
+            {
+                if (buffer.size() < size)
+                {
+                    buffer.reset();
+                    buffer = cuda_data_void_ptr(size);
+                }
+                return buffer;
+            }
+        private:
+            cuda_data_void_ptr buffer;
+        };
+
+    // ----------------------------------------------------------------------------------------
+
+        std::shared_ptr<resizable_cuda_buffer> device_global_buffer(
+        );
+        /*!
+            ensures
+                - Returns a pointer to a globally shared CUDA memory buffer on the
+                  currently selected CUDA device.  The buffer is also thread local.  So
+                  each host thread will get its own buffer.  You can use this global buffer
+                  as scratch space for CUDA computations that all take place on the default
+                  stream.  Using it in this way ensures that there aren't any race conditions
+                  involving the use of the buffer.
+                - The global buffer is deallocated once all references to it are
+                  destructed.  It will be reallocated as required.  So if you want to avoid
+                  these reallocations then hold a copy of the shared_ptr returned by this
+                  function.
+        !*/
+
+    // ----------------------------------------------------------------------------------------
+
+    }
+}
+
+#endif // DLIB_USE_CUDA
+
+#endif // DLIB_DNN_CuDA_DATA_PTR_H_
+
--- a/pkg/OpenFace/lib/3rdParty/dlib/include/dlib/cuda/cuda_dlib.h
+++ b/pkg/OpenFace/lib/3rdParty/dlib/include/dlib/cuda/cuda_dlib.h
@@ -0,0 +1,530 @@
+// Copyright (C) 2015  Davis E. King (davis@dlib.net)
+// License: Boost Software License   See LICENSE.txt for the full license.
+#ifndef DLIB_DNN_CuDA_H_
+#define DLIB_DNN_CuDA_H_
+
+
+#include "tensor.h"
+#include "../geometry/rectangle.h"
+
+namespace dlib
+{
+    namespace cuda 
+    {
+
+    // ----------------------------------------------------------------------------------------
+
+        void set_device (
+            int dev
+        );
+
+        int get_device (
+        );
+
+        int get_num_devices (
+        );
+
+        std::string get_device_name (
+            int device
+        );
+
+        void set_current_device_blocking_sync(
+        );
+
+        bool can_access_peer (int device_id, int peer_device_id);
+        bool can_access_peer (const tensor& device, const tensor& peer_device);
+
+        void device_synchronize (int dev);
+        void device_synchronize (const tensor& dev);
+
+
+        class raii_set_device
+        {
+        public:
+            raii_set_device() = delete;
+            raii_set_device(const raii_set_device&) = delete;
+            raii_set_device& operator=(const raii_set_device&) = delete;
+
+            raii_set_device(int dev)
+            {
+                prev_dev = get_device();
+                set_device(dev);
+            }
+
+            raii_set_device(const tensor& dev)
+            {
+                prev_dev = get_device();
+                set_device(dev.device_id());
+            }
+
+            void operator() (int dev)
+            {
+                set_device(dev);
+            }
+
+            void operator() (const tensor& dev)
+            {
+                set_device(dev.device_id());
+            }
+
+            ~raii_set_device() noexcept(false)
+            {
+                set_device(prev_dev);
+            }
+
+        private:
+            int prev_dev;
+        };
+
+
+#ifdef DLIB_USE_CUDA
+
+        class enable_peer_access
+        {
+        public:
+
+            enable_peer_access() = delete;
+            enable_peer_access(const enable_peer_access&) = delete;
+            enable_peer_access& operator=(const enable_peer_access&) = delete;
+
+            enable_peer_access(
+                int device_id,
+                int peer_device_id
+            );
+
+            enable_peer_access(
+                const tensor& device,
+                const tensor& peer_device
+            ) : enable_peer_access(device.device_id(), peer_device.device_id())
+            {}
+
+            ~enable_peer_access() noexcept(false);
+
+        private:
+
+            bool call_disable;
+            int device_id;
+            int peer_device_id;
+        };
+
+    // -----------------------------------------------------------------------------------
+
+        void inverse_norms (
+            resizable_tensor& invnorms,
+            const tensor& data,
+            const double eps
+        );
+
+        void dot_prods (
+            resizable_tensor& out,
+            const tensor& lhs,
+            const tensor& rhs
+        );
+
+        void dot_prods (
+            bool add_to,
+            tensor& out,
+            const tensor& lhs,
+            const tensor& rhs
+        );
+
+        void scale_columns (
+            tensor& out,
+            const tensor& m,
+            const tensor& v
+        );
+
+        void scale_rows (
+            tensor& out,
+            const tensor& m,
+            const tensor& v
+        );
+
+        void scale_rows2 (
+            float beta, 
+            tensor& out,
+            const tensor& m1,
+            const tensor& m2,
+            const tensor& v1,
+            const tensor& v2
+        );
+
+        void exp (
+            tensor& dest,
+            const tensor& src
+        );
+
+        void log (
+            tensor& dest,
+            const tensor& src
+        );
+
+        void log10 (
+            tensor& dest,
+            const tensor& src
+        );
+
+    // ------------------------------------------------------------------------------------
+
+        void set_tensor (
+            tensor& t,
+            float value
+        );
+
+        void scale_tensor (
+            tensor& t,
+            float value
+        );
+
+    // ------------------------------------------------------------------------------------
+
+        void multiply (
+            bool add_to,
+            tensor& dest,
+            const tensor& src1,
+            const tensor& src2
+        );
+
+        void multiply_conv (
+            bool add_to,
+            tensor& dest,
+            const tensor& src1,
+            const tensor& src2
+        );
+
+        void multiply_zero_padded (
+            bool add_to,
+            tensor& dest,
+            const tensor& src1,
+            const tensor& src2
+        );
+
+        void scale_channels (
+            bool add_to,
+            tensor& dest,
+            const tensor& src,
+            const tensor& scales
+        );
+
+        void add (
+            tensor& dest,
+            const tensor& src1,
+            const tensor& src2
+        );
+
+    // -----------------------------------------------------------------------------------
+
+        void affine_transform(
+            tensor& dest,
+            const tensor& src,
+            const float A,
+            const float B
+        );
+
+        void affine_transform(
+            tensor& dest,
+            const tensor& src,
+            const float A
+        );
+
+        void affine_transform(
+            tensor& dest,
+            const tensor& src1,
+            const tensor& src2,
+            const float A,
+            const float B,
+            const float C
+        );
+
+        void affine_transform(
+            tensor& dest,
+            const tensor& src1,
+            const tensor& src2,
+            const float A,
+            const float B
+        );
+
+        void affine_transform(
+            tensor& dest,
+            const tensor& src1,
+            const tensor& src2,
+            const tensor& src3,
+            const float A,
+            const float B,
+            const float C,
+            const float D
+        );
+
+        void affine_transform_range(
+            size_t begin,
+            size_t end,
+            tensor& dest,
+            const tensor& src1,
+            const tensor& src2,
+            const tensor& src3,
+            const float A,
+            const float B,
+            const float C
+        );
+
+        void affine_transform(
+            const rectangle& rect,
+            tensor& dest, 
+            const tensor& src1, 
+            const tensor& src2, 
+            const tensor& src3, 
+            float A, 
+            float B,
+            float C
+        );
+
+        // Note that this function isn't in the tt:: namespace because add_scaled() is
+        // called by cuda::add() so we don't need a tt:: version of add_scaled().  
+        void add_scaled(
+            tensor& dest,
+            const float scale,
+            const tensor& src
+        );
+
+        void add_cv_to_all_columns(
+            float beta, 
+            tensor& dest, 
+            float alpha, 
+            const tensor& src
+        );
+
+    // -----------------------------------------------------------------------------------
+
+        void affine_transform(
+            tensor& dest,
+            const tensor& src,
+            const tensor& A,
+            const tensor& B
+        );
+
+    // -----------------------------------------------------------------------------------
+
+        void affine_transform_conv(
+            tensor& dest,
+            const tensor& src,
+            const tensor& A,
+            const tensor& B
+        );
+
+    // ----------------------------------------------------------------------------------------
+
+        void compute_adam_update (
+            size_t begin,
+            size_t end,
+            tensor& s,
+            tensor& m,
+            tensor& v,
+            const float t,
+            const float learning_rate,
+            const float weight_decay,
+            const float momentum1,
+            const float momentum2,
+            const tensor& params,
+            const tensor& params_grad
+        );
+
+    // -----------------------------------------------------------------------------------
+
+        void assign_bias_gradient (
+            tensor& grad,
+            const tensor& gradient_input
+        );
+
+    // -----------------------------------------------------------------------------------
+
+        void threshold (
+            tensor& data,
+            float thresh
+        );
+
+    // ----------------------------------------------------------------------------------------
+
+        void dot (
+            const tensor& a,
+            const tensor& b,
+            tensor& result,
+            size_t idx
+        );
+
+    // ----------------------------------------------------------------------------------------
+
+        void prelu (
+            tensor& dest,
+            const tensor& src,
+            const tensor& param
+        );
+
+        void prelu_gradient (
+            tensor& grad,
+            const tensor& src,
+            const tensor& gradient_input,
+            const tensor& param,
+            tensor& params_grad 
+        );
+
+
+    // ----------------------------------------------------------------------------------------
+
+        void resize_bilinear (
+            tensor& dest,
+            long dest_row_stride,
+            long dest_channel_stride,
+            const tensor& src,
+            long src_row_stride,
+            long src_channel_stride
+        );
+
+        void resize_bilinear_gradient (
+            tensor& grad,
+            long grad_row_stride,
+            long grad_channel_stride,
+            const tensor& gradient_input,
+            long gradient_input_row_stride,
+            long gradient_input_channel_stride
+        );
+
+        inline void resize_bilinear (
+            tensor& dest,
+            const tensor& src
+        ) { resize_bilinear(dest, dest.nc(), dest.nr()*dest.nc(), src, src.nc(), src.nr()*src.nc()); }
+
+        inline void resize_bilinear_gradient (
+            tensor& grad,
+            const tensor& gradient_input
+        ) { resize_bilinear_gradient(grad, grad.nc(), grad.nr()*grad.nc(), gradient_input, gradient_input.nc(), gradient_input.nr()*gradient_input.nc()); }
+
+    // ----------------------------------------------------------------------------------------
+
+        void copy_tensor(
+            bool add_to,
+            tensor& dest,
+            size_t dest_k_offset,
+            const tensor& src,
+            size_t src_k_offset,
+            size_t count_k
+        );
+
+
+    // ----------------------------------------------------------------------------------------
+
+        class compute_loss_multiclass_log_per_pixel
+        {
+            /*!
+                The point of this class is to compute the loss computed by
+                loss_multiclass_log_per_pixel, but to do so with CUDA.
+            !*/
+        public:
+
+            compute_loss_multiclass_log_per_pixel(
+            )
+            {
+                work = device_global_buffer();
+            }
+
+            template <
+                typename const_label_iterator
+                >
+            void operator() (
+                const_label_iterator truth,
+                const tensor& subnetwork_output,
+                tensor& gradient,
+                double& loss
+            ) const
+            {
+                const size_t bytes_per_plane = subnetwork_output.nr()*subnetwork_output.nc()*sizeof(uint16_t);
+                // Allocate a cuda buffer to store all the truth images and also one float
+                // for the scalar loss output.
+                cuda_data_void_ptr buf = work->get(subnetwork_output.num_samples()*bytes_per_plane + sizeof(float));
+
+                cuda_data_void_ptr loss_buf = buf;
+                buf = buf+sizeof(float);
+
+
+                // copy the truth data into a cuda buffer.
+                for (long i = 0; i < subnetwork_output.num_samples(); ++i, ++truth)
+                {
+                    const matrix<uint16_t>& t = *truth;
+                    DLIB_ASSERT(t.nr() == subnetwork_output.nr());
+                    DLIB_ASSERT(t.nc() == subnetwork_output.nc());
+                    memcpy(buf + i*bytes_per_plane, &t(0,0), bytes_per_plane);
+                }
+
+                do_work(static_cast<float*>(loss_buf.data()), static_cast<uint16_t*>(buf.data()), subnetwork_output, gradient, loss);
+            }
+
+        private:
+
+            static void do_work(
+                float* loss_cuda_work_buffer,
+                const uint16_t* truth_buffer,
+                const tensor& subnetwork_output,
+                tensor& gradient,
+                double& loss
+            );
+            
+            std::shared_ptr<resizable_cuda_buffer> work;
+        };
+
+    // ------------------------------------------------------------------------------------
+    // ------------------------------------------------------------------------------------
+    // ------------------------------------------------------------------------------------
+    // ------------------------------------------------------------------------------------
+
+#else // if DLIB_USE_CUDA NOT DEFINED
+
+        inline void set_device (
+            int id
+        )
+        {
+            DLIB_CASSERT(id == 0, "dlib::cuda::set_device(id) called with an invalid device id.");
+        }
+
+        inline int get_device (
+        ){ return 0; }
+
+        inline int get_num_devices (
+        ) { return 1; }
+
+        inline std::string get_device_name (
+            int device
+        ) 
+        {
+            DLIB_CASSERT(device == 0, "dlib::cuda::set_device(id) called with an invalid device id.");
+            return "CUDA_DISABLED";
+        }
+
+        inline void set_current_device_blocking_sync(
+        ) {}
+
+
+        inline bool can_access_peer (int , int )
+        { return false; }
+        inline bool can_access_peer (const tensor& , const tensor& )
+        { return false; }
+
+        inline void device_synchronize (int ){}
+        inline void device_synchronize (const tensor& ){}
+
+        class enable_peer_access
+        {
+        public:
+            enable_peer_access() = delete;
+            enable_peer_access(const enable_peer_access&) = delete;
+            enable_peer_access& operator=(const enable_peer_access&) = delete;
+            enable_peer_access( int, int ){}
+            enable_peer_access( const tensor&, const tensor& ) {}
+        };
+
+#endif // DLIB_USE_CUDA
+
+    } 
+}
+
+
+#endif // DLIB_DNN_CuDA_H_
+
--- a/pkg/OpenFace/lib/3rdParty/dlib/include/dlib/cuda/cuda_errors.h
+++ b/pkg/OpenFace/lib/3rdParty/dlib/include/dlib/cuda/cuda_errors.h
@@ -0,0 +1,70 @@
+// Copyright (C) 2015  Davis E. King (davis@dlib.net)
+// License: Boost Software License   See LICENSE.txt for the full license.
+#ifndef DLIB_CUDA_ERRORs_H_
+#define DLIB_CUDA_ERRORs_H_
+
+
+#include "../error.h"
+
+namespace dlib
+{
+    struct cuda_error : public error
+    {
+        /*!
+            WHAT THIS OBJECT REPRESENTS
+                This is the exception thrown if any calls to the NVIDIA CUDA runtime
+                returns an error.  
+        !*/
+
+        cuda_error(const std::string& message): error(message) {}
+    };
+
+
+    struct cudnn_error : public cuda_error
+    {
+        /*!
+            WHAT THIS OBJECT REPRESENTS
+                This is the exception thrown if any calls to the NVIDIA cuDNN library
+                returns an error.  
+        !*/
+
+        cudnn_error(const std::string& message): cuda_error(message) {}
+    };
+
+    struct curand_error : public cuda_error
+    {
+        /*!
+            WHAT THIS OBJECT REPRESENTS
+                This is the exception thrown if any calls to the NVIDIA cuRAND library
+                returns an error.  
+        !*/
+
+        curand_error(const std::string& message): cuda_error(message) {}
+    };
+
+    struct cublas_error : public cuda_error
+    {
+        /*!
+            WHAT THIS OBJECT REPRESENTS
+                This is the exception thrown if any calls to the NVIDIA cuBLAS library
+                returns an error.  
+        !*/
+
+        cublas_error(const std::string& message): cuda_error(message) {}
+    };
+
+    struct cusolver_error : public cuda_error
+    {
+        /*!
+            WHAT THIS OBJECT REPRESENTS
+                This is the exception thrown if any calls to the NVIDIA cuSolver library
+                returns an error.  
+        !*/
+
+        cusolver_error(const std::string& message): cuda_error(message) {}
+    };
+}
+
+
+#endif // DLIB_CUDA_ERRORs_H_
+
--- a/pkg/OpenFace/lib/3rdParty/dlib/include/dlib/cuda/cuda_utils.h
+++ b/pkg/OpenFace/lib/3rdParty/dlib/include/dlib/cuda/cuda_utils.h
@@ -0,0 +1,413 @@
+// Copyright (C) 2015  Davis E. King (davis@dlib.net)
+// License: Boost Software License   See LICENSE.txt for the full license.
+#ifndef DLIB_CUDA_UtILS_H_
+#define DLIB_CUDA_UtILS_H_
+
+#ifndef DLIB_USE_CUDA
+#error "This file shouldn't be #included unless DLIB_USE_CUDA is #defined"
+#endif
+
+#include "cuda_errors.h"
+#include "../algs.h"
+#include <cmath>
+
+#include <cuda_runtime.h>
+#include <sstream>
+#include <iostream>
+#include <memory>
+#include <vector>
+#include <type_traits>
+
+
+// Check the return value of a call to the CUDA runtime for an error condition.
+#define CHECK_CUDA(call)                                                       \
+do{                                                                              \
+    const cudaError_t error = call;                                            \
+    if (error != cudaSuccess)                                                  \
+    {                                                                          \
+        std::ostringstream sout;                                               \
+        sout << "Error while calling " << #call << " in file " << __FILE__ << ":" << __LINE__ << ". ";\
+        sout << "code: " << error << ", reason: " << cudaGetErrorString(error);\
+        throw dlib::cuda_error(sout.str());                                          \
+    }                                                                          \
+}while(false)
+
+// ----------------------------------------------------------------------------------------
+
+#ifdef __CUDACC__
+
+namespace dlib
+{
+    namespace cuda
+    {
+
+    // ------------------------------------------------------------------------------------
+
+        __inline__ __device__ size_t pack_idx (
+            size_t dim_size3,
+            size_t dim_size2,
+            size_t dim_size1,
+            size_t idx4,
+            size_t idx3,
+            size_t idx2,
+            size_t idx1
+        )
+        /*!
+            ensures
+                - Converts a 4D array index into a 1D index assuming row major layout.  To
+                  understand precisely what this function does, imagine we had an array
+                  declared like this:
+                    int ARRAY[anything][dim_size3][dim_size2][dim_size1];
+                  Then we could index it like this:
+                    ARRAY[idx4][idx3][idx2][idx1]
+                  or equivalently like this:
+                    ((int*)ARRAY)[pack_idx(dim_size3,dim_size2,dim_size1, idx4,idx3,idx2,idx1)]
+        !*/
+        {
+            return ((idx4*dim_size3 + idx3)*dim_size2 + idx2)*dim_size1 + idx1;
+        }
+
+        __inline__ __device__ void unpack_idx (
+            size_t idx,
+            size_t dim_size3,
+            size_t dim_size2,
+            size_t dim_size1,
+            size_t& idx4,
+            size_t& idx3,
+            size_t& idx2,
+            size_t& idx1
+        )
+        /*!
+            ensures
+                - This function computes the inverse of pack_idx().  Therefore, 
+                  if PACKED == pack_idx(dim_size3,dim_size2,dim_size1, idx4,idx3,idx2,idx1)
+                  then unpack_idx(PACKED,dim_size3,dim_size2,dim_size1, IDX4,IDX3,IDX2,IDX1)
+                  results in:
+                    - IDX1 == idx1
+                    - IDX2 == idx2
+                    - IDX3 == idx3
+                    - IDX4 == idx4
+        !*/
+        {
+            idx1 = idx%dim_size1;
+
+            idx /= dim_size1;
+            idx2 = idx%dim_size2;
+
+            idx /= dim_size2;
+            idx3 = idx%dim_size3;
+
+            idx /= dim_size3;
+            idx4 = idx;
+        }
+
+    // ------------------------------------------------------------------------------------
+
+        // This function is from the article:
+        // http://devblogs.nvidia.com/parallelforall/faster-parallel-reductions-kepler/
+        __inline__ __device__ float warp_reduce_sum(float val) 
+        {
+            for (int offset = warpSize/2; offset > 0; offset /= 2) 
+#if CUDART_VERSION >= 9000
+                val += __shfl_down_sync(0xFFFFFFFF,val, offset);
+#else
+                val += __shfl_down(val, offset);
+#endif
+            return val;
+        }
+
+        __inline__ __device__ bool is_first_thread_in_warp()
+        {
+            return (threadIdx.x & (warpSize - 1)) == 0;
+        }
+
+        __inline__ __device__ void warp_reduce_atomic_add(
+            float& out, 
+            float val
+        ) 
+        /*!
+            ensures
+                - Atomically adds all the val variables in the current warp to out.
+                  See this page for an extended discussion: 
+                  http://devblogs.nvidia.com/parallelforall/faster-parallel-reductions-kepler/
+        !*/
+        {
+            val = warp_reduce_sum(val);
+            if (is_first_thread_in_warp())
+                atomicAdd(&out, val);
+        }
+
+    // ------------------------------------------------------------------------------------
+
+        struct max_jobs
+        {
+            max_jobs(int x) : num_x(x) {}
+            max_jobs(int x, int y) : num_x(x), num_y(y) {}
+            int num_x;
+            int num_y = 1;
+        };
+
+        template <typename Kernel, typename... T>
+        void launch_kernel (
+            Kernel K,
+            T ...args
+        )
+        /*!
+            ensures
+                - launches the given kernel K(args...).  The point of this function is to
+                  automatically set the kernel launch parameters to something reasonable
+                  based on the properties of the kernel and the current GPU card.
+        !*/
+        {
+            int num_blocks, num_threads;
+            CHECK_CUDA(cudaOccupancyMaxPotentialBlockSize(&num_blocks,&num_threads,K));
+            K<<<num_blocks,num_threads>>>(args...);
+        }
+
+        template <typename Kernel, typename... T>
+        void launch_kernel (
+            Kernel K,
+            max_jobs m,
+            T ...args
+        )
+        /*!
+            ensures
+                - This function is just like launch_kernel(K,args...) except that you can
+                  additionally supply a max_jobs number that tells it how many possible
+                  total threads could be used.  This is useful when launching potentially
+                  small jobs that might not need the number of threads suggested by
+                  launch_kernel().  
+        !*/
+        {
+            if (m.num_x == 0 || m.num_y == 0)
+                return;
+            int num_blocks, num_threads;
+            CHECK_CUDA(cudaOccupancyMaxPotentialBlockSize(&num_blocks,&num_threads,K));
+            // Check if the job is really small and we don't really need to launch a kernel
+            // with this many blocks and threads.
+            if (num_blocks*num_threads > m.num_x*m.num_y)
+                num_blocks = (m.num_x*m.num_y+num_threads-1)/num_threads;
+
+            if (m.num_y == 1)
+            {
+                K<<<num_blocks,num_threads>>>(args...);
+            }
+            else
+            {
+                /*
+                    In general, the reason m.num_y!=1 (i.e. the reason you are in this
+                    code path) is because we are using nested grid-stride loops.  There are
+                    two important things to note about what we are doing here.  To
+                    illustrate them we will talk about this little CUDA code snippet:
+
+                        // initialize out before we begin.
+                        for (auto i : grid_stride_range_y(0, nr))
+                            for (auto j : grid_stride_range(0, 1))
+                                out[i] = 0;
+
+                        __syncthreads(); // synchronize threads in block
+
+                        // loop over some 2D thing and sum and store things into out.
+                        for (auto i : grid_stride_range_y(0, nr))
+                        {
+                            float temp = 0;
+                            for (auto j : grid_stride_range(0, nc))
+                                temp += whatever[i*nc+j];
+
+                            // store the sum into out[i]
+                            warp_reduce_atomic_add(out[i], temp);
+                        }
+                    
+                    First, we make sure the number of x threads is a multiple of 32 so that
+                    you can use warp_reduce_atomic_add() inside the y loop.  
+                    
+                    Second, we put the x block size to 1 so inter-block synchronization is
+                    easier.  For example, if the number of x blocks wasn't 1 the above code
+                    would have a race condition in it.  This is because the execution of
+                    out[i]=0 would be done by blocks with blockIdx.x==0, but then in the
+                    second set of loops, *all* the x blocks use out[i].  Since
+                    __syncthreads() doesn't do any synchronization between blocks some of
+                    the blocks might begin before the out[i]=0 statements finished and that
+                    would be super bad.
+                */
+                
+                // Try and make sure that the ratio of x to y threads is reasonable based
+                // on the respective size of our loops.
+                int x_threads = 32;
+                int y_threads = num_threads/32;
+                const int ratio = static_cast<int>(std::round(put_in_range(1, y_threads, m.num_x/(double)m.num_y)));
+                x_threads *= ratio;
+                y_threads /= ratio;
+
+                dim3 blocks(1,num_blocks);  
+                dim3 threads(x_threads,y_threads); 
+                K<<<blocks,threads>>>(args...);
+            }
+        }
+
+    // ------------------------------------------------------------------------------------
+
+        class grid_stride_range
+        {
+            /*!
+                WHAT THIS OBJECT REPRESENTS
+                    This is a tool for making a for loop that loops over an entire block of
+                    memory inside a kernel, but doing so in a way that parallelizes
+                    appropriately across all the threads in a kernel launch.  For example,
+                    the following kernel would add the vector a to the vector b and store
+                    the output in out (assuming all vectors are of dimension n):
+                        __global__ void add_arrays(
+                            const float* a, 
+                            const float* b, 
+                            float* out, 
+                            size_t n
+                        )
+                        {
+                            for (auto i : grid_stride_range(0, n))
+                            {
+                                out[i] = a[i]+b[i];
+                            }
+                        }
+            !*/
+
+        public:
+            __device__ grid_stride_range(
+                size_t ibegin_,
+                size_t iend_
+            ) : 
+                ibegin(ibegin_),
+                iend(iend_)
+            {}
+
+            class iterator
+            {
+            public:
+                __device__ iterator() {}
+                __device__ iterator(size_t pos_) : pos(pos_) {}
+
+                __device__ size_t operator*() const
+                {
+                    return pos;
+                }
+
+                __device__ iterator& operator++()
+                {
+                    pos += gridDim.x * blockDim.x;
+                    return *this;
+                }
+
+                __device__ bool operator!=(const iterator& item) const
+                { return pos < item.pos; }
+
+            private:
+                size_t pos;
+            };
+
+            __device__ iterator begin() const
+            {
+                return iterator(ibegin+blockDim.x * blockIdx.x + threadIdx.x);
+            }
+            __device__ iterator end() const
+            {
+                return iterator(iend);
+            }
+        private:
+
+            size_t ibegin;
+            size_t iend;
+        };
+
+    // ------------------------------------------------------------------------------------
+
+        class grid_stride_range_y
+        {
+            /*!
+                WHAT THIS OBJECT REPRESENTS
+                    This object is just like grid_stride_range except that it looks at
+                    CUDA's y thread index (e.g. threadIdx.y) instead of the x index.
+                    Therefore, if you launch a cuda kernel with a statement like:
+                        dim3 blocks(1,10);
+                        dim3 threads(32,32);  // You need to have x and y not equal to 1 to get parallelism over both loops.
+                        add_arrays<<<blocks,threads>>>(a,b,out,nr,nc);
+                    You can perform a nested 2D parallel for loop rather than doing just a
+                    1D for loop.
+                   
+                    So the code in the kernel would look like this if you wanted to add two
+                    2D matrices:
+                        __global__ void add_arrays(
+                            const float* a, 
+                            const float* b, 
+                            float* out, 
+                            size_t nr,
+                            size_t nc
+                        )
+                        {
+                            for (auto r : grid_stride_range_y(0, nr))
+                            {
+                                for (auto c : grid_stride_range(0, nc))
+                                {
+                                    auto i = r*nc+c;
+                                    out[i] = a[i]+b[i];
+                                }
+                            }
+                        }
+            !*/
+
+        public:
+            __device__ grid_stride_range_y(
+                size_t ibegin_,
+                size_t iend_
+            ) : 
+                ibegin(ibegin_),
+                iend(iend_)
+            {}
+
+            class iterator
+            {
+            public:
+                __device__ iterator() {}
+                __device__ iterator(size_t pos_) : pos(pos_) {}
+
+                __device__ size_t operator*() const
+                {
+                    return pos;
+                }
+
+                __device__ iterator& operator++()
+                {
+                    pos += gridDim.y * blockDim.y;
+                    return *this;
+                }
+
+                __device__ bool operator!=(const iterator& item) const
+                { return pos < item.pos; }
+
+            private:
+                size_t pos;
+            };
+
+            __device__ iterator begin() const
+            {
+                return iterator(ibegin+blockDim.y * blockIdx.y + threadIdx.y);
+            }
+            __device__ iterator end() const
+            {
+                return iterator(iend);
+            }
+        private:
+
+            size_t ibegin;
+            size_t iend;
+        };
+
+    // ------------------------------------------------------------------------------------
+
+    }
+}
+
+#endif // __CUDACC__
+
+// ----------------------------------------------------------------------------------------
+
+#endif // DLIB_CUDA_UtILS_H_
+
--- a/pkg/OpenFace/lib/3rdParty/dlib/include/dlib/cuda/cudnn_dlibapi.h
+++ b/pkg/OpenFace/lib/3rdParty/dlib/include/dlib/cuda/cudnn_dlibapi.h
@@ -0,0 +1,518 @@
+// Copyright (C) 2015  Davis E. King (davis@dlib.net)
+// License: Boost Software License   See LICENSE.txt for the full license.
+#ifndef DLIB_DNN_CuDNN_H_
+#define DLIB_DNN_CuDNN_H_
+
+#ifdef DLIB_USE_CUDA
+
+#include "cuda_errors.h"
+#include <memory>
+#include "cuda_data_ptr.h"
+
+namespace dlib
+{
+    class tensor;
+    class resizable_tensor;
+
+    namespace cuda 
+    {
+
+    // -----------------------------------------------------------------------------------
+
+        class tensor_descriptor
+        {
+            /*!
+                Each tensor object will carry a tensor_descriptor in it when compiled with
+                CUDA.
+            !*/
+
+        public:
+            // not copyable
+            tensor_descriptor(const tensor_descriptor&) = delete;
+            tensor_descriptor& operator=(const tensor_descriptor&) = delete;
+            // but is movable
+            tensor_descriptor(tensor_descriptor&& item) : tensor_descriptor() { swap(item); }
+            tensor_descriptor& operator=(tensor_descriptor&& item) { swap(item); return *this; }
+
+            tensor_descriptor();
+            ~tensor_descriptor();
+
+            void set_size(
+                int n, 
+                int k,
+                int nr, 
+                int nc 
+            );
+            /*!
+                ensures
+                    - if any of the arguments are 0 then they are all set to 0 in the tensor.
+            !*/
+
+            void get_size (
+                int& n, 
+                int& k,
+                int& nr,
+                int& nc 
+            ) const;
+
+            const void* get_handle (
+            ) const { return handle; }
+
+        private:
+
+            void swap(tensor_descriptor& item) { std::swap(handle, item.handle); }
+
+            void* handle;
+        };
+
+        // ------------------------------------------------------------------------------------
+
+        void add(
+            float beta,
+            tensor& dest,
+            float alpha,
+            const tensor& src
+        );
+        /*!
+            requires
+                - One of the following is true: 
+                    - have_same_dimensions(src, dest)
+                    - src.num_samples()==1 && src.k()==dest.k() && src.nr()==1 && src.nc()==1
+                    - src.num_samples()==1 && src.k()==dest.k() && src.nr()==dest.nr() && src.nc()==dest.nc()
+                    - src.num_samples()==1 && src.k()==1 && src.nr()==dest.nr() && src.nc()==dest.nc()
+                - is_same_object(src,dest) == false
+            ensures
+                - performs: dest = beta*dest + alpha*src
+                  However, how the addition happens depends on the dimensions of src.  In
+                  particular, this function adds the scaled values of one src tensor to
+                  dest. Each dimension of the src tensor must match the corresponding
+                  dimension of the dest tensor or must be equal to 1. In the latter case,
+                  the same value from the src tensor, for those dimensions, will be used to
+                  add into the dest tensor.
+        !*/
+
+    // ------------------------------------------------------------------------------------
+
+        void assign_conv_bias_gradient (
+            tensor& grad,
+            const tensor& gradient_input
+        );
+        /*!
+            requires
+                - grad.num_samples() == 1
+                - grad.k()  >= 1
+                - grad.nr() == 1
+                - grad.nc() == 1
+                - gradient_input.k() == grad.k()
+                - gradient_input.size() > 0
+                - is_same_object(grad,gradient_input) == false
+            ensures
+                - let BIAS be a tensor with all dimensions equal to 1 except for k which is >= 1.
+                - let OUT be the output of add(1,OUT,1,BIAS)
+                - let f(gradient_input,BIAS) == dot(gradient_input,OUT)
+                - Then this function computes the gradient of f() with respect to BIAS and
+                  assigns it to grad.
+        !*/
+
+    // ------------------------------------------------------------------------------------
+
+        void batch_normalize_inference (
+            const double eps,
+            resizable_tensor& dest,
+            const tensor& src,
+            const tensor& gamma, 
+            const tensor& beta,
+            const tensor& running_means,
+            const tensor& running_variances
+        );
+
+        void batch_normalize (
+            const double eps,
+            resizable_tensor& dest,
+            resizable_tensor& means,
+            resizable_tensor& invstds,
+            const double averaging_factor,
+            resizable_tensor& running_means,
+            resizable_tensor& running_variances,
+            const tensor& src,
+            const tensor& gamma, 
+            const tensor& beta 
+        );
+
+        void batch_normalize_gradient(
+            const double eps,
+            const tensor& gradient_input,
+            const tensor& means,
+            const tensor& invstds,
+            const tensor& src,
+            const tensor& gamma,
+            tensor& src_grad,
+            tensor& gamma_grad, 
+            tensor& beta_grad 
+        );
+
+    // ------------------------------------------------------------------------------------
+
+        void batch_normalize_conv_inference (
+            const double eps,
+            resizable_tensor& dest,
+            const tensor& src,
+            const tensor& gamma, 
+            const tensor& beta,
+            const tensor& running_means,
+            const tensor& running_variances
+        );
+
+        void batch_normalize_conv (
+            const double eps,
+            resizable_tensor& dest,
+            resizable_tensor& means,
+            resizable_tensor& invstds,
+            const double averaging_factor,
+            resizable_tensor& running_means,
+            resizable_tensor& running_variances,
+            const tensor& src,
+            const tensor& gamma, 
+            const tensor& beta 
+        );
+
+        void batch_normalize_conv_gradient(
+            const double eps,
+            const tensor& gradient_input,
+            const tensor& means,
+            const tensor& invstds,
+            const tensor& src,
+            const tensor& gamma,
+            tensor& src_grad,
+            tensor& gamma_grad, 
+            tensor& beta_grad 
+        );
+
+    // ------------------------------------------------------------------------------------
+
+        class tensor_conv
+        {
+        public:
+            tensor_conv(const tensor_conv&) = delete;
+            tensor_conv& operator=(const tensor_conv&) = delete;
+
+            tensor_conv();
+
+            void clear(
+            );
+
+            ~tensor_conv (
+            );
+
+            void operator() (
+                const bool add_to_output,
+                tensor& output,
+                const tensor& data,
+                const tensor& filters
+            );
+
+            void operator() (
+                const bool add_to_output,
+                resizable_tensor& output,
+                const tensor& data,
+                const tensor& filters
+            );
+
+            void get_gradient_for_data (
+                const bool add_to_output,
+                const tensor& gradient_input, 
+                const tensor& filters,
+                tensor& data_gradient
+            );
+
+            void get_gradient_for_filters (
+                const bool add_to_output,
+                const tensor& gradient_input, 
+                const tensor& data,
+                tensor& filters_gradient
+            );
+
+           void setup(
+                const tensor& data,
+                const tensor& filters,
+                int stride_y,
+                int stride_x,
+                int padding_y,
+                int padding_x
+            );
+
+        private:
+
+            // These variables record the type of data given to the last call to setup().
+            int stride_y;
+            int stride_x;
+            int padding_y;
+            int padding_x;
+            long data_num_samples, data_k, data_nr, data_nc;
+            long filters_num_samples, filters_k, filters_nr, filters_nc;
+
+
+            void* filter_handle;
+            void* conv_handle;
+
+            // dimensions of the output tensor from operator()
+            int out_num_samples;
+            int out_k;
+            int out_nr;
+            int out_nc;
+
+            int forward_algo;
+            int backward_data_algo;
+            int backward_filters_algo;
+
+            size_t forward_workspace_size_in_bytes;
+            size_t backward_data_workspace_size_in_bytes;
+            size_t backward_filters_workspace_size_in_bytes;
+            std::shared_ptr<resizable_cuda_buffer> workspace;
+            cuda_data_void_ptr forward_workspace;
+            cuda_data_void_ptr backward_data_workspace;
+            cuda_data_void_ptr backward_filters_workspace;
+        };
+
+    // ------------------------------------------------------------------------------------
+
+        class pooling
+        {
+        public:
+
+            pooling(const pooling&) = delete;
+            pooling& operator=(const pooling&) = delete;
+
+            pooling (
+            );
+
+            ~pooling(
+            );
+
+            void clear(
+            );
+
+            void setup_max_pooling(
+                int window_height,
+                int window_width,
+                int stride_y,
+                int stride_x,
+                int padding_y,
+                int padding_x
+            );
+
+            void setup_avg_pooling(
+                int window_height,
+                int window_width,
+                int stride_y,
+                int stride_x,
+                int padding_y,
+                int padding_x
+            );
+
+            bool does_max_pooling(
+            ) const { return do_max_pooling; }
+
+            void operator() (
+                resizable_tensor& dest,
+                const tensor& src
+            );
+
+            void get_gradient(
+                const tensor& gradient_input, 
+                const tensor& dest,
+                const tensor& src,
+                tensor& grad 
+            );
+
+        private:
+
+            void setup(
+                int window_height,
+                int window_width,
+                int stride_y,
+                int stride_x,
+                int padding_y,
+                int padding_x,
+                int pooling_mode
+            );
+
+            void* handle;
+            int window_height;
+            int window_width;
+            int stride_y;
+            int stride_x;
+            int padding_y;
+            int padding_x;
+            bool do_max_pooling;
+        };
+
+    // ------------------------------------------------------------------------------------
+
+        void softmax (
+            tensor& dest,
+            const tensor& src
+        );
+        /*!
+            requires
+                - have_same_dimensions(dest, src) == true
+            ensures
+                - Note that the softmax function is a vector valued function: 
+                    s(x) == exp(x)/sum(exp(x)) 
+                - Computes the softmax function on src and writes the results to dest.  The
+                  softmax is computed per spatial location across the different channels at
+                  each location.  That is, softmax() outputs a new tensor, #dest, where
+                  each of the spatial locations in dest (i.e. image idx, row idx, and
+                  column idx) contains the output of s() evaluated over the channel values
+                  at each location.
+                - This function supports in-place operation, i.e. having
+                  is_same_object(dest, src)==true
+        !*/
+
+        void softmax_gradient (
+            tensor& grad,
+            const tensor& dest,
+            const tensor& gradient_input
+        );
+        /*!
+            requires
+                - have_same_dimensions(dest,gradient_input) == true 
+                - have_same_dimensions(dest,grad) == true 
+                - is_same_object(grad, dest)==false
+            ensures
+                - We interpret dest as the output of softmax(dest,SRC) for some SRC tensor.
+                  Then let f(SRC) == dot(gradient_input,dest) Then this function computes
+                  the gradient of f() with respect to SRC and assigns it to grad.
+                - This function supports in-place operation, i.e. having
+                  is_same_object(grad, gradient_input)==true
+        !*/
+
+    // ------------------------------------------------------------------------------------
+
+        void softmax_all (
+            tensor& dest,
+            const tensor& src
+        );
+
+        void softmax_all_gradient (
+            tensor& grad,
+            const tensor& dest,
+            const tensor& gradient_input
+        );
+
+    // ------------------------------------------------------------------------------------
+
+        void sigmoid (
+            tensor& dest,
+            const tensor& src
+        );
+        /*!
+            requires
+                - have_same_dimensions(dest, src) == true
+            ensures
+                - for all valid i:
+                    - #dest.host()[i] == 1/(1+std::exp(-src.host()[i])) 
+                - This function supports in-place operation, i.e. having
+                  is_same_object(dest, src)==true
+        !*/
+
+        void sigmoid_gradient (
+            tensor& grad,
+            const tensor& dest,
+            const tensor& gradient_input
+        );
+        /*!
+            requires
+                - have_same_dimensions(dest,gradient_input) == true 
+                - have_same_dimensions(dest,grad) == true 
+                - is_same_object(grad,dest) == false
+            ensures
+                - Recalling that dest is the output of sigmoid(dest,SRC) for some SRC tensor,
+                  let f(SRC) == dot(gradient_input,dest)
+                - Then this function computes the gradient of f() with respect to SRC and
+                  assigns it to grad.
+                - This function supports in-place operation, i.e. having
+                  is_same_object(grad, gradient_input)==true
+        !*/
+
+    // ------------------------------------------------------------------------------------
+
+        void relu (
+            tensor& dest,
+            const tensor& src
+        );
+        /*!
+            requires
+                - have_same_dimensions(dest, src) == true
+            ensures
+                - for all valid i:
+                    - #dest.host()[i] == std::max(0,src.host()[i]) 
+                - This function supports in-place operation, i.e. having
+                  is_same_object(dest, src)==true
+        !*/
+
+        void relu_gradient (
+            tensor& grad,
+            const tensor& dest,
+            const tensor& gradient_input
+        );
+        /*!
+            requires
+                - have_same_dimensions(dest,gradient_input) == true 
+                - have_same_dimensions(dest,grad) == true 
+                - is_same_object(grad,dest) == false
+            ensures
+                - Recalling that dest is the output of relu(dest,SRC) for some SRC tensor,
+                  let f(SRC) == dot(gradient_input,dest)
+                - Then this function computes the gradient of f() with respect to SRC and
+                  assigns it to grad.
+                - This function supports in-place operation, i.e. having
+                  is_same_object(grad, gradient_input)==true
+        !*/
+
+    // ------------------------------------------------------------------------------------
+
+        void tanh (
+            tensor& dest,
+            const tensor& src
+        );
+        /*!
+            requires
+                - have_same_dimensions(dest, src) == true
+            ensures
+                - for all valid i:
+                    - #dest.host()[i] == std::tanh(src.host()[i]) 
+                - This function supports in-place operation, i.e. having
+                  is_same_object(dest, src)==true
+        !*/
+
+        void tanh_gradient (
+            tensor& grad,
+            const tensor& dest,
+            const tensor& gradient_input
+        );
+        /*!
+            requires
+                - have_same_dimensions(dest,gradient_input) == true 
+                - have_same_dimensions(dest,grad) == true 
+                - is_same_object(grad,dest) == false
+            ensures
+                - Recalling that dest is the output of tanh(dest,SRC) for some SRC tensor,
+                  let f(SRC) == dot(gradient_input,dest)
+                - Then this function computes the gradient of f() with respect to SRC and
+                  assigns it to grad.
+                - This function supports in-place operation, i.e. having
+                  is_same_object(grad, gradient_input)==true
+        !*/
+
+
+
+    // ------------------------------------------------------------------------------------
+
+    } 
+}
+
+#endif // DLIB_USE_CUDA
+
+#endif // DLIB_DNN_CuDNN_H_
+
--- a/pkg/OpenFace/lib/3rdParty/dlib/include/dlib/cuda/curand_dlibapi.h
+++ b/pkg/OpenFace/lib/3rdParty/dlib/include/dlib/cuda/curand_dlibapi.h
@@ -0,0 +1,75 @@
+// Copyright (C) 2015  Davis E. King (davis@dlib.net)
+// License: Boost Software License   See LICENSE.txt for the full license.
+#ifndef DLIB_DNN_CuRAND_H_
+#define DLIB_DNN_CuRAND_H_
+
+#ifdef DLIB_USE_CUDA
+
+#include "tensor.h"
+#include "cuda_errors.h"
+#include "cuda_data_ptr.h"
+
+namespace dlib
+{
+    namespace cuda 
+    {
+
+    // -----------------------------------------------------------------------------------
+
+        class curand_generator
+        {
+        public:
+            // not copyable
+            curand_generator(const curand_generator&) = delete;
+            curand_generator& operator=(const curand_generator&) = delete;
+
+            curand_generator() : curand_generator(0) {}
+            curand_generator(unsigned long long seed);
+            ~curand_generator();
+
+            void fill (
+                cuda_data_ptr<unsigned int>& data
+            );
+            /*!
+                ensures
+                    - Fills data with random 32-bit unsigned integers.
+            !*/
+
+            void fill_gaussian (
+                tensor& data,
+                float mean = 0,
+                float stddev = 1
+            );
+            /*!
+                requires
+                    - data.size()%2 == 0
+                    - stddev >= 0
+                ensures
+                    - Fills data with random numbers drawn from a Gaussian distribution
+                      with the given mean and standard deviation.
+            !*/
+
+            void fill_uniform (
+                tensor& data
+            );
+            /*!
+                ensures
+                    - Fills data with uniform random numbers in the range (0.0, 1.0].
+            !*/
+
+        private:
+
+            void* handle;
+        };
+
+    // -----------------------------------------------------------------------------------
+
+    }  
+}
+
+#endif // DLIB_USE_CUDA
+
+#endif // DLIB_DNN_CuRAND_H_
+
+
+
--- a/pkg/OpenFace/lib/3rdParty/dlib/include/dlib/cuda/cusolver_dlibapi.h
+++ b/pkg/OpenFace/lib/3rdParty/dlib/include/dlib/cuda/cusolver_dlibapi.h
@@ -0,0 +1,75 @@
+// Copyright (C) 2017  Davis E. King (davis@dlib.net)
+// License: Boost Software License   See LICENSE.txt for the full license.
+#ifndef DLIB_DNN_CuSOLVER_H_
+#define DLIB_DNN_CuSOLVER_H_
+
+#ifdef DLIB_USE_CUDA
+
+#include "tensor.h"
+#include "cuda_errors.h"
+#include "cuda_data_ptr.h"
+#include "../noncopyable.h"
+
+namespace dlib
+{
+    namespace cuda 
+    {
+
+    // -----------------------------------------------------------------------------------
+
+        class inv : noncopyable
+        {
+            /*!
+                WHAT THIS OBJECT REPRESENTS
+                    This is a functor for doing matrix inversion on the GPU.  The only
+                    reason it's an object is to avoid the reallocation of some GPU memory
+                    blocks if you want to do a bunch of matrix inversions in a row.
+            !*/
+
+        public:
+
+            inv() = default;
+            ~inv();
+
+            void operator() (
+                const tensor& m,
+                resizable_tensor& out
+            );
+            /*!
+                requires
+                    - m.size() == m.num_samples()*m.num_samples()
+                      (i.e. mat(m) must be a square matrix)
+                ensures
+                    - out == inv(mat(m));
+            !*/
+
+            int get_last_status(
+            );
+            /*!
+                ensures
+                    - returns 0 if the last matrix inversion was successful and != 0
+                      otherwise.
+            !*/
+
+        private:
+
+            void sync_if_needed();
+
+            bool did_work_lately = false;
+            resizable_tensor m;
+            cuda_data_ptr<float> workspace;
+            cuda_data_ptr<int> Ipiv;
+            cuda_data_ptr<int> info;
+        };
+
+    // ------------------------------------------------------------------------------------
+
+    }  
+}
+
+#endif // DLIB_USE_CUDA
+
+#endif // DLIB_DNN_CuSOLVER_H_
+
+
+
--- a/pkg/OpenFace/lib/3rdParty/dlib/include/dlib/cuda/gpu_data.h
+++ b/pkg/OpenFace/lib/3rdParty/dlib/include/dlib/cuda/gpu_data.h
@@ -0,0 +1,266 @@
+// Copyright (C) 2015  Davis E. King (davis@dlib.net)
+// License: Boost Software License   See LICENSE.txt for the full license.
+#ifndef DLIB_GPU_DaTA_H_
+#define DLIB_GPU_DaTA_H_
+
+#include "gpu_data_abstract.h"
+#include <memory>
+#include <cstring>
+#include "cuda_errors.h"
+#include "../serialize.h"
+
+namespace dlib
+{
+
+// ----------------------------------------------------------------------------------------
+
+    class gpu_data 
+    {
+        /*!
+            CONVENTION
+                - if (size() != 0) then
+                    - data_host == a pointer to size() floats in CPU memory.
+                - if (data_device) then 
+                    - data_device == a pointer to size() floats in device memory.
+
+                - if (there might be an active async transfer from host to device) then
+                    - have_active_transfer == true
+
+                - We use the host_current and device_current bools to keep track of which
+                  copy of the data (or both) are most current.  e.g. if the CPU has
+                  modified the data and it hasn't been copied to the device yet then
+                  host_current==true and device_current==false.
+
+                  Similarly, we use device_in_use==true to indicate that device() has been
+                  called and no operation to wait for all CUDA kernel completion has been
+                  executed.  So if device_in_use==true then there might be a CUDA kernel
+                  executing that is using the device memory block contained in this object.
+
+        !*/
+    public:
+
+        gpu_data(
+        ) : data_size(0), host_current(true), device_current(true),have_active_transfer(false),device_in_use(false), the_device_id(0)
+        {
+        }
+
+        // Not copyable
+        gpu_data(const gpu_data&) = delete;
+        gpu_data& operator=(const gpu_data&) = delete;
+
+        // but is movable
+        gpu_data(gpu_data&& item) : gpu_data() { swap(item); }
+        gpu_data& operator=(gpu_data&& item) { swap(item); return *this; }
+
+        int device_id() const { return the_device_id; }
+
+#ifdef DLIB_USE_CUDA
+        void async_copy_to_device() const; 
+        void set_size(size_t new_size);
+#else
+        // Note that calls to host() or device() will block until any async transfers are complete.
+        void async_copy_to_device() const{}
+
+        void set_size(size_t new_size)
+        {
+            if (new_size == 0)
+            {
+                data_size = 0;
+                host_current = true;
+                device_current = true;
+                device_in_use = false;
+                data_host.reset();
+                data_device.reset();
+            }
+            else if (new_size != data_size)
+            {
+                data_size = new_size;
+                host_current = true;
+                device_current = true;
+                device_in_use = false;
+                data_host.reset(new float[new_size], std::default_delete<float[]>());
+                data_device.reset();
+            }
+        }
+#endif
+
+        const float* host() const 
+        { 
+            copy_to_host();
+            return data_host.get(); 
+        }
+
+        float* host() 
+        {
+            copy_to_host();
+            device_current = false;
+            return data_host.get(); 
+        }
+
+        float* host_write_only() 
+        {
+            host_current = true;
+            device_current = false;
+            return data_host.get(); 
+        }
+
+        const float* device() const 
+        { 
+#ifndef DLIB_USE_CUDA
+            DLIB_CASSERT(false, "CUDA NOT ENABLED");
+#endif
+            copy_to_device();
+            device_in_use = true;
+            return data_device.get(); 
+        }
+
+        float* device() 
+        {
+#ifndef DLIB_USE_CUDA
+            DLIB_CASSERT(false, "CUDA NOT ENABLED");
+#endif
+            copy_to_device();
+            host_current = false;
+            device_in_use = true;
+            return data_device.get(); 
+        }
+
+        float* device_write_only()
+        {
+#ifndef DLIB_USE_CUDA
+            DLIB_CASSERT(false, "CUDA NOT ENABLED");
+#endif
+            wait_for_transfer_to_finish();
+            host_current = false;
+            device_current = true;
+            device_in_use = true;
+            return data_device.get(); 
+        }
+
+        bool host_ready (
+        ) const { return host_current; }
+
+        bool device_ready (
+        ) const { return device_current && !have_active_transfer; }
+
+        size_t size() const { return data_size; }
+
+        void swap (gpu_data& item)
+        {
+            std::swap(data_size, item.data_size);
+            std::swap(host_current, item.host_current);
+            std::swap(device_current, item.device_current);
+            std::swap(have_active_transfer, item.have_active_transfer);
+            std::swap(data_host, item.data_host);
+            std::swap(data_device, item.data_device);
+            std::swap(cuda_stream, item.cuda_stream);
+            std::swap(the_device_id, item.the_device_id);
+        }
+
+    private:
+
+#ifdef DLIB_USE_CUDA
+        void copy_to_device() const;
+        void copy_to_host() const;
+        void wait_for_transfer_to_finish() const;
+#else
+        void copy_to_device() const{}
+        void copy_to_host() const{}
+        void wait_for_transfer_to_finish() const{}
+#endif
+
+
+        size_t data_size;
+        mutable bool host_current;
+        mutable bool device_current;
+        mutable bool have_active_transfer;
+        mutable bool device_in_use;
+
+        std::shared_ptr<float> data_host;
+        std::shared_ptr<float> data_device;
+        std::shared_ptr<void> cuda_stream;
+        int the_device_id;
+    };
+
+    inline void serialize(const gpu_data& item, std::ostream& out)
+    {
+        int version = 1;
+        serialize(version, out);
+        serialize(item.size(), out);
+        auto data = item.host();
+        for (size_t i = 0; i < item.size(); ++i)
+            serialize(data[i], out);
+    }
+
+    inline void deserialize(gpu_data& item, std::istream& in)
+    {
+        int version;
+        deserialize(version, in);
+        if (version != 1)
+            throw serialization_error("Unexpected version found while deserializing dlib::gpu_data.");
+        size_t s;
+        deserialize(s, in);
+        item.set_size(s);
+        auto data = item.host();
+        for (size_t i = 0; i < item.size(); ++i)
+            deserialize(data[i], in);
+    }
+
+#ifdef DLIB_USE_CUDA
+    void memcpy (gpu_data& dest, const gpu_data& src);
+
+    void memcpy (
+        gpu_data& dest, 
+        size_t dest_offset,
+        const gpu_data& src,
+        size_t src_offset,
+        size_t num
+    );
+
+#else
+
+    inline void memcpy (gpu_data& dest, const gpu_data& src)
+    {
+        DLIB_CASSERT(dest.size() == src.size());
+        if (src.size() == 0 || &dest == &src)
+            return;
+        std::memcpy(dest.host_write_only(), src.host(), sizeof(float)*src.size());
+    }
+
+    inline void memcpy (
+        gpu_data& dest, 
+        size_t dest_offset,
+        const gpu_data& src,
+        size_t src_offset,
+        size_t num
+    )
+    {
+        DLIB_CASSERT(dest_offset + num <= dest.size());
+        DLIB_CASSERT(src_offset + num <= src.size());
+        if (num == 0)
+            return;
+        if (&dest == &src && std::max(dest_offset, src_offset) < std::min(dest_offset,src_offset)+num)
+        {
+            // if they perfectly alias each other then there is nothing to do
+            if (dest_offset == src_offset)
+                return;
+            else
+                std::memmove(dest.host()+dest_offset, src.host()+src_offset, sizeof(float)*num);
+        }
+        else
+        {
+            // if we write to the entire thing then we can use host_write_only()
+            if (dest_offset == 0 && num == dest.size())
+                std::memcpy(dest.host_write_only(), src.host()+src_offset, sizeof(float)*num);
+            else
+                std::memcpy(dest.host()+dest_offset, src.host()+src_offset, sizeof(float)*num);
+        }
+    }
+#endif
+
+// ----------------------------------------------------------------------------------------
+
+}
+
+#endif // DLIB_GPU_DaTA_H_
+
--- a/pkg/OpenFace/lib/3rdParty/dlib/include/dlib/cuda/gpu_data_abstract.h
+++ b/pkg/OpenFace/lib/3rdParty/dlib/include/dlib/cuda/gpu_data_abstract.h
@@ -0,0 +1,266 @@
+// Copyright (C) 2015  Davis E. King (davis@dlib.net)
+// License: Boost Software License   See LICENSE.txt for the full license.
+#undef DLIB_GPU_DaTA_ABSTRACT_H_
+#ifdef DLIB_GPU_DaTA_ABSTRACT_H_
+
+#include "cuda_errors.h"
+#include "../serialize.h"
+
+namespace dlib
+{
+
+// ----------------------------------------------------------------------------------------
+
+    class gpu_data 
+    {
+        /*!
+            WHAT THIS OBJECT REPRESENTS
+                This object is a block of size() floats, all stored contiguously in memory.
+                Importantly, it keeps two copies of the floats, one on the host CPU side
+                and another on the GPU device side. It automatically performs the necessary
+                host/device transfers to keep these two copies of the data in sync.
+
+                All transfers to the device happen asynchronously with respect to the
+                default CUDA stream so that CUDA kernel computations can overlap with data
+                transfers.  However, any transfers from the device to the host happen
+                synchronously in the default CUDA stream.  Therefore, you should perform
+                all your CUDA kernel launches on the default stream so that transfers back
+                to the host do not happen before the relevant computations have completed.
+
+                If DLIB_USE_CUDA is not #defined then this object will not use CUDA at all.
+                Instead, it will simply store one host side memory block of floats.  
+
+            THREAD SAFETY
+                Instances of this object are not thread-safe.  So don't touch one from
+                multiple threads at the same time.
+        !*/
+    public:
+
+        gpu_data(
+        );
+        /*!
+            ensures
+                - #size() == 0
+                - #host() == nullptr 
+                - #device() == nullptr 
+                - #host_ready() == true
+                - #device_ready() == true
+                - #device_id() == 0
+        !*/
+
+        // This object is not copyable, however, it is movable.
+        gpu_data(const gpu_data&) = delete;
+        gpu_data& operator=(const gpu_data&) = delete;
+        gpu_data(gpu_data&& item);
+        gpu_data& operator=(gpu_data&& item);
+
+        int device_id(
+        ) const; 
+        /*!
+            ensures
+                - returns the ID of the CUDA device that allocated this memory. I.e. the
+                  number returned by cudaGetDevice() when the memory was allocated.
+                - If CUDA is not being used then this function always returns 0.
+        !*/
+
+        void async_copy_to_device(
+        ); 
+        /*!
+            ensures
+                - if (!device_ready()) then
+                    - Begins asynchronously copying host data to the device once it is safe
+                      to do so.  I.e. This function will wait until any previously
+                      scheduled CUDA kernels, which are using the device() memory block,
+                      have completed before transferring the new data to the device.
+                    - A call to device() that happens before the transfer completes will
+                      block until the transfer is complete.  That is, it is safe to call
+                      async_copy_to_device() and then immediately call device().
+        !*/
+
+        void set_size(
+            size_t new_size
+        );
+        /*!
+            ensures
+                - #size() == new_size
+        !*/
+
+        bool host_ready (
+        ) const;
+        /*!
+            ensures
+                - returns true if and only if the host's copy of the data is current.  The
+                  host's data is current if there aren't any modifications to the data
+                  which were made on the device side that have yet to be copied to the
+                  host.
+        !*/
+
+        bool device_ready (
+        ) const; 
+        /*!
+            ensures
+                - returns true if and only if the device's copy of the data is current.
+                  The device's data is current if there aren't any modifications to the
+                  data which were made on the host side that have yet to be copied to the
+                  device.
+        !*/
+
+        const float* host(
+        ) const;
+        /*!
+            ensures
+                - returns a pointer to the host memory block of size() contiguous float
+                  values or nullptr if size()==0.
+                - if (!host_ready()) then
+                    - copies the data from the device to the host, while this is happening
+                      the call to host() blocks. 
+                - #host_ready() == true 
+        !*/
+
+        float* host(
+        );
+        /*!
+            ensures
+                - returns a pointer to the host memory block of size() contiguous float
+                  values or nullptr if size()==0.
+                - if (!host_ready()) then
+                    - copies the data from the device to the host, while this is happening
+                      the call to host() blocks. 
+                - #host_ready() == true 
+                - #device_ready() == false
+                  I.e. Marks the device side data as out of date so that the next call to
+                  device() will perform a host to device transfer.  If you want to begin
+                  the transfer immediately then you can call async_copy_to_device() after
+                  calling host().
+        !*/
+
+        float* host_write_only(
+        );
+        /*!
+            ensures
+                - This function returns the same pointer as host(), except that it never
+                  performs a device to host memory copy.  Instead, it immediately marks the
+                  device side data as out of date, effectively discarding it.  Therefore,
+                  the values in the data pointed to by host_write_only() are undefined and
+                  you should only call host_write_only() if you are going to assign to
+                  every memory location in the returned memory block.  
+                - #host_ready() == true
+                - #device_ready() == false 
+        !*/
+
+        const float* device(
+        ) const;
+        /*!
+            requires
+                - DLIB_USE_CUDA is #defined
+            ensures
+                - returns a pointer to the device memory block of size() contiguous float
+                  values or nullptr if size()==0.
+                - if (!device_ready()) then
+                    - copies the data from the host to the device, while this is happening
+                      the call to device() blocks. 
+                - #device_ready() == true
+        !*/
+
+        float* device(
+        );
+        /*!
+            requires
+                - DLIB_USE_CUDA is #defined
+            ensures
+                - returns a pointer to the device memory block of size() contiguous float
+                  values or nullptr if size()==0.
+                - if (!device_ready()) then
+                    - copies the data from the host to the device, while this is happening
+                      the call to device() blocks. 
+                - #host_ready() == false
+                - #device_ready() == true
+        !*/
+
+        float* device_write_only(
+        );
+        /*!
+            requires
+                - DLIB_USE_CUDA is #defined
+            ensures
+                - This function returns the same pointer as device(), except that it never
+                  performs a host to device memory copy.  Instead, it immediately marks the
+                  host side data as out of date, effectively discarding it.  Therefore, the
+                  values in the data pointed to by device_write_only() are undefined and
+                  you should only call device_write_only() if you are going to assign to
+                  every memory location in the returned memory block.  
+                - #host_ready() == false 
+                - #device_ready() == true 
+        !*/
+
+
+        size_t size(
+        ) const; 
+        /*!
+            ensures
+                - returns the number of floats contained in this object.
+        !*/
+
+        void swap (
+            gpu_data& item
+        );
+        /*!
+            ensures
+                - swaps the state of *this and item
+        !*/
+
+    };
+
+    void serialize(const gpu_data& item, std::ostream& out);
+    void deserialize(gpu_data& item, std::istream& in);
+    /*!
+        provides serialization support
+    !*/
+
+    void memcpy (
+        gpu_data& dest, 
+        const gpu_data& src
+    );
+    /*!
+        requires
+            - dest.size() == src.size()
+        ensures
+            - Copies the data in src to dest.  If the device data is current (i.e.
+              device_ready()==true) on both src and dest then the copy will happen entirely
+              on the device side.
+            - It doesn't matter what GPU device is selected by cudaSetDevice().  You can
+              always copy gpu_data objects to and from each other regardless.
+            - This function blocks until the copy has completed.
+    !*/
+
+    void memcpy (
+        gpu_data& dest, 
+        size_t dest_offset,
+        const gpu_data& src,
+        size_t src_offset,
+        size_t num
+    );
+    /*!
+        requires
+            - dest_offset + num <= dest.size()
+            - src_offset  + num <= src.size()
+        ensures
+            - Copies the data in src to dest, but only copies data in the range
+              [src.host()+src_offset, src.host()+src_offset+num) to
+              [dest.host()+dest_offset, dest.host()+dest_offset+num).  Therefore, it is
+              just like the above memcpy() except that you can specify some subset of data
+              in a gpu_data object to be copied.
+            - Like the above version of memcpy(), the copy will happen in the most
+              efficient way, automatically using the appropriate type of host/device
+              transfers based on where data is currently resident. 
+            - It doesn't matter what GPU device is selected by cudaSetDevice().  You can
+              always copy gpu_data objects to and from each other regardless.
+            - This function blocks until the copy has completed.
+    !*/
+
+// ----------------------------------------------------------------------------------------
+
+}
+
+#endif // DLIB_GPU_DaTA_ABSTRACT_H_
+
--- a/pkg/OpenFace/lib/3rdParty/dlib/include/dlib/cuda/tensor.h
+++ b/pkg/OpenFace/lib/3rdParty/dlib/include/dlib/cuda/tensor.h
@@ -0,0 +1,686 @@
+// Copyright (C) 2015  Davis E. King (davis@dlib.net)
+// License: Boost Software License   See LICENSE.txt for the full license.
+#ifndef DLIB_DNn_TENSOR_H_
+#define DLIB_DNn_TENSOR_H_
+
+#include "tensor_abstract.h"
+#include <cstring>
+#include "../matrix.h"
+#include "cudnn_dlibapi.h"
+#include "gpu_data.h"
+#include "../byte_orderer.h"
+#include <memory>
+#include "../any.h"
+
+namespace dlib
+{
+
+// ----------------------------------------------------------------------------------------
+
+    class tensor;
+    namespace cuda
+    {
+        void set_tensor (
+            tensor& t,
+            float value
+        );
+
+        void scale_tensor (
+            tensor& t,
+            float value
+        );
+    }
+
+// ----------------------------------------------------------------------------------------
+
+    class tensor
+    {
+    public:
+
+        tensor (
+        ) : 
+            m_n(0), m_k(0), m_nr(0), m_nc(0), m_size(0)
+        {
+        }
+
+        virtual ~tensor() {}
+
+        long long num_samples() const { return m_n; }
+        long long k() const { return m_k; }
+        long long nr() const { return m_nr; }
+        long long nc() const { return m_nc; }
+        size_t size() const { return m_size; }
+
+        typedef float* iterator;
+        typedef const float* const_iterator;
+        iterator       begin()       { return host(); }
+        const_iterator begin() const { return host(); }
+        iterator       end()         { return host()+size(); }
+        const_iterator end() const   { return host()+size(); }
+
+        void async_copy_to_device() const
+        {
+            data().async_copy_to_device();
+        }
+
+        virtual const float* host() const = 0;
+        virtual float*       host() = 0; 
+        virtual float*       host_write_only() = 0;
+        virtual const float* device() const = 0;
+        virtual float*       device() = 0;
+        virtual float*       device_write_only() = 0;
+
+        virtual const any&   annotation() const = 0;
+        virtual any&         annotation() = 0;
+
+        int device_id() const { return data().device_id(); }
+
+        tensor& operator= (float val)
+        {
+#ifdef DLIB_USE_CUDA
+            // If you are using CUDA then presumably you will be mostly using tensors on
+            // the GPU.  So unless you seem to be actively working with the host side's
+            // data then we do this initialization on the device side since this avoids a
+            // host to device transfer that would likely immediately follow.
+            if (data().device_ready())
+            {
+                cuda::set_tensor(*this, val);
+                return *this;
+            }
+#endif
+            auto d = host_write_only();
+            for (size_t i = 0; i < size(); ++i)
+                d[i] = val;
+
+            return *this;
+        }
+
+        tensor& operator*= (float val)
+        {
+#ifdef DLIB_USE_CUDA
+            cuda::scale_tensor(*this, val);
+            return *this;
+#else
+            for (auto& d : *this)
+                d *= val;
+
+            return *this;
+#endif
+        }
+        
+        tensor& operator/= (float val)
+        {
+            *this *= 1.0/val;
+            return *this;
+        }
+
+        template <typename EXP>
+        tensor& operator= (const matrix_exp<EXP>& item)
+        {
+            DLIB_CASSERT(num_samples() == item.nr() &&
+                         nr()*nc()*k() == item.nc());
+            static_assert((is_same_type<float, typename EXP::type>::value == true),
+                "To assign a matrix to a tensor the matrix must contain float values");
+
+            set_ptrm(host_write_only(), m_n, m_nr*m_nc*m_k) = item;
+            return *this;
+        }
+
+        template <typename EXP>
+        tensor& operator+= (const matrix_exp<EXP>& item)
+        {
+            DLIB_CASSERT(num_samples() == item.nr() &&
+                         nr()*nc()*k() == item.nc());
+            static_assert((is_same_type<float, typename EXP::type>::value == true),
+                "To assign a matrix to a tensor the matrix must contain float values");
+            set_ptrm(host(), m_n, m_nr*m_nc*m_k) += item;
+            return *this;
+        }
+
+        template <typename EXP>
+        tensor& operator-= (const matrix_exp<EXP>& item)
+        {
+            DLIB_CASSERT(num_samples() == item.nr() &&
+                         nr()*nc()*k() == item.nc());
+            static_assert((is_same_type<float, typename EXP::type>::value == true),
+                "To assign a matrix to a tensor the matrix must contain float values");
+            set_ptrm(host(), m_n, m_nr*m_nc*m_k) -= item;
+            return *this;
+        }
+
+        template <typename EXP>
+        void set_sample (
+            unsigned long long idx,
+            const matrix_exp<EXP>& item
+        )
+        {
+            DLIB_CASSERT(idx < (unsigned long long)num_samples());
+            DLIB_CASSERT(item.size() == nr()*nc()*k());
+            static_assert((is_same_type<float, typename EXP::type>::value == true),
+                "To assign a matrix to a tensor the matrix must contain float values");
+            set_ptrm(host()+idx*item.size(), item.nr(), item.nc()) = item;
+        }
+
+
+        template <typename EXP>
+        void add_to_sample (
+            unsigned long long idx,
+            const matrix_exp<EXP>& item
+        )
+        {
+            DLIB_CASSERT(idx < (unsigned long long)num_samples());
+            DLIB_CASSERT(item.size() == nr()*nc()*k());
+            static_assert((is_same_type<float, typename EXP::type>::value == true),
+                "To assign a matrix to a tensor the matrix must contain float values");
+            set_ptrm(host()+idx*item.size(), item.nr(), item.nc()) += item;
+        }
+
+
+#ifdef DLIB_USE_CUDA
+        virtual const cuda::tensor_descriptor& get_cudnn_tensor_descriptor (
+        ) const = 0; 
+#endif
+
+        friend void memcpy (
+            tensor& dest, 
+            const tensor& src
+        )
+        {
+            DLIB_CASSERT(dest.size() == src.size());
+            memcpy(dest.data(), dest.get_alias_offset(),  
+                   src.data(),  src.get_alias_offset(), 
+                   src.size());
+        }
+
+
+    protected:
+
+        friend class alias_tensor;
+
+        virtual gpu_data& data() = 0;
+        virtual const gpu_data& data() const = 0;
+        virtual size_t get_alias_offset() const { return 0; } // needed by alias_tensor.
+
+        long long m_n;
+        long long m_k;
+        long long m_nr;
+        long long m_nc;
+        long long m_size; // always equal to m_n*m_k*m_nr*m_nc
+    };
+
+// ----------------------------------------------------------------------------------------
+
+    inline bool is_vector (
+        const tensor& t
+    )
+    {
+        return t.size() == (size_t)t.num_samples() ||
+               t.size() == (size_t)t.k() ||
+               t.size() == (size_t)t.nr() ||
+               t.size() == (size_t)t.nc();
+    }
+
+// ----------------------------------------------------------------------------------------
+
+    inline const matrix_op<op_pointer_to_mat<float> > mat (
+        const tensor& t,
+        long long nr,
+        long long nc
+    )
+    {
+        DLIB_ASSERT(nr >= 0 && nc >= 0 , 
+                    "\tconst matrix_exp mat(tensor, nr, nc)"
+                    << "\n\t nr and nc must be >= 0"
+                    << "\n\t nr: " << nr
+                    << "\n\t nc: " << nc
+        );
+        DLIB_ASSERT(nr*nc == (long long)t.size() , 
+                    "\tconst matrix_exp mat(tensor, nr, nc)"
+                    << "\n\t The sizes don't match up."
+                    << "\n\t nr*nc:    " << nr*nc
+                    << "\n\t t.size(): " << t.size()
+        );
+        typedef op_pointer_to_mat<float> op;
+        return matrix_op<op>(op(t.host(),nr,nc));
+    }
+
+    inline const matrix_op<op_pointer_to_mat<float> > mat (
+        const tensor& t
+    )
+    {
+        if (t.size() != 0)
+            return mat(t, t.num_samples(), t.size()/t.num_samples());
+        else
+            return mat((float*)0,0,0);
+    }
+
+    inline const matrix_op<op_pointer_to_mat<float> > image_plane (
+        const tensor& t,
+        long long sample = 0,
+        long long k = 0
+    )
+    {
+        DLIB_ASSERT(0 <= sample && sample < t.num_samples() &&
+                    0 <= k && k < t.k() &&
+                    t.size() != 0, 
+                    "\tconst matrix_exp image_plane(tensor,sample,k)"
+                    << "\n\t Invalid arguments were given to this function."
+                    << "\n\t sample: " << sample
+                    << "\n\t k:      " << k 
+                    << "\n\t t.num_samples(): " << t.num_samples() 
+                    << "\n\t t.k():           " << t.k() 
+                    << "\n\t t.size():        " << t.size() 
+        );
+
+
+        typedef op_pointer_to_mat<float> op;
+        return matrix_op<op>(op(t.host() + ((sample*t.k() + k)*t.nr())*t.nc(), 
+                                t.nr(), 
+                                t.nc()));
+    }
+
+// ----------------------------------------------------------------------------------------
+
+    inline bool have_same_dimensions (
+        const tensor& a,
+        const tensor& b
+    )
+    {
+        return a.num_samples() == b.num_samples() &&
+               a.k()  == b.k() &&
+               a.nr() == b.nr() &&
+               a.nc() == b.nc();
+    }
+
+// ----------------------------------------------------------------------------------------
+
+    class resizable_tensor : public tensor
+    {
+    public:
+        resizable_tensor(
+        )
+        {}
+
+        template <typename EXP>
+        resizable_tensor(
+            const matrix_exp<EXP>& item
+        )
+        {
+            set_size(item.nr(), item.nc());
+            *this = item;
+        }
+
+        explicit resizable_tensor(
+            long long n_, long long k_ = 1, long long nr_ = 1, long long nc_ = 1
+        ) 
+        {
+            DLIB_ASSERT( n_ >= 0 && k_ >= 0 && nr_ >= 0 && nc_ >= 0);
+
+            set_size(n_,k_,nr_,nc_);
+        }
+
+        resizable_tensor(const resizable_tensor& item) : _annotation(item.annotation()) 
+        {
+            copy_size(item);
+            memcpy(*this, item);
+        }
+        resizable_tensor(const tensor& item) : _annotation(item.annotation()) 
+        {
+            copy_size(item);
+            memcpy(*this, item);
+        }
+
+        resizable_tensor(resizable_tensor&& item) { swap(item); }
+        resizable_tensor& operator=(resizable_tensor&& item) { swap(item); return *this; }
+
+        virtual const float* host() const { return data_instance.host(); }
+        virtual float*       host()       { return data_instance.host(); }
+        virtual float*       host_write_only() { return data_instance.host_write_only(); }
+        virtual const float* device() const { return data_instance.device(); }
+        virtual float*       device()       { return data_instance.device(); }
+        virtual float*       device_write_only() { return data_instance.device_write_only(); }
+
+        virtual const any&   annotation() const { return _annotation; }
+        virtual any&         annotation() { return _annotation; }
+
+        void clear(
+        )
+        {
+            set_size(0,0,0,0);
+            _annotation.clear();
+            // free underlying memory
+            data_instance.set_size(0);
+        }
+
+        void copy_size (
+            const tensor& item
+        )
+        {
+            set_size(item.num_samples(), item.k(), item.nr(), item.nc());
+        }
+
+        resizable_tensor& operator= (float val)
+        {
+            tensor::operator=(val);
+            return *this;
+        }
+
+        template <typename EXP>
+        resizable_tensor& operator= (
+            const matrix_exp<EXP>& item
+        )
+        {
+            if (!(num_samples() == item.nr() && k()*nr()*nc() == item.nc()))
+                set_size(item.nr(), item.nc());
+            tensor::operator=(item);
+            return *this;
+        }
+
+        void set_size(
+            long long n_, long long k_ = 1, long long nr_ = 1, long long nc_ = 1
+        )
+        {
+            DLIB_ASSERT( n_ >= 0 && k_ >= 0 && nr_ >= 0 && nc_ >= 0);
+
+            m_n = n_;
+            m_k = k_;
+            m_nr = nr_;
+            m_nc = nc_;
+            m_size = n_*k_*nr_*nc_;
+            if ((long long)data_instance.size() < m_size)
+                data_instance.set_size(m_size);
+#ifdef DLIB_USE_CUDA
+            cudnn_descriptor.set_size(m_n,m_k,m_nr,m_nc);
+#endif
+        }
+
+
+        resizable_tensor& operator= (const resizable_tensor& item) 
+        {
+            resizable_tensor temp(item);
+            temp.swap(*this);
+            return *this;
+        }
+
+        resizable_tensor& operator= (const tensor& item) 
+        {
+            resizable_tensor temp(item);
+            temp.swap(*this);
+            return *this;
+        }
+
+
+        void swap(resizable_tensor& item)
+        {
+            std::swap(m_n,    item.m_n);
+            std::swap(m_k,    item.m_k);
+            std::swap(m_nr,   item.m_nr);
+            std::swap(m_nc,   item.m_nc);
+            std::swap(m_size, item.m_size);
+            std::swap(data_instance, item.data_instance);
+            std::swap(_annotation, item._annotation);
+#ifdef DLIB_USE_CUDA
+            std::swap(cudnn_descriptor, item.cudnn_descriptor);
+#endif
+        }
+
+#ifdef DLIB_USE_CUDA
+        virtual const cuda::tensor_descriptor& get_cudnn_tensor_descriptor (
+        ) const { return cudnn_descriptor; }
+#endif
+
+    private:
+
+#ifdef DLIB_USE_CUDA
+        cuda::tensor_descriptor cudnn_descriptor;
+#endif 
+
+        gpu_data data_instance;
+        any _annotation;
+        virtual gpu_data& data() { return data_instance; }
+        virtual const gpu_data& data() const { return data_instance; }
+    };
+
+    inline void serialize(const tensor& item, std::ostream& out)
+    {
+        int version = 2;
+        serialize(version, out);
+        serialize(item.num_samples(), out);
+        serialize(item.k(), out);
+        serialize(item.nr(), out);
+        serialize(item.nc(), out);
+        byte_orderer bo;
+        auto sbuf = out.rdbuf();
+        for (auto d : item)
+        {
+            // Write out our data as 4byte little endian IEEE floats rather than using
+            // dlib's default float serialization.  We do this because it will result in
+            // more compact outputs.  It's slightly less portable but it seems doubtful
+            // that any CUDA enabled platform isn't going to use IEEE floats.  But if one
+            // does we can just update the serialization code here to handle it if such a
+            // platform is encountered.
+            bo.host_to_little(d);
+            static_assert(sizeof(d)==4, "This serialization code assumes we are writing 4 byte floats");
+            sbuf->sputn((char*)&d, sizeof(d));
+        }
+    }
+
+    inline void deserialize(resizable_tensor& item, std::istream& in)
+    {
+        int version;
+        deserialize(version, in);
+        if (version != 2)
+            throw serialization_error("Unexpected version found while deserializing dlib::resizable_tensor.");
+
+        long long num_samples=0, k=0, nr=0, nc=0;
+        deserialize(num_samples, in);
+        deserialize(k, in);
+        deserialize(nr, in);
+        deserialize(nc, in);
+        item.set_size(num_samples, k, nr, nc);
+        byte_orderer bo;
+        auto sbuf = in.rdbuf();
+        for (auto& d : item)
+        {
+            static_assert(sizeof(d)==4, "This serialization code assumes we are writing 4 byte floats");
+            if (sbuf->sgetn((char*)&d,sizeof(d)) != sizeof(d))
+            {
+                in.setstate(std::ios::badbit);
+                throw serialization_error("Error reading data while deserializing dlib::resizable_tensor.");
+            }
+            bo.little_to_host(d);
+        }
+    }
+
+// ----------------------------------------------------------------------------------------
+
+    inline double dot(
+        const tensor& a,
+        const tensor& b
+    )
+    {
+        DLIB_CASSERT(a.size() == b.size());
+        const float* da = a.host();
+        const float* db = b.host();
+        double sum = 0;
+        for (size_t i = 0; i < a.size(); ++i)
+            sum += da[i]*db[i];
+        return sum;
+    }
+
+// ----------------------------------------------------------------------------------------
+// ----------------------------------------------------------------------------------------
+
+    class alias_tensor_instance : public tensor
+    {
+        alias_tensor_instance(
+        ) : data_instance(0), _annotation(0), data_offset(0) {}
+
+    public:
+        friend class alias_tensor;
+        friend class alias_tensor_const_instance;
+
+        alias_tensor_instance& operator= (float val)
+        {
+            tensor::operator=(val);
+            return *this;
+        }
+
+        template <typename EXP>
+        alias_tensor_instance& operator= (const matrix_exp<EXP>& item)
+        {
+            tensor::operator=(item);
+            return *this;
+        }
+
+        virtual const float* host() const { return data_instance->host()+data_offset; }
+        virtual float*       host()       { return data_instance->host()+data_offset; }
+        virtual float*       host_write_only()    { return data_instance->host()+data_offset; }
+        virtual const float* device() const { return data_instance->device()+data_offset; }
+        virtual float*       device()       { return data_instance->device()+data_offset; }
+        virtual float*       device_write_only()  { return data_instance->device()+data_offset; }
+
+        virtual const any&   annotation() const { return *_annotation; }
+        virtual any&         annotation() { return *_annotation; }
+
+#ifdef DLIB_USE_CUDA
+        virtual const cuda::tensor_descriptor& get_cudnn_tensor_descriptor (
+        ) const { return *cudnn_descriptor; }
+#endif
+    private:
+
+        virtual size_t get_alias_offset() const { return data_offset; } 
+
+#ifdef DLIB_USE_CUDA
+        std::shared_ptr<cuda::tensor_descriptor> cudnn_descriptor;
+#endif
+        gpu_data* data_instance;
+        any* _annotation;
+        size_t data_offset;
+        virtual gpu_data& data() { return *data_instance; }
+        virtual const gpu_data& data() const { return *data_instance; }
+    };
+
+// ----------------------------------------------------------------------------------------
+
+    class alias_tensor_const_instance 
+    {
+    public:
+        const tensor& get() const { return inst; }
+        operator const tensor& () { return inst; }
+
+        alias_tensor_const_instance(const alias_tensor_instance& item) : inst(item) {}
+
+    private:
+        alias_tensor_instance inst;
+
+        friend class alias_tensor;
+        alias_tensor_const_instance() {}
+    };
+
+// ----------------------------------------------------------------------------------------
+
+    class alias_tensor 
+    {
+    public:
+
+        alias_tensor (
+        ) {}
+
+        alias_tensor (
+            long long n_, long long k_ = 1, long long nr_ = 1, long long nc_ = 1
+        ) 
+        {
+            DLIB_ASSERT( n_ >= 0 && k_ >= 0 && nr_ >= 0 && nc_ >= 0);
+
+            inst.m_n = n_;
+            inst.m_k = k_;
+            inst.m_nr = nr_;
+            inst.m_nc = nc_;
+            inst.m_size = n_*k_*nr_*nc_;
+        }
+
+        long long num_samples(
+        ) const { return inst.m_n; }
+
+        long long k(
+        ) const { return inst.m_k; }
+
+        long long nr(
+        ) const { return inst.m_nr; }
+
+        long long nc(
+        ) const { return inst.m_nc; }
+
+        size_t size(
+        ) const { return inst.m_size; }
+
+        alias_tensor_instance operator() (
+            tensor& t,
+            size_t offset = 0
+        ) const
+        {
+            DLIB_CASSERT(offset+size() <= t.size(), 
+                "offset: "<<offset <<"\n"<<
+                "size(): "<<size() <<"\n"<<
+                "t.size(): "<<t.size() <<"\n");
+
+#ifdef DLIB_USE_CUDA
+            if (!inst.cudnn_descriptor)
+            {
+                inst.cudnn_descriptor = std::make_shared<cuda::tensor_descriptor>();
+                inst.cudnn_descriptor->set_size(inst.m_n, inst.m_k, inst.m_nr, inst.m_nc);
+            }
+#endif
+            inst.data_instance = &t.data();
+            inst._annotation   = &t.annotation();
+            // Note that t might already be an aliasing tensor so we need to take that into
+            // account.
+            inst.data_offset = t.get_alias_offset()+offset;
+            return inst;
+        }
+
+        alias_tensor_const_instance operator() (
+            const tensor& t,
+            size_t offset = 0
+        ) const
+        {
+            alias_tensor_const_instance temp;
+            temp.inst = (*this)(const_cast<tensor&>(t),offset);
+            return temp;
+        }
+
+    private:
+        mutable alias_tensor_instance inst;
+    };
+
+    inline void serialize(const alias_tensor& item, std::ostream& out)
+    {
+        int version = 1;
+        serialize(version, out);
+        serialize(item.num_samples(), out);
+        serialize(item.k(), out);
+        serialize(item.nr(), out);
+        serialize(item.nc(), out);
+    }
+
+    inline void deserialize(alias_tensor& item, std::istream& in)
+    {
+        int version = 0;
+        deserialize(version, in);
+        if (version != 1)
+            throw serialization_error("Unexpected version found while deserializing dlib::alias_tensor.");
+        long long num_samples, k, nr, nc;
+        deserialize(num_samples, in);
+        deserialize(k, in);
+        deserialize(nr, in);
+        deserialize(nc, in);
+        item = alias_tensor(num_samples, k, nr, nc);
+    }
+
+// ----------------------------------------------------------------------------------------
+
+}
+
+#endif // DLIB_DNn_TENSOR_H_
+
--- a/pkg/OpenFace/lib/3rdParty/dlib/include/dlib/cuda/tensor_abstract.h
+++ b/pkg/OpenFace/lib/3rdParty/dlib/include/dlib/cuda/tensor_abstract.h
@@ -0,0 +1,727 @@
+// Copyright (C) 2015  Davis E. King (davis@dlib.net)
+// License: Boost Software License   See LICENSE.txt for the full license.
+#undef DLIB_DNn_TENSOR_ABSTRACT_H_
+#ifdef DLIB_DNn_TENSOR_ABSTRACT_H_
+
+#include "../matrix.h"
+#include "../any/any_abstract.h"
+
+namespace dlib
+{
+// ----------------------------------------------------------------------------------------
+
+    class tensor
+    {
+        /*!
+            WHAT THIS OBJECT REPRESENTS
+                This object represents a 4D array of float values, all stored contiguously
+                in memory.  Importantly, it keeps two copies of the floats, one on the host
+                CPU side and another on the GPU device side. It automatically performs the
+                necessary host/device transfers to keep these two copies of the data in
+                sync.
+
+                All transfers to the device happen asynchronously with respect to the
+                default CUDA stream so that CUDA kernel computations can overlap with data
+                transfers.  However, any transfers from the device to the host happen
+                synchronously in the default CUDA stream.  Therefore, you should perform
+                all your CUDA kernel launches on the default stream so that transfers back
+                to the host do not happen before the relevant computations have completed.
+
+                If DLIB_USE_CUDA is not #defined then this object will not use CUDA at all.
+                Instead, it will simply store one host side memory block of floats.  
+
+                Finally, the convention in dlib code is to interpret the tensor as a set of
+                num_samples() 3D arrays, each of dimension k() by nr() by nc().  Also,
+                while this class does not specify a memory layout, the convention is to
+                assume that indexing into an element at coordinates (sample,k,r,c) can be
+                accomplished via:
+                    host()[((sample*t.k() + k)*t.nr() + r)*t.nc() + c]
+
+            THREAD SAFETY
+                Instances of this object are not thread-safe.  So don't touch one from
+                multiple threads at the same time.
+        !*/
+
+    public:
+
+        virtual ~tensor();
+
+        long long num_samples(
+        ) const; 
+        /*!
+            ensures
+                - returns the number of 3D arrays of dimension k() by nr() by nc() there
+                  are in this object.  
+        !*/
+
+        long long k(
+        ) const; 
+        /*!
+            ensures
+                - returns the k dimension of this tensor.  Generally, we think of a tensor
+                  as containing num_samples() images of nr() by nc() rows and columns, each
+                  with k() channels.
+        !*/
+
+        long long nr(
+        ) const; 
+        /*!
+            ensures
+                - returns the number of rows in this tensor.
+        !*/
+
+        long long nc(
+        ) const; 
+        /*!
+            ensures
+                - returns the number of columns in this tensor.
+        !*/
+
+        size_t size(
+        ) const;
+        /*!
+            ensures
+                - returns num_samples()*k()*nr()*nc()
+                  (i.e. the total number of floats in this tensor)
+        !*/
+
+        void async_copy_to_device(
+        ) const;
+        /*!
+            ensures
+                - This function does not block.
+                - if (the host version of the data is newer than the device's copy) then
+                    - Begins asynchronously copying host data to the device.
+                    - A call to device() that happens before the transfer completes will
+                      block until the transfer is complete.  That is, it is safe to call
+                      async_copy_to_device() and then immediately call device().
+        !*/
+
+        typedef float* iterator;
+        typedef const float* const_iterator;
+        iterator       begin()       { return host(); }
+        const_iterator begin() const { return host(); }
+        iterator       end()         { return host()+size(); }
+        const_iterator end() const   { return host()+size(); }
+        /*!
+            ensures
+                - makes a tensor iterable just like the STL containers.   
+        !*/
+
+        virtual const float* host(
+        ) const = 0;
+        /*!
+            ensures
+                - returns a pointer to the host memory block of size() contiguous float
+                  values or nullptr if size()==0.
+                - if (the host's copy of the data is out of date) then
+                    - copies the data from the device to the host, while this is happening
+                      the call to host() blocks. 
+        !*/
+
+        virtual float* host(
+        ) = 0;
+        /*!
+            ensures
+                - returns a pointer to the host memory block of size() contiguous float
+                  values or nullptr if size()==0.
+                - if (the host's copy of the data is out of date) then
+                    - copies the data from the device to the host, while this is happening
+                      the call to host() blocks. 
+                - Marks the device side data as out of date so that the next call to
+                  device() will perform a host to device transfer.  If you want to begin
+                  the transfer immediately then you can call async_copy_to_device() after
+                  calling host().
+        !*/
+
+        virtual float* host_write_only(
+        ) = 0;
+        /*!
+            ensures
+                - This function returns the same pointer as host(), except that it never
+                  performs a device to host memory copy.  Instead, it immediately marks the
+                  device side data as out of date, effectively discarding it.  Therefore,
+                  the values in the data pointed to by host_write_only() are undefined and
+                  you should only call host_write_only() if you are going to assign to
+                  every memory location in the returned memory block.  
+        !*/
+
+        virtual const float* device(
+        ) const = 0;
+        /*!
+            requires
+                - DLIB_USE_CUDA is #defined
+            ensures
+                - returns a pointer to the device memory block of size() contiguous float
+                  values or nullptr if size()==0.
+                - if (the device's copy of the data is out of date) then
+                    - copies the data from the host to the device, while this is happening
+                      the call to device() blocks. 
+        !*/
+
+        virtual float* device(
+        ) = 0;
+        /*!
+            requires
+                - DLIB_USE_CUDA is #defined
+            ensures
+                - returns a pointer to the device memory block of size() contiguous float
+                  values or nullptr if size()==0.
+                - if (the device's copy of the data is out of date) then
+                    - copies the data from the host to the device, while this is happening
+                      the call to device() blocks. 
+                - Marks the host side data as out of date so that the next call to
+                  host() will perform a device to host transfer.
+        !*/
+
+        virtual float* device_write_only(
+        ) = 0;
+        /*!
+            requires
+                - DLIB_USE_CUDA is #defined
+            ensures
+                - This function returns the same pointer as device(), except that it never
+                  performs a host to device memory copy.  Instead, it immediately marks the
+                  host side data as out of date, effectively discarding it.  Therefore, the
+                  values in the data pointed to by device_write_only() are undefined and
+                  you should only call device_write_only() if you are going to assign to
+                  every memory location in the returned memory block.  
+        !*/
+
+        virtual const any& annotation(
+        ) const = 0;
+        /*!
+            ensures
+                - returns a const reference to the any object in this tensor.  The any
+                  object can be used to store any additional annotation you like in a
+                  tensor.  However, it should be noted that the annotation() is ignored by
+                  serialize() and therefore not saved when a tensor is serialized.
+        !*/
+
+        virtual any& annotation(
+        ) = 0;
+        /*!
+            ensures
+                - returns a non-const reference to the any object in this tensor.  The any
+                  object can be used to store any additional annotation you like in a
+                  tensor.  However, it should be noted that the annotation() is ignored by
+                  serialize() and therefore not saved when a tensor is serialized.
+        !*/
+
+        int device_id(
+        ) const; 
+        /*!
+            ensures
+                - returns the ID of the CUDA device that allocated this memory. I.e. the
+                  number returned by cudaGetDevice() when the memory was allocated.
+                - If CUDA is not being used then this function always returns 0.
+        !*/
+
+        tensor& operator= (
+            float val
+        );
+        /*!
+            ensures
+                - sets all elements of this tensor equal to val.
+                - returns *this
+        !*/
+
+        tensor& operator*= (
+            float val
+        );
+        /*!
+            ensures
+                - pointwise multiplies all elements of *this tensor with val.
+                - returns *this
+        !*/
+        
+        tensor& operator/= (
+            float val
+        );
+        /*!
+            ensures
+                - pointwise divides all elements of *this tensor with val.
+                - returns *this
+        !*/
+
+        template <typename EXP>
+        tensor& operator= (
+            const matrix_exp<EXP>& item
+        );
+        /*!
+            requires
+                - num_samples() == item.nr()
+                - k()*nr()*nc() == item.nc()
+                - item contains float values
+            ensures
+                - Assigns item to *this tensor by performing:
+                  set_ptrm(host(), num_samples(), k()*nr()*nc()) = item;
+        !*/
+
+        template <typename EXP>
+        tensor& operator+= (
+            const matrix_exp<EXP>& item
+        );
+        /*!
+            requires
+                - num_samples() == item.nr()
+                - k()*nr()*nc() == item.nc()
+                - item contains float values
+            ensures
+                - Adds item to *this tensor by performing:
+                  set_ptrm(host(), num_samples(), k()*nr()*nc()) += item;
+        !*/
+
+        template <typename EXP>
+        tensor& operator-= (
+            const matrix_exp<EXP>& item
+        );
+        /*!
+            requires
+                - num_samples() == item.nr()
+                - k()*nr()*nc() == item.nc()
+                - item contains float values
+            ensures
+                - Subtracts item from *this tensor by performing:
+                  set_ptrm(host(), num_samples(), k()*nr()*nc()) -= item;
+        !*/
+
+        template <typename EXP>
+        void set_sample (
+            unsigned long long idx,
+            const matrix_exp<EXP>& item
+        );
+        /*!
+            requires
+                - idx < num_samples()
+                - k()*nr()*nc() == item.size()
+                - item contains float values
+            ensures
+                - Assigns item to the idx'th sample in *this by performing:
+                  set_ptrm(host()+idx*item.size(), item.nr(), item.nc()) = item;
+        !*/
+
+
+        template <typename EXP>
+        void add_to_sample (
+            unsigned long long idx,
+            const matrix_exp<EXP>& item
+        );
+        /*!
+            requires
+                - idx < num_samples()
+                - k()*nr()*nc() == item.size()
+                - item contains float values
+            ensures
+                - Adds item to the idx'th sample in *this by performing:
+                  set_ptrm(host()+idx*item.size(), item.nr(), item.nc()) += item;
+        !*/
+
+    protected:
+
+        // You can't move or copy another tensor into *this since that might modify the
+        // tensor's dimensions.  If you want to do that sort of thing then use a
+        // resizable_tensor.
+        tensor(const tensor& item);  
+        tensor& operator= (const tensor& item); 
+        tensor(tensor&& item); 
+        tensor& operator=(tensor&& item); 
+    };
+
+// ----------------------------------------------------------------------------------------
+
+    void memcpy (
+        tensor& dest, 
+        const tensor& src
+    );
+    /*!
+        requires
+            - dest.size() == src.size()
+        ensures
+            - Copies the data in src to dest.  If the device data is current on both src
+              and dest then the copy will happen entirely on the device side.
+            - It doesn't matter what GPU device is selected by cudaSetDevice().  You can
+              always copy tensor objects to and from each other regardless.
+            - This function blocks until the copy has completed.
+    !*/
+
+// ----------------------------------------------------------------------------------------
+
+    bool is_vector (
+        const tensor& t
+    );
+    /*!
+        ensures
+            - returns true if and only if one of the following is true:
+                - t.size() == t.num_samples() 
+                - t.size() == t.k() 
+                - t.size() == t.nr() 
+                - t.size() == t.nc()
+    !*/
+
+// ----------------------------------------------------------------------------------------
+
+    const matrix_exp mat (
+        const tensor& t,
+        long long nr,
+        long long nc
+    );
+    /*!
+        requires
+            - nr >= 0
+            - nc >= 0
+            - nr*nc == t.size()
+        ensures
+            - returns a matrix M such that:
+                - M.nr() == nr
+                - m.nc() == nc 
+                - for all valid r and c:
+                  M(r,c) == t.host()[r*nc + c]
+                  (i.e. the tensor is interpreted as a matrix laid out in memory
+                  in row major order)
+    !*/
+
+    const matrix_exp mat (
+        const tensor& t
+    );
+    /*!
+        ensures
+            - if (t.size() != 0) then
+                - returns mat(t, t.num_samples(), t.size()/t.num_samples())
+            - else
+                - returns an empty matrix.
+    !*/
+
+    const matrix_exp image_plane (
+        const tensor& t,
+        long long sample = 0,
+        long long k = 0
+    );
+    /*!
+        requires
+            - t.size() != 0
+            - 0 <= sample < t.num_samples()
+            - 0 <= k < t.k()
+        ensures
+            - returns the k-th image plane from the sample-th image in t.  That is,
+              returns a matrix M such that:
+                - M contains float valued elements.
+                - M.nr() == t.nr()
+                - M.nc() == t.nc()
+                - for all valid r and c:
+                    - M(r,c) == t.host()[((sample*t.k() + k)*t.nr() + r)*t.nc() + c]
+    !*/
+
+// ----------------------------------------------------------------------------------------
+
+    bool have_same_dimensions (
+        const tensor& a,
+        const tensor& b
+    );
+    /*!
+        ensures
+            - returns true if and only if all of the fallowing are satisfied:
+                - a.num_samples() == b.num_samples() 
+                - a.k()  == b.k() 
+                - a.nr() == b.nr() 
+                - a.nc() == b.nc()
+    !*/
+
+// ----------------------------------------------------------------------------------------
+
+    class resizable_tensor : public tensor
+    {
+        /*!
+            WHAT THIS OBJECT REPRESENTS
+                This object is just a tensor with the additional ability to be resized.
+        !*/
+
+    public:
+        resizable_tensor(
+        );
+        /*!
+            ensures
+                - #size() == 0
+                - #num_samples() == 0
+                - #k() == 0
+                - #nr() == 0
+                - #nc() == 0
+                - #capacity() == 0
+        !*/
+
+        template <typename EXP>
+        resizable_tensor(
+            const matrix_exp<EXP>& item
+        );
+        /*!
+            requires
+                - item contains float values
+            ensures
+                - #num_samples() == item.nr()
+                - #k() == item.nc()
+                - #nr() == 1
+                - #nc() == 1
+                - Assigns item to *this tensor by performing:
+                  set_ptrm(host(), num_samples(), k()*nr()*nc()) = item;
+                - #capacity() == size()
+        !*/
+
+        explicit resizable_tensor(
+            long long n_, long long k_ = 1, long long nr_ = 1, long long nc_ = 1
+        );
+        /*!
+            requires
+                - n_ >= 0
+                - k_ >= 0
+                - nr_ >= 0
+                - nc_ >= 0
+            ensures
+                - #size() == n_*k_*nr_*nc_
+                - #num_samples() == n_
+                - #k() == k_
+                - #nr() == nr_
+                - #nc() == nc_
+                - #capacity() == size()
+        !*/
+
+        // This object is copyable and movable
+        resizable_tensor(const resizable_tensor&) = default;
+        resizable_tensor(resizable_tensor&&) = default;
+        resizable_tensor& operator= (const resizable_tensor&) = default;
+        resizable_tensor& operator= (resizable_tensor&&) = default;
+
+        size_t capacity (
+        ) const;
+        /*!
+            ensures
+                - returns the total number of floats allocated.  This might be different
+                  from the size() since calls to set_size() that make a tensor smaller
+                  don't trigger reallocations.  They simply adjust the nominal dimensions
+                  while keeping the same allocated memory block.  This makes calls to
+                  set_size() very fast.  If you need to deallocate a tensor then use
+                  clear().
+        !*/
+
+        void clear(
+        );
+        /*!
+            ensures
+                - #size() == 0
+                - #num_samples() == 0
+                - #k() == 0
+                - #nr() == 0
+                - #nc() == 0
+                - #annotation().is_empty() == true
+                - #capacity() == 0
+        !*/
+
+        void copy_size (
+            const tensor& item
+        );
+        /*!
+            ensures
+                - resizes *this so that: have_same_dimensions(#*this, item)==true
+        !*/
+
+        void set_size(
+            long long n_, long long k_ = 1, long long nr_ = 1, long long nc_ = 1
+        );
+        /*!
+            requires
+                - n_ >= 0
+                - k_ >= 0
+                - nr_ >= 0
+                - nc_ >= 0
+            ensures
+                - #size() == n_*k_*nr_*nc_
+                - #num_samples() == n_
+                - #k() == k_
+                - #nr() == nr_
+                - #nc() == nc_
+                - #capacity() == max(#size(), capacity())
+                  (i.e. capacity() never goes down when calling set_size().)
+        !*/
+
+        template <typename EXP>
+        resizable_tensor& operator= (
+            const matrix_exp<EXP>& item
+        );
+        /*!
+            requires
+                - item contains float values
+            ensures
+                - if (num_samples() == item.nr() && k()*nr()*nc() == item.nc()) then
+                    - the dimensions of this tensor are not changed
+                - else
+                    - #num_samples() == item.nr()
+                    - #k() == item.nc()
+                    - #nr() == 1
+                    - #nc() == 1
+                - Assigns item to *this tensor by performing:
+                  set_ptrm(host(), num_samples(), k()*nr()*nc()) = item;
+        !*/
+    };
+
+    void serialize(const tensor& item, std::ostream& out);
+    void deserialize(resizable_tensor& item, std::istream& in);
+    /*!
+        provides serialization support for tensor and resizable_tensor.  Note that you can
+        serialize to/from any combination of tenor and resizable_tensor objects.
+    !*/
+
+// ----------------------------------------------------------------------------------------
+
+    double dot(
+        const tensor& a,
+        const tensor& b
+    );
+    /*!
+        requires
+            - a.size() == b.size()
+        ensures
+            - returns the dot product between a and b when they are both treated as
+              a.size() dimensional vectors.  That is, this function pointwise multiplies
+              the vectors together, then sums the result and returns it.
+
+    !*/
+
+// ----------------------------------------------------------------------------------------
+
+    class alias_tensor_instance : public tensor
+    {
+        /*!
+            WHAT THIS OBJECT REPRESENTS
+                This object is a tensor that aliases another tensor.  That is, it doesn't
+                have its own block of memory but instead simply holds pointers to the
+                memory of another tensor object.  It therefore allows you to efficiently
+                break a tensor into pieces and pass those pieces into functions.
+
+                An alias_tensor_instance doesn't own the resources it points to in any sense.
+                So it is important to make sure that the underlying owning tensor doesn't get
+                destructed before any alias tensors which point to it are destructed.
+        !*/
+
+        // You can't default initialize this object.  You can only get instances of it from
+        // alias_tensor::operator().
+        alias_tensor_instance(
+        ); 
+    };
+
+    class alias_tensor_const_instance 
+    {
+        /*!
+            WHAT THIS OBJECT REPRESENTS
+                This is essentially a const version of alias_tensor_instance and therefore
+                represents a tensor.  However, due to the mechanics of C++, this object
+                can't inherit from tensor.  So instead it provides a get() and an implicit
+                conversion to const tensor.
+        !*/
+
+    public:
+
+        // non-const alias tensors are convertible to const ones.
+        alias_tensor_const_instance(const alias_tensor_instance& item); 
+
+        // Methods that cast the alias to a tensor.
+        const tensor& get() const;
+        operator const tensor& (); 
+
+    private:
+        // You can't default initialize this object.  You can only get instances of it from
+        // alias_tensor::operator().
+        alias_tensor_const_instance();
+    };
+
+    class alias_tensor 
+    {
+        /*!
+            WHAT THIS OBJECT REPRESENTS
+                This is a tool for creating tensor objects that alias other tensor objects.
+                That is, it allows you to make a tensor that references the memory space of
+                another tensor object rather than owning its own memory.  This allows you
+                to do things like interpret a single tensor in different ways or even as a
+                group of multiple tensors.
+        !*/
+    public:
+
+        alias_tensor (
+        );
+        /*!
+            ensures
+                - #size() == 0 
+                - #num_samples() == 0
+                - #k() == 0
+                - #nr() == 0
+                - #nc() == 0
+        !*/
+
+        alias_tensor (
+            long long n_, long long k_ = 1, long long nr_ = 1, long long nc_ = 1
+        );
+        /*!
+            requires
+                - n_ >= 0
+                - k_ >= 0
+                - nr_ >= 0
+                - nc_ >= 0
+            ensures
+                - #size() == n_*k_*nr_*nc_
+                - #num_samples() == n_
+                - #k() == k_
+                - #nr() == nr_
+                - #nc() == nc_
+        !*/
+
+        long long num_samples() const;
+        long long k() const;
+        long long nr() const;
+        long long nc() const;
+        size_t size() const;
+
+        alias_tensor_instance operator() (
+            tensor& t,
+            size_t offset = 0
+        ) const;
+        /*!
+            requires
+                - offset+size() <= t.size()
+            ensures
+                - Returns a tensor that simply aliases the elements of t beginning with t's
+                  offset'th element.  Specifically, this function returns an aliasing
+                  tensor T such that:
+                    - T.size()   == size()
+                    - T.num_samples() == num_samples()
+                    - T.k()      == k()
+                    - T.nr()     == nr()
+                    - T.nc()     == nc()
+                    - T.host()   == t.host()+offset
+                    - T.device() == t.device()+offset
+                    - &T.annotation() == &t.annotation()
+        !*/
+
+        alias_tensor_const_instance operator() (
+            const tensor& t,
+            size_t offset = 0
+        ) const;
+        /*!
+            requires
+                - offset+size() <= t.size()
+            ensures
+                - This function is identical to the above version of operator() except that 
+                  it takes and returns const tensors instead of non-const tensors.
+        !*/
+    };
+
+    void serialize(const alias_tensor& item, std::ostream& out);
+    void deserialize(alias_tensor& item, std::istream& in);
+    /*!
+        provides serialization support for alias_tensor.  
+    !*/
+
+// ----------------------------------------------------------------------------------------
+
+}
+
+#endif // DLIB_DNn_TENSOR_ABSTRACT_H_
+
+
--- a/pkg/OpenFace/lib/3rdParty/dlib/include/dlib/cuda/tensor_tools.h
+++ b/pkg/OpenFace/lib/3rdParty/dlib/include/dlib/cuda/tensor_tools.h