open source pkg v1
This commit is contained in:
505
pkg/OpenFace/lib/3rdParty/dlib/include/dlib/cuda/cpu_dlib.h
vendored
Normal file
505
pkg/OpenFace/lib/3rdParty/dlib/include/dlib/cuda/cpu_dlib.h
vendored
Normal file
@@ -0,0 +1,505 @@
|
||||
// Copyright (C) 2015 Davis E. King (davis@dlib.net)
|
||||
// License: Boost Software License See LICENSE.txt for the full license.
|
||||
#ifndef DLIB_DNN_CPU_H_
|
||||
#define DLIB_DNN_CPU_H_
|
||||
|
||||
// This file contains CPU implementations of the GPU based functions in cuda_dlib.h
|
||||
// and cudnn_dlibapi.h
|
||||
|
||||
#include "tensor.h"
|
||||
#include "../geometry/rectangle.h"
|
||||
|
||||
namespace dlib
|
||||
{
|
||||
namespace cpu
|
||||
{
|
||||
|
||||
// -----------------------------------------------------------------------------------
|
||||
|
||||
void multiply (
|
||||
bool add_to,
|
||||
tensor& dest,
|
||||
const tensor& src1,
|
||||
const tensor& src2
|
||||
);
|
||||
|
||||
void multiply_conv (
|
||||
bool add_to,
|
||||
tensor& dest,
|
||||
const tensor& src1,
|
||||
const tensor& src2
|
||||
);
|
||||
|
||||
void multiply_zero_padded (
|
||||
bool add_to,
|
||||
tensor& dest,
|
||||
const tensor& src1,
|
||||
const tensor& src2
|
||||
);
|
||||
|
||||
void scale_channels (
|
||||
bool add_to,
|
||||
tensor& dest,
|
||||
const tensor& src,
|
||||
const tensor& scales
|
||||
);
|
||||
|
||||
void add(
|
||||
float beta,
|
||||
tensor& dest,
|
||||
float alpha,
|
||||
const tensor& src
|
||||
);
|
||||
|
||||
void assign_bias_gradient (
|
||||
tensor& grad,
|
||||
const tensor& gradient_input
|
||||
);
|
||||
|
||||
void add (
|
||||
tensor& dest,
|
||||
const tensor& src1,
|
||||
const tensor& src2
|
||||
);
|
||||
|
||||
void assign_conv_bias_gradient (
|
||||
tensor& grad,
|
||||
const tensor& gradient_input
|
||||
);
|
||||
|
||||
// -----------------------------------------------------------------------------------
|
||||
|
||||
void affine_transform(
|
||||
tensor& dest,
|
||||
const tensor& src,
|
||||
const float A,
|
||||
const float B
|
||||
);
|
||||
|
||||
void affine_transform(
|
||||
tensor& dest,
|
||||
const tensor& src1,
|
||||
const tensor& src2,
|
||||
const float A,
|
||||
const float B,
|
||||
const float C
|
||||
);
|
||||
|
||||
void affine_transform(
|
||||
tensor& dest,
|
||||
const tensor& src1,
|
||||
const tensor& src2,
|
||||
const tensor& src3,
|
||||
const float A,
|
||||
const float B,
|
||||
const float C,
|
||||
const float D
|
||||
);
|
||||
|
||||
void affine_transform_range(
|
||||
size_t begin,
|
||||
size_t end,
|
||||
tensor& dest,
|
||||
const tensor& src1,
|
||||
const tensor& src2,
|
||||
const tensor& src3,
|
||||
const float A,
|
||||
const float B,
|
||||
const float C
|
||||
);
|
||||
|
||||
// -----------------------------------------------------------------------------------
|
||||
|
||||
void affine_transform(
|
||||
tensor& dest,
|
||||
const tensor& src,
|
||||
const tensor& A,
|
||||
const tensor& B
|
||||
);
|
||||
|
||||
// -----------------------------------------------------------------------------------
|
||||
|
||||
void affine_transform_conv(
|
||||
tensor& dest,
|
||||
const tensor& src,
|
||||
const tensor& A,
|
||||
const tensor& B
|
||||
);
|
||||
|
||||
// -----------------------------------------------------------------------------------
|
||||
|
||||
void affine_transform(
|
||||
const rectangle& rect,
|
||||
tensor& dest,
|
||||
const tensor& src1,
|
||||
const tensor& src2,
|
||||
const tensor& src3,
|
||||
float A,
|
||||
float B,
|
||||
float C
|
||||
);
|
||||
|
||||
// -----------------------------------------------------------------------------------
|
||||
|
||||
void compute_adam_update (
|
||||
size_t begin,
|
||||
size_t end,
|
||||
tensor& s,
|
||||
tensor& m,
|
||||
tensor& v,
|
||||
const float t,
|
||||
const float learning_rate,
|
||||
const float weight_decay,
|
||||
const float momentum1,
|
||||
const float momentum2,
|
||||
const tensor& params,
|
||||
const tensor& params_grad
|
||||
);
|
||||
|
||||
// -----------------------------------------------------------------------------------
|
||||
|
||||
void batch_normalize_inference (
|
||||
const double eps,
|
||||
resizable_tensor& dest,
|
||||
const tensor& src,
|
||||
const tensor& gamma,
|
||||
const tensor& beta,
|
||||
const tensor& running_means,
|
||||
const tensor& running_variances
|
||||
);
|
||||
|
||||
void batch_normalize (
|
||||
const double eps,
|
||||
resizable_tensor& dest,
|
||||
resizable_tensor& means,
|
||||
resizable_tensor& invstds,
|
||||
const double averaging_factor,
|
||||
resizable_tensor& running_means,
|
||||
resizable_tensor& running_variances,
|
||||
const tensor& src,
|
||||
const tensor& gamma,
|
||||
const tensor& beta
|
||||
);
|
||||
|
||||
void batch_normalize_gradient (
|
||||
const double eps,
|
||||
const tensor& gradient_input,
|
||||
const tensor& means,
|
||||
const tensor& invstds,
|
||||
const tensor& src,
|
||||
const tensor& gamma,
|
||||
tensor& src_grad,
|
||||
tensor& gamma_grad,
|
||||
tensor& beta_grad
|
||||
);
|
||||
|
||||
void batch_normalize_conv_inference (
|
||||
const double eps,
|
||||
resizable_tensor& dest,
|
||||
const tensor& src,
|
||||
const tensor& gamma,
|
||||
const tensor& beta,
|
||||
const tensor& running_means,
|
||||
const tensor& running_variances
|
||||
);
|
||||
|
||||
void batch_normalize_conv (
|
||||
const double eps,
|
||||
resizable_tensor& dest,
|
||||
resizable_tensor& means,
|
||||
resizable_tensor& invstds,
|
||||
const double averaging_factor,
|
||||
resizable_tensor& running_means,
|
||||
resizable_tensor& running_variances,
|
||||
const tensor& src,
|
||||
const tensor& gamma,
|
||||
const tensor& beta
|
||||
);
|
||||
|
||||
void batch_normalize_conv_gradient (
|
||||
const double eps,
|
||||
const tensor& gradient_input,
|
||||
const tensor& means,
|
||||
const tensor& invstds,
|
||||
const tensor& src,
|
||||
const tensor& gamma,
|
||||
tensor& src_grad,
|
||||
tensor& gamma_grad,
|
||||
tensor& beta_grad
|
||||
);
|
||||
|
||||
// -----------------------------------------------------------------------------------
|
||||
|
||||
void threshold (
|
||||
tensor& data,
|
||||
float thresh
|
||||
);
|
||||
|
||||
void dot (
|
||||
const tensor& a,
|
||||
const tensor& b,
|
||||
tensor& result,
|
||||
size_t idx
|
||||
);
|
||||
|
||||
// -----------------------------------------------------------------------------------
|
||||
|
||||
void softmax (
|
||||
tensor& dest,
|
||||
const tensor& src
|
||||
);
|
||||
|
||||
void softmax_gradient (
|
||||
tensor& grad,
|
||||
const tensor& dest,
|
||||
const tensor& gradient_input
|
||||
);
|
||||
|
||||
// ------------------------------------------------------------------------------------
|
||||
|
||||
void softmax_all (
|
||||
tensor& dest,
|
||||
const tensor& src
|
||||
);
|
||||
|
||||
void softmax_all_gradient (
|
||||
tensor& grad,
|
||||
const tensor& dest,
|
||||
const tensor& gradient_input
|
||||
);
|
||||
|
||||
// ------------------------------------------------------------------------------------
|
||||
|
||||
void sigmoid (
|
||||
tensor& dest,
|
||||
const tensor& src
|
||||
);
|
||||
|
||||
void sigmoid_gradient (
|
||||
tensor& grad,
|
||||
const tensor& dest,
|
||||
const tensor& gradient_input
|
||||
);
|
||||
|
||||
// ------------------------------------------------------------------------------------
|
||||
|
||||
void relu (
|
||||
tensor& dest,
|
||||
const tensor& src
|
||||
);
|
||||
|
||||
void relu_gradient (
|
||||
tensor& grad,
|
||||
const tensor& dest,
|
||||
const tensor& gradient_input
|
||||
);
|
||||
|
||||
// ----------------------------------------------------------------------------------------
|
||||
|
||||
void prelu (
|
||||
tensor& dest,
|
||||
const tensor& src,
|
||||
const tensor& param
|
||||
);
|
||||
|
||||
void prelu_gradient (
|
||||
tensor& grad,
|
||||
const tensor& src,
|
||||
const tensor& gradient_input,
|
||||
const tensor& param,
|
||||
tensor& params_grad
|
||||
);
|
||||
|
||||
// ------------------------------------------------------------------------------------
|
||||
|
||||
void tanh (
|
||||
tensor& dest,
|
||||
const tensor& src
|
||||
);
|
||||
|
||||
void tanh_gradient (
|
||||
tensor& grad,
|
||||
const tensor& dest,
|
||||
const tensor& gradient_input
|
||||
);
|
||||
|
||||
// ----------------------------------------------------------------------------------------
|
||||
|
||||
void resize_bilinear (
|
||||
tensor& dest,
|
||||
long dest_row_stride,
|
||||
long dest_channel_stride,
|
||||
const tensor& src,
|
||||
long src_row_stride,
|
||||
long src_channel_stride
|
||||
);
|
||||
|
||||
void resize_bilinear_gradient (
|
||||
tensor& grad,
|
||||
long grad_row_stride,
|
||||
long grad_channel_stride,
|
||||
const tensor& gradient_input,
|
||||
long gradient_input_row_stride,
|
||||
long gradient_input_channel_stride
|
||||
);
|
||||
|
||||
inline void resize_bilinear (
|
||||
tensor& dest,
|
||||
const tensor& src
|
||||
) { resize_bilinear(dest, dest.nc(), dest.nr()*dest.nc(), src, src.nc(), src.nr()*src.nc()); }
|
||||
|
||||
inline void resize_bilinear_gradient (
|
||||
tensor& grad,
|
||||
const tensor& gradient_input
|
||||
) { resize_bilinear_gradient(grad, grad.nc(), grad.nr()*grad.nc(), gradient_input, gradient_input.nc(), gradient_input.nr()*gradient_input.nc()); }
|
||||
|
||||
// -----------------------------------------------------------------------------------
|
||||
|
||||
class pooling
|
||||
{
|
||||
public:
|
||||
|
||||
pooling(const pooling&) = delete;
|
||||
pooling& operator=(const pooling&) = delete;
|
||||
|
||||
pooling (
|
||||
);
|
||||
|
||||
void clear(
|
||||
);
|
||||
|
||||
void setup_max_pooling(
|
||||
int window_height,
|
||||
int window_width,
|
||||
int stride_y,
|
||||
int stride_x,
|
||||
int padding_y,
|
||||
int padding_x
|
||||
);
|
||||
|
||||
void setup_avg_pooling(
|
||||
int window_height,
|
||||
int window_width,
|
||||
int stride_y,
|
||||
int stride_x,
|
||||
int padding_y,
|
||||
int padding_x
|
||||
);
|
||||
|
||||
bool does_max_pooling(
|
||||
) const { return do_max_pooling; }
|
||||
|
||||
void operator() (
|
||||
resizable_tensor& dest,
|
||||
const tensor& src
|
||||
);
|
||||
|
||||
void get_gradient(
|
||||
const tensor& gradient_input,
|
||||
const tensor& dest,
|
||||
const tensor& src,
|
||||
tensor& grad
|
||||
);
|
||||
|
||||
private:
|
||||
int window_height;
|
||||
int window_width;
|
||||
int stride_y;
|
||||
int stride_x;
|
||||
int padding_y;
|
||||
int padding_x;
|
||||
bool do_max_pooling;
|
||||
|
||||
};
|
||||
|
||||
// -----------------------------------------------------------------------------------
|
||||
|
||||
class tensor_conv
|
||||
{
|
||||
public:
|
||||
tensor_conv(const tensor_conv&) = delete;
|
||||
tensor_conv& operator=(const tensor_conv&) = delete;
|
||||
|
||||
tensor_conv() {}
|
||||
|
||||
void clear(
|
||||
) {}
|
||||
|
||||
void setup(
|
||||
const tensor& data, /* not used but required for interface */
|
||||
const tensor& filters, /* not used but required for interface */
|
||||
int stride_y,
|
||||
int stride_x,
|
||||
int padding_y,
|
||||
int padding_x
|
||||
)
|
||||
{
|
||||
(void)data; /* silence compiler */
|
||||
DLIB_CASSERT(stride_y > 0 && stride_x > 0);
|
||||
DLIB_CASSERT(0 <= padding_y && padding_y < filters.nr());
|
||||
DLIB_CASSERT(0 <= padding_x && padding_x < filters.nc());
|
||||
last_stride_y = stride_y;
|
||||
last_stride_x = stride_x;
|
||||
last_padding_y = padding_y;
|
||||
last_padding_x = padding_x;
|
||||
}
|
||||
|
||||
void operator() (
|
||||
const bool add_to_output,
|
||||
resizable_tensor& output,
|
||||
const tensor& data,
|
||||
const tensor& filters
|
||||
);
|
||||
|
||||
void operator() (
|
||||
const bool add_to_output,
|
||||
tensor& output,
|
||||
const tensor& data,
|
||||
const tensor& filters
|
||||
);
|
||||
|
||||
void get_gradient_for_data (
|
||||
const bool add_to_output,
|
||||
const tensor& gradient_input,
|
||||
const tensor& filters,
|
||||
tensor& data_gradient
|
||||
);
|
||||
|
||||
void get_gradient_for_filters (
|
||||
const bool add_to_output,
|
||||
const tensor& gradient_input,
|
||||
const tensor& data,
|
||||
tensor& filters_gradient
|
||||
);
|
||||
|
||||
private:
|
||||
|
||||
long last_stride_y = 0;
|
||||
long last_stride_x = 0;
|
||||
long last_padding_y = 0;
|
||||
long last_padding_x = 0;
|
||||
};
|
||||
|
||||
// -----------------------------------------------------------------------------------
|
||||
|
||||
void copy_tensor(
|
||||
bool add_to,
|
||||
tensor& dest,
|
||||
size_t dest_k_offset,
|
||||
const tensor& src,
|
||||
size_t src_k_offset,
|
||||
size_t count_k
|
||||
);
|
||||
|
||||
// -----------------------------------------------------------------------------------
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
#ifdef NO_MAKEFILE
|
||||
#include "cpu_dlib.cpp"
|
||||
#endif
|
||||
|
||||
#endif // DLIB_DNN_CPU_H_
|
||||
|
||||
|
||||
50
pkg/OpenFace/lib/3rdParty/dlib/include/dlib/cuda/cublas_dlibapi.h
vendored
Normal file
50
pkg/OpenFace/lib/3rdParty/dlib/include/dlib/cuda/cublas_dlibapi.h
vendored
Normal file
@@ -0,0 +1,50 @@
|
||||
// Copyright (C) 2015 Davis E. King (davis@dlib.net)
|
||||
// License: Boost Software License See LICENSE.txt for the full license.
|
||||
#ifndef DLIB_DNN_CuBLAS_H_
|
||||
#define DLIB_DNN_CuBLAS_H_
|
||||
|
||||
#ifdef DLIB_USE_CUDA
|
||||
|
||||
#include "tensor.h"
|
||||
#include "cuda_errors.h"
|
||||
|
||||
namespace dlib
|
||||
{
|
||||
namespace cuda
|
||||
{
|
||||
|
||||
// -----------------------------------------------------------------------------------
|
||||
|
||||
void gemm (
|
||||
float beta,
|
||||
tensor& dest,
|
||||
float alpha,
|
||||
const tensor& lhs,
|
||||
bool trans_lhs,
|
||||
const tensor& rhs,
|
||||
bool trans_rhs
|
||||
);
|
||||
/*!
|
||||
requires
|
||||
- The dimensions of lhs and rhs must be compatible for matrix
|
||||
multiplication. In particular:
|
||||
- Let L == trans_lhs ? trans(mat(lhs)) : mat(lhs)
|
||||
- Let R == trans_rhs ? trans(mat(rhs)) : mat(rhs)
|
||||
- Let D == mat(dest)
|
||||
- D.nr() == L.nr() && D.nc() == R.nc()
|
||||
(i.e. dest must be preallocated and have the correct output dimensions)
|
||||
- L.nc() == R.nr()
|
||||
ensures
|
||||
- performs: dest = alpha*L*R + beta*mat(dest)
|
||||
!*/
|
||||
|
||||
// ------------------------------------------------------------------------------------
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
#endif // DLIB_USE_CUDA
|
||||
|
||||
#endif // DLIB_DNN_CuBLAS_H_
|
||||
|
||||
|
||||
256
pkg/OpenFace/lib/3rdParty/dlib/include/dlib/cuda/cuda_data_ptr.h
vendored
Normal file
256
pkg/OpenFace/lib/3rdParty/dlib/include/dlib/cuda/cuda_data_ptr.h
vendored
Normal file
@@ -0,0 +1,256 @@
|
||||
// Copyright (C) 2017 Davis E. King (davis@dlib.net)
|
||||
// License: Boost Software License See LICENSE.txt for the full license.
|
||||
#ifndef DLIB_DNN_CuDA_DATA_PTR_H_
|
||||
#define DLIB_DNN_CuDA_DATA_PTR_H_
|
||||
|
||||
#ifdef DLIB_USE_CUDA
|
||||
|
||||
#include <memory>
|
||||
#include <vector>
|
||||
#include "../assert.h"
|
||||
|
||||
namespace dlib
|
||||
{
|
||||
namespace cuda
|
||||
{
|
||||
|
||||
// ------------------------------------------------------------------------------------
|
||||
|
||||
class cuda_data_void_ptr
|
||||
{
|
||||
/*!
|
||||
WHAT THIS OBJECT REPRESENTS
|
||||
This is a block of memory on a CUDA device.
|
||||
!*/
|
||||
public:
|
||||
|
||||
cuda_data_void_ptr() = default;
|
||||
|
||||
cuda_data_void_ptr(size_t n);
|
||||
/*!
|
||||
ensures
|
||||
- This object will allocate a device memory buffer of n bytes.
|
||||
- #size() == n
|
||||
!*/
|
||||
|
||||
void* data() { return pdata.get(); }
|
||||
const void* data() const { return pdata.get(); }
|
||||
operator void*() { return pdata.get(); }
|
||||
operator const void*() const { return pdata.get(); }
|
||||
|
||||
void reset() { pdata.reset(); }
|
||||
|
||||
size_t size() const { return num; }
|
||||
/*!
|
||||
ensures
|
||||
- returns the length of this buffer, in bytes.
|
||||
!*/
|
||||
|
||||
cuda_data_void_ptr operator+ (size_t offset) const
|
||||
/*!
|
||||
requires
|
||||
- offset < size()
|
||||
ensures
|
||||
- returns a pointer that is offset by the given amount.
|
||||
!*/
|
||||
{
|
||||
DLIB_CASSERT(offset < num);
|
||||
cuda_data_void_ptr temp;
|
||||
temp.num = num-offset;
|
||||
temp.pdata = std::shared_ptr<void>(pdata, ((char*)pdata.get())+offset);
|
||||
return temp;
|
||||
}
|
||||
|
||||
private:
|
||||
|
||||
size_t num = 0;
|
||||
std::shared_ptr<void> pdata;
|
||||
};
|
||||
|
||||
inline cuda_data_void_ptr operator+(size_t offset, const cuda_data_void_ptr& rhs) { return rhs+offset; }
|
||||
|
||||
// ------------------------------------------------------------------------------------
|
||||
|
||||
void memcpy(
|
||||
void* dest,
|
||||
const cuda_data_void_ptr& src
|
||||
);
|
||||
/*!
|
||||
requires
|
||||
- dest == a pointer to at least src.size() bytes on the host machine.
|
||||
ensures
|
||||
- copies the GPU data from src into dest.
|
||||
- This routine is equivalent to performing: memcpy(dest,src,src.size())
|
||||
!*/
|
||||
|
||||
void memcpy(
|
||||
void* dest,
|
||||
const cuda_data_void_ptr& src,
|
||||
const size_t num
|
||||
);
|
||||
/*!
|
||||
requires
|
||||
- dest == a pointer to at least num bytes on the host machine.
|
||||
- num <= src.size()
|
||||
ensures
|
||||
- copies the GPU data from src into dest. Copies only the first num bytes
|
||||
of src to dest.
|
||||
!*/
|
||||
|
||||
// ------------------------------------------------------------------------------------
|
||||
|
||||
void memcpy(
|
||||
cuda_data_void_ptr dest,
|
||||
const void* src
|
||||
);
|
||||
/*!
|
||||
requires
|
||||
- dest == a pointer to at least src.size() bytes on the host machine.
|
||||
ensures
|
||||
- copies the host data from src to the GPU memory buffer dest.
|
||||
- This routine is equivalent to performing: memcpy(dest,src,dest.size())
|
||||
!*/
|
||||
|
||||
void memcpy(
|
||||
cuda_data_void_ptr dest,
|
||||
const void* src,
|
||||
const size_t num
|
||||
);
|
||||
/*!
|
||||
requires
|
||||
- dest == a pointer to at least num bytes on the host machine.
|
||||
- num <= dest.size()
|
||||
ensures
|
||||
- copies the host data from src to the GPU memory buffer dest. Copies only
|
||||
the first num bytes of src to dest.
|
||||
!*/
|
||||
|
||||
// ------------------------------------------------------------------------------------
|
||||
// ------------------------------------------------------------------------------------
|
||||
// ------------------------------------------------------------------------------------
|
||||
|
||||
template <typename T>
|
||||
class cuda_data_ptr
|
||||
{
|
||||
/*!
|
||||
WHAT THIS OBJECT REPRESENTS
|
||||
This is a block of memory on a CUDA device. It is just a type safe
|
||||
version of cuda_data_void_ptr.
|
||||
!*/
|
||||
|
||||
public:
|
||||
|
||||
static_assert(std::is_standard_layout<T>::value, "You can only create basic standard layout types on the GPU");
|
||||
|
||||
cuda_data_ptr() = default;
|
||||
cuda_data_ptr(size_t n) : num(n)
|
||||
/*!
|
||||
ensures
|
||||
- This object will allocate a device memory buffer of n T objects.
|
||||
- #size() == n
|
||||
!*/
|
||||
{
|
||||
if (n == 0)
|
||||
return;
|
||||
|
||||
pdata = cuda_data_void_ptr(n*sizeof(T));
|
||||
}
|
||||
|
||||
T* data() { return (T*)pdata.data(); }
|
||||
const T* data() const { return (T*)pdata.data(); }
|
||||
|
||||
operator T*() { return (T*)pdata.data(); }
|
||||
operator const T*() const { return (T*)pdata.data(); }
|
||||
|
||||
void reset() { pdata.reset(); }
|
||||
|
||||
size_t size() const { return num; }
|
||||
|
||||
|
||||
friend void memcpy(
|
||||
std::vector<T>& dest,
|
||||
const cuda_data_ptr& src
|
||||
)
|
||||
{
|
||||
dest.resize(src.size());
|
||||
if (src.size() != 0)
|
||||
memcpy(dest.data(), src.pdata);
|
||||
}
|
||||
|
||||
friend void memcpy(
|
||||
cuda_data_ptr& src,
|
||||
const std::vector<T>& dest
|
||||
)
|
||||
{
|
||||
if (dest.size() != src.size())
|
||||
dest = cuda_data_ptr<T>(src.size());
|
||||
|
||||
if (src.size() != 0)
|
||||
memcpy(src.pdata, dest.data());
|
||||
}
|
||||
|
||||
private:
|
||||
|
||||
size_t num = 0;
|
||||
cuda_data_void_ptr pdata;
|
||||
};
|
||||
|
||||
// ------------------------------------------------------------------------------------
|
||||
|
||||
class resizable_cuda_buffer
|
||||
{
|
||||
/*!
|
||||
WHAT THIS OBJECT REPRESENTS
|
||||
This is a block of memory on a CUDA device that will be automatically
|
||||
resized if requested size is larger than allocated.
|
||||
!*/
|
||||
public:
|
||||
cuda_data_void_ptr get(size_t size)
|
||||
/*!
|
||||
ensures
|
||||
- This object will return the buffer of requested size or larger.
|
||||
- buffer.size() >= size
|
||||
- Client code should not hold the returned cuda_data_void_ptr for long
|
||||
durations, but instead should call get() whenever the buffer is
|
||||
needed. Doing so ensures that multiple buffers are not kept around
|
||||
in the event of a resize.
|
||||
!*/
|
||||
{
|
||||
if (buffer.size() < size)
|
||||
{
|
||||
buffer.reset();
|
||||
buffer = cuda_data_void_ptr(size);
|
||||
}
|
||||
return buffer;
|
||||
}
|
||||
private:
|
||||
cuda_data_void_ptr buffer;
|
||||
};
|
||||
|
||||
// ----------------------------------------------------------------------------------------
|
||||
|
||||
std::shared_ptr<resizable_cuda_buffer> device_global_buffer(
|
||||
);
|
||||
/*!
|
||||
ensures
|
||||
- Returns a pointer to a globally shared CUDA memory buffer on the
|
||||
currently selected CUDA device. The buffer is also thread local. So
|
||||
each host thread will get its own buffer. You can use this global buffer
|
||||
as scratch space for CUDA computations that all take place on the default
|
||||
stream. Using it in this way ensures that there aren't any race conditions
|
||||
involving the use of the buffer.
|
||||
- The global buffer is deallocated once all references to it are
|
||||
destructed. It will be reallocated as required. So if you want to avoid
|
||||
these reallocations then hold a copy of the shared_ptr returned by this
|
||||
function.
|
||||
!*/
|
||||
|
||||
// ----------------------------------------------------------------------------------------
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
#endif // DLIB_USE_CUDA
|
||||
|
||||
#endif // DLIB_DNN_CuDA_DATA_PTR_H_
|
||||
|
||||
530
pkg/OpenFace/lib/3rdParty/dlib/include/dlib/cuda/cuda_dlib.h
vendored
Normal file
530
pkg/OpenFace/lib/3rdParty/dlib/include/dlib/cuda/cuda_dlib.h
vendored
Normal file
@@ -0,0 +1,530 @@
|
||||
// Copyright (C) 2015 Davis E. King (davis@dlib.net)
|
||||
// License: Boost Software License See LICENSE.txt for the full license.
|
||||
#ifndef DLIB_DNN_CuDA_H_
|
||||
#define DLIB_DNN_CuDA_H_
|
||||
|
||||
|
||||
#include "tensor.h"
|
||||
#include "../geometry/rectangle.h"
|
||||
|
||||
namespace dlib
|
||||
{
|
||||
namespace cuda
|
||||
{
|
||||
|
||||
// ----------------------------------------------------------------------------------------
|
||||
|
||||
void set_device (
|
||||
int dev
|
||||
);
|
||||
|
||||
int get_device (
|
||||
);
|
||||
|
||||
int get_num_devices (
|
||||
);
|
||||
|
||||
std::string get_device_name (
|
||||
int device
|
||||
);
|
||||
|
||||
void set_current_device_blocking_sync(
|
||||
);
|
||||
|
||||
bool can_access_peer (int device_id, int peer_device_id);
|
||||
bool can_access_peer (const tensor& device, const tensor& peer_device);
|
||||
|
||||
void device_synchronize (int dev);
|
||||
void device_synchronize (const tensor& dev);
|
||||
|
||||
|
||||
class raii_set_device
|
||||
{
|
||||
public:
|
||||
raii_set_device() = delete;
|
||||
raii_set_device(const raii_set_device&) = delete;
|
||||
raii_set_device& operator=(const raii_set_device&) = delete;
|
||||
|
||||
raii_set_device(int dev)
|
||||
{
|
||||
prev_dev = get_device();
|
||||
set_device(dev);
|
||||
}
|
||||
|
||||
raii_set_device(const tensor& dev)
|
||||
{
|
||||
prev_dev = get_device();
|
||||
set_device(dev.device_id());
|
||||
}
|
||||
|
||||
void operator() (int dev)
|
||||
{
|
||||
set_device(dev);
|
||||
}
|
||||
|
||||
void operator() (const tensor& dev)
|
||||
{
|
||||
set_device(dev.device_id());
|
||||
}
|
||||
|
||||
~raii_set_device() noexcept(false)
|
||||
{
|
||||
set_device(prev_dev);
|
||||
}
|
||||
|
||||
private:
|
||||
int prev_dev;
|
||||
};
|
||||
|
||||
|
||||
#ifdef DLIB_USE_CUDA
|
||||
|
||||
class enable_peer_access
|
||||
{
|
||||
public:
|
||||
|
||||
enable_peer_access() = delete;
|
||||
enable_peer_access(const enable_peer_access&) = delete;
|
||||
enable_peer_access& operator=(const enable_peer_access&) = delete;
|
||||
|
||||
enable_peer_access(
|
||||
int device_id,
|
||||
int peer_device_id
|
||||
);
|
||||
|
||||
enable_peer_access(
|
||||
const tensor& device,
|
||||
const tensor& peer_device
|
||||
) : enable_peer_access(device.device_id(), peer_device.device_id())
|
||||
{}
|
||||
|
||||
~enable_peer_access() noexcept(false);
|
||||
|
||||
private:
|
||||
|
||||
bool call_disable;
|
||||
int device_id;
|
||||
int peer_device_id;
|
||||
};
|
||||
|
||||
// -----------------------------------------------------------------------------------
|
||||
|
||||
void inverse_norms (
|
||||
resizable_tensor& invnorms,
|
||||
const tensor& data,
|
||||
const double eps
|
||||
);
|
||||
|
||||
void dot_prods (
|
||||
resizable_tensor& out,
|
||||
const tensor& lhs,
|
||||
const tensor& rhs
|
||||
);
|
||||
|
||||
void dot_prods (
|
||||
bool add_to,
|
||||
tensor& out,
|
||||
const tensor& lhs,
|
||||
const tensor& rhs
|
||||
);
|
||||
|
||||
void scale_columns (
|
||||
tensor& out,
|
||||
const tensor& m,
|
||||
const tensor& v
|
||||
);
|
||||
|
||||
void scale_rows (
|
||||
tensor& out,
|
||||
const tensor& m,
|
||||
const tensor& v
|
||||
);
|
||||
|
||||
void scale_rows2 (
|
||||
float beta,
|
||||
tensor& out,
|
||||
const tensor& m1,
|
||||
const tensor& m2,
|
||||
const tensor& v1,
|
||||
const tensor& v2
|
||||
);
|
||||
|
||||
void exp (
|
||||
tensor& dest,
|
||||
const tensor& src
|
||||
);
|
||||
|
||||
void log (
|
||||
tensor& dest,
|
||||
const tensor& src
|
||||
);
|
||||
|
||||
void log10 (
|
||||
tensor& dest,
|
||||
const tensor& src
|
||||
);
|
||||
|
||||
// ------------------------------------------------------------------------------------
|
||||
|
||||
void set_tensor (
|
||||
tensor& t,
|
||||
float value
|
||||
);
|
||||
|
||||
void scale_tensor (
|
||||
tensor& t,
|
||||
float value
|
||||
);
|
||||
|
||||
// ------------------------------------------------------------------------------------
|
||||
|
||||
void multiply (
|
||||
bool add_to,
|
||||
tensor& dest,
|
||||
const tensor& src1,
|
||||
const tensor& src2
|
||||
);
|
||||
|
||||
void multiply_conv (
|
||||
bool add_to,
|
||||
tensor& dest,
|
||||
const tensor& src1,
|
||||
const tensor& src2
|
||||
);
|
||||
|
||||
void multiply_zero_padded (
|
||||
bool add_to,
|
||||
tensor& dest,
|
||||
const tensor& src1,
|
||||
const tensor& src2
|
||||
);
|
||||
|
||||
void scale_channels (
|
||||
bool add_to,
|
||||
tensor& dest,
|
||||
const tensor& src,
|
||||
const tensor& scales
|
||||
);
|
||||
|
||||
void add (
|
||||
tensor& dest,
|
||||
const tensor& src1,
|
||||
const tensor& src2
|
||||
);
|
||||
|
||||
// -----------------------------------------------------------------------------------
|
||||
|
||||
void affine_transform(
|
||||
tensor& dest,
|
||||
const tensor& src,
|
||||
const float A,
|
||||
const float B
|
||||
);
|
||||
|
||||
void affine_transform(
|
||||
tensor& dest,
|
||||
const tensor& src,
|
||||
const float A
|
||||
);
|
||||
|
||||
void affine_transform(
|
||||
tensor& dest,
|
||||
const tensor& src1,
|
||||
const tensor& src2,
|
||||
const float A,
|
||||
const float B,
|
||||
const float C
|
||||
);
|
||||
|
||||
void affine_transform(
|
||||
tensor& dest,
|
||||
const tensor& src1,
|
||||
const tensor& src2,
|
||||
const float A,
|
||||
const float B
|
||||
);
|
||||
|
||||
void affine_transform(
|
||||
tensor& dest,
|
||||
const tensor& src1,
|
||||
const tensor& src2,
|
||||
const tensor& src3,
|
||||
const float A,
|
||||
const float B,
|
||||
const float C,
|
||||
const float D
|
||||
);
|
||||
|
||||
void affine_transform_range(
|
||||
size_t begin,
|
||||
size_t end,
|
||||
tensor& dest,
|
||||
const tensor& src1,
|
||||
const tensor& src2,
|
||||
const tensor& src3,
|
||||
const float A,
|
||||
const float B,
|
||||
const float C
|
||||
);
|
||||
|
||||
void affine_transform(
|
||||
const rectangle& rect,
|
||||
tensor& dest,
|
||||
const tensor& src1,
|
||||
const tensor& src2,
|
||||
const tensor& src3,
|
||||
float A,
|
||||
float B,
|
||||
float C
|
||||
);
|
||||
|
||||
// Note that this function isn't in the tt:: namespace because add_scaled() is
|
||||
// called by cuda::add() so we don't need a tt:: version of add_scaled().
|
||||
void add_scaled(
|
||||
tensor& dest,
|
||||
const float scale,
|
||||
const tensor& src
|
||||
);
|
||||
|
||||
void add_cv_to_all_columns(
|
||||
float beta,
|
||||
tensor& dest,
|
||||
float alpha,
|
||||
const tensor& src
|
||||
);
|
||||
|
||||
// -----------------------------------------------------------------------------------
|
||||
|
||||
void affine_transform(
|
||||
tensor& dest,
|
||||
const tensor& src,
|
||||
const tensor& A,
|
||||
const tensor& B
|
||||
);
|
||||
|
||||
// -----------------------------------------------------------------------------------
|
||||
|
||||
void affine_transform_conv(
|
||||
tensor& dest,
|
||||
const tensor& src,
|
||||
const tensor& A,
|
||||
const tensor& B
|
||||
);
|
||||
|
||||
// ----------------------------------------------------------------------------------------
|
||||
|
||||
void compute_adam_update (
|
||||
size_t begin,
|
||||
size_t end,
|
||||
tensor& s,
|
||||
tensor& m,
|
||||
tensor& v,
|
||||
const float t,
|
||||
const float learning_rate,
|
||||
const float weight_decay,
|
||||
const float momentum1,
|
||||
const float momentum2,
|
||||
const tensor& params,
|
||||
const tensor& params_grad
|
||||
);
|
||||
|
||||
// -----------------------------------------------------------------------------------
|
||||
|
||||
void assign_bias_gradient (
|
||||
tensor& grad,
|
||||
const tensor& gradient_input
|
||||
);
|
||||
|
||||
// -----------------------------------------------------------------------------------
|
||||
|
||||
void threshold (
|
||||
tensor& data,
|
||||
float thresh
|
||||
);
|
||||
|
||||
// ----------------------------------------------------------------------------------------
|
||||
|
||||
void dot (
|
||||
const tensor& a,
|
||||
const tensor& b,
|
||||
tensor& result,
|
||||
size_t idx
|
||||
);
|
||||
|
||||
// ----------------------------------------------------------------------------------------
|
||||
|
||||
void prelu (
|
||||
tensor& dest,
|
||||
const tensor& src,
|
||||
const tensor& param
|
||||
);
|
||||
|
||||
void prelu_gradient (
|
||||
tensor& grad,
|
||||
const tensor& src,
|
||||
const tensor& gradient_input,
|
||||
const tensor& param,
|
||||
tensor& params_grad
|
||||
);
|
||||
|
||||
|
||||
// ----------------------------------------------------------------------------------------
|
||||
|
||||
void resize_bilinear (
|
||||
tensor& dest,
|
||||
long dest_row_stride,
|
||||
long dest_channel_stride,
|
||||
const tensor& src,
|
||||
long src_row_stride,
|
||||
long src_channel_stride
|
||||
);
|
||||
|
||||
void resize_bilinear_gradient (
|
||||
tensor& grad,
|
||||
long grad_row_stride,
|
||||
long grad_channel_stride,
|
||||
const tensor& gradient_input,
|
||||
long gradient_input_row_stride,
|
||||
long gradient_input_channel_stride
|
||||
);
|
||||
|
||||
inline void resize_bilinear (
|
||||
tensor& dest,
|
||||
const tensor& src
|
||||
) { resize_bilinear(dest, dest.nc(), dest.nr()*dest.nc(), src, src.nc(), src.nr()*src.nc()); }
|
||||
|
||||
inline void resize_bilinear_gradient (
|
||||
tensor& grad,
|
||||
const tensor& gradient_input
|
||||
) { resize_bilinear_gradient(grad, grad.nc(), grad.nr()*grad.nc(), gradient_input, gradient_input.nc(), gradient_input.nr()*gradient_input.nc()); }
|
||||
|
||||
// ----------------------------------------------------------------------------------------
|
||||
|
||||
void copy_tensor(
|
||||
bool add_to,
|
||||
tensor& dest,
|
||||
size_t dest_k_offset,
|
||||
const tensor& src,
|
||||
size_t src_k_offset,
|
||||
size_t count_k
|
||||
);
|
||||
|
||||
|
||||
// ----------------------------------------------------------------------------------------
|
||||
|
||||
class compute_loss_multiclass_log_per_pixel
|
||||
{
|
||||
/*!
|
||||
The point of this class is to compute the loss computed by
|
||||
loss_multiclass_log_per_pixel, but to do so with CUDA.
|
||||
!*/
|
||||
public:
|
||||
|
||||
compute_loss_multiclass_log_per_pixel(
|
||||
)
|
||||
{
|
||||
work = device_global_buffer();
|
||||
}
|
||||
|
||||
template <
|
||||
typename const_label_iterator
|
||||
>
|
||||
void operator() (
|
||||
const_label_iterator truth,
|
||||
const tensor& subnetwork_output,
|
||||
tensor& gradient,
|
||||
double& loss
|
||||
) const
|
||||
{
|
||||
const size_t bytes_per_plane = subnetwork_output.nr()*subnetwork_output.nc()*sizeof(uint16_t);
|
||||
// Allocate a cuda buffer to store all the truth images and also one float
|
||||
// for the scalar loss output.
|
||||
cuda_data_void_ptr buf = work->get(subnetwork_output.num_samples()*bytes_per_plane + sizeof(float));
|
||||
|
||||
cuda_data_void_ptr loss_buf = buf;
|
||||
buf = buf+sizeof(float);
|
||||
|
||||
|
||||
// copy the truth data into a cuda buffer.
|
||||
for (long i = 0; i < subnetwork_output.num_samples(); ++i, ++truth)
|
||||
{
|
||||
const matrix<uint16_t>& t = *truth;
|
||||
DLIB_ASSERT(t.nr() == subnetwork_output.nr());
|
||||
DLIB_ASSERT(t.nc() == subnetwork_output.nc());
|
||||
memcpy(buf + i*bytes_per_plane, &t(0,0), bytes_per_plane);
|
||||
}
|
||||
|
||||
do_work(static_cast<float*>(loss_buf.data()), static_cast<uint16_t*>(buf.data()), subnetwork_output, gradient, loss);
|
||||
}
|
||||
|
||||
private:
|
||||
|
||||
static void do_work(
|
||||
float* loss_cuda_work_buffer,
|
||||
const uint16_t* truth_buffer,
|
||||
const tensor& subnetwork_output,
|
||||
tensor& gradient,
|
||||
double& loss
|
||||
);
|
||||
|
||||
std::shared_ptr<resizable_cuda_buffer> work;
|
||||
};
|
||||
|
||||
// ------------------------------------------------------------------------------------
|
||||
// ------------------------------------------------------------------------------------
|
||||
// ------------------------------------------------------------------------------------
|
||||
// ------------------------------------------------------------------------------------
|
||||
|
||||
#else // if DLIB_USE_CUDA NOT DEFINED
|
||||
|
||||
inline void set_device (
|
||||
int id
|
||||
)
|
||||
{
|
||||
DLIB_CASSERT(id == 0, "dlib::cuda::set_device(id) called with an invalid device id.");
|
||||
}
|
||||
|
||||
inline int get_device (
|
||||
){ return 0; }
|
||||
|
||||
inline int get_num_devices (
|
||||
) { return 1; }
|
||||
|
||||
inline std::string get_device_name (
|
||||
int device
|
||||
)
|
||||
{
|
||||
DLIB_CASSERT(device == 0, "dlib::cuda::set_device(id) called with an invalid device id.");
|
||||
return "CUDA_DISABLED";
|
||||
}
|
||||
|
||||
inline void set_current_device_blocking_sync(
|
||||
) {}
|
||||
|
||||
|
||||
inline bool can_access_peer (int , int )
|
||||
{ return false; }
|
||||
inline bool can_access_peer (const tensor& , const tensor& )
|
||||
{ return false; }
|
||||
|
||||
inline void device_synchronize (int ){}
|
||||
inline void device_synchronize (const tensor& ){}
|
||||
|
||||
class enable_peer_access
|
||||
{
|
||||
public:
|
||||
enable_peer_access() = delete;
|
||||
enable_peer_access(const enable_peer_access&) = delete;
|
||||
enable_peer_access& operator=(const enable_peer_access&) = delete;
|
||||
enable_peer_access( int, int ){}
|
||||
enable_peer_access( const tensor&, const tensor& ) {}
|
||||
};
|
||||
|
||||
#endif // DLIB_USE_CUDA
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
#endif // DLIB_DNN_CuDA_H_
|
||||
|
||||
70
pkg/OpenFace/lib/3rdParty/dlib/include/dlib/cuda/cuda_errors.h
vendored
Normal file
70
pkg/OpenFace/lib/3rdParty/dlib/include/dlib/cuda/cuda_errors.h
vendored
Normal file
@@ -0,0 +1,70 @@
|
||||
// Copyright (C) 2015 Davis E. King (davis@dlib.net)
|
||||
// License: Boost Software License See LICENSE.txt for the full license.
|
||||
#ifndef DLIB_CUDA_ERRORs_H_
|
||||
#define DLIB_CUDA_ERRORs_H_
|
||||
|
||||
|
||||
#include "../error.h"
|
||||
|
||||
namespace dlib
|
||||
{
|
||||
struct cuda_error : public error
|
||||
{
|
||||
/*!
|
||||
WHAT THIS OBJECT REPRESENTS
|
||||
This is the exception thrown if any calls to the NVIDIA CUDA runtime
|
||||
returns an error.
|
||||
!*/
|
||||
|
||||
cuda_error(const std::string& message): error(message) {}
|
||||
};
|
||||
|
||||
|
||||
struct cudnn_error : public cuda_error
|
||||
{
|
||||
/*!
|
||||
WHAT THIS OBJECT REPRESENTS
|
||||
This is the exception thrown if any calls to the NVIDIA cuDNN library
|
||||
returns an error.
|
||||
!*/
|
||||
|
||||
cudnn_error(const std::string& message): cuda_error(message) {}
|
||||
};
|
||||
|
||||
struct curand_error : public cuda_error
|
||||
{
|
||||
/*!
|
||||
WHAT THIS OBJECT REPRESENTS
|
||||
This is the exception thrown if any calls to the NVIDIA cuRAND library
|
||||
returns an error.
|
||||
!*/
|
||||
|
||||
curand_error(const std::string& message): cuda_error(message) {}
|
||||
};
|
||||
|
||||
struct cublas_error : public cuda_error
|
||||
{
|
||||
/*!
|
||||
WHAT THIS OBJECT REPRESENTS
|
||||
This is the exception thrown if any calls to the NVIDIA cuBLAS library
|
||||
returns an error.
|
||||
!*/
|
||||
|
||||
cublas_error(const std::string& message): cuda_error(message) {}
|
||||
};
|
||||
|
||||
struct cusolver_error : public cuda_error
|
||||
{
|
||||
/*!
|
||||
WHAT THIS OBJECT REPRESENTS
|
||||
This is the exception thrown if any calls to the NVIDIA cuSolver library
|
||||
returns an error.
|
||||
!*/
|
||||
|
||||
cusolver_error(const std::string& message): cuda_error(message) {}
|
||||
};
|
||||
}
|
||||
|
||||
|
||||
#endif // DLIB_CUDA_ERRORs_H_
|
||||
|
||||
413
pkg/OpenFace/lib/3rdParty/dlib/include/dlib/cuda/cuda_utils.h
vendored
Normal file
413
pkg/OpenFace/lib/3rdParty/dlib/include/dlib/cuda/cuda_utils.h
vendored
Normal file
@@ -0,0 +1,413 @@
|
||||
// Copyright (C) 2015 Davis E. King (davis@dlib.net)
|
||||
// License: Boost Software License See LICENSE.txt for the full license.
|
||||
#ifndef DLIB_CUDA_UtILS_H_
|
||||
#define DLIB_CUDA_UtILS_H_
|
||||
|
||||
#ifndef DLIB_USE_CUDA
|
||||
#error "This file shouldn't be #included unless DLIB_USE_CUDA is #defined"
|
||||
#endif
|
||||
|
||||
#include "cuda_errors.h"
|
||||
#include "../algs.h"
|
||||
#include <cmath>
|
||||
|
||||
#include <cuda_runtime.h>
|
||||
#include <sstream>
|
||||
#include <iostream>
|
||||
#include <memory>
|
||||
#include <vector>
|
||||
#include <type_traits>
|
||||
|
||||
|
||||
// Check the return value of a call to the CUDA runtime for an error condition.
|
||||
#define CHECK_CUDA(call) \
|
||||
do{ \
|
||||
const cudaError_t error = call; \
|
||||
if (error != cudaSuccess) \
|
||||
{ \
|
||||
std::ostringstream sout; \
|
||||
sout << "Error while calling " << #call << " in file " << __FILE__ << ":" << __LINE__ << ". ";\
|
||||
sout << "code: " << error << ", reason: " << cudaGetErrorString(error);\
|
||||
throw dlib::cuda_error(sout.str()); \
|
||||
} \
|
||||
}while(false)
|
||||
|
||||
// ----------------------------------------------------------------------------------------
|
||||
|
||||
#ifdef __CUDACC__
|
||||
|
||||
namespace dlib
|
||||
{
|
||||
namespace cuda
|
||||
{
|
||||
|
||||
// ------------------------------------------------------------------------------------
|
||||
|
||||
__inline__ __device__ size_t pack_idx (
|
||||
size_t dim_size3,
|
||||
size_t dim_size2,
|
||||
size_t dim_size1,
|
||||
size_t idx4,
|
||||
size_t idx3,
|
||||
size_t idx2,
|
||||
size_t idx1
|
||||
)
|
||||
/*!
|
||||
ensures
|
||||
- Converts a 4D array index into a 1D index assuming row major layout. To
|
||||
understand precisely what this function does, imagine we had an array
|
||||
declared like this:
|
||||
int ARRAY[anything][dim_size3][dim_size2][dim_size1];
|
||||
Then we could index it like this:
|
||||
ARRAY[idx4][idx3][idx2][idx1]
|
||||
or equivalently like this:
|
||||
((int*)ARRAY)[pack_idx(dim_size3,dim_size2,dim_size1, idx4,idx3,idx2,idx1)]
|
||||
!*/
|
||||
{
|
||||
return ((idx4*dim_size3 + idx3)*dim_size2 + idx2)*dim_size1 + idx1;
|
||||
}
|
||||
|
||||
__inline__ __device__ void unpack_idx (
|
||||
size_t idx,
|
||||
size_t dim_size3,
|
||||
size_t dim_size2,
|
||||
size_t dim_size1,
|
||||
size_t& idx4,
|
||||
size_t& idx3,
|
||||
size_t& idx2,
|
||||
size_t& idx1
|
||||
)
|
||||
/*!
|
||||
ensures
|
||||
- This function computes the inverse of pack_idx(). Therefore,
|
||||
if PACKED == pack_idx(dim_size3,dim_size2,dim_size1, idx4,idx3,idx2,idx1)
|
||||
then unpack_idx(PACKED,dim_size3,dim_size2,dim_size1, IDX4,IDX3,IDX2,IDX1)
|
||||
results in:
|
||||
- IDX1 == idx1
|
||||
- IDX2 == idx2
|
||||
- IDX3 == idx3
|
||||
- IDX4 == idx4
|
||||
!*/
|
||||
{
|
||||
idx1 = idx%dim_size1;
|
||||
|
||||
idx /= dim_size1;
|
||||
idx2 = idx%dim_size2;
|
||||
|
||||
idx /= dim_size2;
|
||||
idx3 = idx%dim_size3;
|
||||
|
||||
idx /= dim_size3;
|
||||
idx4 = idx;
|
||||
}
|
||||
|
||||
// ------------------------------------------------------------------------------------
|
||||
|
||||
// This function is from the article:
|
||||
// http://devblogs.nvidia.com/parallelforall/faster-parallel-reductions-kepler/
|
||||
__inline__ __device__ float warp_reduce_sum(float val)
|
||||
{
|
||||
for (int offset = warpSize/2; offset > 0; offset /= 2)
|
||||
#if CUDART_VERSION >= 9000
|
||||
val += __shfl_down_sync(0xFFFFFFFF,val, offset);
|
||||
#else
|
||||
val += __shfl_down(val, offset);
|
||||
#endif
|
||||
return val;
|
||||
}
|
||||
|
||||
__inline__ __device__ bool is_first_thread_in_warp()
|
||||
{
|
||||
return (threadIdx.x & (warpSize - 1)) == 0;
|
||||
}
|
||||
|
||||
__inline__ __device__ void warp_reduce_atomic_add(
|
||||
float& out,
|
||||
float val
|
||||
)
|
||||
/*!
|
||||
ensures
|
||||
- Atomically adds all the val variables in the current warp to out.
|
||||
See this page for an extended discussion:
|
||||
http://devblogs.nvidia.com/parallelforall/faster-parallel-reductions-kepler/
|
||||
!*/
|
||||
{
|
||||
val = warp_reduce_sum(val);
|
||||
if (is_first_thread_in_warp())
|
||||
atomicAdd(&out, val);
|
||||
}
|
||||
|
||||
// ------------------------------------------------------------------------------------
|
||||
|
||||
struct max_jobs
|
||||
{
|
||||
max_jobs(int x) : num_x(x) {}
|
||||
max_jobs(int x, int y) : num_x(x), num_y(y) {}
|
||||
int num_x;
|
||||
int num_y = 1;
|
||||
};
|
||||
|
||||
template <typename Kernel, typename... T>
|
||||
void launch_kernel (
|
||||
Kernel K,
|
||||
T ...args
|
||||
)
|
||||
/*!
|
||||
ensures
|
||||
- launches the given kernel K(args...). The point of this function is to
|
||||
automatically set the kernel launch parameters to something reasonable
|
||||
based on the properties of the kernel and the current GPU card.
|
||||
!*/
|
||||
{
|
||||
int num_blocks, num_threads;
|
||||
CHECK_CUDA(cudaOccupancyMaxPotentialBlockSize(&num_blocks,&num_threads,K));
|
||||
K<<<num_blocks,num_threads>>>(args...);
|
||||
}
|
||||
|
||||
template <typename Kernel, typename... T>
|
||||
void launch_kernel (
|
||||
Kernel K,
|
||||
max_jobs m,
|
||||
T ...args
|
||||
)
|
||||
/*!
|
||||
ensures
|
||||
- This function is just like launch_kernel(K,args...) except that you can
|
||||
additionally supply a max_jobs number that tells it how many possible
|
||||
total threads could be used. This is useful when launching potentially
|
||||
small jobs that might not need the number of threads suggested by
|
||||
launch_kernel().
|
||||
!*/
|
||||
{
|
||||
if (m.num_x == 0 || m.num_y == 0)
|
||||
return;
|
||||
int num_blocks, num_threads;
|
||||
CHECK_CUDA(cudaOccupancyMaxPotentialBlockSize(&num_blocks,&num_threads,K));
|
||||
// Check if the job is really small and we don't really need to launch a kernel
|
||||
// with this many blocks and threads.
|
||||
if (num_blocks*num_threads > m.num_x*m.num_y)
|
||||
num_blocks = (m.num_x*m.num_y+num_threads-1)/num_threads;
|
||||
|
||||
if (m.num_y == 1)
|
||||
{
|
||||
K<<<num_blocks,num_threads>>>(args...);
|
||||
}
|
||||
else
|
||||
{
|
||||
/*
|
||||
In general, the reason m.num_y!=1 (i.e. the reason you are in this
|
||||
code path) is because we are using nested grid-stride loops. There are
|
||||
two important things to note about what we are doing here. To
|
||||
illustrate them we will talk about this little CUDA code snippet:
|
||||
|
||||
// initialize out before we begin.
|
||||
for (auto i : grid_stride_range_y(0, nr))
|
||||
for (auto j : grid_stride_range(0, 1))
|
||||
out[i] = 0;
|
||||
|
||||
__syncthreads(); // synchronize threads in block
|
||||
|
||||
// loop over some 2D thing and sum and store things into out.
|
||||
for (auto i : grid_stride_range_y(0, nr))
|
||||
{
|
||||
float temp = 0;
|
||||
for (auto j : grid_stride_range(0, nc))
|
||||
temp += whatever[i*nc+j];
|
||||
|
||||
// store the sum into out[i]
|
||||
warp_reduce_atomic_add(out[i], temp);
|
||||
}
|
||||
|
||||
First, we make sure the number of x threads is a multiple of 32 so that
|
||||
you can use warp_reduce_atomic_add() inside the y loop.
|
||||
|
||||
Second, we put the x block size to 1 so inter-block synchronization is
|
||||
easier. For example, if the number of x blocks wasn't 1 the above code
|
||||
would have a race condition in it. This is because the execution of
|
||||
out[i]=0 would be done by blocks with blockIdx.x==0, but then in the
|
||||
second set of loops, *all* the x blocks use out[i]. Since
|
||||
__syncthreads() doesn't do any synchronization between blocks some of
|
||||
the blocks might begin before the out[i]=0 statements finished and that
|
||||
would be super bad.
|
||||
*/
|
||||
|
||||
// Try and make sure that the ratio of x to y threads is reasonable based
|
||||
// on the respective size of our loops.
|
||||
int x_threads = 32;
|
||||
int y_threads = num_threads/32;
|
||||
const int ratio = static_cast<int>(std::round(put_in_range(1, y_threads, m.num_x/(double)m.num_y)));
|
||||
x_threads *= ratio;
|
||||
y_threads /= ratio;
|
||||
|
||||
dim3 blocks(1,num_blocks);
|
||||
dim3 threads(x_threads,y_threads);
|
||||
K<<<blocks,threads>>>(args...);
|
||||
}
|
||||
}
|
||||
|
||||
// ------------------------------------------------------------------------------------
|
||||
|
||||
class grid_stride_range
|
||||
{
|
||||
/*!
|
||||
WHAT THIS OBJECT REPRESENTS
|
||||
This is a tool for making a for loop that loops over an entire block of
|
||||
memory inside a kernel, but doing so in a way that parallelizes
|
||||
appropriately across all the threads in a kernel launch. For example,
|
||||
the following kernel would add the vector a to the vector b and store
|
||||
the output in out (assuming all vectors are of dimension n):
|
||||
__global__ void add_arrays(
|
||||
const float* a,
|
||||
const float* b,
|
||||
float* out,
|
||||
size_t n
|
||||
)
|
||||
{
|
||||
for (auto i : grid_stride_range(0, n))
|
||||
{
|
||||
out[i] = a[i]+b[i];
|
||||
}
|
||||
}
|
||||
!*/
|
||||
|
||||
public:
|
||||
__device__ grid_stride_range(
|
||||
size_t ibegin_,
|
||||
size_t iend_
|
||||
) :
|
||||
ibegin(ibegin_),
|
||||
iend(iend_)
|
||||
{}
|
||||
|
||||
class iterator
|
||||
{
|
||||
public:
|
||||
__device__ iterator() {}
|
||||
__device__ iterator(size_t pos_) : pos(pos_) {}
|
||||
|
||||
__device__ size_t operator*() const
|
||||
{
|
||||
return pos;
|
||||
}
|
||||
|
||||
__device__ iterator& operator++()
|
||||
{
|
||||
pos += gridDim.x * blockDim.x;
|
||||
return *this;
|
||||
}
|
||||
|
||||
__device__ bool operator!=(const iterator& item) const
|
||||
{ return pos < item.pos; }
|
||||
|
||||
private:
|
||||
size_t pos;
|
||||
};
|
||||
|
||||
__device__ iterator begin() const
|
||||
{
|
||||
return iterator(ibegin+blockDim.x * blockIdx.x + threadIdx.x);
|
||||
}
|
||||
__device__ iterator end() const
|
||||
{
|
||||
return iterator(iend);
|
||||
}
|
||||
private:
|
||||
|
||||
size_t ibegin;
|
||||
size_t iend;
|
||||
};
|
||||
|
||||
// ------------------------------------------------------------------------------------
|
||||
|
||||
class grid_stride_range_y
|
||||
{
|
||||
/*!
|
||||
WHAT THIS OBJECT REPRESENTS
|
||||
This object is just like grid_stride_range except that it looks at
|
||||
CUDA's y thread index (e.g. threadIdx.y) instead of the x index.
|
||||
Therefore, if you launch a cuda kernel with a statement like:
|
||||
dim3 blocks(1,10);
|
||||
dim3 threads(32,32); // You need to have x and y not equal to 1 to get parallelism over both loops.
|
||||
add_arrays<<<blocks,threads>>>(a,b,out,nr,nc);
|
||||
You can perform a nested 2D parallel for loop rather than doing just a
|
||||
1D for loop.
|
||||
|
||||
So the code in the kernel would look like this if you wanted to add two
|
||||
2D matrices:
|
||||
__global__ void add_arrays(
|
||||
const float* a,
|
||||
const float* b,
|
||||
float* out,
|
||||
size_t nr,
|
||||
size_t nc
|
||||
)
|
||||
{
|
||||
for (auto r : grid_stride_range_y(0, nr))
|
||||
{
|
||||
for (auto c : grid_stride_range(0, nc))
|
||||
{
|
||||
auto i = r*nc+c;
|
||||
out[i] = a[i]+b[i];
|
||||
}
|
||||
}
|
||||
}
|
||||
!*/
|
||||
|
||||
public:
|
||||
__device__ grid_stride_range_y(
|
||||
size_t ibegin_,
|
||||
size_t iend_
|
||||
) :
|
||||
ibegin(ibegin_),
|
||||
iend(iend_)
|
||||
{}
|
||||
|
||||
class iterator
|
||||
{
|
||||
public:
|
||||
__device__ iterator() {}
|
||||
__device__ iterator(size_t pos_) : pos(pos_) {}
|
||||
|
||||
__device__ size_t operator*() const
|
||||
{
|
||||
return pos;
|
||||
}
|
||||
|
||||
__device__ iterator& operator++()
|
||||
{
|
||||
pos += gridDim.y * blockDim.y;
|
||||
return *this;
|
||||
}
|
||||
|
||||
__device__ bool operator!=(const iterator& item) const
|
||||
{ return pos < item.pos; }
|
||||
|
||||
private:
|
||||
size_t pos;
|
||||
};
|
||||
|
||||
__device__ iterator begin() const
|
||||
{
|
||||
return iterator(ibegin+blockDim.y * blockIdx.y + threadIdx.y);
|
||||
}
|
||||
__device__ iterator end() const
|
||||
{
|
||||
return iterator(iend);
|
||||
}
|
||||
private:
|
||||
|
||||
size_t ibegin;
|
||||
size_t iend;
|
||||
};
|
||||
|
||||
// ------------------------------------------------------------------------------------
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
#endif // __CUDACC__
|
||||
|
||||
// ----------------------------------------------------------------------------------------
|
||||
|
||||
#endif // DLIB_CUDA_UtILS_H_
|
||||
|
||||
518
pkg/OpenFace/lib/3rdParty/dlib/include/dlib/cuda/cudnn_dlibapi.h
vendored
Normal file
518
pkg/OpenFace/lib/3rdParty/dlib/include/dlib/cuda/cudnn_dlibapi.h
vendored
Normal file
@@ -0,0 +1,518 @@
|
||||
// Copyright (C) 2015 Davis E. King (davis@dlib.net)
|
||||
// License: Boost Software License See LICENSE.txt for the full license.
|
||||
#ifndef DLIB_DNN_CuDNN_H_
|
||||
#define DLIB_DNN_CuDNN_H_
|
||||
|
||||
#ifdef DLIB_USE_CUDA
|
||||
|
||||
#include "cuda_errors.h"
|
||||
#include <memory>
|
||||
#include "cuda_data_ptr.h"
|
||||
|
||||
namespace dlib
|
||||
{
|
||||
class tensor;
|
||||
class resizable_tensor;
|
||||
|
||||
namespace cuda
|
||||
{
|
||||
|
||||
// -----------------------------------------------------------------------------------
|
||||
|
||||
class tensor_descriptor
|
||||
{
|
||||
/*!
|
||||
Each tensor object will carry a tensor_descriptor in it when compiled with
|
||||
CUDA.
|
||||
!*/
|
||||
|
||||
public:
|
||||
// not copyable
|
||||
tensor_descriptor(const tensor_descriptor&) = delete;
|
||||
tensor_descriptor& operator=(const tensor_descriptor&) = delete;
|
||||
// but is movable
|
||||
tensor_descriptor(tensor_descriptor&& item) : tensor_descriptor() { swap(item); }
|
||||
tensor_descriptor& operator=(tensor_descriptor&& item) { swap(item); return *this; }
|
||||
|
||||
tensor_descriptor();
|
||||
~tensor_descriptor();
|
||||
|
||||
void set_size(
|
||||
int n,
|
||||
int k,
|
||||
int nr,
|
||||
int nc
|
||||
);
|
||||
/*!
|
||||
ensures
|
||||
- if any of the arguments are 0 then they are all set to 0 in the tensor.
|
||||
!*/
|
||||
|
||||
void get_size (
|
||||
int& n,
|
||||
int& k,
|
||||
int& nr,
|
||||
int& nc
|
||||
) const;
|
||||
|
||||
const void* get_handle (
|
||||
) const { return handle; }
|
||||
|
||||
private:
|
||||
|
||||
void swap(tensor_descriptor& item) { std::swap(handle, item.handle); }
|
||||
|
||||
void* handle;
|
||||
};
|
||||
|
||||
// ------------------------------------------------------------------------------------
|
||||
|
||||
void add(
|
||||
float beta,
|
||||
tensor& dest,
|
||||
float alpha,
|
||||
const tensor& src
|
||||
);
|
||||
/*!
|
||||
requires
|
||||
- One of the following is true:
|
||||
- have_same_dimensions(src, dest)
|
||||
- src.num_samples()==1 && src.k()==dest.k() && src.nr()==1 && src.nc()==1
|
||||
- src.num_samples()==1 && src.k()==dest.k() && src.nr()==dest.nr() && src.nc()==dest.nc()
|
||||
- src.num_samples()==1 && src.k()==1 && src.nr()==dest.nr() && src.nc()==dest.nc()
|
||||
- is_same_object(src,dest) == false
|
||||
ensures
|
||||
- performs: dest = beta*dest + alpha*src
|
||||
However, how the addition happens depends on the dimensions of src. In
|
||||
particular, this function adds the scaled values of one src tensor to
|
||||
dest. Each dimension of the src tensor must match the corresponding
|
||||
dimension of the dest tensor or must be equal to 1. In the latter case,
|
||||
the same value from the src tensor, for those dimensions, will be used to
|
||||
add into the dest tensor.
|
||||
!*/
|
||||
|
||||
// ------------------------------------------------------------------------------------
|
||||
|
||||
void assign_conv_bias_gradient (
|
||||
tensor& grad,
|
||||
const tensor& gradient_input
|
||||
);
|
||||
/*!
|
||||
requires
|
||||
- grad.num_samples() == 1
|
||||
- grad.k() >= 1
|
||||
- grad.nr() == 1
|
||||
- grad.nc() == 1
|
||||
- gradient_input.k() == grad.k()
|
||||
- gradient_input.size() > 0
|
||||
- is_same_object(grad,gradient_input) == false
|
||||
ensures
|
||||
- let BIAS be a tensor with all dimensions equal to 1 except for k which is >= 1.
|
||||
- let OUT be the output of add(1,OUT,1,BIAS)
|
||||
- let f(gradient_input,BIAS) == dot(gradient_input,OUT)
|
||||
- Then this function computes the gradient of f() with respect to BIAS and
|
||||
assigns it to grad.
|
||||
!*/
|
||||
|
||||
// ------------------------------------------------------------------------------------
|
||||
|
||||
void batch_normalize_inference (
|
||||
const double eps,
|
||||
resizable_tensor& dest,
|
||||
const tensor& src,
|
||||
const tensor& gamma,
|
||||
const tensor& beta,
|
||||
const tensor& running_means,
|
||||
const tensor& running_variances
|
||||
);
|
||||
|
||||
void batch_normalize (
|
||||
const double eps,
|
||||
resizable_tensor& dest,
|
||||
resizable_tensor& means,
|
||||
resizable_tensor& invstds,
|
||||
const double averaging_factor,
|
||||
resizable_tensor& running_means,
|
||||
resizable_tensor& running_variances,
|
||||
const tensor& src,
|
||||
const tensor& gamma,
|
||||
const tensor& beta
|
||||
);
|
||||
|
||||
void batch_normalize_gradient(
|
||||
const double eps,
|
||||
const tensor& gradient_input,
|
||||
const tensor& means,
|
||||
const tensor& invstds,
|
||||
const tensor& src,
|
||||
const tensor& gamma,
|
||||
tensor& src_grad,
|
||||
tensor& gamma_grad,
|
||||
tensor& beta_grad
|
||||
);
|
||||
|
||||
// ------------------------------------------------------------------------------------
|
||||
|
||||
void batch_normalize_conv_inference (
|
||||
const double eps,
|
||||
resizable_tensor& dest,
|
||||
const tensor& src,
|
||||
const tensor& gamma,
|
||||
const tensor& beta,
|
||||
const tensor& running_means,
|
||||
const tensor& running_variances
|
||||
);
|
||||
|
||||
void batch_normalize_conv (
|
||||
const double eps,
|
||||
resizable_tensor& dest,
|
||||
resizable_tensor& means,
|
||||
resizable_tensor& invstds,
|
||||
const double averaging_factor,
|
||||
resizable_tensor& running_means,
|
||||
resizable_tensor& running_variances,
|
||||
const tensor& src,
|
||||
const tensor& gamma,
|
||||
const tensor& beta
|
||||
);
|
||||
|
||||
void batch_normalize_conv_gradient(
|
||||
const double eps,
|
||||
const tensor& gradient_input,
|
||||
const tensor& means,
|
||||
const tensor& invstds,
|
||||
const tensor& src,
|
||||
const tensor& gamma,
|
||||
tensor& src_grad,
|
||||
tensor& gamma_grad,
|
||||
tensor& beta_grad
|
||||
);
|
||||
|
||||
// ------------------------------------------------------------------------------------
|
||||
|
||||
class tensor_conv
|
||||
{
|
||||
public:
|
||||
tensor_conv(const tensor_conv&) = delete;
|
||||
tensor_conv& operator=(const tensor_conv&) = delete;
|
||||
|
||||
tensor_conv();
|
||||
|
||||
void clear(
|
||||
);
|
||||
|
||||
~tensor_conv (
|
||||
);
|
||||
|
||||
void operator() (
|
||||
const bool add_to_output,
|
||||
tensor& output,
|
||||
const tensor& data,
|
||||
const tensor& filters
|
||||
);
|
||||
|
||||
void operator() (
|
||||
const bool add_to_output,
|
||||
resizable_tensor& output,
|
||||
const tensor& data,
|
||||
const tensor& filters
|
||||
);
|
||||
|
||||
void get_gradient_for_data (
|
||||
const bool add_to_output,
|
||||
const tensor& gradient_input,
|
||||
const tensor& filters,
|
||||
tensor& data_gradient
|
||||
);
|
||||
|
||||
void get_gradient_for_filters (
|
||||
const bool add_to_output,
|
||||
const tensor& gradient_input,
|
||||
const tensor& data,
|
||||
tensor& filters_gradient
|
||||
);
|
||||
|
||||
void setup(
|
||||
const tensor& data,
|
||||
const tensor& filters,
|
||||
int stride_y,
|
||||
int stride_x,
|
||||
int padding_y,
|
||||
int padding_x
|
||||
);
|
||||
|
||||
private:
|
||||
|
||||
// These variables record the type of data given to the last call to setup().
|
||||
int stride_y;
|
||||
int stride_x;
|
||||
int padding_y;
|
||||
int padding_x;
|
||||
long data_num_samples, data_k, data_nr, data_nc;
|
||||
long filters_num_samples, filters_k, filters_nr, filters_nc;
|
||||
|
||||
|
||||
void* filter_handle;
|
||||
void* conv_handle;
|
||||
|
||||
// dimensions of the output tensor from operator()
|
||||
int out_num_samples;
|
||||
int out_k;
|
||||
int out_nr;
|
||||
int out_nc;
|
||||
|
||||
int forward_algo;
|
||||
int backward_data_algo;
|
||||
int backward_filters_algo;
|
||||
|
||||
size_t forward_workspace_size_in_bytes;
|
||||
size_t backward_data_workspace_size_in_bytes;
|
||||
size_t backward_filters_workspace_size_in_bytes;
|
||||
std::shared_ptr<resizable_cuda_buffer> workspace;
|
||||
cuda_data_void_ptr forward_workspace;
|
||||
cuda_data_void_ptr backward_data_workspace;
|
||||
cuda_data_void_ptr backward_filters_workspace;
|
||||
};
|
||||
|
||||
// ------------------------------------------------------------------------------------
|
||||
|
||||
class pooling
|
||||
{
|
||||
public:
|
||||
|
||||
pooling(const pooling&) = delete;
|
||||
pooling& operator=(const pooling&) = delete;
|
||||
|
||||
pooling (
|
||||
);
|
||||
|
||||
~pooling(
|
||||
);
|
||||
|
||||
void clear(
|
||||
);
|
||||
|
||||
void setup_max_pooling(
|
||||
int window_height,
|
||||
int window_width,
|
||||
int stride_y,
|
||||
int stride_x,
|
||||
int padding_y,
|
||||
int padding_x
|
||||
);
|
||||
|
||||
void setup_avg_pooling(
|
||||
int window_height,
|
||||
int window_width,
|
||||
int stride_y,
|
||||
int stride_x,
|
||||
int padding_y,
|
||||
int padding_x
|
||||
);
|
||||
|
||||
bool does_max_pooling(
|
||||
) const { return do_max_pooling; }
|
||||
|
||||
void operator() (
|
||||
resizable_tensor& dest,
|
||||
const tensor& src
|
||||
);
|
||||
|
||||
void get_gradient(
|
||||
const tensor& gradient_input,
|
||||
const tensor& dest,
|
||||
const tensor& src,
|
||||
tensor& grad
|
||||
);
|
||||
|
||||
private:
|
||||
|
||||
void setup(
|
||||
int window_height,
|
||||
int window_width,
|
||||
int stride_y,
|
||||
int stride_x,
|
||||
int padding_y,
|
||||
int padding_x,
|
||||
int pooling_mode
|
||||
);
|
||||
|
||||
void* handle;
|
||||
int window_height;
|
||||
int window_width;
|
||||
int stride_y;
|
||||
int stride_x;
|
||||
int padding_y;
|
||||
int padding_x;
|
||||
bool do_max_pooling;
|
||||
};
|
||||
|
||||
// ------------------------------------------------------------------------------------
|
||||
|
||||
void softmax (
|
||||
tensor& dest,
|
||||
const tensor& src
|
||||
);
|
||||
/*!
|
||||
requires
|
||||
- have_same_dimensions(dest, src) == true
|
||||
ensures
|
||||
- Note that the softmax function is a vector valued function:
|
||||
s(x) == exp(x)/sum(exp(x))
|
||||
- Computes the softmax function on src and writes the results to dest. The
|
||||
softmax is computed per spatial location across the different channels at
|
||||
each location. That is, softmax() outputs a new tensor, #dest, where
|
||||
each of the spatial locations in dest (i.e. image idx, row idx, and
|
||||
column idx) contains the output of s() evaluated over the channel values
|
||||
at each location.
|
||||
- This function supports in-place operation, i.e. having
|
||||
is_same_object(dest, src)==true
|
||||
!*/
|
||||
|
||||
void softmax_gradient (
|
||||
tensor& grad,
|
||||
const tensor& dest,
|
||||
const tensor& gradient_input
|
||||
);
|
||||
/*!
|
||||
requires
|
||||
- have_same_dimensions(dest,gradient_input) == true
|
||||
- have_same_dimensions(dest,grad) == true
|
||||
- is_same_object(grad, dest)==false
|
||||
ensures
|
||||
- We interpret dest as the output of softmax(dest,SRC) for some SRC tensor.
|
||||
Then let f(SRC) == dot(gradient_input,dest) Then this function computes
|
||||
the gradient of f() with respect to SRC and assigns it to grad.
|
||||
- This function supports in-place operation, i.e. having
|
||||
is_same_object(grad, gradient_input)==true
|
||||
!*/
|
||||
|
||||
// ------------------------------------------------------------------------------------
|
||||
|
||||
void softmax_all (
|
||||
tensor& dest,
|
||||
const tensor& src
|
||||
);
|
||||
|
||||
void softmax_all_gradient (
|
||||
tensor& grad,
|
||||
const tensor& dest,
|
||||
const tensor& gradient_input
|
||||
);
|
||||
|
||||
// ------------------------------------------------------------------------------------
|
||||
|
||||
void sigmoid (
|
||||
tensor& dest,
|
||||
const tensor& src
|
||||
);
|
||||
/*!
|
||||
requires
|
||||
- have_same_dimensions(dest, src) == true
|
||||
ensures
|
||||
- for all valid i:
|
||||
- #dest.host()[i] == 1/(1+std::exp(-src.host()[i]))
|
||||
- This function supports in-place operation, i.e. having
|
||||
is_same_object(dest, src)==true
|
||||
!*/
|
||||
|
||||
void sigmoid_gradient (
|
||||
tensor& grad,
|
||||
const tensor& dest,
|
||||
const tensor& gradient_input
|
||||
);
|
||||
/*!
|
||||
requires
|
||||
- have_same_dimensions(dest,gradient_input) == true
|
||||
- have_same_dimensions(dest,grad) == true
|
||||
- is_same_object(grad,dest) == false
|
||||
ensures
|
||||
- Recalling that dest is the output of sigmoid(dest,SRC) for some SRC tensor,
|
||||
let f(SRC) == dot(gradient_input,dest)
|
||||
- Then this function computes the gradient of f() with respect to SRC and
|
||||
assigns it to grad.
|
||||
- This function supports in-place operation, i.e. having
|
||||
is_same_object(grad, gradient_input)==true
|
||||
!*/
|
||||
|
||||
// ------------------------------------------------------------------------------------
|
||||
|
||||
void relu (
|
||||
tensor& dest,
|
||||
const tensor& src
|
||||
);
|
||||
/*!
|
||||
requires
|
||||
- have_same_dimensions(dest, src) == true
|
||||
ensures
|
||||
- for all valid i:
|
||||
- #dest.host()[i] == std::max(0,src.host()[i])
|
||||
- This function supports in-place operation, i.e. having
|
||||
is_same_object(dest, src)==true
|
||||
!*/
|
||||
|
||||
void relu_gradient (
|
||||
tensor& grad,
|
||||
const tensor& dest,
|
||||
const tensor& gradient_input
|
||||
);
|
||||
/*!
|
||||
requires
|
||||
- have_same_dimensions(dest,gradient_input) == true
|
||||
- have_same_dimensions(dest,grad) == true
|
||||
- is_same_object(grad,dest) == false
|
||||
ensures
|
||||
- Recalling that dest is the output of relu(dest,SRC) for some SRC tensor,
|
||||
let f(SRC) == dot(gradient_input,dest)
|
||||
- Then this function computes the gradient of f() with respect to SRC and
|
||||
assigns it to grad.
|
||||
- This function supports in-place operation, i.e. having
|
||||
is_same_object(grad, gradient_input)==true
|
||||
!*/
|
||||
|
||||
// ------------------------------------------------------------------------------------
|
||||
|
||||
void tanh (
|
||||
tensor& dest,
|
||||
const tensor& src
|
||||
);
|
||||
/*!
|
||||
requires
|
||||
- have_same_dimensions(dest, src) == true
|
||||
ensures
|
||||
- for all valid i:
|
||||
- #dest.host()[i] == std::tanh(src.host()[i])
|
||||
- This function supports in-place operation, i.e. having
|
||||
is_same_object(dest, src)==true
|
||||
!*/
|
||||
|
||||
void tanh_gradient (
|
||||
tensor& grad,
|
||||
const tensor& dest,
|
||||
const tensor& gradient_input
|
||||
);
|
||||
/*!
|
||||
requires
|
||||
- have_same_dimensions(dest,gradient_input) == true
|
||||
- have_same_dimensions(dest,grad) == true
|
||||
- is_same_object(grad,dest) == false
|
||||
ensures
|
||||
- Recalling that dest is the output of tanh(dest,SRC) for some SRC tensor,
|
||||
let f(SRC) == dot(gradient_input,dest)
|
||||
- Then this function computes the gradient of f() with respect to SRC and
|
||||
assigns it to grad.
|
||||
- This function supports in-place operation, i.e. having
|
||||
is_same_object(grad, gradient_input)==true
|
||||
!*/
|
||||
|
||||
|
||||
|
||||
// ------------------------------------------------------------------------------------
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
#endif // DLIB_USE_CUDA
|
||||
|
||||
#endif // DLIB_DNN_CuDNN_H_
|
||||
|
||||
75
pkg/OpenFace/lib/3rdParty/dlib/include/dlib/cuda/curand_dlibapi.h
vendored
Normal file
75
pkg/OpenFace/lib/3rdParty/dlib/include/dlib/cuda/curand_dlibapi.h
vendored
Normal file
@@ -0,0 +1,75 @@
|
||||
// Copyright (C) 2015 Davis E. King (davis@dlib.net)
|
||||
// License: Boost Software License See LICENSE.txt for the full license.
|
||||
#ifndef DLIB_DNN_CuRAND_H_
|
||||
#define DLIB_DNN_CuRAND_H_
|
||||
|
||||
#ifdef DLIB_USE_CUDA
|
||||
|
||||
#include "tensor.h"
|
||||
#include "cuda_errors.h"
|
||||
#include "cuda_data_ptr.h"
|
||||
|
||||
namespace dlib
|
||||
{
|
||||
namespace cuda
|
||||
{
|
||||
|
||||
// -----------------------------------------------------------------------------------
|
||||
|
||||
class curand_generator
|
||||
{
|
||||
public:
|
||||
// not copyable
|
||||
curand_generator(const curand_generator&) = delete;
|
||||
curand_generator& operator=(const curand_generator&) = delete;
|
||||
|
||||
curand_generator() : curand_generator(0) {}
|
||||
curand_generator(unsigned long long seed);
|
||||
~curand_generator();
|
||||
|
||||
void fill (
|
||||
cuda_data_ptr<unsigned int>& data
|
||||
);
|
||||
/*!
|
||||
ensures
|
||||
- Fills data with random 32-bit unsigned integers.
|
||||
!*/
|
||||
|
||||
void fill_gaussian (
|
||||
tensor& data,
|
||||
float mean = 0,
|
||||
float stddev = 1
|
||||
);
|
||||
/*!
|
||||
requires
|
||||
- data.size()%2 == 0
|
||||
- stddev >= 0
|
||||
ensures
|
||||
- Fills data with random numbers drawn from a Gaussian distribution
|
||||
with the given mean and standard deviation.
|
||||
!*/
|
||||
|
||||
void fill_uniform (
|
||||
tensor& data
|
||||
);
|
||||
/*!
|
||||
ensures
|
||||
- Fills data with uniform random numbers in the range (0.0, 1.0].
|
||||
!*/
|
||||
|
||||
private:
|
||||
|
||||
void* handle;
|
||||
};
|
||||
|
||||
// -----------------------------------------------------------------------------------
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
#endif // DLIB_USE_CUDA
|
||||
|
||||
#endif // DLIB_DNN_CuRAND_H_
|
||||
|
||||
|
||||
|
||||
75
pkg/OpenFace/lib/3rdParty/dlib/include/dlib/cuda/cusolver_dlibapi.h
vendored
Normal file
75
pkg/OpenFace/lib/3rdParty/dlib/include/dlib/cuda/cusolver_dlibapi.h
vendored
Normal file
@@ -0,0 +1,75 @@
|
||||
// Copyright (C) 2017 Davis E. King (davis@dlib.net)
|
||||
// License: Boost Software License See LICENSE.txt for the full license.
|
||||
#ifndef DLIB_DNN_CuSOLVER_H_
|
||||
#define DLIB_DNN_CuSOLVER_H_
|
||||
|
||||
#ifdef DLIB_USE_CUDA
|
||||
|
||||
#include "tensor.h"
|
||||
#include "cuda_errors.h"
|
||||
#include "cuda_data_ptr.h"
|
||||
#include "../noncopyable.h"
|
||||
|
||||
namespace dlib
|
||||
{
|
||||
namespace cuda
|
||||
{
|
||||
|
||||
// -----------------------------------------------------------------------------------
|
||||
|
||||
class inv : noncopyable
|
||||
{
|
||||
/*!
|
||||
WHAT THIS OBJECT REPRESENTS
|
||||
This is a functor for doing matrix inversion on the GPU. The only
|
||||
reason it's an object is to avoid the reallocation of some GPU memory
|
||||
blocks if you want to do a bunch of matrix inversions in a row.
|
||||
!*/
|
||||
|
||||
public:
|
||||
|
||||
inv() = default;
|
||||
~inv();
|
||||
|
||||
void operator() (
|
||||
const tensor& m,
|
||||
resizable_tensor& out
|
||||
);
|
||||
/*!
|
||||
requires
|
||||
- m.size() == m.num_samples()*m.num_samples()
|
||||
(i.e. mat(m) must be a square matrix)
|
||||
ensures
|
||||
- out == inv(mat(m));
|
||||
!*/
|
||||
|
||||
int get_last_status(
|
||||
);
|
||||
/*!
|
||||
ensures
|
||||
- returns 0 if the last matrix inversion was successful and != 0
|
||||
otherwise.
|
||||
!*/
|
||||
|
||||
private:
|
||||
|
||||
void sync_if_needed();
|
||||
|
||||
bool did_work_lately = false;
|
||||
resizable_tensor m;
|
||||
cuda_data_ptr<float> workspace;
|
||||
cuda_data_ptr<int> Ipiv;
|
||||
cuda_data_ptr<int> info;
|
||||
};
|
||||
|
||||
// ------------------------------------------------------------------------------------
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
#endif // DLIB_USE_CUDA
|
||||
|
||||
#endif // DLIB_DNN_CuSOLVER_H_
|
||||
|
||||
|
||||
|
||||
266
pkg/OpenFace/lib/3rdParty/dlib/include/dlib/cuda/gpu_data.h
vendored
Normal file
266
pkg/OpenFace/lib/3rdParty/dlib/include/dlib/cuda/gpu_data.h
vendored
Normal file
@@ -0,0 +1,266 @@
|
||||
// Copyright (C) 2015 Davis E. King (davis@dlib.net)
|
||||
// License: Boost Software License See LICENSE.txt for the full license.
|
||||
#ifndef DLIB_GPU_DaTA_H_
|
||||
#define DLIB_GPU_DaTA_H_
|
||||
|
||||
#include "gpu_data_abstract.h"
|
||||
#include <memory>
|
||||
#include <cstring>
|
||||
#include "cuda_errors.h"
|
||||
#include "../serialize.h"
|
||||
|
||||
namespace dlib
|
||||
{
|
||||
|
||||
// ----------------------------------------------------------------------------------------
|
||||
|
||||
class gpu_data
|
||||
{
|
||||
/*!
|
||||
CONVENTION
|
||||
- if (size() != 0) then
|
||||
- data_host == a pointer to size() floats in CPU memory.
|
||||
- if (data_device) then
|
||||
- data_device == a pointer to size() floats in device memory.
|
||||
|
||||
- if (there might be an active async transfer from host to device) then
|
||||
- have_active_transfer == true
|
||||
|
||||
- We use the host_current and device_current bools to keep track of which
|
||||
copy of the data (or both) are most current. e.g. if the CPU has
|
||||
modified the data and it hasn't been copied to the device yet then
|
||||
host_current==true and device_current==false.
|
||||
|
||||
Similarly, we use device_in_use==true to indicate that device() has been
|
||||
called and no operation to wait for all CUDA kernel completion has been
|
||||
executed. So if device_in_use==true then there might be a CUDA kernel
|
||||
executing that is using the device memory block contained in this object.
|
||||
|
||||
!*/
|
||||
public:
|
||||
|
||||
gpu_data(
|
||||
) : data_size(0), host_current(true), device_current(true),have_active_transfer(false),device_in_use(false), the_device_id(0)
|
||||
{
|
||||
}
|
||||
|
||||
// Not copyable
|
||||
gpu_data(const gpu_data&) = delete;
|
||||
gpu_data& operator=(const gpu_data&) = delete;
|
||||
|
||||
// but is movable
|
||||
gpu_data(gpu_data&& item) : gpu_data() { swap(item); }
|
||||
gpu_data& operator=(gpu_data&& item) { swap(item); return *this; }
|
||||
|
||||
int device_id() const { return the_device_id; }
|
||||
|
||||
#ifdef DLIB_USE_CUDA
|
||||
void async_copy_to_device() const;
|
||||
void set_size(size_t new_size);
|
||||
#else
|
||||
// Note that calls to host() or device() will block until any async transfers are complete.
|
||||
void async_copy_to_device() const{}
|
||||
|
||||
void set_size(size_t new_size)
|
||||
{
|
||||
if (new_size == 0)
|
||||
{
|
||||
data_size = 0;
|
||||
host_current = true;
|
||||
device_current = true;
|
||||
device_in_use = false;
|
||||
data_host.reset();
|
||||
data_device.reset();
|
||||
}
|
||||
else if (new_size != data_size)
|
||||
{
|
||||
data_size = new_size;
|
||||
host_current = true;
|
||||
device_current = true;
|
||||
device_in_use = false;
|
||||
data_host.reset(new float[new_size], std::default_delete<float[]>());
|
||||
data_device.reset();
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
const float* host() const
|
||||
{
|
||||
copy_to_host();
|
||||
return data_host.get();
|
||||
}
|
||||
|
||||
float* host()
|
||||
{
|
||||
copy_to_host();
|
||||
device_current = false;
|
||||
return data_host.get();
|
||||
}
|
||||
|
||||
float* host_write_only()
|
||||
{
|
||||
host_current = true;
|
||||
device_current = false;
|
||||
return data_host.get();
|
||||
}
|
||||
|
||||
const float* device() const
|
||||
{
|
||||
#ifndef DLIB_USE_CUDA
|
||||
DLIB_CASSERT(false, "CUDA NOT ENABLED");
|
||||
#endif
|
||||
copy_to_device();
|
||||
device_in_use = true;
|
||||
return data_device.get();
|
||||
}
|
||||
|
||||
float* device()
|
||||
{
|
||||
#ifndef DLIB_USE_CUDA
|
||||
DLIB_CASSERT(false, "CUDA NOT ENABLED");
|
||||
#endif
|
||||
copy_to_device();
|
||||
host_current = false;
|
||||
device_in_use = true;
|
||||
return data_device.get();
|
||||
}
|
||||
|
||||
float* device_write_only()
|
||||
{
|
||||
#ifndef DLIB_USE_CUDA
|
||||
DLIB_CASSERT(false, "CUDA NOT ENABLED");
|
||||
#endif
|
||||
wait_for_transfer_to_finish();
|
||||
host_current = false;
|
||||
device_current = true;
|
||||
device_in_use = true;
|
||||
return data_device.get();
|
||||
}
|
||||
|
||||
bool host_ready (
|
||||
) const { return host_current; }
|
||||
|
||||
bool device_ready (
|
||||
) const { return device_current && !have_active_transfer; }
|
||||
|
||||
size_t size() const { return data_size; }
|
||||
|
||||
void swap (gpu_data& item)
|
||||
{
|
||||
std::swap(data_size, item.data_size);
|
||||
std::swap(host_current, item.host_current);
|
||||
std::swap(device_current, item.device_current);
|
||||
std::swap(have_active_transfer, item.have_active_transfer);
|
||||
std::swap(data_host, item.data_host);
|
||||
std::swap(data_device, item.data_device);
|
||||
std::swap(cuda_stream, item.cuda_stream);
|
||||
std::swap(the_device_id, item.the_device_id);
|
||||
}
|
||||
|
||||
private:
|
||||
|
||||
#ifdef DLIB_USE_CUDA
|
||||
void copy_to_device() const;
|
||||
void copy_to_host() const;
|
||||
void wait_for_transfer_to_finish() const;
|
||||
#else
|
||||
void copy_to_device() const{}
|
||||
void copy_to_host() const{}
|
||||
void wait_for_transfer_to_finish() const{}
|
||||
#endif
|
||||
|
||||
|
||||
size_t data_size;
|
||||
mutable bool host_current;
|
||||
mutable bool device_current;
|
||||
mutable bool have_active_transfer;
|
||||
mutable bool device_in_use;
|
||||
|
||||
std::shared_ptr<float> data_host;
|
||||
std::shared_ptr<float> data_device;
|
||||
std::shared_ptr<void> cuda_stream;
|
||||
int the_device_id;
|
||||
};
|
||||
|
||||
inline void serialize(const gpu_data& item, std::ostream& out)
|
||||
{
|
||||
int version = 1;
|
||||
serialize(version, out);
|
||||
serialize(item.size(), out);
|
||||
auto data = item.host();
|
||||
for (size_t i = 0; i < item.size(); ++i)
|
||||
serialize(data[i], out);
|
||||
}
|
||||
|
||||
inline void deserialize(gpu_data& item, std::istream& in)
|
||||
{
|
||||
int version;
|
||||
deserialize(version, in);
|
||||
if (version != 1)
|
||||
throw serialization_error("Unexpected version found while deserializing dlib::gpu_data.");
|
||||
size_t s;
|
||||
deserialize(s, in);
|
||||
item.set_size(s);
|
||||
auto data = item.host();
|
||||
for (size_t i = 0; i < item.size(); ++i)
|
||||
deserialize(data[i], in);
|
||||
}
|
||||
|
||||
#ifdef DLIB_USE_CUDA
|
||||
void memcpy (gpu_data& dest, const gpu_data& src);
|
||||
|
||||
void memcpy (
|
||||
gpu_data& dest,
|
||||
size_t dest_offset,
|
||||
const gpu_data& src,
|
||||
size_t src_offset,
|
||||
size_t num
|
||||
);
|
||||
|
||||
#else
|
||||
|
||||
inline void memcpy (gpu_data& dest, const gpu_data& src)
|
||||
{
|
||||
DLIB_CASSERT(dest.size() == src.size());
|
||||
if (src.size() == 0 || &dest == &src)
|
||||
return;
|
||||
std::memcpy(dest.host_write_only(), src.host(), sizeof(float)*src.size());
|
||||
}
|
||||
|
||||
inline void memcpy (
|
||||
gpu_data& dest,
|
||||
size_t dest_offset,
|
||||
const gpu_data& src,
|
||||
size_t src_offset,
|
||||
size_t num
|
||||
)
|
||||
{
|
||||
DLIB_CASSERT(dest_offset + num <= dest.size());
|
||||
DLIB_CASSERT(src_offset + num <= src.size());
|
||||
if (num == 0)
|
||||
return;
|
||||
if (&dest == &src && std::max(dest_offset, src_offset) < std::min(dest_offset,src_offset)+num)
|
||||
{
|
||||
// if they perfectly alias each other then there is nothing to do
|
||||
if (dest_offset == src_offset)
|
||||
return;
|
||||
else
|
||||
std::memmove(dest.host()+dest_offset, src.host()+src_offset, sizeof(float)*num);
|
||||
}
|
||||
else
|
||||
{
|
||||
// if we write to the entire thing then we can use host_write_only()
|
||||
if (dest_offset == 0 && num == dest.size())
|
||||
std::memcpy(dest.host_write_only(), src.host()+src_offset, sizeof(float)*num);
|
||||
else
|
||||
std::memcpy(dest.host()+dest_offset, src.host()+src_offset, sizeof(float)*num);
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
// ----------------------------------------------------------------------------------------
|
||||
|
||||
}
|
||||
|
||||
#endif // DLIB_GPU_DaTA_H_
|
||||
|
||||
266
pkg/OpenFace/lib/3rdParty/dlib/include/dlib/cuda/gpu_data_abstract.h
vendored
Normal file
266
pkg/OpenFace/lib/3rdParty/dlib/include/dlib/cuda/gpu_data_abstract.h
vendored
Normal file
@@ -0,0 +1,266 @@
|
||||
// Copyright (C) 2015 Davis E. King (davis@dlib.net)
|
||||
// License: Boost Software License See LICENSE.txt for the full license.
|
||||
#undef DLIB_GPU_DaTA_ABSTRACT_H_
|
||||
#ifdef DLIB_GPU_DaTA_ABSTRACT_H_
|
||||
|
||||
#include "cuda_errors.h"
|
||||
#include "../serialize.h"
|
||||
|
||||
namespace dlib
|
||||
{
|
||||
|
||||
// ----------------------------------------------------------------------------------------
|
||||
|
||||
class gpu_data
|
||||
{
|
||||
/*!
|
||||
WHAT THIS OBJECT REPRESENTS
|
||||
This object is a block of size() floats, all stored contiguously in memory.
|
||||
Importantly, it keeps two copies of the floats, one on the host CPU side
|
||||
and another on the GPU device side. It automatically performs the necessary
|
||||
host/device transfers to keep these two copies of the data in sync.
|
||||
|
||||
All transfers to the device happen asynchronously with respect to the
|
||||
default CUDA stream so that CUDA kernel computations can overlap with data
|
||||
transfers. However, any transfers from the device to the host happen
|
||||
synchronously in the default CUDA stream. Therefore, you should perform
|
||||
all your CUDA kernel launches on the default stream so that transfers back
|
||||
to the host do not happen before the relevant computations have completed.
|
||||
|
||||
If DLIB_USE_CUDA is not #defined then this object will not use CUDA at all.
|
||||
Instead, it will simply store one host side memory block of floats.
|
||||
|
||||
THREAD SAFETY
|
||||
Instances of this object are not thread-safe. So don't touch one from
|
||||
multiple threads at the same time.
|
||||
!*/
|
||||
public:
|
||||
|
||||
gpu_data(
|
||||
);
|
||||
/*!
|
||||
ensures
|
||||
- #size() == 0
|
||||
- #host() == nullptr
|
||||
- #device() == nullptr
|
||||
- #host_ready() == true
|
||||
- #device_ready() == true
|
||||
- #device_id() == 0
|
||||
!*/
|
||||
|
||||
// This object is not copyable, however, it is movable.
|
||||
gpu_data(const gpu_data&) = delete;
|
||||
gpu_data& operator=(const gpu_data&) = delete;
|
||||
gpu_data(gpu_data&& item);
|
||||
gpu_data& operator=(gpu_data&& item);
|
||||
|
||||
int device_id(
|
||||
) const;
|
||||
/*!
|
||||
ensures
|
||||
- returns the ID of the CUDA device that allocated this memory. I.e. the
|
||||
number returned by cudaGetDevice() when the memory was allocated.
|
||||
- If CUDA is not being used then this function always returns 0.
|
||||
!*/
|
||||
|
||||
void async_copy_to_device(
|
||||
);
|
||||
/*!
|
||||
ensures
|
||||
- if (!device_ready()) then
|
||||
- Begins asynchronously copying host data to the device once it is safe
|
||||
to do so. I.e. This function will wait until any previously
|
||||
scheduled CUDA kernels, which are using the device() memory block,
|
||||
have completed before transferring the new data to the device.
|
||||
- A call to device() that happens before the transfer completes will
|
||||
block until the transfer is complete. That is, it is safe to call
|
||||
async_copy_to_device() and then immediately call device().
|
||||
!*/
|
||||
|
||||
void set_size(
|
||||
size_t new_size
|
||||
);
|
||||
/*!
|
||||
ensures
|
||||
- #size() == new_size
|
||||
!*/
|
||||
|
||||
bool host_ready (
|
||||
) const;
|
||||
/*!
|
||||
ensures
|
||||
- returns true if and only if the host's copy of the data is current. The
|
||||
host's data is current if there aren't any modifications to the data
|
||||
which were made on the device side that have yet to be copied to the
|
||||
host.
|
||||
!*/
|
||||
|
||||
bool device_ready (
|
||||
) const;
|
||||
/*!
|
||||
ensures
|
||||
- returns true if and only if the device's copy of the data is current.
|
||||
The device's data is current if there aren't any modifications to the
|
||||
data which were made on the host side that have yet to be copied to the
|
||||
device.
|
||||
!*/
|
||||
|
||||
const float* host(
|
||||
) const;
|
||||
/*!
|
||||
ensures
|
||||
- returns a pointer to the host memory block of size() contiguous float
|
||||
values or nullptr if size()==0.
|
||||
- if (!host_ready()) then
|
||||
- copies the data from the device to the host, while this is happening
|
||||
the call to host() blocks.
|
||||
- #host_ready() == true
|
||||
!*/
|
||||
|
||||
float* host(
|
||||
);
|
||||
/*!
|
||||
ensures
|
||||
- returns a pointer to the host memory block of size() contiguous float
|
||||
values or nullptr if size()==0.
|
||||
- if (!host_ready()) then
|
||||
- copies the data from the device to the host, while this is happening
|
||||
the call to host() blocks.
|
||||
- #host_ready() == true
|
||||
- #device_ready() == false
|
||||
I.e. Marks the device side data as out of date so that the next call to
|
||||
device() will perform a host to device transfer. If you want to begin
|
||||
the transfer immediately then you can call async_copy_to_device() after
|
||||
calling host().
|
||||
!*/
|
||||
|
||||
float* host_write_only(
|
||||
);
|
||||
/*!
|
||||
ensures
|
||||
- This function returns the same pointer as host(), except that it never
|
||||
performs a device to host memory copy. Instead, it immediately marks the
|
||||
device side data as out of date, effectively discarding it. Therefore,
|
||||
the values in the data pointed to by host_write_only() are undefined and
|
||||
you should only call host_write_only() if you are going to assign to
|
||||
every memory location in the returned memory block.
|
||||
- #host_ready() == true
|
||||
- #device_ready() == false
|
||||
!*/
|
||||
|
||||
const float* device(
|
||||
) const;
|
||||
/*!
|
||||
requires
|
||||
- DLIB_USE_CUDA is #defined
|
||||
ensures
|
||||
- returns a pointer to the device memory block of size() contiguous float
|
||||
values or nullptr if size()==0.
|
||||
- if (!device_ready()) then
|
||||
- copies the data from the host to the device, while this is happening
|
||||
the call to device() blocks.
|
||||
- #device_ready() == true
|
||||
!*/
|
||||
|
||||
float* device(
|
||||
);
|
||||
/*!
|
||||
requires
|
||||
- DLIB_USE_CUDA is #defined
|
||||
ensures
|
||||
- returns a pointer to the device memory block of size() contiguous float
|
||||
values or nullptr if size()==0.
|
||||
- if (!device_ready()) then
|
||||
- copies the data from the host to the device, while this is happening
|
||||
the call to device() blocks.
|
||||
- #host_ready() == false
|
||||
- #device_ready() == true
|
||||
!*/
|
||||
|
||||
float* device_write_only(
|
||||
);
|
||||
/*!
|
||||
requires
|
||||
- DLIB_USE_CUDA is #defined
|
||||
ensures
|
||||
- This function returns the same pointer as device(), except that it never
|
||||
performs a host to device memory copy. Instead, it immediately marks the
|
||||
host side data as out of date, effectively discarding it. Therefore, the
|
||||
values in the data pointed to by device_write_only() are undefined and
|
||||
you should only call device_write_only() if you are going to assign to
|
||||
every memory location in the returned memory block.
|
||||
- #host_ready() == false
|
||||
- #device_ready() == true
|
||||
!*/
|
||||
|
||||
|
||||
size_t size(
|
||||
) const;
|
||||
/*!
|
||||
ensures
|
||||
- returns the number of floats contained in this object.
|
||||
!*/
|
||||
|
||||
void swap (
|
||||
gpu_data& item
|
||||
);
|
||||
/*!
|
||||
ensures
|
||||
- swaps the state of *this and item
|
||||
!*/
|
||||
|
||||
};
|
||||
|
||||
void serialize(const gpu_data& item, std::ostream& out);
|
||||
void deserialize(gpu_data& item, std::istream& in);
|
||||
/*!
|
||||
provides serialization support
|
||||
!*/
|
||||
|
||||
void memcpy (
|
||||
gpu_data& dest,
|
||||
const gpu_data& src
|
||||
);
|
||||
/*!
|
||||
requires
|
||||
- dest.size() == src.size()
|
||||
ensures
|
||||
- Copies the data in src to dest. If the device data is current (i.e.
|
||||
device_ready()==true) on both src and dest then the copy will happen entirely
|
||||
on the device side.
|
||||
- It doesn't matter what GPU device is selected by cudaSetDevice(). You can
|
||||
always copy gpu_data objects to and from each other regardless.
|
||||
- This function blocks until the copy has completed.
|
||||
!*/
|
||||
|
||||
void memcpy (
|
||||
gpu_data& dest,
|
||||
size_t dest_offset,
|
||||
const gpu_data& src,
|
||||
size_t src_offset,
|
||||
size_t num
|
||||
);
|
||||
/*!
|
||||
requires
|
||||
- dest_offset + num <= dest.size()
|
||||
- src_offset + num <= src.size()
|
||||
ensures
|
||||
- Copies the data in src to dest, but only copies data in the range
|
||||
[src.host()+src_offset, src.host()+src_offset+num) to
|
||||
[dest.host()+dest_offset, dest.host()+dest_offset+num). Therefore, it is
|
||||
just like the above memcpy() except that you can specify some subset of data
|
||||
in a gpu_data object to be copied.
|
||||
- Like the above version of memcpy(), the copy will happen in the most
|
||||
efficient way, automatically using the appropriate type of host/device
|
||||
transfers based on where data is currently resident.
|
||||
- It doesn't matter what GPU device is selected by cudaSetDevice(). You can
|
||||
always copy gpu_data objects to and from each other regardless.
|
||||
- This function blocks until the copy has completed.
|
||||
!*/
|
||||
|
||||
// ----------------------------------------------------------------------------------------
|
||||
|
||||
}
|
||||
|
||||
#endif // DLIB_GPU_DaTA_ABSTRACT_H_
|
||||
|
||||
686
pkg/OpenFace/lib/3rdParty/dlib/include/dlib/cuda/tensor.h
vendored
Normal file
686
pkg/OpenFace/lib/3rdParty/dlib/include/dlib/cuda/tensor.h
vendored
Normal file
@@ -0,0 +1,686 @@
|
||||
// Copyright (C) 2015 Davis E. King (davis@dlib.net)
|
||||
// License: Boost Software License See LICENSE.txt for the full license.
|
||||
#ifndef DLIB_DNn_TENSOR_H_
|
||||
#define DLIB_DNn_TENSOR_H_
|
||||
|
||||
#include "tensor_abstract.h"
|
||||
#include <cstring>
|
||||
#include "../matrix.h"
|
||||
#include "cudnn_dlibapi.h"
|
||||
#include "gpu_data.h"
|
||||
#include "../byte_orderer.h"
|
||||
#include <memory>
|
||||
#include "../any.h"
|
||||
|
||||
namespace dlib
|
||||
{
|
||||
|
||||
// ----------------------------------------------------------------------------------------
|
||||
|
||||
class tensor;
|
||||
namespace cuda
|
||||
{
|
||||
void set_tensor (
|
||||
tensor& t,
|
||||
float value
|
||||
);
|
||||
|
||||
void scale_tensor (
|
||||
tensor& t,
|
||||
float value
|
||||
);
|
||||
}
|
||||
|
||||
// ----------------------------------------------------------------------------------------
|
||||
|
||||
class tensor
|
||||
{
|
||||
public:
|
||||
|
||||
tensor (
|
||||
) :
|
||||
m_n(0), m_k(0), m_nr(0), m_nc(0), m_size(0)
|
||||
{
|
||||
}
|
||||
|
||||
virtual ~tensor() {}
|
||||
|
||||
long long num_samples() const { return m_n; }
|
||||
long long k() const { return m_k; }
|
||||
long long nr() const { return m_nr; }
|
||||
long long nc() const { return m_nc; }
|
||||
size_t size() const { return m_size; }
|
||||
|
||||
typedef float* iterator;
|
||||
typedef const float* const_iterator;
|
||||
iterator begin() { return host(); }
|
||||
const_iterator begin() const { return host(); }
|
||||
iterator end() { return host()+size(); }
|
||||
const_iterator end() const { return host()+size(); }
|
||||
|
||||
void async_copy_to_device() const
|
||||
{
|
||||
data().async_copy_to_device();
|
||||
}
|
||||
|
||||
virtual const float* host() const = 0;
|
||||
virtual float* host() = 0;
|
||||
virtual float* host_write_only() = 0;
|
||||
virtual const float* device() const = 0;
|
||||
virtual float* device() = 0;
|
||||
virtual float* device_write_only() = 0;
|
||||
|
||||
virtual const any& annotation() const = 0;
|
||||
virtual any& annotation() = 0;
|
||||
|
||||
int device_id() const { return data().device_id(); }
|
||||
|
||||
tensor& operator= (float val)
|
||||
{
|
||||
#ifdef DLIB_USE_CUDA
|
||||
// If you are using CUDA then presumably you will be mostly using tensors on
|
||||
// the GPU. So unless you seem to be actively working with the host side's
|
||||
// data then we do this initialization on the device side since this avoids a
|
||||
// host to device transfer that would likely immediately follow.
|
||||
if (data().device_ready())
|
||||
{
|
||||
cuda::set_tensor(*this, val);
|
||||
return *this;
|
||||
}
|
||||
#endif
|
||||
auto d = host_write_only();
|
||||
for (size_t i = 0; i < size(); ++i)
|
||||
d[i] = val;
|
||||
|
||||
return *this;
|
||||
}
|
||||
|
||||
tensor& operator*= (float val)
|
||||
{
|
||||
#ifdef DLIB_USE_CUDA
|
||||
cuda::scale_tensor(*this, val);
|
||||
return *this;
|
||||
#else
|
||||
for (auto& d : *this)
|
||||
d *= val;
|
||||
|
||||
return *this;
|
||||
#endif
|
||||
}
|
||||
|
||||
tensor& operator/= (float val)
|
||||
{
|
||||
*this *= 1.0/val;
|
||||
return *this;
|
||||
}
|
||||
|
||||
template <typename EXP>
|
||||
tensor& operator= (const matrix_exp<EXP>& item)
|
||||
{
|
||||
DLIB_CASSERT(num_samples() == item.nr() &&
|
||||
nr()*nc()*k() == item.nc());
|
||||
static_assert((is_same_type<float, typename EXP::type>::value == true),
|
||||
"To assign a matrix to a tensor the matrix must contain float values");
|
||||
|
||||
set_ptrm(host_write_only(), m_n, m_nr*m_nc*m_k) = item;
|
||||
return *this;
|
||||
}
|
||||
|
||||
template <typename EXP>
|
||||
tensor& operator+= (const matrix_exp<EXP>& item)
|
||||
{
|
||||
DLIB_CASSERT(num_samples() == item.nr() &&
|
||||
nr()*nc()*k() == item.nc());
|
||||
static_assert((is_same_type<float, typename EXP::type>::value == true),
|
||||
"To assign a matrix to a tensor the matrix must contain float values");
|
||||
set_ptrm(host(), m_n, m_nr*m_nc*m_k) += item;
|
||||
return *this;
|
||||
}
|
||||
|
||||
template <typename EXP>
|
||||
tensor& operator-= (const matrix_exp<EXP>& item)
|
||||
{
|
||||
DLIB_CASSERT(num_samples() == item.nr() &&
|
||||
nr()*nc()*k() == item.nc());
|
||||
static_assert((is_same_type<float, typename EXP::type>::value == true),
|
||||
"To assign a matrix to a tensor the matrix must contain float values");
|
||||
set_ptrm(host(), m_n, m_nr*m_nc*m_k) -= item;
|
||||
return *this;
|
||||
}
|
||||
|
||||
template <typename EXP>
|
||||
void set_sample (
|
||||
unsigned long long idx,
|
||||
const matrix_exp<EXP>& item
|
||||
)
|
||||
{
|
||||
DLIB_CASSERT(idx < (unsigned long long)num_samples());
|
||||
DLIB_CASSERT(item.size() == nr()*nc()*k());
|
||||
static_assert((is_same_type<float, typename EXP::type>::value == true),
|
||||
"To assign a matrix to a tensor the matrix must contain float values");
|
||||
set_ptrm(host()+idx*item.size(), item.nr(), item.nc()) = item;
|
||||
}
|
||||
|
||||
|
||||
template <typename EXP>
|
||||
void add_to_sample (
|
||||
unsigned long long idx,
|
||||
const matrix_exp<EXP>& item
|
||||
)
|
||||
{
|
||||
DLIB_CASSERT(idx < (unsigned long long)num_samples());
|
||||
DLIB_CASSERT(item.size() == nr()*nc()*k());
|
||||
static_assert((is_same_type<float, typename EXP::type>::value == true),
|
||||
"To assign a matrix to a tensor the matrix must contain float values");
|
||||
set_ptrm(host()+idx*item.size(), item.nr(), item.nc()) += item;
|
||||
}
|
||||
|
||||
|
||||
#ifdef DLIB_USE_CUDA
|
||||
virtual const cuda::tensor_descriptor& get_cudnn_tensor_descriptor (
|
||||
) const = 0;
|
||||
#endif
|
||||
|
||||
friend void memcpy (
|
||||
tensor& dest,
|
||||
const tensor& src
|
||||
)
|
||||
{
|
||||
DLIB_CASSERT(dest.size() == src.size());
|
||||
memcpy(dest.data(), dest.get_alias_offset(),
|
||||
src.data(), src.get_alias_offset(),
|
||||
src.size());
|
||||
}
|
||||
|
||||
|
||||
protected:
|
||||
|
||||
friend class alias_tensor;
|
||||
|
||||
virtual gpu_data& data() = 0;
|
||||
virtual const gpu_data& data() const = 0;
|
||||
virtual size_t get_alias_offset() const { return 0; } // needed by alias_tensor.
|
||||
|
||||
long long m_n;
|
||||
long long m_k;
|
||||
long long m_nr;
|
||||
long long m_nc;
|
||||
long long m_size; // always equal to m_n*m_k*m_nr*m_nc
|
||||
};
|
||||
|
||||
// ----------------------------------------------------------------------------------------
|
||||
|
||||
inline bool is_vector (
|
||||
const tensor& t
|
||||
)
|
||||
{
|
||||
return t.size() == (size_t)t.num_samples() ||
|
||||
t.size() == (size_t)t.k() ||
|
||||
t.size() == (size_t)t.nr() ||
|
||||
t.size() == (size_t)t.nc();
|
||||
}
|
||||
|
||||
// ----------------------------------------------------------------------------------------
|
||||
|
||||
inline const matrix_op<op_pointer_to_mat<float> > mat (
|
||||
const tensor& t,
|
||||
long long nr,
|
||||
long long nc
|
||||
)
|
||||
{
|
||||
DLIB_ASSERT(nr >= 0 && nc >= 0 ,
|
||||
"\tconst matrix_exp mat(tensor, nr, nc)"
|
||||
<< "\n\t nr and nc must be >= 0"
|
||||
<< "\n\t nr: " << nr
|
||||
<< "\n\t nc: " << nc
|
||||
);
|
||||
DLIB_ASSERT(nr*nc == (long long)t.size() ,
|
||||
"\tconst matrix_exp mat(tensor, nr, nc)"
|
||||
<< "\n\t The sizes don't match up."
|
||||
<< "\n\t nr*nc: " << nr*nc
|
||||
<< "\n\t t.size(): " << t.size()
|
||||
);
|
||||
typedef op_pointer_to_mat<float> op;
|
||||
return matrix_op<op>(op(t.host(),nr,nc));
|
||||
}
|
||||
|
||||
inline const matrix_op<op_pointer_to_mat<float> > mat (
|
||||
const tensor& t
|
||||
)
|
||||
{
|
||||
if (t.size() != 0)
|
||||
return mat(t, t.num_samples(), t.size()/t.num_samples());
|
||||
else
|
||||
return mat((float*)0,0,0);
|
||||
}
|
||||
|
||||
inline const matrix_op<op_pointer_to_mat<float> > image_plane (
|
||||
const tensor& t,
|
||||
long long sample = 0,
|
||||
long long k = 0
|
||||
)
|
||||
{
|
||||
DLIB_ASSERT(0 <= sample && sample < t.num_samples() &&
|
||||
0 <= k && k < t.k() &&
|
||||
t.size() != 0,
|
||||
"\tconst matrix_exp image_plane(tensor,sample,k)"
|
||||
<< "\n\t Invalid arguments were given to this function."
|
||||
<< "\n\t sample: " << sample
|
||||
<< "\n\t k: " << k
|
||||
<< "\n\t t.num_samples(): " << t.num_samples()
|
||||
<< "\n\t t.k(): " << t.k()
|
||||
<< "\n\t t.size(): " << t.size()
|
||||
);
|
||||
|
||||
|
||||
typedef op_pointer_to_mat<float> op;
|
||||
return matrix_op<op>(op(t.host() + ((sample*t.k() + k)*t.nr())*t.nc(),
|
||||
t.nr(),
|
||||
t.nc()));
|
||||
}
|
||||
|
||||
// ----------------------------------------------------------------------------------------
|
||||
|
||||
inline bool have_same_dimensions (
|
||||
const tensor& a,
|
||||
const tensor& b
|
||||
)
|
||||
{
|
||||
return a.num_samples() == b.num_samples() &&
|
||||
a.k() == b.k() &&
|
||||
a.nr() == b.nr() &&
|
||||
a.nc() == b.nc();
|
||||
}
|
||||
|
||||
// ----------------------------------------------------------------------------------------
|
||||
|
||||
class resizable_tensor : public tensor
|
||||
{
|
||||
public:
|
||||
resizable_tensor(
|
||||
)
|
||||
{}
|
||||
|
||||
template <typename EXP>
|
||||
resizable_tensor(
|
||||
const matrix_exp<EXP>& item
|
||||
)
|
||||
{
|
||||
set_size(item.nr(), item.nc());
|
||||
*this = item;
|
||||
}
|
||||
|
||||
explicit resizable_tensor(
|
||||
long long n_, long long k_ = 1, long long nr_ = 1, long long nc_ = 1
|
||||
)
|
||||
{
|
||||
DLIB_ASSERT( n_ >= 0 && k_ >= 0 && nr_ >= 0 && nc_ >= 0);
|
||||
|
||||
set_size(n_,k_,nr_,nc_);
|
||||
}
|
||||
|
||||
resizable_tensor(const resizable_tensor& item) : _annotation(item.annotation())
|
||||
{
|
||||
copy_size(item);
|
||||
memcpy(*this, item);
|
||||
}
|
||||
resizable_tensor(const tensor& item) : _annotation(item.annotation())
|
||||
{
|
||||
copy_size(item);
|
||||
memcpy(*this, item);
|
||||
}
|
||||
|
||||
resizable_tensor(resizable_tensor&& item) { swap(item); }
|
||||
resizable_tensor& operator=(resizable_tensor&& item) { swap(item); return *this; }
|
||||
|
||||
virtual const float* host() const { return data_instance.host(); }
|
||||
virtual float* host() { return data_instance.host(); }
|
||||
virtual float* host_write_only() { return data_instance.host_write_only(); }
|
||||
virtual const float* device() const { return data_instance.device(); }
|
||||
virtual float* device() { return data_instance.device(); }
|
||||
virtual float* device_write_only() { return data_instance.device_write_only(); }
|
||||
|
||||
virtual const any& annotation() const { return _annotation; }
|
||||
virtual any& annotation() { return _annotation; }
|
||||
|
||||
void clear(
|
||||
)
|
||||
{
|
||||
set_size(0,0,0,0);
|
||||
_annotation.clear();
|
||||
// free underlying memory
|
||||
data_instance.set_size(0);
|
||||
}
|
||||
|
||||
void copy_size (
|
||||
const tensor& item
|
||||
)
|
||||
{
|
||||
set_size(item.num_samples(), item.k(), item.nr(), item.nc());
|
||||
}
|
||||
|
||||
resizable_tensor& operator= (float val)
|
||||
{
|
||||
tensor::operator=(val);
|
||||
return *this;
|
||||
}
|
||||
|
||||
template <typename EXP>
|
||||
resizable_tensor& operator= (
|
||||
const matrix_exp<EXP>& item
|
||||
)
|
||||
{
|
||||
if (!(num_samples() == item.nr() && k()*nr()*nc() == item.nc()))
|
||||
set_size(item.nr(), item.nc());
|
||||
tensor::operator=(item);
|
||||
return *this;
|
||||
}
|
||||
|
||||
void set_size(
|
||||
long long n_, long long k_ = 1, long long nr_ = 1, long long nc_ = 1
|
||||
)
|
||||
{
|
||||
DLIB_ASSERT( n_ >= 0 && k_ >= 0 && nr_ >= 0 && nc_ >= 0);
|
||||
|
||||
m_n = n_;
|
||||
m_k = k_;
|
||||
m_nr = nr_;
|
||||
m_nc = nc_;
|
||||
m_size = n_*k_*nr_*nc_;
|
||||
if ((long long)data_instance.size() < m_size)
|
||||
data_instance.set_size(m_size);
|
||||
#ifdef DLIB_USE_CUDA
|
||||
cudnn_descriptor.set_size(m_n,m_k,m_nr,m_nc);
|
||||
#endif
|
||||
}
|
||||
|
||||
|
||||
resizable_tensor& operator= (const resizable_tensor& item)
|
||||
{
|
||||
resizable_tensor temp(item);
|
||||
temp.swap(*this);
|
||||
return *this;
|
||||
}
|
||||
|
||||
resizable_tensor& operator= (const tensor& item)
|
||||
{
|
||||
resizable_tensor temp(item);
|
||||
temp.swap(*this);
|
||||
return *this;
|
||||
}
|
||||
|
||||
|
||||
void swap(resizable_tensor& item)
|
||||
{
|
||||
std::swap(m_n, item.m_n);
|
||||
std::swap(m_k, item.m_k);
|
||||
std::swap(m_nr, item.m_nr);
|
||||
std::swap(m_nc, item.m_nc);
|
||||
std::swap(m_size, item.m_size);
|
||||
std::swap(data_instance, item.data_instance);
|
||||
std::swap(_annotation, item._annotation);
|
||||
#ifdef DLIB_USE_CUDA
|
||||
std::swap(cudnn_descriptor, item.cudnn_descriptor);
|
||||
#endif
|
||||
}
|
||||
|
||||
#ifdef DLIB_USE_CUDA
|
||||
virtual const cuda::tensor_descriptor& get_cudnn_tensor_descriptor (
|
||||
) const { return cudnn_descriptor; }
|
||||
#endif
|
||||
|
||||
private:
|
||||
|
||||
#ifdef DLIB_USE_CUDA
|
||||
cuda::tensor_descriptor cudnn_descriptor;
|
||||
#endif
|
||||
|
||||
gpu_data data_instance;
|
||||
any _annotation;
|
||||
virtual gpu_data& data() { return data_instance; }
|
||||
virtual const gpu_data& data() const { return data_instance; }
|
||||
};
|
||||
|
||||
inline void serialize(const tensor& item, std::ostream& out)
|
||||
{
|
||||
int version = 2;
|
||||
serialize(version, out);
|
||||
serialize(item.num_samples(), out);
|
||||
serialize(item.k(), out);
|
||||
serialize(item.nr(), out);
|
||||
serialize(item.nc(), out);
|
||||
byte_orderer bo;
|
||||
auto sbuf = out.rdbuf();
|
||||
for (auto d : item)
|
||||
{
|
||||
// Write out our data as 4byte little endian IEEE floats rather than using
|
||||
// dlib's default float serialization. We do this because it will result in
|
||||
// more compact outputs. It's slightly less portable but it seems doubtful
|
||||
// that any CUDA enabled platform isn't going to use IEEE floats. But if one
|
||||
// does we can just update the serialization code here to handle it if such a
|
||||
// platform is encountered.
|
||||
bo.host_to_little(d);
|
||||
static_assert(sizeof(d)==4, "This serialization code assumes we are writing 4 byte floats");
|
||||
sbuf->sputn((char*)&d, sizeof(d));
|
||||
}
|
||||
}
|
||||
|
||||
inline void deserialize(resizable_tensor& item, std::istream& in)
|
||||
{
|
||||
int version;
|
||||
deserialize(version, in);
|
||||
if (version != 2)
|
||||
throw serialization_error("Unexpected version found while deserializing dlib::resizable_tensor.");
|
||||
|
||||
long long num_samples=0, k=0, nr=0, nc=0;
|
||||
deserialize(num_samples, in);
|
||||
deserialize(k, in);
|
||||
deserialize(nr, in);
|
||||
deserialize(nc, in);
|
||||
item.set_size(num_samples, k, nr, nc);
|
||||
byte_orderer bo;
|
||||
auto sbuf = in.rdbuf();
|
||||
for (auto& d : item)
|
||||
{
|
||||
static_assert(sizeof(d)==4, "This serialization code assumes we are writing 4 byte floats");
|
||||
if (sbuf->sgetn((char*)&d,sizeof(d)) != sizeof(d))
|
||||
{
|
||||
in.setstate(std::ios::badbit);
|
||||
throw serialization_error("Error reading data while deserializing dlib::resizable_tensor.");
|
||||
}
|
||||
bo.little_to_host(d);
|
||||
}
|
||||
}
|
||||
|
||||
// ----------------------------------------------------------------------------------------
|
||||
|
||||
inline double dot(
|
||||
const tensor& a,
|
||||
const tensor& b
|
||||
)
|
||||
{
|
||||
DLIB_CASSERT(a.size() == b.size());
|
||||
const float* da = a.host();
|
||||
const float* db = b.host();
|
||||
double sum = 0;
|
||||
for (size_t i = 0; i < a.size(); ++i)
|
||||
sum += da[i]*db[i];
|
||||
return sum;
|
||||
}
|
||||
|
||||
// ----------------------------------------------------------------------------------------
|
||||
// ----------------------------------------------------------------------------------------
|
||||
|
||||
class alias_tensor_instance : public tensor
|
||||
{
|
||||
alias_tensor_instance(
|
||||
) : data_instance(0), _annotation(0), data_offset(0) {}
|
||||
|
||||
public:
|
||||
friend class alias_tensor;
|
||||
friend class alias_tensor_const_instance;
|
||||
|
||||
alias_tensor_instance& operator= (float val)
|
||||
{
|
||||
tensor::operator=(val);
|
||||
return *this;
|
||||
}
|
||||
|
||||
template <typename EXP>
|
||||
alias_tensor_instance& operator= (const matrix_exp<EXP>& item)
|
||||
{
|
||||
tensor::operator=(item);
|
||||
return *this;
|
||||
}
|
||||
|
||||
virtual const float* host() const { return data_instance->host()+data_offset; }
|
||||
virtual float* host() { return data_instance->host()+data_offset; }
|
||||
virtual float* host_write_only() { return data_instance->host()+data_offset; }
|
||||
virtual const float* device() const { return data_instance->device()+data_offset; }
|
||||
virtual float* device() { return data_instance->device()+data_offset; }
|
||||
virtual float* device_write_only() { return data_instance->device()+data_offset; }
|
||||
|
||||
virtual const any& annotation() const { return *_annotation; }
|
||||
virtual any& annotation() { return *_annotation; }
|
||||
|
||||
#ifdef DLIB_USE_CUDA
|
||||
virtual const cuda::tensor_descriptor& get_cudnn_tensor_descriptor (
|
||||
) const { return *cudnn_descriptor; }
|
||||
#endif
|
||||
private:
|
||||
|
||||
virtual size_t get_alias_offset() const { return data_offset; }
|
||||
|
||||
#ifdef DLIB_USE_CUDA
|
||||
std::shared_ptr<cuda::tensor_descriptor> cudnn_descriptor;
|
||||
#endif
|
||||
gpu_data* data_instance;
|
||||
any* _annotation;
|
||||
size_t data_offset;
|
||||
virtual gpu_data& data() { return *data_instance; }
|
||||
virtual const gpu_data& data() const { return *data_instance; }
|
||||
};
|
||||
|
||||
// ----------------------------------------------------------------------------------------
|
||||
|
||||
class alias_tensor_const_instance
|
||||
{
|
||||
public:
|
||||
const tensor& get() const { return inst; }
|
||||
operator const tensor& () { return inst; }
|
||||
|
||||
alias_tensor_const_instance(const alias_tensor_instance& item) : inst(item) {}
|
||||
|
||||
private:
|
||||
alias_tensor_instance inst;
|
||||
|
||||
friend class alias_tensor;
|
||||
alias_tensor_const_instance() {}
|
||||
};
|
||||
|
||||
// ----------------------------------------------------------------------------------------
|
||||
|
||||
class alias_tensor
|
||||
{
|
||||
public:
|
||||
|
||||
alias_tensor (
|
||||
) {}
|
||||
|
||||
alias_tensor (
|
||||
long long n_, long long k_ = 1, long long nr_ = 1, long long nc_ = 1
|
||||
)
|
||||
{
|
||||
DLIB_ASSERT( n_ >= 0 && k_ >= 0 && nr_ >= 0 && nc_ >= 0);
|
||||
|
||||
inst.m_n = n_;
|
||||
inst.m_k = k_;
|
||||
inst.m_nr = nr_;
|
||||
inst.m_nc = nc_;
|
||||
inst.m_size = n_*k_*nr_*nc_;
|
||||
}
|
||||
|
||||
long long num_samples(
|
||||
) const { return inst.m_n; }
|
||||
|
||||
long long k(
|
||||
) const { return inst.m_k; }
|
||||
|
||||
long long nr(
|
||||
) const { return inst.m_nr; }
|
||||
|
||||
long long nc(
|
||||
) const { return inst.m_nc; }
|
||||
|
||||
size_t size(
|
||||
) const { return inst.m_size; }
|
||||
|
||||
alias_tensor_instance operator() (
|
||||
tensor& t,
|
||||
size_t offset = 0
|
||||
) const
|
||||
{
|
||||
DLIB_CASSERT(offset+size() <= t.size(),
|
||||
"offset: "<<offset <<"\n"<<
|
||||
"size(): "<<size() <<"\n"<<
|
||||
"t.size(): "<<t.size() <<"\n");
|
||||
|
||||
#ifdef DLIB_USE_CUDA
|
||||
if (!inst.cudnn_descriptor)
|
||||
{
|
||||
inst.cudnn_descriptor = std::make_shared<cuda::tensor_descriptor>();
|
||||
inst.cudnn_descriptor->set_size(inst.m_n, inst.m_k, inst.m_nr, inst.m_nc);
|
||||
}
|
||||
#endif
|
||||
inst.data_instance = &t.data();
|
||||
inst._annotation = &t.annotation();
|
||||
// Note that t might already be an aliasing tensor so we need to take that into
|
||||
// account.
|
||||
inst.data_offset = t.get_alias_offset()+offset;
|
||||
return inst;
|
||||
}
|
||||
|
||||
alias_tensor_const_instance operator() (
|
||||
const tensor& t,
|
||||
size_t offset = 0
|
||||
) const
|
||||
{
|
||||
alias_tensor_const_instance temp;
|
||||
temp.inst = (*this)(const_cast<tensor&>(t),offset);
|
||||
return temp;
|
||||
}
|
||||
|
||||
private:
|
||||
mutable alias_tensor_instance inst;
|
||||
};
|
||||
|
||||
inline void serialize(const alias_tensor& item, std::ostream& out)
|
||||
{
|
||||
int version = 1;
|
||||
serialize(version, out);
|
||||
serialize(item.num_samples(), out);
|
||||
serialize(item.k(), out);
|
||||
serialize(item.nr(), out);
|
||||
serialize(item.nc(), out);
|
||||
}
|
||||
|
||||
inline void deserialize(alias_tensor& item, std::istream& in)
|
||||
{
|
||||
int version = 0;
|
||||
deserialize(version, in);
|
||||
if (version != 1)
|
||||
throw serialization_error("Unexpected version found while deserializing dlib::alias_tensor.");
|
||||
long long num_samples, k, nr, nc;
|
||||
deserialize(num_samples, in);
|
||||
deserialize(k, in);
|
||||
deserialize(nr, in);
|
||||
deserialize(nc, in);
|
||||
item = alias_tensor(num_samples, k, nr, nc);
|
||||
}
|
||||
|
||||
// ----------------------------------------------------------------------------------------
|
||||
|
||||
}
|
||||
|
||||
#endif // DLIB_DNn_TENSOR_H_
|
||||
|
||||
727
pkg/OpenFace/lib/3rdParty/dlib/include/dlib/cuda/tensor_abstract.h
vendored
Normal file
727
pkg/OpenFace/lib/3rdParty/dlib/include/dlib/cuda/tensor_abstract.h
vendored
Normal file
@@ -0,0 +1,727 @@
|
||||
// Copyright (C) 2015 Davis E. King (davis@dlib.net)
|
||||
// License: Boost Software License See LICENSE.txt for the full license.
|
||||
#undef DLIB_DNn_TENSOR_ABSTRACT_H_
|
||||
#ifdef DLIB_DNn_TENSOR_ABSTRACT_H_
|
||||
|
||||
#include "../matrix.h"
|
||||
#include "../any/any_abstract.h"
|
||||
|
||||
namespace dlib
|
||||
{
|
||||
// ----------------------------------------------------------------------------------------
|
||||
|
||||
class tensor
|
||||
{
|
||||
/*!
|
||||
WHAT THIS OBJECT REPRESENTS
|
||||
This object represents a 4D array of float values, all stored contiguously
|
||||
in memory. Importantly, it keeps two copies of the floats, one on the host
|
||||
CPU side and another on the GPU device side. It automatically performs the
|
||||
necessary host/device transfers to keep these two copies of the data in
|
||||
sync.
|
||||
|
||||
All transfers to the device happen asynchronously with respect to the
|
||||
default CUDA stream so that CUDA kernel computations can overlap with data
|
||||
transfers. However, any transfers from the device to the host happen
|
||||
synchronously in the default CUDA stream. Therefore, you should perform
|
||||
all your CUDA kernel launches on the default stream so that transfers back
|
||||
to the host do not happen before the relevant computations have completed.
|
||||
|
||||
If DLIB_USE_CUDA is not #defined then this object will not use CUDA at all.
|
||||
Instead, it will simply store one host side memory block of floats.
|
||||
|
||||
Finally, the convention in dlib code is to interpret the tensor as a set of
|
||||
num_samples() 3D arrays, each of dimension k() by nr() by nc(). Also,
|
||||
while this class does not specify a memory layout, the convention is to
|
||||
assume that indexing into an element at coordinates (sample,k,r,c) can be
|
||||
accomplished via:
|
||||
host()[((sample*t.k() + k)*t.nr() + r)*t.nc() + c]
|
||||
|
||||
THREAD SAFETY
|
||||
Instances of this object are not thread-safe. So don't touch one from
|
||||
multiple threads at the same time.
|
||||
!*/
|
||||
|
||||
public:
|
||||
|
||||
virtual ~tensor();
|
||||
|
||||
long long num_samples(
|
||||
) const;
|
||||
/*!
|
||||
ensures
|
||||
- returns the number of 3D arrays of dimension k() by nr() by nc() there
|
||||
are in this object.
|
||||
!*/
|
||||
|
||||
long long k(
|
||||
) const;
|
||||
/*!
|
||||
ensures
|
||||
- returns the k dimension of this tensor. Generally, we think of a tensor
|
||||
as containing num_samples() images of nr() by nc() rows and columns, each
|
||||
with k() channels.
|
||||
!*/
|
||||
|
||||
long long nr(
|
||||
) const;
|
||||
/*!
|
||||
ensures
|
||||
- returns the number of rows in this tensor.
|
||||
!*/
|
||||
|
||||
long long nc(
|
||||
) const;
|
||||
/*!
|
||||
ensures
|
||||
- returns the number of columns in this tensor.
|
||||
!*/
|
||||
|
||||
size_t size(
|
||||
) const;
|
||||
/*!
|
||||
ensures
|
||||
- returns num_samples()*k()*nr()*nc()
|
||||
(i.e. the total number of floats in this tensor)
|
||||
!*/
|
||||
|
||||
void async_copy_to_device(
|
||||
) const;
|
||||
/*!
|
||||
ensures
|
||||
- This function does not block.
|
||||
- if (the host version of the data is newer than the device's copy) then
|
||||
- Begins asynchronously copying host data to the device.
|
||||
- A call to device() that happens before the transfer completes will
|
||||
block until the transfer is complete. That is, it is safe to call
|
||||
async_copy_to_device() and then immediately call device().
|
||||
!*/
|
||||
|
||||
typedef float* iterator;
|
||||
typedef const float* const_iterator;
|
||||
iterator begin() { return host(); }
|
||||
const_iterator begin() const { return host(); }
|
||||
iterator end() { return host()+size(); }
|
||||
const_iterator end() const { return host()+size(); }
|
||||
/*!
|
||||
ensures
|
||||
- makes a tensor iterable just like the STL containers.
|
||||
!*/
|
||||
|
||||
virtual const float* host(
|
||||
) const = 0;
|
||||
/*!
|
||||
ensures
|
||||
- returns a pointer to the host memory block of size() contiguous float
|
||||
values or nullptr if size()==0.
|
||||
- if (the host's copy of the data is out of date) then
|
||||
- copies the data from the device to the host, while this is happening
|
||||
the call to host() blocks.
|
||||
!*/
|
||||
|
||||
virtual float* host(
|
||||
) = 0;
|
||||
/*!
|
||||
ensures
|
||||
- returns a pointer to the host memory block of size() contiguous float
|
||||
values or nullptr if size()==0.
|
||||
- if (the host's copy of the data is out of date) then
|
||||
- copies the data from the device to the host, while this is happening
|
||||
the call to host() blocks.
|
||||
- Marks the device side data as out of date so that the next call to
|
||||
device() will perform a host to device transfer. If you want to begin
|
||||
the transfer immediately then you can call async_copy_to_device() after
|
||||
calling host().
|
||||
!*/
|
||||
|
||||
virtual float* host_write_only(
|
||||
) = 0;
|
||||
/*!
|
||||
ensures
|
||||
- This function returns the same pointer as host(), except that it never
|
||||
performs a device to host memory copy. Instead, it immediately marks the
|
||||
device side data as out of date, effectively discarding it. Therefore,
|
||||
the values in the data pointed to by host_write_only() are undefined and
|
||||
you should only call host_write_only() if you are going to assign to
|
||||
every memory location in the returned memory block.
|
||||
!*/
|
||||
|
||||
virtual const float* device(
|
||||
) const = 0;
|
||||
/*!
|
||||
requires
|
||||
- DLIB_USE_CUDA is #defined
|
||||
ensures
|
||||
- returns a pointer to the device memory block of size() contiguous float
|
||||
values or nullptr if size()==0.
|
||||
- if (the device's copy of the data is out of date) then
|
||||
- copies the data from the host to the device, while this is happening
|
||||
the call to device() blocks.
|
||||
!*/
|
||||
|
||||
virtual float* device(
|
||||
) = 0;
|
||||
/*!
|
||||
requires
|
||||
- DLIB_USE_CUDA is #defined
|
||||
ensures
|
||||
- returns a pointer to the device memory block of size() contiguous float
|
||||
values or nullptr if size()==0.
|
||||
- if (the device's copy of the data is out of date) then
|
||||
- copies the data from the host to the device, while this is happening
|
||||
the call to device() blocks.
|
||||
- Marks the host side data as out of date so that the next call to
|
||||
host() will perform a device to host transfer.
|
||||
!*/
|
||||
|
||||
virtual float* device_write_only(
|
||||
) = 0;
|
||||
/*!
|
||||
requires
|
||||
- DLIB_USE_CUDA is #defined
|
||||
ensures
|
||||
- This function returns the same pointer as device(), except that it never
|
||||
performs a host to device memory copy. Instead, it immediately marks the
|
||||
host side data as out of date, effectively discarding it. Therefore, the
|
||||
values in the data pointed to by device_write_only() are undefined and
|
||||
you should only call device_write_only() if you are going to assign to
|
||||
every memory location in the returned memory block.
|
||||
!*/
|
||||
|
||||
virtual const any& annotation(
|
||||
) const = 0;
|
||||
/*!
|
||||
ensures
|
||||
- returns a const reference to the any object in this tensor. The any
|
||||
object can be used to store any additional annotation you like in a
|
||||
tensor. However, it should be noted that the annotation() is ignored by
|
||||
serialize() and therefore not saved when a tensor is serialized.
|
||||
!*/
|
||||
|
||||
virtual any& annotation(
|
||||
) = 0;
|
||||
/*!
|
||||
ensures
|
||||
- returns a non-const reference to the any object in this tensor. The any
|
||||
object can be used to store any additional annotation you like in a
|
||||
tensor. However, it should be noted that the annotation() is ignored by
|
||||
serialize() and therefore not saved when a tensor is serialized.
|
||||
!*/
|
||||
|
||||
int device_id(
|
||||
) const;
|
||||
/*!
|
||||
ensures
|
||||
- returns the ID of the CUDA device that allocated this memory. I.e. the
|
||||
number returned by cudaGetDevice() when the memory was allocated.
|
||||
- If CUDA is not being used then this function always returns 0.
|
||||
!*/
|
||||
|
||||
tensor& operator= (
|
||||
float val
|
||||
);
|
||||
/*!
|
||||
ensures
|
||||
- sets all elements of this tensor equal to val.
|
||||
- returns *this
|
||||
!*/
|
||||
|
||||
tensor& operator*= (
|
||||
float val
|
||||
);
|
||||
/*!
|
||||
ensures
|
||||
- pointwise multiplies all elements of *this tensor with val.
|
||||
- returns *this
|
||||
!*/
|
||||
|
||||
tensor& operator/= (
|
||||
float val
|
||||
);
|
||||
/*!
|
||||
ensures
|
||||
- pointwise divides all elements of *this tensor with val.
|
||||
- returns *this
|
||||
!*/
|
||||
|
||||
template <typename EXP>
|
||||
tensor& operator= (
|
||||
const matrix_exp<EXP>& item
|
||||
);
|
||||
/*!
|
||||
requires
|
||||
- num_samples() == item.nr()
|
||||
- k()*nr()*nc() == item.nc()
|
||||
- item contains float values
|
||||
ensures
|
||||
- Assigns item to *this tensor by performing:
|
||||
set_ptrm(host(), num_samples(), k()*nr()*nc()) = item;
|
||||
!*/
|
||||
|
||||
template <typename EXP>
|
||||
tensor& operator+= (
|
||||
const matrix_exp<EXP>& item
|
||||
);
|
||||
/*!
|
||||
requires
|
||||
- num_samples() == item.nr()
|
||||
- k()*nr()*nc() == item.nc()
|
||||
- item contains float values
|
||||
ensures
|
||||
- Adds item to *this tensor by performing:
|
||||
set_ptrm(host(), num_samples(), k()*nr()*nc()) += item;
|
||||
!*/
|
||||
|
||||
template <typename EXP>
|
||||
tensor& operator-= (
|
||||
const matrix_exp<EXP>& item
|
||||
);
|
||||
/*!
|
||||
requires
|
||||
- num_samples() == item.nr()
|
||||
- k()*nr()*nc() == item.nc()
|
||||
- item contains float values
|
||||
ensures
|
||||
- Subtracts item from *this tensor by performing:
|
||||
set_ptrm(host(), num_samples(), k()*nr()*nc()) -= item;
|
||||
!*/
|
||||
|
||||
template <typename EXP>
|
||||
void set_sample (
|
||||
unsigned long long idx,
|
||||
const matrix_exp<EXP>& item
|
||||
);
|
||||
/*!
|
||||
requires
|
||||
- idx < num_samples()
|
||||
- k()*nr()*nc() == item.size()
|
||||
- item contains float values
|
||||
ensures
|
||||
- Assigns item to the idx'th sample in *this by performing:
|
||||
set_ptrm(host()+idx*item.size(), item.nr(), item.nc()) = item;
|
||||
!*/
|
||||
|
||||
|
||||
template <typename EXP>
|
||||
void add_to_sample (
|
||||
unsigned long long idx,
|
||||
const matrix_exp<EXP>& item
|
||||
);
|
||||
/*!
|
||||
requires
|
||||
- idx < num_samples()
|
||||
- k()*nr()*nc() == item.size()
|
||||
- item contains float values
|
||||
ensures
|
||||
- Adds item to the idx'th sample in *this by performing:
|
||||
set_ptrm(host()+idx*item.size(), item.nr(), item.nc()) += item;
|
||||
!*/
|
||||
|
||||
protected:
|
||||
|
||||
// You can't move or copy another tensor into *this since that might modify the
|
||||
// tensor's dimensions. If you want to do that sort of thing then use a
|
||||
// resizable_tensor.
|
||||
tensor(const tensor& item);
|
||||
tensor& operator= (const tensor& item);
|
||||
tensor(tensor&& item);
|
||||
tensor& operator=(tensor&& item);
|
||||
};
|
||||
|
||||
// ----------------------------------------------------------------------------------------
|
||||
|
||||
void memcpy (
|
||||
tensor& dest,
|
||||
const tensor& src
|
||||
);
|
||||
/*!
|
||||
requires
|
||||
- dest.size() == src.size()
|
||||
ensures
|
||||
- Copies the data in src to dest. If the device data is current on both src
|
||||
and dest then the copy will happen entirely on the device side.
|
||||
- It doesn't matter what GPU device is selected by cudaSetDevice(). You can
|
||||
always copy tensor objects to and from each other regardless.
|
||||
- This function blocks until the copy has completed.
|
||||
!*/
|
||||
|
||||
// ----------------------------------------------------------------------------------------
|
||||
|
||||
bool is_vector (
|
||||
const tensor& t
|
||||
);
|
||||
/*!
|
||||
ensures
|
||||
- returns true if and only if one of the following is true:
|
||||
- t.size() == t.num_samples()
|
||||
- t.size() == t.k()
|
||||
- t.size() == t.nr()
|
||||
- t.size() == t.nc()
|
||||
!*/
|
||||
|
||||
// ----------------------------------------------------------------------------------------
|
||||
|
||||
const matrix_exp mat (
|
||||
const tensor& t,
|
||||
long long nr,
|
||||
long long nc
|
||||
);
|
||||
/*!
|
||||
requires
|
||||
- nr >= 0
|
||||
- nc >= 0
|
||||
- nr*nc == t.size()
|
||||
ensures
|
||||
- returns a matrix M such that:
|
||||
- M.nr() == nr
|
||||
- m.nc() == nc
|
||||
- for all valid r and c:
|
||||
M(r,c) == t.host()[r*nc + c]
|
||||
(i.e. the tensor is interpreted as a matrix laid out in memory
|
||||
in row major order)
|
||||
!*/
|
||||
|
||||
const matrix_exp mat (
|
||||
const tensor& t
|
||||
);
|
||||
/*!
|
||||
ensures
|
||||
- if (t.size() != 0) then
|
||||
- returns mat(t, t.num_samples(), t.size()/t.num_samples())
|
||||
- else
|
||||
- returns an empty matrix.
|
||||
!*/
|
||||
|
||||
const matrix_exp image_plane (
|
||||
const tensor& t,
|
||||
long long sample = 0,
|
||||
long long k = 0
|
||||
);
|
||||
/*!
|
||||
requires
|
||||
- t.size() != 0
|
||||
- 0 <= sample < t.num_samples()
|
||||
- 0 <= k < t.k()
|
||||
ensures
|
||||
- returns the k-th image plane from the sample-th image in t. That is,
|
||||
returns a matrix M such that:
|
||||
- M contains float valued elements.
|
||||
- M.nr() == t.nr()
|
||||
- M.nc() == t.nc()
|
||||
- for all valid r and c:
|
||||
- M(r,c) == t.host()[((sample*t.k() + k)*t.nr() + r)*t.nc() + c]
|
||||
!*/
|
||||
|
||||
// ----------------------------------------------------------------------------------------
|
||||
|
||||
bool have_same_dimensions (
|
||||
const tensor& a,
|
||||
const tensor& b
|
||||
);
|
||||
/*!
|
||||
ensures
|
||||
- returns true if and only if all of the fallowing are satisfied:
|
||||
- a.num_samples() == b.num_samples()
|
||||
- a.k() == b.k()
|
||||
- a.nr() == b.nr()
|
||||
- a.nc() == b.nc()
|
||||
!*/
|
||||
|
||||
// ----------------------------------------------------------------------------------------
|
||||
|
||||
class resizable_tensor : public tensor
|
||||
{
|
||||
/*!
|
||||
WHAT THIS OBJECT REPRESENTS
|
||||
This object is just a tensor with the additional ability to be resized.
|
||||
!*/
|
||||
|
||||
public:
|
||||
resizable_tensor(
|
||||
);
|
||||
/*!
|
||||
ensures
|
||||
- #size() == 0
|
||||
- #num_samples() == 0
|
||||
- #k() == 0
|
||||
- #nr() == 0
|
||||
- #nc() == 0
|
||||
- #capacity() == 0
|
||||
!*/
|
||||
|
||||
template <typename EXP>
|
||||
resizable_tensor(
|
||||
const matrix_exp<EXP>& item
|
||||
);
|
||||
/*!
|
||||
requires
|
||||
- item contains float values
|
||||
ensures
|
||||
- #num_samples() == item.nr()
|
||||
- #k() == item.nc()
|
||||
- #nr() == 1
|
||||
- #nc() == 1
|
||||
- Assigns item to *this tensor by performing:
|
||||
set_ptrm(host(), num_samples(), k()*nr()*nc()) = item;
|
||||
- #capacity() == size()
|
||||
!*/
|
||||
|
||||
explicit resizable_tensor(
|
||||
long long n_, long long k_ = 1, long long nr_ = 1, long long nc_ = 1
|
||||
);
|
||||
/*!
|
||||
requires
|
||||
- n_ >= 0
|
||||
- k_ >= 0
|
||||
- nr_ >= 0
|
||||
- nc_ >= 0
|
||||
ensures
|
||||
- #size() == n_*k_*nr_*nc_
|
||||
- #num_samples() == n_
|
||||
- #k() == k_
|
||||
- #nr() == nr_
|
||||
- #nc() == nc_
|
||||
- #capacity() == size()
|
||||
!*/
|
||||
|
||||
// This object is copyable and movable
|
||||
resizable_tensor(const resizable_tensor&) = default;
|
||||
resizable_tensor(resizable_tensor&&) = default;
|
||||
resizable_tensor& operator= (const resizable_tensor&) = default;
|
||||
resizable_tensor& operator= (resizable_tensor&&) = default;
|
||||
|
||||
size_t capacity (
|
||||
) const;
|
||||
/*!
|
||||
ensures
|
||||
- returns the total number of floats allocated. This might be different
|
||||
from the size() since calls to set_size() that make a tensor smaller
|
||||
don't trigger reallocations. They simply adjust the nominal dimensions
|
||||
while keeping the same allocated memory block. This makes calls to
|
||||
set_size() very fast. If you need to deallocate a tensor then use
|
||||
clear().
|
||||
!*/
|
||||
|
||||
void clear(
|
||||
);
|
||||
/*!
|
||||
ensures
|
||||
- #size() == 0
|
||||
- #num_samples() == 0
|
||||
- #k() == 0
|
||||
- #nr() == 0
|
||||
- #nc() == 0
|
||||
- #annotation().is_empty() == true
|
||||
- #capacity() == 0
|
||||
!*/
|
||||
|
||||
void copy_size (
|
||||
const tensor& item
|
||||
);
|
||||
/*!
|
||||
ensures
|
||||
- resizes *this so that: have_same_dimensions(#*this, item)==true
|
||||
!*/
|
||||
|
||||
void set_size(
|
||||
long long n_, long long k_ = 1, long long nr_ = 1, long long nc_ = 1
|
||||
);
|
||||
/*!
|
||||
requires
|
||||
- n_ >= 0
|
||||
- k_ >= 0
|
||||
- nr_ >= 0
|
||||
- nc_ >= 0
|
||||
ensures
|
||||
- #size() == n_*k_*nr_*nc_
|
||||
- #num_samples() == n_
|
||||
- #k() == k_
|
||||
- #nr() == nr_
|
||||
- #nc() == nc_
|
||||
- #capacity() == max(#size(), capacity())
|
||||
(i.e. capacity() never goes down when calling set_size().)
|
||||
!*/
|
||||
|
||||
template <typename EXP>
|
||||
resizable_tensor& operator= (
|
||||
const matrix_exp<EXP>& item
|
||||
);
|
||||
/*!
|
||||
requires
|
||||
- item contains float values
|
||||
ensures
|
||||
- if (num_samples() == item.nr() && k()*nr()*nc() == item.nc()) then
|
||||
- the dimensions of this tensor are not changed
|
||||
- else
|
||||
- #num_samples() == item.nr()
|
||||
- #k() == item.nc()
|
||||
- #nr() == 1
|
||||
- #nc() == 1
|
||||
- Assigns item to *this tensor by performing:
|
||||
set_ptrm(host(), num_samples(), k()*nr()*nc()) = item;
|
||||
!*/
|
||||
};
|
||||
|
||||
void serialize(const tensor& item, std::ostream& out);
|
||||
void deserialize(resizable_tensor& item, std::istream& in);
|
||||
/*!
|
||||
provides serialization support for tensor and resizable_tensor. Note that you can
|
||||
serialize to/from any combination of tenor and resizable_tensor objects.
|
||||
!*/
|
||||
|
||||
// ----------------------------------------------------------------------------------------
|
||||
|
||||
double dot(
|
||||
const tensor& a,
|
||||
const tensor& b
|
||||
);
|
||||
/*!
|
||||
requires
|
||||
- a.size() == b.size()
|
||||
ensures
|
||||
- returns the dot product between a and b when they are both treated as
|
||||
a.size() dimensional vectors. That is, this function pointwise multiplies
|
||||
the vectors together, then sums the result and returns it.
|
||||
|
||||
!*/
|
||||
|
||||
// ----------------------------------------------------------------------------------------
|
||||
|
||||
class alias_tensor_instance : public tensor
|
||||
{
|
||||
/*!
|
||||
WHAT THIS OBJECT REPRESENTS
|
||||
This object is a tensor that aliases another tensor. That is, it doesn't
|
||||
have its own block of memory but instead simply holds pointers to the
|
||||
memory of another tensor object. It therefore allows you to efficiently
|
||||
break a tensor into pieces and pass those pieces into functions.
|
||||
|
||||
An alias_tensor_instance doesn't own the resources it points to in any sense.
|
||||
So it is important to make sure that the underlying owning tensor doesn't get
|
||||
destructed before any alias tensors which point to it are destructed.
|
||||
!*/
|
||||
|
||||
// You can't default initialize this object. You can only get instances of it from
|
||||
// alias_tensor::operator().
|
||||
alias_tensor_instance(
|
||||
);
|
||||
};
|
||||
|
||||
class alias_tensor_const_instance
|
||||
{
|
||||
/*!
|
||||
WHAT THIS OBJECT REPRESENTS
|
||||
This is essentially a const version of alias_tensor_instance and therefore
|
||||
represents a tensor. However, due to the mechanics of C++, this object
|
||||
can't inherit from tensor. So instead it provides a get() and an implicit
|
||||
conversion to const tensor.
|
||||
!*/
|
||||
|
||||
public:
|
||||
|
||||
// non-const alias tensors are convertible to const ones.
|
||||
alias_tensor_const_instance(const alias_tensor_instance& item);
|
||||
|
||||
// Methods that cast the alias to a tensor.
|
||||
const tensor& get() const;
|
||||
operator const tensor& ();
|
||||
|
||||
private:
|
||||
// You can't default initialize this object. You can only get instances of it from
|
||||
// alias_tensor::operator().
|
||||
alias_tensor_const_instance();
|
||||
};
|
||||
|
||||
class alias_tensor
|
||||
{
|
||||
/*!
|
||||
WHAT THIS OBJECT REPRESENTS
|
||||
This is a tool for creating tensor objects that alias other tensor objects.
|
||||
That is, it allows you to make a tensor that references the memory space of
|
||||
another tensor object rather than owning its own memory. This allows you
|
||||
to do things like interpret a single tensor in different ways or even as a
|
||||
group of multiple tensors.
|
||||
!*/
|
||||
public:
|
||||
|
||||
alias_tensor (
|
||||
);
|
||||
/*!
|
||||
ensures
|
||||
- #size() == 0
|
||||
- #num_samples() == 0
|
||||
- #k() == 0
|
||||
- #nr() == 0
|
||||
- #nc() == 0
|
||||
!*/
|
||||
|
||||
alias_tensor (
|
||||
long long n_, long long k_ = 1, long long nr_ = 1, long long nc_ = 1
|
||||
);
|
||||
/*!
|
||||
requires
|
||||
- n_ >= 0
|
||||
- k_ >= 0
|
||||
- nr_ >= 0
|
||||
- nc_ >= 0
|
||||
ensures
|
||||
- #size() == n_*k_*nr_*nc_
|
||||
- #num_samples() == n_
|
||||
- #k() == k_
|
||||
- #nr() == nr_
|
||||
- #nc() == nc_
|
||||
!*/
|
||||
|
||||
long long num_samples() const;
|
||||
long long k() const;
|
||||
long long nr() const;
|
||||
long long nc() const;
|
||||
size_t size() const;
|
||||
|
||||
alias_tensor_instance operator() (
|
||||
tensor& t,
|
||||
size_t offset = 0
|
||||
) const;
|
||||
/*!
|
||||
requires
|
||||
- offset+size() <= t.size()
|
||||
ensures
|
||||
- Returns a tensor that simply aliases the elements of t beginning with t's
|
||||
offset'th element. Specifically, this function returns an aliasing
|
||||
tensor T such that:
|
||||
- T.size() == size()
|
||||
- T.num_samples() == num_samples()
|
||||
- T.k() == k()
|
||||
- T.nr() == nr()
|
||||
- T.nc() == nc()
|
||||
- T.host() == t.host()+offset
|
||||
- T.device() == t.device()+offset
|
||||
- &T.annotation() == &t.annotation()
|
||||
!*/
|
||||
|
||||
alias_tensor_const_instance operator() (
|
||||
const tensor& t,
|
||||
size_t offset = 0
|
||||
) const;
|
||||
/*!
|
||||
requires
|
||||
- offset+size() <= t.size()
|
||||
ensures
|
||||
- This function is identical to the above version of operator() except that
|
||||
it takes and returns const tensors instead of non-const tensors.
|
||||
!*/
|
||||
};
|
||||
|
||||
void serialize(const alias_tensor& item, std::ostream& out);
|
||||
void deserialize(alias_tensor& item, std::istream& in);
|
||||
/*!
|
||||
provides serialization support for alias_tensor.
|
||||
!*/
|
||||
|
||||
// ----------------------------------------------------------------------------------------
|
||||
|
||||
}
|
||||
|
||||
#endif // DLIB_DNn_TENSOR_ABSTRACT_H_
|
||||
|
||||
|
||||
1711
pkg/OpenFace/lib/3rdParty/dlib/include/dlib/cuda/tensor_tools.h
vendored
Normal file
1711
pkg/OpenFace/lib/3rdParty/dlib/include/dlib/cuda/tensor_tools.h
vendored
Normal file
File diff suppressed because it is too large
Load Diff
Reference in New Issue
Block a user