open source pkg v1

This commit is contained in:
Vijay Yadev
2020-08-04 19:12:31 -04:00
parent bef213dba9
commit c389fc2c47
3708 changed files with 1624220 additions and 1 deletions

View File

@@ -0,0 +1,505 @@
// Copyright (C) 2015 Davis E. King (davis@dlib.net)
// License: Boost Software License See LICENSE.txt for the full license.
#ifndef DLIB_DNN_CPU_H_
#define DLIB_DNN_CPU_H_
// This file contains CPU implementations of the GPU based functions in cuda_dlib.h
// and cudnn_dlibapi.h
#include "tensor.h"
#include "../geometry/rectangle.h"
namespace dlib
{
namespace cpu
{
// -----------------------------------------------------------------------------------
void multiply (
bool add_to,
tensor& dest,
const tensor& src1,
const tensor& src2
);
void multiply_conv (
bool add_to,
tensor& dest,
const tensor& src1,
const tensor& src2
);
void multiply_zero_padded (
bool add_to,
tensor& dest,
const tensor& src1,
const tensor& src2
);
void scale_channels (
bool add_to,
tensor& dest,
const tensor& src,
const tensor& scales
);
void add(
float beta,
tensor& dest,
float alpha,
const tensor& src
);
void assign_bias_gradient (
tensor& grad,
const tensor& gradient_input
);
void add (
tensor& dest,
const tensor& src1,
const tensor& src2
);
void assign_conv_bias_gradient (
tensor& grad,
const tensor& gradient_input
);
// -----------------------------------------------------------------------------------
void affine_transform(
tensor& dest,
const tensor& src,
const float A,
const float B
);
void affine_transform(
tensor& dest,
const tensor& src1,
const tensor& src2,
const float A,
const float B,
const float C
);
void affine_transform(
tensor& dest,
const tensor& src1,
const tensor& src2,
const tensor& src3,
const float A,
const float B,
const float C,
const float D
);
void affine_transform_range(
size_t begin,
size_t end,
tensor& dest,
const tensor& src1,
const tensor& src2,
const tensor& src3,
const float A,
const float B,
const float C
);
// -----------------------------------------------------------------------------------
void affine_transform(
tensor& dest,
const tensor& src,
const tensor& A,
const tensor& B
);
// -----------------------------------------------------------------------------------
void affine_transform_conv(
tensor& dest,
const tensor& src,
const tensor& A,
const tensor& B
);
// -----------------------------------------------------------------------------------
void affine_transform(
const rectangle& rect,
tensor& dest,
const tensor& src1,
const tensor& src2,
const tensor& src3,
float A,
float B,
float C
);
// -----------------------------------------------------------------------------------
void compute_adam_update (
size_t begin,
size_t end,
tensor& s,
tensor& m,
tensor& v,
const float t,
const float learning_rate,
const float weight_decay,
const float momentum1,
const float momentum2,
const tensor& params,
const tensor& params_grad
);
// -----------------------------------------------------------------------------------
void batch_normalize_inference (
const double eps,
resizable_tensor& dest,
const tensor& src,
const tensor& gamma,
const tensor& beta,
const tensor& running_means,
const tensor& running_variances
);
void batch_normalize (
const double eps,
resizable_tensor& dest,
resizable_tensor& means,
resizable_tensor& invstds,
const double averaging_factor,
resizable_tensor& running_means,
resizable_tensor& running_variances,
const tensor& src,
const tensor& gamma,
const tensor& beta
);
void batch_normalize_gradient (
const double eps,
const tensor& gradient_input,
const tensor& means,
const tensor& invstds,
const tensor& src,
const tensor& gamma,
tensor& src_grad,
tensor& gamma_grad,
tensor& beta_grad
);
void batch_normalize_conv_inference (
const double eps,
resizable_tensor& dest,
const tensor& src,
const tensor& gamma,
const tensor& beta,
const tensor& running_means,
const tensor& running_variances
);
void batch_normalize_conv (
const double eps,
resizable_tensor& dest,
resizable_tensor& means,
resizable_tensor& invstds,
const double averaging_factor,
resizable_tensor& running_means,
resizable_tensor& running_variances,
const tensor& src,
const tensor& gamma,
const tensor& beta
);
void batch_normalize_conv_gradient (
const double eps,
const tensor& gradient_input,
const tensor& means,
const tensor& invstds,
const tensor& src,
const tensor& gamma,
tensor& src_grad,
tensor& gamma_grad,
tensor& beta_grad
);
// -----------------------------------------------------------------------------------
void threshold (
tensor& data,
float thresh
);
void dot (
const tensor& a,
const tensor& b,
tensor& result,
size_t idx
);
// -----------------------------------------------------------------------------------
void softmax (
tensor& dest,
const tensor& src
);
void softmax_gradient (
tensor& grad,
const tensor& dest,
const tensor& gradient_input
);
// ------------------------------------------------------------------------------------
void softmax_all (
tensor& dest,
const tensor& src
);
void softmax_all_gradient (
tensor& grad,
const tensor& dest,
const tensor& gradient_input
);
// ------------------------------------------------------------------------------------
void sigmoid (
tensor& dest,
const tensor& src
);
void sigmoid_gradient (
tensor& grad,
const tensor& dest,
const tensor& gradient_input
);
// ------------------------------------------------------------------------------------
void relu (
tensor& dest,
const tensor& src
);
void relu_gradient (
tensor& grad,
const tensor& dest,
const tensor& gradient_input
);
// ----------------------------------------------------------------------------------------
void prelu (
tensor& dest,
const tensor& src,
const tensor& param
);
void prelu_gradient (
tensor& grad,
const tensor& src,
const tensor& gradient_input,
const tensor& param,
tensor& params_grad
);
// ------------------------------------------------------------------------------------
void tanh (
tensor& dest,
const tensor& src
);
void tanh_gradient (
tensor& grad,
const tensor& dest,
const tensor& gradient_input
);
// ----------------------------------------------------------------------------------------
void resize_bilinear (
tensor& dest,
long dest_row_stride,
long dest_channel_stride,
const tensor& src,
long src_row_stride,
long src_channel_stride
);
void resize_bilinear_gradient (
tensor& grad,
long grad_row_stride,
long grad_channel_stride,
const tensor& gradient_input,
long gradient_input_row_stride,
long gradient_input_channel_stride
);
inline void resize_bilinear (
tensor& dest,
const tensor& src
) { resize_bilinear(dest, dest.nc(), dest.nr()*dest.nc(), src, src.nc(), src.nr()*src.nc()); }
inline void resize_bilinear_gradient (
tensor& grad,
const tensor& gradient_input
) { resize_bilinear_gradient(grad, grad.nc(), grad.nr()*grad.nc(), gradient_input, gradient_input.nc(), gradient_input.nr()*gradient_input.nc()); }
// -----------------------------------------------------------------------------------
class pooling
{
public:
pooling(const pooling&) = delete;
pooling& operator=(const pooling&) = delete;
pooling (
);
void clear(
);
void setup_max_pooling(
int window_height,
int window_width,
int stride_y,
int stride_x,
int padding_y,
int padding_x
);
void setup_avg_pooling(
int window_height,
int window_width,
int stride_y,
int stride_x,
int padding_y,
int padding_x
);
bool does_max_pooling(
) const { return do_max_pooling; }
void operator() (
resizable_tensor& dest,
const tensor& src
);
void get_gradient(
const tensor& gradient_input,
const tensor& dest,
const tensor& src,
tensor& grad
);
private:
int window_height;
int window_width;
int stride_y;
int stride_x;
int padding_y;
int padding_x;
bool do_max_pooling;
};
// -----------------------------------------------------------------------------------
class tensor_conv
{
public:
tensor_conv(const tensor_conv&) = delete;
tensor_conv& operator=(const tensor_conv&) = delete;
tensor_conv() {}
void clear(
) {}
void setup(
const tensor& data, /* not used but required for interface */
const tensor& filters, /* not used but required for interface */
int stride_y,
int stride_x,
int padding_y,
int padding_x
)
{
(void)data; /* silence compiler */
DLIB_CASSERT(stride_y > 0 && stride_x > 0);
DLIB_CASSERT(0 <= padding_y && padding_y < filters.nr());
DLIB_CASSERT(0 <= padding_x && padding_x < filters.nc());
last_stride_y = stride_y;
last_stride_x = stride_x;
last_padding_y = padding_y;
last_padding_x = padding_x;
}
void operator() (
const bool add_to_output,
resizable_tensor& output,
const tensor& data,
const tensor& filters
);
void operator() (
const bool add_to_output,
tensor& output,
const tensor& data,
const tensor& filters
);
void get_gradient_for_data (
const bool add_to_output,
const tensor& gradient_input,
const tensor& filters,
tensor& data_gradient
);
void get_gradient_for_filters (
const bool add_to_output,
const tensor& gradient_input,
const tensor& data,
tensor& filters_gradient
);
private:
long last_stride_y = 0;
long last_stride_x = 0;
long last_padding_y = 0;
long last_padding_x = 0;
};
// -----------------------------------------------------------------------------------
void copy_tensor(
bool add_to,
tensor& dest,
size_t dest_k_offset,
const tensor& src,
size_t src_k_offset,
size_t count_k
);
// -----------------------------------------------------------------------------------
}
}
#ifdef NO_MAKEFILE
#include "cpu_dlib.cpp"
#endif
#endif // DLIB_DNN_CPU_H_

View File

@@ -0,0 +1,50 @@
// Copyright (C) 2015 Davis E. King (davis@dlib.net)
// License: Boost Software License See LICENSE.txt for the full license.
#ifndef DLIB_DNN_CuBLAS_H_
#define DLIB_DNN_CuBLAS_H_
#ifdef DLIB_USE_CUDA
#include "tensor.h"
#include "cuda_errors.h"
namespace dlib
{
namespace cuda
{
// -----------------------------------------------------------------------------------
void gemm (
float beta,
tensor& dest,
float alpha,
const tensor& lhs,
bool trans_lhs,
const tensor& rhs,
bool trans_rhs
);
/*!
requires
- The dimensions of lhs and rhs must be compatible for matrix
multiplication. In particular:
- Let L == trans_lhs ? trans(mat(lhs)) : mat(lhs)
- Let R == trans_rhs ? trans(mat(rhs)) : mat(rhs)
- Let D == mat(dest)
- D.nr() == L.nr() && D.nc() == R.nc()
(i.e. dest must be preallocated and have the correct output dimensions)
- L.nc() == R.nr()
ensures
- performs: dest = alpha*L*R + beta*mat(dest)
!*/
// ------------------------------------------------------------------------------------
}
}
#endif // DLIB_USE_CUDA
#endif // DLIB_DNN_CuBLAS_H_

View File

@@ -0,0 +1,256 @@
// Copyright (C) 2017 Davis E. King (davis@dlib.net)
// License: Boost Software License See LICENSE.txt for the full license.
#ifndef DLIB_DNN_CuDA_DATA_PTR_H_
#define DLIB_DNN_CuDA_DATA_PTR_H_
#ifdef DLIB_USE_CUDA
#include <memory>
#include <vector>
#include "../assert.h"
namespace dlib
{
namespace cuda
{
// ------------------------------------------------------------------------------------
class cuda_data_void_ptr
{
/*!
WHAT THIS OBJECT REPRESENTS
This is a block of memory on a CUDA device.
!*/
public:
cuda_data_void_ptr() = default;
cuda_data_void_ptr(size_t n);
/*!
ensures
- This object will allocate a device memory buffer of n bytes.
- #size() == n
!*/
void* data() { return pdata.get(); }
const void* data() const { return pdata.get(); }
operator void*() { return pdata.get(); }
operator const void*() const { return pdata.get(); }
void reset() { pdata.reset(); }
size_t size() const { return num; }
/*!
ensures
- returns the length of this buffer, in bytes.
!*/
cuda_data_void_ptr operator+ (size_t offset) const
/*!
requires
- offset < size()
ensures
- returns a pointer that is offset by the given amount.
!*/
{
DLIB_CASSERT(offset < num);
cuda_data_void_ptr temp;
temp.num = num-offset;
temp.pdata = std::shared_ptr<void>(pdata, ((char*)pdata.get())+offset);
return temp;
}
private:
size_t num = 0;
std::shared_ptr<void> pdata;
};
inline cuda_data_void_ptr operator+(size_t offset, const cuda_data_void_ptr& rhs) { return rhs+offset; }
// ------------------------------------------------------------------------------------
void memcpy(
void* dest,
const cuda_data_void_ptr& src
);
/*!
requires
- dest == a pointer to at least src.size() bytes on the host machine.
ensures
- copies the GPU data from src into dest.
- This routine is equivalent to performing: memcpy(dest,src,src.size())
!*/
void memcpy(
void* dest,
const cuda_data_void_ptr& src,
const size_t num
);
/*!
requires
- dest == a pointer to at least num bytes on the host machine.
- num <= src.size()
ensures
- copies the GPU data from src into dest. Copies only the first num bytes
of src to dest.
!*/
// ------------------------------------------------------------------------------------
void memcpy(
cuda_data_void_ptr dest,
const void* src
);
/*!
requires
- dest == a pointer to at least src.size() bytes on the host machine.
ensures
- copies the host data from src to the GPU memory buffer dest.
- This routine is equivalent to performing: memcpy(dest,src,dest.size())
!*/
void memcpy(
cuda_data_void_ptr dest,
const void* src,
const size_t num
);
/*!
requires
- dest == a pointer to at least num bytes on the host machine.
- num <= dest.size()
ensures
- copies the host data from src to the GPU memory buffer dest. Copies only
the first num bytes of src to dest.
!*/
// ------------------------------------------------------------------------------------
// ------------------------------------------------------------------------------------
// ------------------------------------------------------------------------------------
template <typename T>
class cuda_data_ptr
{
/*!
WHAT THIS OBJECT REPRESENTS
This is a block of memory on a CUDA device. It is just a type safe
version of cuda_data_void_ptr.
!*/
public:
static_assert(std::is_standard_layout<T>::value, "You can only create basic standard layout types on the GPU");
cuda_data_ptr() = default;
cuda_data_ptr(size_t n) : num(n)
/*!
ensures
- This object will allocate a device memory buffer of n T objects.
- #size() == n
!*/
{
if (n == 0)
return;
pdata = cuda_data_void_ptr(n*sizeof(T));
}
T* data() { return (T*)pdata.data(); }
const T* data() const { return (T*)pdata.data(); }
operator T*() { return (T*)pdata.data(); }
operator const T*() const { return (T*)pdata.data(); }
void reset() { pdata.reset(); }
size_t size() const { return num; }
friend void memcpy(
std::vector<T>& dest,
const cuda_data_ptr& src
)
{
dest.resize(src.size());
if (src.size() != 0)
memcpy(dest.data(), src.pdata);
}
friend void memcpy(
cuda_data_ptr& src,
const std::vector<T>& dest
)
{
if (dest.size() != src.size())
dest = cuda_data_ptr<T>(src.size());
if (src.size() != 0)
memcpy(src.pdata, dest.data());
}
private:
size_t num = 0;
cuda_data_void_ptr pdata;
};
// ------------------------------------------------------------------------------------
class resizable_cuda_buffer
{
/*!
WHAT THIS OBJECT REPRESENTS
This is a block of memory on a CUDA device that will be automatically
resized if requested size is larger than allocated.
!*/
public:
cuda_data_void_ptr get(size_t size)
/*!
ensures
- This object will return the buffer of requested size or larger.
- buffer.size() >= size
- Client code should not hold the returned cuda_data_void_ptr for long
durations, but instead should call get() whenever the buffer is
needed. Doing so ensures that multiple buffers are not kept around
in the event of a resize.
!*/
{
if (buffer.size() < size)
{
buffer.reset();
buffer = cuda_data_void_ptr(size);
}
return buffer;
}
private:
cuda_data_void_ptr buffer;
};
// ----------------------------------------------------------------------------------------
std::shared_ptr<resizable_cuda_buffer> device_global_buffer(
);
/*!
ensures
- Returns a pointer to a globally shared CUDA memory buffer on the
currently selected CUDA device. The buffer is also thread local. So
each host thread will get its own buffer. You can use this global buffer
as scratch space for CUDA computations that all take place on the default
stream. Using it in this way ensures that there aren't any race conditions
involving the use of the buffer.
- The global buffer is deallocated once all references to it are
destructed. It will be reallocated as required. So if you want to avoid
these reallocations then hold a copy of the shared_ptr returned by this
function.
!*/
// ----------------------------------------------------------------------------------------
}
}
#endif // DLIB_USE_CUDA
#endif // DLIB_DNN_CuDA_DATA_PTR_H_

View File

@@ -0,0 +1,530 @@
// Copyright (C) 2015 Davis E. King (davis@dlib.net)
// License: Boost Software License See LICENSE.txt for the full license.
#ifndef DLIB_DNN_CuDA_H_
#define DLIB_DNN_CuDA_H_
#include "tensor.h"
#include "../geometry/rectangle.h"
namespace dlib
{
namespace cuda
{
// ----------------------------------------------------------------------------------------
void set_device (
int dev
);
int get_device (
);
int get_num_devices (
);
std::string get_device_name (
int device
);
void set_current_device_blocking_sync(
);
bool can_access_peer (int device_id, int peer_device_id);
bool can_access_peer (const tensor& device, const tensor& peer_device);
void device_synchronize (int dev);
void device_synchronize (const tensor& dev);
class raii_set_device
{
public:
raii_set_device() = delete;
raii_set_device(const raii_set_device&) = delete;
raii_set_device& operator=(const raii_set_device&) = delete;
raii_set_device(int dev)
{
prev_dev = get_device();
set_device(dev);
}
raii_set_device(const tensor& dev)
{
prev_dev = get_device();
set_device(dev.device_id());
}
void operator() (int dev)
{
set_device(dev);
}
void operator() (const tensor& dev)
{
set_device(dev.device_id());
}
~raii_set_device() noexcept(false)
{
set_device(prev_dev);
}
private:
int prev_dev;
};
#ifdef DLIB_USE_CUDA
class enable_peer_access
{
public:
enable_peer_access() = delete;
enable_peer_access(const enable_peer_access&) = delete;
enable_peer_access& operator=(const enable_peer_access&) = delete;
enable_peer_access(
int device_id,
int peer_device_id
);
enable_peer_access(
const tensor& device,
const tensor& peer_device
) : enable_peer_access(device.device_id(), peer_device.device_id())
{}
~enable_peer_access() noexcept(false);
private:
bool call_disable;
int device_id;
int peer_device_id;
};
// -----------------------------------------------------------------------------------
void inverse_norms (
resizable_tensor& invnorms,
const tensor& data,
const double eps
);
void dot_prods (
resizable_tensor& out,
const tensor& lhs,
const tensor& rhs
);
void dot_prods (
bool add_to,
tensor& out,
const tensor& lhs,
const tensor& rhs
);
void scale_columns (
tensor& out,
const tensor& m,
const tensor& v
);
void scale_rows (
tensor& out,
const tensor& m,
const tensor& v
);
void scale_rows2 (
float beta,
tensor& out,
const tensor& m1,
const tensor& m2,
const tensor& v1,
const tensor& v2
);
void exp (
tensor& dest,
const tensor& src
);
void log (
tensor& dest,
const tensor& src
);
void log10 (
tensor& dest,
const tensor& src
);
// ------------------------------------------------------------------------------------
void set_tensor (
tensor& t,
float value
);
void scale_tensor (
tensor& t,
float value
);
// ------------------------------------------------------------------------------------
void multiply (
bool add_to,
tensor& dest,
const tensor& src1,
const tensor& src2
);
void multiply_conv (
bool add_to,
tensor& dest,
const tensor& src1,
const tensor& src2
);
void multiply_zero_padded (
bool add_to,
tensor& dest,
const tensor& src1,
const tensor& src2
);
void scale_channels (
bool add_to,
tensor& dest,
const tensor& src,
const tensor& scales
);
void add (
tensor& dest,
const tensor& src1,
const tensor& src2
);
// -----------------------------------------------------------------------------------
void affine_transform(
tensor& dest,
const tensor& src,
const float A,
const float B
);
void affine_transform(
tensor& dest,
const tensor& src,
const float A
);
void affine_transform(
tensor& dest,
const tensor& src1,
const tensor& src2,
const float A,
const float B,
const float C
);
void affine_transform(
tensor& dest,
const tensor& src1,
const tensor& src2,
const float A,
const float B
);
void affine_transform(
tensor& dest,
const tensor& src1,
const tensor& src2,
const tensor& src3,
const float A,
const float B,
const float C,
const float D
);
void affine_transform_range(
size_t begin,
size_t end,
tensor& dest,
const tensor& src1,
const tensor& src2,
const tensor& src3,
const float A,
const float B,
const float C
);
void affine_transform(
const rectangle& rect,
tensor& dest,
const tensor& src1,
const tensor& src2,
const tensor& src3,
float A,
float B,
float C
);
// Note that this function isn't in the tt:: namespace because add_scaled() is
// called by cuda::add() so we don't need a tt:: version of add_scaled().
void add_scaled(
tensor& dest,
const float scale,
const tensor& src
);
void add_cv_to_all_columns(
float beta,
tensor& dest,
float alpha,
const tensor& src
);
// -----------------------------------------------------------------------------------
void affine_transform(
tensor& dest,
const tensor& src,
const tensor& A,
const tensor& B
);
// -----------------------------------------------------------------------------------
void affine_transform_conv(
tensor& dest,
const tensor& src,
const tensor& A,
const tensor& B
);
// ----------------------------------------------------------------------------------------
void compute_adam_update (
size_t begin,
size_t end,
tensor& s,
tensor& m,
tensor& v,
const float t,
const float learning_rate,
const float weight_decay,
const float momentum1,
const float momentum2,
const tensor& params,
const tensor& params_grad
);
// -----------------------------------------------------------------------------------
void assign_bias_gradient (
tensor& grad,
const tensor& gradient_input
);
// -----------------------------------------------------------------------------------
void threshold (
tensor& data,
float thresh
);
// ----------------------------------------------------------------------------------------
void dot (
const tensor& a,
const tensor& b,
tensor& result,
size_t idx
);
// ----------------------------------------------------------------------------------------
void prelu (
tensor& dest,
const tensor& src,
const tensor& param
);
void prelu_gradient (
tensor& grad,
const tensor& src,
const tensor& gradient_input,
const tensor& param,
tensor& params_grad
);
// ----------------------------------------------------------------------------------------
void resize_bilinear (
tensor& dest,
long dest_row_stride,
long dest_channel_stride,
const tensor& src,
long src_row_stride,
long src_channel_stride
);
void resize_bilinear_gradient (
tensor& grad,
long grad_row_stride,
long grad_channel_stride,
const tensor& gradient_input,
long gradient_input_row_stride,
long gradient_input_channel_stride
);
inline void resize_bilinear (
tensor& dest,
const tensor& src
) { resize_bilinear(dest, dest.nc(), dest.nr()*dest.nc(), src, src.nc(), src.nr()*src.nc()); }
inline void resize_bilinear_gradient (
tensor& grad,
const tensor& gradient_input
) { resize_bilinear_gradient(grad, grad.nc(), grad.nr()*grad.nc(), gradient_input, gradient_input.nc(), gradient_input.nr()*gradient_input.nc()); }
// ----------------------------------------------------------------------------------------
void copy_tensor(
bool add_to,
tensor& dest,
size_t dest_k_offset,
const tensor& src,
size_t src_k_offset,
size_t count_k
);
// ----------------------------------------------------------------------------------------
class compute_loss_multiclass_log_per_pixel
{
/*!
The point of this class is to compute the loss computed by
loss_multiclass_log_per_pixel, but to do so with CUDA.
!*/
public:
compute_loss_multiclass_log_per_pixel(
)
{
work = device_global_buffer();
}
template <
typename const_label_iterator
>
void operator() (
const_label_iterator truth,
const tensor& subnetwork_output,
tensor& gradient,
double& loss
) const
{
const size_t bytes_per_plane = subnetwork_output.nr()*subnetwork_output.nc()*sizeof(uint16_t);
// Allocate a cuda buffer to store all the truth images and also one float
// for the scalar loss output.
cuda_data_void_ptr buf = work->get(subnetwork_output.num_samples()*bytes_per_plane + sizeof(float));
cuda_data_void_ptr loss_buf = buf;
buf = buf+sizeof(float);
// copy the truth data into a cuda buffer.
for (long i = 0; i < subnetwork_output.num_samples(); ++i, ++truth)
{
const matrix<uint16_t>& t = *truth;
DLIB_ASSERT(t.nr() == subnetwork_output.nr());
DLIB_ASSERT(t.nc() == subnetwork_output.nc());
memcpy(buf + i*bytes_per_plane, &t(0,0), bytes_per_plane);
}
do_work(static_cast<float*>(loss_buf.data()), static_cast<uint16_t*>(buf.data()), subnetwork_output, gradient, loss);
}
private:
static void do_work(
float* loss_cuda_work_buffer,
const uint16_t* truth_buffer,
const tensor& subnetwork_output,
tensor& gradient,
double& loss
);
std::shared_ptr<resizable_cuda_buffer> work;
};
// ------------------------------------------------------------------------------------
// ------------------------------------------------------------------------------------
// ------------------------------------------------------------------------------------
// ------------------------------------------------------------------------------------
#else // if DLIB_USE_CUDA NOT DEFINED
inline void set_device (
int id
)
{
DLIB_CASSERT(id == 0, "dlib::cuda::set_device(id) called with an invalid device id.");
}
inline int get_device (
){ return 0; }
inline int get_num_devices (
) { return 1; }
inline std::string get_device_name (
int device
)
{
DLIB_CASSERT(device == 0, "dlib::cuda::set_device(id) called with an invalid device id.");
return "CUDA_DISABLED";
}
inline void set_current_device_blocking_sync(
) {}
inline bool can_access_peer (int , int )
{ return false; }
inline bool can_access_peer (const tensor& , const tensor& )
{ return false; }
inline void device_synchronize (int ){}
inline void device_synchronize (const tensor& ){}
class enable_peer_access
{
public:
enable_peer_access() = delete;
enable_peer_access(const enable_peer_access&) = delete;
enable_peer_access& operator=(const enable_peer_access&) = delete;
enable_peer_access( int, int ){}
enable_peer_access( const tensor&, const tensor& ) {}
};
#endif // DLIB_USE_CUDA
}
}
#endif // DLIB_DNN_CuDA_H_

View File

@@ -0,0 +1,70 @@
// Copyright (C) 2015 Davis E. King (davis@dlib.net)
// License: Boost Software License See LICENSE.txt for the full license.
#ifndef DLIB_CUDA_ERRORs_H_
#define DLIB_CUDA_ERRORs_H_
#include "../error.h"
namespace dlib
{
struct cuda_error : public error
{
/*!
WHAT THIS OBJECT REPRESENTS
This is the exception thrown if any calls to the NVIDIA CUDA runtime
returns an error.
!*/
cuda_error(const std::string& message): error(message) {}
};
struct cudnn_error : public cuda_error
{
/*!
WHAT THIS OBJECT REPRESENTS
This is the exception thrown if any calls to the NVIDIA cuDNN library
returns an error.
!*/
cudnn_error(const std::string& message): cuda_error(message) {}
};
struct curand_error : public cuda_error
{
/*!
WHAT THIS OBJECT REPRESENTS
This is the exception thrown if any calls to the NVIDIA cuRAND library
returns an error.
!*/
curand_error(const std::string& message): cuda_error(message) {}
};
struct cublas_error : public cuda_error
{
/*!
WHAT THIS OBJECT REPRESENTS
This is the exception thrown if any calls to the NVIDIA cuBLAS library
returns an error.
!*/
cublas_error(const std::string& message): cuda_error(message) {}
};
struct cusolver_error : public cuda_error
{
/*!
WHAT THIS OBJECT REPRESENTS
This is the exception thrown if any calls to the NVIDIA cuSolver library
returns an error.
!*/
cusolver_error(const std::string& message): cuda_error(message) {}
};
}
#endif // DLIB_CUDA_ERRORs_H_

View File

@@ -0,0 +1,413 @@
// Copyright (C) 2015 Davis E. King (davis@dlib.net)
// License: Boost Software License See LICENSE.txt for the full license.
#ifndef DLIB_CUDA_UtILS_H_
#define DLIB_CUDA_UtILS_H_
#ifndef DLIB_USE_CUDA
#error "This file shouldn't be #included unless DLIB_USE_CUDA is #defined"
#endif
#include "cuda_errors.h"
#include "../algs.h"
#include <cmath>
#include <cuda_runtime.h>
#include <sstream>
#include <iostream>
#include <memory>
#include <vector>
#include <type_traits>
// Check the return value of a call to the CUDA runtime for an error condition.
#define CHECK_CUDA(call) \
do{ \
const cudaError_t error = call; \
if (error != cudaSuccess) \
{ \
std::ostringstream sout; \
sout << "Error while calling " << #call << " in file " << __FILE__ << ":" << __LINE__ << ". ";\
sout << "code: " << error << ", reason: " << cudaGetErrorString(error);\
throw dlib::cuda_error(sout.str()); \
} \
}while(false)
// ----------------------------------------------------------------------------------------
#ifdef __CUDACC__
namespace dlib
{
namespace cuda
{
// ------------------------------------------------------------------------------------
__inline__ __device__ size_t pack_idx (
size_t dim_size3,
size_t dim_size2,
size_t dim_size1,
size_t idx4,
size_t idx3,
size_t idx2,
size_t idx1
)
/*!
ensures
- Converts a 4D array index into a 1D index assuming row major layout. To
understand precisely what this function does, imagine we had an array
declared like this:
int ARRAY[anything][dim_size3][dim_size2][dim_size1];
Then we could index it like this:
ARRAY[idx4][idx3][idx2][idx1]
or equivalently like this:
((int*)ARRAY)[pack_idx(dim_size3,dim_size2,dim_size1, idx4,idx3,idx2,idx1)]
!*/
{
return ((idx4*dim_size3 + idx3)*dim_size2 + idx2)*dim_size1 + idx1;
}
__inline__ __device__ void unpack_idx (
size_t idx,
size_t dim_size3,
size_t dim_size2,
size_t dim_size1,
size_t& idx4,
size_t& idx3,
size_t& idx2,
size_t& idx1
)
/*!
ensures
- This function computes the inverse of pack_idx(). Therefore,
if PACKED == pack_idx(dim_size3,dim_size2,dim_size1, idx4,idx3,idx2,idx1)
then unpack_idx(PACKED,dim_size3,dim_size2,dim_size1, IDX4,IDX3,IDX2,IDX1)
results in:
- IDX1 == idx1
- IDX2 == idx2
- IDX3 == idx3
- IDX4 == idx4
!*/
{
idx1 = idx%dim_size1;
idx /= dim_size1;
idx2 = idx%dim_size2;
idx /= dim_size2;
idx3 = idx%dim_size3;
idx /= dim_size3;
idx4 = idx;
}
// ------------------------------------------------------------------------------------
// This function is from the article:
// http://devblogs.nvidia.com/parallelforall/faster-parallel-reductions-kepler/
__inline__ __device__ float warp_reduce_sum(float val)
{
for (int offset = warpSize/2; offset > 0; offset /= 2)
#if CUDART_VERSION >= 9000
val += __shfl_down_sync(0xFFFFFFFF,val, offset);
#else
val += __shfl_down(val, offset);
#endif
return val;
}
__inline__ __device__ bool is_first_thread_in_warp()
{
return (threadIdx.x & (warpSize - 1)) == 0;
}
__inline__ __device__ void warp_reduce_atomic_add(
float& out,
float val
)
/*!
ensures
- Atomically adds all the val variables in the current warp to out.
See this page for an extended discussion:
http://devblogs.nvidia.com/parallelforall/faster-parallel-reductions-kepler/
!*/
{
val = warp_reduce_sum(val);
if (is_first_thread_in_warp())
atomicAdd(&out, val);
}
// ------------------------------------------------------------------------------------
struct max_jobs
{
max_jobs(int x) : num_x(x) {}
max_jobs(int x, int y) : num_x(x), num_y(y) {}
int num_x;
int num_y = 1;
};
template <typename Kernel, typename... T>
void launch_kernel (
Kernel K,
T ...args
)
/*!
ensures
- launches the given kernel K(args...). The point of this function is to
automatically set the kernel launch parameters to something reasonable
based on the properties of the kernel and the current GPU card.
!*/
{
int num_blocks, num_threads;
CHECK_CUDA(cudaOccupancyMaxPotentialBlockSize(&num_blocks,&num_threads,K));
K<<<num_blocks,num_threads>>>(args...);
}
template <typename Kernel, typename... T>
void launch_kernel (
Kernel K,
max_jobs m,
T ...args
)
/*!
ensures
- This function is just like launch_kernel(K,args...) except that you can
additionally supply a max_jobs number that tells it how many possible
total threads could be used. This is useful when launching potentially
small jobs that might not need the number of threads suggested by
launch_kernel().
!*/
{
if (m.num_x == 0 || m.num_y == 0)
return;
int num_blocks, num_threads;
CHECK_CUDA(cudaOccupancyMaxPotentialBlockSize(&num_blocks,&num_threads,K));
// Check if the job is really small and we don't really need to launch a kernel
// with this many blocks and threads.
if (num_blocks*num_threads > m.num_x*m.num_y)
num_blocks = (m.num_x*m.num_y+num_threads-1)/num_threads;
if (m.num_y == 1)
{
K<<<num_blocks,num_threads>>>(args...);
}
else
{
/*
In general, the reason m.num_y!=1 (i.e. the reason you are in this
code path) is because we are using nested grid-stride loops. There are
two important things to note about what we are doing here. To
illustrate them we will talk about this little CUDA code snippet:
// initialize out before we begin.
for (auto i : grid_stride_range_y(0, nr))
for (auto j : grid_stride_range(0, 1))
out[i] = 0;
__syncthreads(); // synchronize threads in block
// loop over some 2D thing and sum and store things into out.
for (auto i : grid_stride_range_y(0, nr))
{
float temp = 0;
for (auto j : grid_stride_range(0, nc))
temp += whatever[i*nc+j];
// store the sum into out[i]
warp_reduce_atomic_add(out[i], temp);
}
First, we make sure the number of x threads is a multiple of 32 so that
you can use warp_reduce_atomic_add() inside the y loop.
Second, we put the x block size to 1 so inter-block synchronization is
easier. For example, if the number of x blocks wasn't 1 the above code
would have a race condition in it. This is because the execution of
out[i]=0 would be done by blocks with blockIdx.x==0, but then in the
second set of loops, *all* the x blocks use out[i]. Since
__syncthreads() doesn't do any synchronization between blocks some of
the blocks might begin before the out[i]=0 statements finished and that
would be super bad.
*/
// Try and make sure that the ratio of x to y threads is reasonable based
// on the respective size of our loops.
int x_threads = 32;
int y_threads = num_threads/32;
const int ratio = static_cast<int>(std::round(put_in_range(1, y_threads, m.num_x/(double)m.num_y)));
x_threads *= ratio;
y_threads /= ratio;
dim3 blocks(1,num_blocks);
dim3 threads(x_threads,y_threads);
K<<<blocks,threads>>>(args...);
}
}
// ------------------------------------------------------------------------------------
class grid_stride_range
{
/*!
WHAT THIS OBJECT REPRESENTS
This is a tool for making a for loop that loops over an entire block of
memory inside a kernel, but doing so in a way that parallelizes
appropriately across all the threads in a kernel launch. For example,
the following kernel would add the vector a to the vector b and store
the output in out (assuming all vectors are of dimension n):
__global__ void add_arrays(
const float* a,
const float* b,
float* out,
size_t n
)
{
for (auto i : grid_stride_range(0, n))
{
out[i] = a[i]+b[i];
}
}
!*/
public:
__device__ grid_stride_range(
size_t ibegin_,
size_t iend_
) :
ibegin(ibegin_),
iend(iend_)
{}
class iterator
{
public:
__device__ iterator() {}
__device__ iterator(size_t pos_) : pos(pos_) {}
__device__ size_t operator*() const
{
return pos;
}
__device__ iterator& operator++()
{
pos += gridDim.x * blockDim.x;
return *this;
}
__device__ bool operator!=(const iterator& item) const
{ return pos < item.pos; }
private:
size_t pos;
};
__device__ iterator begin() const
{
return iterator(ibegin+blockDim.x * blockIdx.x + threadIdx.x);
}
__device__ iterator end() const
{
return iterator(iend);
}
private:
size_t ibegin;
size_t iend;
};
// ------------------------------------------------------------------------------------
class grid_stride_range_y
{
/*!
WHAT THIS OBJECT REPRESENTS
This object is just like grid_stride_range except that it looks at
CUDA's y thread index (e.g. threadIdx.y) instead of the x index.
Therefore, if you launch a cuda kernel with a statement like:
dim3 blocks(1,10);
dim3 threads(32,32); // You need to have x and y not equal to 1 to get parallelism over both loops.
add_arrays<<<blocks,threads>>>(a,b,out,nr,nc);
You can perform a nested 2D parallel for loop rather than doing just a
1D for loop.
So the code in the kernel would look like this if you wanted to add two
2D matrices:
__global__ void add_arrays(
const float* a,
const float* b,
float* out,
size_t nr,
size_t nc
)
{
for (auto r : grid_stride_range_y(0, nr))
{
for (auto c : grid_stride_range(0, nc))
{
auto i = r*nc+c;
out[i] = a[i]+b[i];
}
}
}
!*/
public:
__device__ grid_stride_range_y(
size_t ibegin_,
size_t iend_
) :
ibegin(ibegin_),
iend(iend_)
{}
class iterator
{
public:
__device__ iterator() {}
__device__ iterator(size_t pos_) : pos(pos_) {}
__device__ size_t operator*() const
{
return pos;
}
__device__ iterator& operator++()
{
pos += gridDim.y * blockDim.y;
return *this;
}
__device__ bool operator!=(const iterator& item) const
{ return pos < item.pos; }
private:
size_t pos;
};
__device__ iterator begin() const
{
return iterator(ibegin+blockDim.y * blockIdx.y + threadIdx.y);
}
__device__ iterator end() const
{
return iterator(iend);
}
private:
size_t ibegin;
size_t iend;
};
// ------------------------------------------------------------------------------------
}
}
#endif // __CUDACC__
// ----------------------------------------------------------------------------------------
#endif // DLIB_CUDA_UtILS_H_

View File

@@ -0,0 +1,518 @@
// Copyright (C) 2015 Davis E. King (davis@dlib.net)
// License: Boost Software License See LICENSE.txt for the full license.
#ifndef DLIB_DNN_CuDNN_H_
#define DLIB_DNN_CuDNN_H_
#ifdef DLIB_USE_CUDA
#include "cuda_errors.h"
#include <memory>
#include "cuda_data_ptr.h"
namespace dlib
{
class tensor;
class resizable_tensor;
namespace cuda
{
// -----------------------------------------------------------------------------------
class tensor_descriptor
{
/*!
Each tensor object will carry a tensor_descriptor in it when compiled with
CUDA.
!*/
public:
// not copyable
tensor_descriptor(const tensor_descriptor&) = delete;
tensor_descriptor& operator=(const tensor_descriptor&) = delete;
// but is movable
tensor_descriptor(tensor_descriptor&& item) : tensor_descriptor() { swap(item); }
tensor_descriptor& operator=(tensor_descriptor&& item) { swap(item); return *this; }
tensor_descriptor();
~tensor_descriptor();
void set_size(
int n,
int k,
int nr,
int nc
);
/*!
ensures
- if any of the arguments are 0 then they are all set to 0 in the tensor.
!*/
void get_size (
int& n,
int& k,
int& nr,
int& nc
) const;
const void* get_handle (
) const { return handle; }
private:
void swap(tensor_descriptor& item) { std::swap(handle, item.handle); }
void* handle;
};
// ------------------------------------------------------------------------------------
void add(
float beta,
tensor& dest,
float alpha,
const tensor& src
);
/*!
requires
- One of the following is true:
- have_same_dimensions(src, dest)
- src.num_samples()==1 && src.k()==dest.k() && src.nr()==1 && src.nc()==1
- src.num_samples()==1 && src.k()==dest.k() && src.nr()==dest.nr() && src.nc()==dest.nc()
- src.num_samples()==1 && src.k()==1 && src.nr()==dest.nr() && src.nc()==dest.nc()
- is_same_object(src,dest) == false
ensures
- performs: dest = beta*dest + alpha*src
However, how the addition happens depends on the dimensions of src. In
particular, this function adds the scaled values of one src tensor to
dest. Each dimension of the src tensor must match the corresponding
dimension of the dest tensor or must be equal to 1. In the latter case,
the same value from the src tensor, for those dimensions, will be used to
add into the dest tensor.
!*/
// ------------------------------------------------------------------------------------
void assign_conv_bias_gradient (
tensor& grad,
const tensor& gradient_input
);
/*!
requires
- grad.num_samples() == 1
- grad.k() >= 1
- grad.nr() == 1
- grad.nc() == 1
- gradient_input.k() == grad.k()
- gradient_input.size() > 0
- is_same_object(grad,gradient_input) == false
ensures
- let BIAS be a tensor with all dimensions equal to 1 except for k which is >= 1.
- let OUT be the output of add(1,OUT,1,BIAS)
- let f(gradient_input,BIAS) == dot(gradient_input,OUT)
- Then this function computes the gradient of f() with respect to BIAS and
assigns it to grad.
!*/
// ------------------------------------------------------------------------------------
void batch_normalize_inference (
const double eps,
resizable_tensor& dest,
const tensor& src,
const tensor& gamma,
const tensor& beta,
const tensor& running_means,
const tensor& running_variances
);
void batch_normalize (
const double eps,
resizable_tensor& dest,
resizable_tensor& means,
resizable_tensor& invstds,
const double averaging_factor,
resizable_tensor& running_means,
resizable_tensor& running_variances,
const tensor& src,
const tensor& gamma,
const tensor& beta
);
void batch_normalize_gradient(
const double eps,
const tensor& gradient_input,
const tensor& means,
const tensor& invstds,
const tensor& src,
const tensor& gamma,
tensor& src_grad,
tensor& gamma_grad,
tensor& beta_grad
);
// ------------------------------------------------------------------------------------
void batch_normalize_conv_inference (
const double eps,
resizable_tensor& dest,
const tensor& src,
const tensor& gamma,
const tensor& beta,
const tensor& running_means,
const tensor& running_variances
);
void batch_normalize_conv (
const double eps,
resizable_tensor& dest,
resizable_tensor& means,
resizable_tensor& invstds,
const double averaging_factor,
resizable_tensor& running_means,
resizable_tensor& running_variances,
const tensor& src,
const tensor& gamma,
const tensor& beta
);
void batch_normalize_conv_gradient(
const double eps,
const tensor& gradient_input,
const tensor& means,
const tensor& invstds,
const tensor& src,
const tensor& gamma,
tensor& src_grad,
tensor& gamma_grad,
tensor& beta_grad
);
// ------------------------------------------------------------------------------------
class tensor_conv
{
public:
tensor_conv(const tensor_conv&) = delete;
tensor_conv& operator=(const tensor_conv&) = delete;
tensor_conv();
void clear(
);
~tensor_conv (
);
void operator() (
const bool add_to_output,
tensor& output,
const tensor& data,
const tensor& filters
);
void operator() (
const bool add_to_output,
resizable_tensor& output,
const tensor& data,
const tensor& filters
);
void get_gradient_for_data (
const bool add_to_output,
const tensor& gradient_input,
const tensor& filters,
tensor& data_gradient
);
void get_gradient_for_filters (
const bool add_to_output,
const tensor& gradient_input,
const tensor& data,
tensor& filters_gradient
);
void setup(
const tensor& data,
const tensor& filters,
int stride_y,
int stride_x,
int padding_y,
int padding_x
);
private:
// These variables record the type of data given to the last call to setup().
int stride_y;
int stride_x;
int padding_y;
int padding_x;
long data_num_samples, data_k, data_nr, data_nc;
long filters_num_samples, filters_k, filters_nr, filters_nc;
void* filter_handle;
void* conv_handle;
// dimensions of the output tensor from operator()
int out_num_samples;
int out_k;
int out_nr;
int out_nc;
int forward_algo;
int backward_data_algo;
int backward_filters_algo;
size_t forward_workspace_size_in_bytes;
size_t backward_data_workspace_size_in_bytes;
size_t backward_filters_workspace_size_in_bytes;
std::shared_ptr<resizable_cuda_buffer> workspace;
cuda_data_void_ptr forward_workspace;
cuda_data_void_ptr backward_data_workspace;
cuda_data_void_ptr backward_filters_workspace;
};
// ------------------------------------------------------------------------------------
class pooling
{
public:
pooling(const pooling&) = delete;
pooling& operator=(const pooling&) = delete;
pooling (
);
~pooling(
);
void clear(
);
void setup_max_pooling(
int window_height,
int window_width,
int stride_y,
int stride_x,
int padding_y,
int padding_x
);
void setup_avg_pooling(
int window_height,
int window_width,
int stride_y,
int stride_x,
int padding_y,
int padding_x
);
bool does_max_pooling(
) const { return do_max_pooling; }
void operator() (
resizable_tensor& dest,
const tensor& src
);
void get_gradient(
const tensor& gradient_input,
const tensor& dest,
const tensor& src,
tensor& grad
);
private:
void setup(
int window_height,
int window_width,
int stride_y,
int stride_x,
int padding_y,
int padding_x,
int pooling_mode
);
void* handle;
int window_height;
int window_width;
int stride_y;
int stride_x;
int padding_y;
int padding_x;
bool do_max_pooling;
};
// ------------------------------------------------------------------------------------
void softmax (
tensor& dest,
const tensor& src
);
/*!
requires
- have_same_dimensions(dest, src) == true
ensures
- Note that the softmax function is a vector valued function:
s(x) == exp(x)/sum(exp(x))
- Computes the softmax function on src and writes the results to dest. The
softmax is computed per spatial location across the different channels at
each location. That is, softmax() outputs a new tensor, #dest, where
each of the spatial locations in dest (i.e. image idx, row idx, and
column idx) contains the output of s() evaluated over the channel values
at each location.
- This function supports in-place operation, i.e. having
is_same_object(dest, src)==true
!*/
void softmax_gradient (
tensor& grad,
const tensor& dest,
const tensor& gradient_input
);
/*!
requires
- have_same_dimensions(dest,gradient_input) == true
- have_same_dimensions(dest,grad) == true
- is_same_object(grad, dest)==false
ensures
- We interpret dest as the output of softmax(dest,SRC) for some SRC tensor.
Then let f(SRC) == dot(gradient_input,dest) Then this function computes
the gradient of f() with respect to SRC and assigns it to grad.
- This function supports in-place operation, i.e. having
is_same_object(grad, gradient_input)==true
!*/
// ------------------------------------------------------------------------------------
void softmax_all (
tensor& dest,
const tensor& src
);
void softmax_all_gradient (
tensor& grad,
const tensor& dest,
const tensor& gradient_input
);
// ------------------------------------------------------------------------------------
void sigmoid (
tensor& dest,
const tensor& src
);
/*!
requires
- have_same_dimensions(dest, src) == true
ensures
- for all valid i:
- #dest.host()[i] == 1/(1+std::exp(-src.host()[i]))
- This function supports in-place operation, i.e. having
is_same_object(dest, src)==true
!*/
void sigmoid_gradient (
tensor& grad,
const tensor& dest,
const tensor& gradient_input
);
/*!
requires
- have_same_dimensions(dest,gradient_input) == true
- have_same_dimensions(dest,grad) == true
- is_same_object(grad,dest) == false
ensures
- Recalling that dest is the output of sigmoid(dest,SRC) for some SRC tensor,
let f(SRC) == dot(gradient_input,dest)
- Then this function computes the gradient of f() with respect to SRC and
assigns it to grad.
- This function supports in-place operation, i.e. having
is_same_object(grad, gradient_input)==true
!*/
// ------------------------------------------------------------------------------------
void relu (
tensor& dest,
const tensor& src
);
/*!
requires
- have_same_dimensions(dest, src) == true
ensures
- for all valid i:
- #dest.host()[i] == std::max(0,src.host()[i])
- This function supports in-place operation, i.e. having
is_same_object(dest, src)==true
!*/
void relu_gradient (
tensor& grad,
const tensor& dest,
const tensor& gradient_input
);
/*!
requires
- have_same_dimensions(dest,gradient_input) == true
- have_same_dimensions(dest,grad) == true
- is_same_object(grad,dest) == false
ensures
- Recalling that dest is the output of relu(dest,SRC) for some SRC tensor,
let f(SRC) == dot(gradient_input,dest)
- Then this function computes the gradient of f() with respect to SRC and
assigns it to grad.
- This function supports in-place operation, i.e. having
is_same_object(grad, gradient_input)==true
!*/
// ------------------------------------------------------------------------------------
void tanh (
tensor& dest,
const tensor& src
);
/*!
requires
- have_same_dimensions(dest, src) == true
ensures
- for all valid i:
- #dest.host()[i] == std::tanh(src.host()[i])
- This function supports in-place operation, i.e. having
is_same_object(dest, src)==true
!*/
void tanh_gradient (
tensor& grad,
const tensor& dest,
const tensor& gradient_input
);
/*!
requires
- have_same_dimensions(dest,gradient_input) == true
- have_same_dimensions(dest,grad) == true
- is_same_object(grad,dest) == false
ensures
- Recalling that dest is the output of tanh(dest,SRC) for some SRC tensor,
let f(SRC) == dot(gradient_input,dest)
- Then this function computes the gradient of f() with respect to SRC and
assigns it to grad.
- This function supports in-place operation, i.e. having
is_same_object(grad, gradient_input)==true
!*/
// ------------------------------------------------------------------------------------
}
}
#endif // DLIB_USE_CUDA
#endif // DLIB_DNN_CuDNN_H_

View File

@@ -0,0 +1,75 @@
// Copyright (C) 2015 Davis E. King (davis@dlib.net)
// License: Boost Software License See LICENSE.txt for the full license.
#ifndef DLIB_DNN_CuRAND_H_
#define DLIB_DNN_CuRAND_H_
#ifdef DLIB_USE_CUDA
#include "tensor.h"
#include "cuda_errors.h"
#include "cuda_data_ptr.h"
namespace dlib
{
namespace cuda
{
// -----------------------------------------------------------------------------------
class curand_generator
{
public:
// not copyable
curand_generator(const curand_generator&) = delete;
curand_generator& operator=(const curand_generator&) = delete;
curand_generator() : curand_generator(0) {}
curand_generator(unsigned long long seed);
~curand_generator();
void fill (
cuda_data_ptr<unsigned int>& data
);
/*!
ensures
- Fills data with random 32-bit unsigned integers.
!*/
void fill_gaussian (
tensor& data,
float mean = 0,
float stddev = 1
);
/*!
requires
- data.size()%2 == 0
- stddev >= 0
ensures
- Fills data with random numbers drawn from a Gaussian distribution
with the given mean and standard deviation.
!*/
void fill_uniform (
tensor& data
);
/*!
ensures
- Fills data with uniform random numbers in the range (0.0, 1.0].
!*/
private:
void* handle;
};
// -----------------------------------------------------------------------------------
}
}
#endif // DLIB_USE_CUDA
#endif // DLIB_DNN_CuRAND_H_

View File

@@ -0,0 +1,75 @@
// Copyright (C) 2017 Davis E. King (davis@dlib.net)
// License: Boost Software License See LICENSE.txt for the full license.
#ifndef DLIB_DNN_CuSOLVER_H_
#define DLIB_DNN_CuSOLVER_H_
#ifdef DLIB_USE_CUDA
#include "tensor.h"
#include "cuda_errors.h"
#include "cuda_data_ptr.h"
#include "../noncopyable.h"
namespace dlib
{
namespace cuda
{
// -----------------------------------------------------------------------------------
class inv : noncopyable
{
/*!
WHAT THIS OBJECT REPRESENTS
This is a functor for doing matrix inversion on the GPU. The only
reason it's an object is to avoid the reallocation of some GPU memory
blocks if you want to do a bunch of matrix inversions in a row.
!*/
public:
inv() = default;
~inv();
void operator() (
const tensor& m,
resizable_tensor& out
);
/*!
requires
- m.size() == m.num_samples()*m.num_samples()
(i.e. mat(m) must be a square matrix)
ensures
- out == inv(mat(m));
!*/
int get_last_status(
);
/*!
ensures
- returns 0 if the last matrix inversion was successful and != 0
otherwise.
!*/
private:
void sync_if_needed();
bool did_work_lately = false;
resizable_tensor m;
cuda_data_ptr<float> workspace;
cuda_data_ptr<int> Ipiv;
cuda_data_ptr<int> info;
};
// ------------------------------------------------------------------------------------
}
}
#endif // DLIB_USE_CUDA
#endif // DLIB_DNN_CuSOLVER_H_

View File

@@ -0,0 +1,266 @@
// Copyright (C) 2015 Davis E. King (davis@dlib.net)
// License: Boost Software License See LICENSE.txt for the full license.
#ifndef DLIB_GPU_DaTA_H_
#define DLIB_GPU_DaTA_H_
#include "gpu_data_abstract.h"
#include <memory>
#include <cstring>
#include "cuda_errors.h"
#include "../serialize.h"
namespace dlib
{
// ----------------------------------------------------------------------------------------
class gpu_data
{
/*!
CONVENTION
- if (size() != 0) then
- data_host == a pointer to size() floats in CPU memory.
- if (data_device) then
- data_device == a pointer to size() floats in device memory.
- if (there might be an active async transfer from host to device) then
- have_active_transfer == true
- We use the host_current and device_current bools to keep track of which
copy of the data (or both) are most current. e.g. if the CPU has
modified the data and it hasn't been copied to the device yet then
host_current==true and device_current==false.
Similarly, we use device_in_use==true to indicate that device() has been
called and no operation to wait for all CUDA kernel completion has been
executed. So if device_in_use==true then there might be a CUDA kernel
executing that is using the device memory block contained in this object.
!*/
public:
gpu_data(
) : data_size(0), host_current(true), device_current(true),have_active_transfer(false),device_in_use(false), the_device_id(0)
{
}
// Not copyable
gpu_data(const gpu_data&) = delete;
gpu_data& operator=(const gpu_data&) = delete;
// but is movable
gpu_data(gpu_data&& item) : gpu_data() { swap(item); }
gpu_data& operator=(gpu_data&& item) { swap(item); return *this; }
int device_id() const { return the_device_id; }
#ifdef DLIB_USE_CUDA
void async_copy_to_device() const;
void set_size(size_t new_size);
#else
// Note that calls to host() or device() will block until any async transfers are complete.
void async_copy_to_device() const{}
void set_size(size_t new_size)
{
if (new_size == 0)
{
data_size = 0;
host_current = true;
device_current = true;
device_in_use = false;
data_host.reset();
data_device.reset();
}
else if (new_size != data_size)
{
data_size = new_size;
host_current = true;
device_current = true;
device_in_use = false;
data_host.reset(new float[new_size], std::default_delete<float[]>());
data_device.reset();
}
}
#endif
const float* host() const
{
copy_to_host();
return data_host.get();
}
float* host()
{
copy_to_host();
device_current = false;
return data_host.get();
}
float* host_write_only()
{
host_current = true;
device_current = false;
return data_host.get();
}
const float* device() const
{
#ifndef DLIB_USE_CUDA
DLIB_CASSERT(false, "CUDA NOT ENABLED");
#endif
copy_to_device();
device_in_use = true;
return data_device.get();
}
float* device()
{
#ifndef DLIB_USE_CUDA
DLIB_CASSERT(false, "CUDA NOT ENABLED");
#endif
copy_to_device();
host_current = false;
device_in_use = true;
return data_device.get();
}
float* device_write_only()
{
#ifndef DLIB_USE_CUDA
DLIB_CASSERT(false, "CUDA NOT ENABLED");
#endif
wait_for_transfer_to_finish();
host_current = false;
device_current = true;
device_in_use = true;
return data_device.get();
}
bool host_ready (
) const { return host_current; }
bool device_ready (
) const { return device_current && !have_active_transfer; }
size_t size() const { return data_size; }
void swap (gpu_data& item)
{
std::swap(data_size, item.data_size);
std::swap(host_current, item.host_current);
std::swap(device_current, item.device_current);
std::swap(have_active_transfer, item.have_active_transfer);
std::swap(data_host, item.data_host);
std::swap(data_device, item.data_device);
std::swap(cuda_stream, item.cuda_stream);
std::swap(the_device_id, item.the_device_id);
}
private:
#ifdef DLIB_USE_CUDA
void copy_to_device() const;
void copy_to_host() const;
void wait_for_transfer_to_finish() const;
#else
void copy_to_device() const{}
void copy_to_host() const{}
void wait_for_transfer_to_finish() const{}
#endif
size_t data_size;
mutable bool host_current;
mutable bool device_current;
mutable bool have_active_transfer;
mutable bool device_in_use;
std::shared_ptr<float> data_host;
std::shared_ptr<float> data_device;
std::shared_ptr<void> cuda_stream;
int the_device_id;
};
inline void serialize(const gpu_data& item, std::ostream& out)
{
int version = 1;
serialize(version, out);
serialize(item.size(), out);
auto data = item.host();
for (size_t i = 0; i < item.size(); ++i)
serialize(data[i], out);
}
inline void deserialize(gpu_data& item, std::istream& in)
{
int version;
deserialize(version, in);
if (version != 1)
throw serialization_error("Unexpected version found while deserializing dlib::gpu_data.");
size_t s;
deserialize(s, in);
item.set_size(s);
auto data = item.host();
for (size_t i = 0; i < item.size(); ++i)
deserialize(data[i], in);
}
#ifdef DLIB_USE_CUDA
void memcpy (gpu_data& dest, const gpu_data& src);
void memcpy (
gpu_data& dest,
size_t dest_offset,
const gpu_data& src,
size_t src_offset,
size_t num
);
#else
inline void memcpy (gpu_data& dest, const gpu_data& src)
{
DLIB_CASSERT(dest.size() == src.size());
if (src.size() == 0 || &dest == &src)
return;
std::memcpy(dest.host_write_only(), src.host(), sizeof(float)*src.size());
}
inline void memcpy (
gpu_data& dest,
size_t dest_offset,
const gpu_data& src,
size_t src_offset,
size_t num
)
{
DLIB_CASSERT(dest_offset + num <= dest.size());
DLIB_CASSERT(src_offset + num <= src.size());
if (num == 0)
return;
if (&dest == &src && std::max(dest_offset, src_offset) < std::min(dest_offset,src_offset)+num)
{
// if they perfectly alias each other then there is nothing to do
if (dest_offset == src_offset)
return;
else
std::memmove(dest.host()+dest_offset, src.host()+src_offset, sizeof(float)*num);
}
else
{
// if we write to the entire thing then we can use host_write_only()
if (dest_offset == 0 && num == dest.size())
std::memcpy(dest.host_write_only(), src.host()+src_offset, sizeof(float)*num);
else
std::memcpy(dest.host()+dest_offset, src.host()+src_offset, sizeof(float)*num);
}
}
#endif
// ----------------------------------------------------------------------------------------
}
#endif // DLIB_GPU_DaTA_H_

View File

@@ -0,0 +1,266 @@
// Copyright (C) 2015 Davis E. King (davis@dlib.net)
// License: Boost Software License See LICENSE.txt for the full license.
#undef DLIB_GPU_DaTA_ABSTRACT_H_
#ifdef DLIB_GPU_DaTA_ABSTRACT_H_
#include "cuda_errors.h"
#include "../serialize.h"
namespace dlib
{
// ----------------------------------------------------------------------------------------
class gpu_data
{
/*!
WHAT THIS OBJECT REPRESENTS
This object is a block of size() floats, all stored contiguously in memory.
Importantly, it keeps two copies of the floats, one on the host CPU side
and another on the GPU device side. It automatically performs the necessary
host/device transfers to keep these two copies of the data in sync.
All transfers to the device happen asynchronously with respect to the
default CUDA stream so that CUDA kernel computations can overlap with data
transfers. However, any transfers from the device to the host happen
synchronously in the default CUDA stream. Therefore, you should perform
all your CUDA kernel launches on the default stream so that transfers back
to the host do not happen before the relevant computations have completed.
If DLIB_USE_CUDA is not #defined then this object will not use CUDA at all.
Instead, it will simply store one host side memory block of floats.
THREAD SAFETY
Instances of this object are not thread-safe. So don't touch one from
multiple threads at the same time.
!*/
public:
gpu_data(
);
/*!
ensures
- #size() == 0
- #host() == nullptr
- #device() == nullptr
- #host_ready() == true
- #device_ready() == true
- #device_id() == 0
!*/
// This object is not copyable, however, it is movable.
gpu_data(const gpu_data&) = delete;
gpu_data& operator=(const gpu_data&) = delete;
gpu_data(gpu_data&& item);
gpu_data& operator=(gpu_data&& item);
int device_id(
) const;
/*!
ensures
- returns the ID of the CUDA device that allocated this memory. I.e. the
number returned by cudaGetDevice() when the memory was allocated.
- If CUDA is not being used then this function always returns 0.
!*/
void async_copy_to_device(
);
/*!
ensures
- if (!device_ready()) then
- Begins asynchronously copying host data to the device once it is safe
to do so. I.e. This function will wait until any previously
scheduled CUDA kernels, which are using the device() memory block,
have completed before transferring the new data to the device.
- A call to device() that happens before the transfer completes will
block until the transfer is complete. That is, it is safe to call
async_copy_to_device() and then immediately call device().
!*/
void set_size(
size_t new_size
);
/*!
ensures
- #size() == new_size
!*/
bool host_ready (
) const;
/*!
ensures
- returns true if and only if the host's copy of the data is current. The
host's data is current if there aren't any modifications to the data
which were made on the device side that have yet to be copied to the
host.
!*/
bool device_ready (
) const;
/*!
ensures
- returns true if and only if the device's copy of the data is current.
The device's data is current if there aren't any modifications to the
data which were made on the host side that have yet to be copied to the
device.
!*/
const float* host(
) const;
/*!
ensures
- returns a pointer to the host memory block of size() contiguous float
values or nullptr if size()==0.
- if (!host_ready()) then
- copies the data from the device to the host, while this is happening
the call to host() blocks.
- #host_ready() == true
!*/
float* host(
);
/*!
ensures
- returns a pointer to the host memory block of size() contiguous float
values or nullptr if size()==0.
- if (!host_ready()) then
- copies the data from the device to the host, while this is happening
the call to host() blocks.
- #host_ready() == true
- #device_ready() == false
I.e. Marks the device side data as out of date so that the next call to
device() will perform a host to device transfer. If you want to begin
the transfer immediately then you can call async_copy_to_device() after
calling host().
!*/
float* host_write_only(
);
/*!
ensures
- This function returns the same pointer as host(), except that it never
performs a device to host memory copy. Instead, it immediately marks the
device side data as out of date, effectively discarding it. Therefore,
the values in the data pointed to by host_write_only() are undefined and
you should only call host_write_only() if you are going to assign to
every memory location in the returned memory block.
- #host_ready() == true
- #device_ready() == false
!*/
const float* device(
) const;
/*!
requires
- DLIB_USE_CUDA is #defined
ensures
- returns a pointer to the device memory block of size() contiguous float
values or nullptr if size()==0.
- if (!device_ready()) then
- copies the data from the host to the device, while this is happening
the call to device() blocks.
- #device_ready() == true
!*/
float* device(
);
/*!
requires
- DLIB_USE_CUDA is #defined
ensures
- returns a pointer to the device memory block of size() contiguous float
values or nullptr if size()==0.
- if (!device_ready()) then
- copies the data from the host to the device, while this is happening
the call to device() blocks.
- #host_ready() == false
- #device_ready() == true
!*/
float* device_write_only(
);
/*!
requires
- DLIB_USE_CUDA is #defined
ensures
- This function returns the same pointer as device(), except that it never
performs a host to device memory copy. Instead, it immediately marks the
host side data as out of date, effectively discarding it. Therefore, the
values in the data pointed to by device_write_only() are undefined and
you should only call device_write_only() if you are going to assign to
every memory location in the returned memory block.
- #host_ready() == false
- #device_ready() == true
!*/
size_t size(
) const;
/*!
ensures
- returns the number of floats contained in this object.
!*/
void swap (
gpu_data& item
);
/*!
ensures
- swaps the state of *this and item
!*/
};
void serialize(const gpu_data& item, std::ostream& out);
void deserialize(gpu_data& item, std::istream& in);
/*!
provides serialization support
!*/
void memcpy (
gpu_data& dest,
const gpu_data& src
);
/*!
requires
- dest.size() == src.size()
ensures
- Copies the data in src to dest. If the device data is current (i.e.
device_ready()==true) on both src and dest then the copy will happen entirely
on the device side.
- It doesn't matter what GPU device is selected by cudaSetDevice(). You can
always copy gpu_data objects to and from each other regardless.
- This function blocks until the copy has completed.
!*/
void memcpy (
gpu_data& dest,
size_t dest_offset,
const gpu_data& src,
size_t src_offset,
size_t num
);
/*!
requires
- dest_offset + num <= dest.size()
- src_offset + num <= src.size()
ensures
- Copies the data in src to dest, but only copies data in the range
[src.host()+src_offset, src.host()+src_offset+num) to
[dest.host()+dest_offset, dest.host()+dest_offset+num). Therefore, it is
just like the above memcpy() except that you can specify some subset of data
in a gpu_data object to be copied.
- Like the above version of memcpy(), the copy will happen in the most
efficient way, automatically using the appropriate type of host/device
transfers based on where data is currently resident.
- It doesn't matter what GPU device is selected by cudaSetDevice(). You can
always copy gpu_data objects to and from each other regardless.
- This function blocks until the copy has completed.
!*/
// ----------------------------------------------------------------------------------------
}
#endif // DLIB_GPU_DaTA_ABSTRACT_H_

View File

@@ -0,0 +1,686 @@
// Copyright (C) 2015 Davis E. King (davis@dlib.net)
// License: Boost Software License See LICENSE.txt for the full license.
#ifndef DLIB_DNn_TENSOR_H_
#define DLIB_DNn_TENSOR_H_
#include "tensor_abstract.h"
#include <cstring>
#include "../matrix.h"
#include "cudnn_dlibapi.h"
#include "gpu_data.h"
#include "../byte_orderer.h"
#include <memory>
#include "../any.h"
namespace dlib
{
// ----------------------------------------------------------------------------------------
class tensor;
namespace cuda
{
void set_tensor (
tensor& t,
float value
);
void scale_tensor (
tensor& t,
float value
);
}
// ----------------------------------------------------------------------------------------
class tensor
{
public:
tensor (
) :
m_n(0), m_k(0), m_nr(0), m_nc(0), m_size(0)
{
}
virtual ~tensor() {}
long long num_samples() const { return m_n; }
long long k() const { return m_k; }
long long nr() const { return m_nr; }
long long nc() const { return m_nc; }
size_t size() const { return m_size; }
typedef float* iterator;
typedef const float* const_iterator;
iterator begin() { return host(); }
const_iterator begin() const { return host(); }
iterator end() { return host()+size(); }
const_iterator end() const { return host()+size(); }
void async_copy_to_device() const
{
data().async_copy_to_device();
}
virtual const float* host() const = 0;
virtual float* host() = 0;
virtual float* host_write_only() = 0;
virtual const float* device() const = 0;
virtual float* device() = 0;
virtual float* device_write_only() = 0;
virtual const any& annotation() const = 0;
virtual any& annotation() = 0;
int device_id() const { return data().device_id(); }
tensor& operator= (float val)
{
#ifdef DLIB_USE_CUDA
// If you are using CUDA then presumably you will be mostly using tensors on
// the GPU. So unless you seem to be actively working with the host side's
// data then we do this initialization on the device side since this avoids a
// host to device transfer that would likely immediately follow.
if (data().device_ready())
{
cuda::set_tensor(*this, val);
return *this;
}
#endif
auto d = host_write_only();
for (size_t i = 0; i < size(); ++i)
d[i] = val;
return *this;
}
tensor& operator*= (float val)
{
#ifdef DLIB_USE_CUDA
cuda::scale_tensor(*this, val);
return *this;
#else
for (auto& d : *this)
d *= val;
return *this;
#endif
}
tensor& operator/= (float val)
{
*this *= 1.0/val;
return *this;
}
template <typename EXP>
tensor& operator= (const matrix_exp<EXP>& item)
{
DLIB_CASSERT(num_samples() == item.nr() &&
nr()*nc()*k() == item.nc());
static_assert((is_same_type<float, typename EXP::type>::value == true),
"To assign a matrix to a tensor the matrix must contain float values");
set_ptrm(host_write_only(), m_n, m_nr*m_nc*m_k) = item;
return *this;
}
template <typename EXP>
tensor& operator+= (const matrix_exp<EXP>& item)
{
DLIB_CASSERT(num_samples() == item.nr() &&
nr()*nc()*k() == item.nc());
static_assert((is_same_type<float, typename EXP::type>::value == true),
"To assign a matrix to a tensor the matrix must contain float values");
set_ptrm(host(), m_n, m_nr*m_nc*m_k) += item;
return *this;
}
template <typename EXP>
tensor& operator-= (const matrix_exp<EXP>& item)
{
DLIB_CASSERT(num_samples() == item.nr() &&
nr()*nc()*k() == item.nc());
static_assert((is_same_type<float, typename EXP::type>::value == true),
"To assign a matrix to a tensor the matrix must contain float values");
set_ptrm(host(), m_n, m_nr*m_nc*m_k) -= item;
return *this;
}
template <typename EXP>
void set_sample (
unsigned long long idx,
const matrix_exp<EXP>& item
)
{
DLIB_CASSERT(idx < (unsigned long long)num_samples());
DLIB_CASSERT(item.size() == nr()*nc()*k());
static_assert((is_same_type<float, typename EXP::type>::value == true),
"To assign a matrix to a tensor the matrix must contain float values");
set_ptrm(host()+idx*item.size(), item.nr(), item.nc()) = item;
}
template <typename EXP>
void add_to_sample (
unsigned long long idx,
const matrix_exp<EXP>& item
)
{
DLIB_CASSERT(idx < (unsigned long long)num_samples());
DLIB_CASSERT(item.size() == nr()*nc()*k());
static_assert((is_same_type<float, typename EXP::type>::value == true),
"To assign a matrix to a tensor the matrix must contain float values");
set_ptrm(host()+idx*item.size(), item.nr(), item.nc()) += item;
}
#ifdef DLIB_USE_CUDA
virtual const cuda::tensor_descriptor& get_cudnn_tensor_descriptor (
) const = 0;
#endif
friend void memcpy (
tensor& dest,
const tensor& src
)
{
DLIB_CASSERT(dest.size() == src.size());
memcpy(dest.data(), dest.get_alias_offset(),
src.data(), src.get_alias_offset(),
src.size());
}
protected:
friend class alias_tensor;
virtual gpu_data& data() = 0;
virtual const gpu_data& data() const = 0;
virtual size_t get_alias_offset() const { return 0; } // needed by alias_tensor.
long long m_n;
long long m_k;
long long m_nr;
long long m_nc;
long long m_size; // always equal to m_n*m_k*m_nr*m_nc
};
// ----------------------------------------------------------------------------------------
inline bool is_vector (
const tensor& t
)
{
return t.size() == (size_t)t.num_samples() ||
t.size() == (size_t)t.k() ||
t.size() == (size_t)t.nr() ||
t.size() == (size_t)t.nc();
}
// ----------------------------------------------------------------------------------------
inline const matrix_op<op_pointer_to_mat<float> > mat (
const tensor& t,
long long nr,
long long nc
)
{
DLIB_ASSERT(nr >= 0 && nc >= 0 ,
"\tconst matrix_exp mat(tensor, nr, nc)"
<< "\n\t nr and nc must be >= 0"
<< "\n\t nr: " << nr
<< "\n\t nc: " << nc
);
DLIB_ASSERT(nr*nc == (long long)t.size() ,
"\tconst matrix_exp mat(tensor, nr, nc)"
<< "\n\t The sizes don't match up."
<< "\n\t nr*nc: " << nr*nc
<< "\n\t t.size(): " << t.size()
);
typedef op_pointer_to_mat<float> op;
return matrix_op<op>(op(t.host(),nr,nc));
}
inline const matrix_op<op_pointer_to_mat<float> > mat (
const tensor& t
)
{
if (t.size() != 0)
return mat(t, t.num_samples(), t.size()/t.num_samples());
else
return mat((float*)0,0,0);
}
inline const matrix_op<op_pointer_to_mat<float> > image_plane (
const tensor& t,
long long sample = 0,
long long k = 0
)
{
DLIB_ASSERT(0 <= sample && sample < t.num_samples() &&
0 <= k && k < t.k() &&
t.size() != 0,
"\tconst matrix_exp image_plane(tensor,sample,k)"
<< "\n\t Invalid arguments were given to this function."
<< "\n\t sample: " << sample
<< "\n\t k: " << k
<< "\n\t t.num_samples(): " << t.num_samples()
<< "\n\t t.k(): " << t.k()
<< "\n\t t.size(): " << t.size()
);
typedef op_pointer_to_mat<float> op;
return matrix_op<op>(op(t.host() + ((sample*t.k() + k)*t.nr())*t.nc(),
t.nr(),
t.nc()));
}
// ----------------------------------------------------------------------------------------
inline bool have_same_dimensions (
const tensor& a,
const tensor& b
)
{
return a.num_samples() == b.num_samples() &&
a.k() == b.k() &&
a.nr() == b.nr() &&
a.nc() == b.nc();
}
// ----------------------------------------------------------------------------------------
class resizable_tensor : public tensor
{
public:
resizable_tensor(
)
{}
template <typename EXP>
resizable_tensor(
const matrix_exp<EXP>& item
)
{
set_size(item.nr(), item.nc());
*this = item;
}
explicit resizable_tensor(
long long n_, long long k_ = 1, long long nr_ = 1, long long nc_ = 1
)
{
DLIB_ASSERT( n_ >= 0 && k_ >= 0 && nr_ >= 0 && nc_ >= 0);
set_size(n_,k_,nr_,nc_);
}
resizable_tensor(const resizable_tensor& item) : _annotation(item.annotation())
{
copy_size(item);
memcpy(*this, item);
}
resizable_tensor(const tensor& item) : _annotation(item.annotation())
{
copy_size(item);
memcpy(*this, item);
}
resizable_tensor(resizable_tensor&& item) { swap(item); }
resizable_tensor& operator=(resizable_tensor&& item) { swap(item); return *this; }
virtual const float* host() const { return data_instance.host(); }
virtual float* host() { return data_instance.host(); }
virtual float* host_write_only() { return data_instance.host_write_only(); }
virtual const float* device() const { return data_instance.device(); }
virtual float* device() { return data_instance.device(); }
virtual float* device_write_only() { return data_instance.device_write_only(); }
virtual const any& annotation() const { return _annotation; }
virtual any& annotation() { return _annotation; }
void clear(
)
{
set_size(0,0,0,0);
_annotation.clear();
// free underlying memory
data_instance.set_size(0);
}
void copy_size (
const tensor& item
)
{
set_size(item.num_samples(), item.k(), item.nr(), item.nc());
}
resizable_tensor& operator= (float val)
{
tensor::operator=(val);
return *this;
}
template <typename EXP>
resizable_tensor& operator= (
const matrix_exp<EXP>& item
)
{
if (!(num_samples() == item.nr() && k()*nr()*nc() == item.nc()))
set_size(item.nr(), item.nc());
tensor::operator=(item);
return *this;
}
void set_size(
long long n_, long long k_ = 1, long long nr_ = 1, long long nc_ = 1
)
{
DLIB_ASSERT( n_ >= 0 && k_ >= 0 && nr_ >= 0 && nc_ >= 0);
m_n = n_;
m_k = k_;
m_nr = nr_;
m_nc = nc_;
m_size = n_*k_*nr_*nc_;
if ((long long)data_instance.size() < m_size)
data_instance.set_size(m_size);
#ifdef DLIB_USE_CUDA
cudnn_descriptor.set_size(m_n,m_k,m_nr,m_nc);
#endif
}
resizable_tensor& operator= (const resizable_tensor& item)
{
resizable_tensor temp(item);
temp.swap(*this);
return *this;
}
resizable_tensor& operator= (const tensor& item)
{
resizable_tensor temp(item);
temp.swap(*this);
return *this;
}
void swap(resizable_tensor& item)
{
std::swap(m_n, item.m_n);
std::swap(m_k, item.m_k);
std::swap(m_nr, item.m_nr);
std::swap(m_nc, item.m_nc);
std::swap(m_size, item.m_size);
std::swap(data_instance, item.data_instance);
std::swap(_annotation, item._annotation);
#ifdef DLIB_USE_CUDA
std::swap(cudnn_descriptor, item.cudnn_descriptor);
#endif
}
#ifdef DLIB_USE_CUDA
virtual const cuda::tensor_descriptor& get_cudnn_tensor_descriptor (
) const { return cudnn_descriptor; }
#endif
private:
#ifdef DLIB_USE_CUDA
cuda::tensor_descriptor cudnn_descriptor;
#endif
gpu_data data_instance;
any _annotation;
virtual gpu_data& data() { return data_instance; }
virtual const gpu_data& data() const { return data_instance; }
};
inline void serialize(const tensor& item, std::ostream& out)
{
int version = 2;
serialize(version, out);
serialize(item.num_samples(), out);
serialize(item.k(), out);
serialize(item.nr(), out);
serialize(item.nc(), out);
byte_orderer bo;
auto sbuf = out.rdbuf();
for (auto d : item)
{
// Write out our data as 4byte little endian IEEE floats rather than using
// dlib's default float serialization. We do this because it will result in
// more compact outputs. It's slightly less portable but it seems doubtful
// that any CUDA enabled platform isn't going to use IEEE floats. But if one
// does we can just update the serialization code here to handle it if such a
// platform is encountered.
bo.host_to_little(d);
static_assert(sizeof(d)==4, "This serialization code assumes we are writing 4 byte floats");
sbuf->sputn((char*)&d, sizeof(d));
}
}
inline void deserialize(resizable_tensor& item, std::istream& in)
{
int version;
deserialize(version, in);
if (version != 2)
throw serialization_error("Unexpected version found while deserializing dlib::resizable_tensor.");
long long num_samples=0, k=0, nr=0, nc=0;
deserialize(num_samples, in);
deserialize(k, in);
deserialize(nr, in);
deserialize(nc, in);
item.set_size(num_samples, k, nr, nc);
byte_orderer bo;
auto sbuf = in.rdbuf();
for (auto& d : item)
{
static_assert(sizeof(d)==4, "This serialization code assumes we are writing 4 byte floats");
if (sbuf->sgetn((char*)&d,sizeof(d)) != sizeof(d))
{
in.setstate(std::ios::badbit);
throw serialization_error("Error reading data while deserializing dlib::resizable_tensor.");
}
bo.little_to_host(d);
}
}
// ----------------------------------------------------------------------------------------
inline double dot(
const tensor& a,
const tensor& b
)
{
DLIB_CASSERT(a.size() == b.size());
const float* da = a.host();
const float* db = b.host();
double sum = 0;
for (size_t i = 0; i < a.size(); ++i)
sum += da[i]*db[i];
return sum;
}
// ----------------------------------------------------------------------------------------
// ----------------------------------------------------------------------------------------
class alias_tensor_instance : public tensor
{
alias_tensor_instance(
) : data_instance(0), _annotation(0), data_offset(0) {}
public:
friend class alias_tensor;
friend class alias_tensor_const_instance;
alias_tensor_instance& operator= (float val)
{
tensor::operator=(val);
return *this;
}
template <typename EXP>
alias_tensor_instance& operator= (const matrix_exp<EXP>& item)
{
tensor::operator=(item);
return *this;
}
virtual const float* host() const { return data_instance->host()+data_offset; }
virtual float* host() { return data_instance->host()+data_offset; }
virtual float* host_write_only() { return data_instance->host()+data_offset; }
virtual const float* device() const { return data_instance->device()+data_offset; }
virtual float* device() { return data_instance->device()+data_offset; }
virtual float* device_write_only() { return data_instance->device()+data_offset; }
virtual const any& annotation() const { return *_annotation; }
virtual any& annotation() { return *_annotation; }
#ifdef DLIB_USE_CUDA
virtual const cuda::tensor_descriptor& get_cudnn_tensor_descriptor (
) const { return *cudnn_descriptor; }
#endif
private:
virtual size_t get_alias_offset() const { return data_offset; }
#ifdef DLIB_USE_CUDA
std::shared_ptr<cuda::tensor_descriptor> cudnn_descriptor;
#endif
gpu_data* data_instance;
any* _annotation;
size_t data_offset;
virtual gpu_data& data() { return *data_instance; }
virtual const gpu_data& data() const { return *data_instance; }
};
// ----------------------------------------------------------------------------------------
class alias_tensor_const_instance
{
public:
const tensor& get() const { return inst; }
operator const tensor& () { return inst; }
alias_tensor_const_instance(const alias_tensor_instance& item) : inst(item) {}
private:
alias_tensor_instance inst;
friend class alias_tensor;
alias_tensor_const_instance() {}
};
// ----------------------------------------------------------------------------------------
class alias_tensor
{
public:
alias_tensor (
) {}
alias_tensor (
long long n_, long long k_ = 1, long long nr_ = 1, long long nc_ = 1
)
{
DLIB_ASSERT( n_ >= 0 && k_ >= 0 && nr_ >= 0 && nc_ >= 0);
inst.m_n = n_;
inst.m_k = k_;
inst.m_nr = nr_;
inst.m_nc = nc_;
inst.m_size = n_*k_*nr_*nc_;
}
long long num_samples(
) const { return inst.m_n; }
long long k(
) const { return inst.m_k; }
long long nr(
) const { return inst.m_nr; }
long long nc(
) const { return inst.m_nc; }
size_t size(
) const { return inst.m_size; }
alias_tensor_instance operator() (
tensor& t,
size_t offset = 0
) const
{
DLIB_CASSERT(offset+size() <= t.size(),
"offset: "<<offset <<"\n"<<
"size(): "<<size() <<"\n"<<
"t.size(): "<<t.size() <<"\n");
#ifdef DLIB_USE_CUDA
if (!inst.cudnn_descriptor)
{
inst.cudnn_descriptor = std::make_shared<cuda::tensor_descriptor>();
inst.cudnn_descriptor->set_size(inst.m_n, inst.m_k, inst.m_nr, inst.m_nc);
}
#endif
inst.data_instance = &t.data();
inst._annotation = &t.annotation();
// Note that t might already be an aliasing tensor so we need to take that into
// account.
inst.data_offset = t.get_alias_offset()+offset;
return inst;
}
alias_tensor_const_instance operator() (
const tensor& t,
size_t offset = 0
) const
{
alias_tensor_const_instance temp;
temp.inst = (*this)(const_cast<tensor&>(t),offset);
return temp;
}
private:
mutable alias_tensor_instance inst;
};
inline void serialize(const alias_tensor& item, std::ostream& out)
{
int version = 1;
serialize(version, out);
serialize(item.num_samples(), out);
serialize(item.k(), out);
serialize(item.nr(), out);
serialize(item.nc(), out);
}
inline void deserialize(alias_tensor& item, std::istream& in)
{
int version = 0;
deserialize(version, in);
if (version != 1)
throw serialization_error("Unexpected version found while deserializing dlib::alias_tensor.");
long long num_samples, k, nr, nc;
deserialize(num_samples, in);
deserialize(k, in);
deserialize(nr, in);
deserialize(nc, in);
item = alias_tensor(num_samples, k, nr, nc);
}
// ----------------------------------------------------------------------------------------
}
#endif // DLIB_DNn_TENSOR_H_

View File

@@ -0,0 +1,727 @@
// Copyright (C) 2015 Davis E. King (davis@dlib.net)
// License: Boost Software License See LICENSE.txt for the full license.
#undef DLIB_DNn_TENSOR_ABSTRACT_H_
#ifdef DLIB_DNn_TENSOR_ABSTRACT_H_
#include "../matrix.h"
#include "../any/any_abstract.h"
namespace dlib
{
// ----------------------------------------------------------------------------------------
class tensor
{
/*!
WHAT THIS OBJECT REPRESENTS
This object represents a 4D array of float values, all stored contiguously
in memory. Importantly, it keeps two copies of the floats, one on the host
CPU side and another on the GPU device side. It automatically performs the
necessary host/device transfers to keep these two copies of the data in
sync.
All transfers to the device happen asynchronously with respect to the
default CUDA stream so that CUDA kernel computations can overlap with data
transfers. However, any transfers from the device to the host happen
synchronously in the default CUDA stream. Therefore, you should perform
all your CUDA kernel launches on the default stream so that transfers back
to the host do not happen before the relevant computations have completed.
If DLIB_USE_CUDA is not #defined then this object will not use CUDA at all.
Instead, it will simply store one host side memory block of floats.
Finally, the convention in dlib code is to interpret the tensor as a set of
num_samples() 3D arrays, each of dimension k() by nr() by nc(). Also,
while this class does not specify a memory layout, the convention is to
assume that indexing into an element at coordinates (sample,k,r,c) can be
accomplished via:
host()[((sample*t.k() + k)*t.nr() + r)*t.nc() + c]
THREAD SAFETY
Instances of this object are not thread-safe. So don't touch one from
multiple threads at the same time.
!*/
public:
virtual ~tensor();
long long num_samples(
) const;
/*!
ensures
- returns the number of 3D arrays of dimension k() by nr() by nc() there
are in this object.
!*/
long long k(
) const;
/*!
ensures
- returns the k dimension of this tensor. Generally, we think of a tensor
as containing num_samples() images of nr() by nc() rows and columns, each
with k() channels.
!*/
long long nr(
) const;
/*!
ensures
- returns the number of rows in this tensor.
!*/
long long nc(
) const;
/*!
ensures
- returns the number of columns in this tensor.
!*/
size_t size(
) const;
/*!
ensures
- returns num_samples()*k()*nr()*nc()
(i.e. the total number of floats in this tensor)
!*/
void async_copy_to_device(
) const;
/*!
ensures
- This function does not block.
- if (the host version of the data is newer than the device's copy) then
- Begins asynchronously copying host data to the device.
- A call to device() that happens before the transfer completes will
block until the transfer is complete. That is, it is safe to call
async_copy_to_device() and then immediately call device().
!*/
typedef float* iterator;
typedef const float* const_iterator;
iterator begin() { return host(); }
const_iterator begin() const { return host(); }
iterator end() { return host()+size(); }
const_iterator end() const { return host()+size(); }
/*!
ensures
- makes a tensor iterable just like the STL containers.
!*/
virtual const float* host(
) const = 0;
/*!
ensures
- returns a pointer to the host memory block of size() contiguous float
values or nullptr if size()==0.
- if (the host's copy of the data is out of date) then
- copies the data from the device to the host, while this is happening
the call to host() blocks.
!*/
virtual float* host(
) = 0;
/*!
ensures
- returns a pointer to the host memory block of size() contiguous float
values or nullptr if size()==0.
- if (the host's copy of the data is out of date) then
- copies the data from the device to the host, while this is happening
the call to host() blocks.
- Marks the device side data as out of date so that the next call to
device() will perform a host to device transfer. If you want to begin
the transfer immediately then you can call async_copy_to_device() after
calling host().
!*/
virtual float* host_write_only(
) = 0;
/*!
ensures
- This function returns the same pointer as host(), except that it never
performs a device to host memory copy. Instead, it immediately marks the
device side data as out of date, effectively discarding it. Therefore,
the values in the data pointed to by host_write_only() are undefined and
you should only call host_write_only() if you are going to assign to
every memory location in the returned memory block.
!*/
virtual const float* device(
) const = 0;
/*!
requires
- DLIB_USE_CUDA is #defined
ensures
- returns a pointer to the device memory block of size() contiguous float
values or nullptr if size()==0.
- if (the device's copy of the data is out of date) then
- copies the data from the host to the device, while this is happening
the call to device() blocks.
!*/
virtual float* device(
) = 0;
/*!
requires
- DLIB_USE_CUDA is #defined
ensures
- returns a pointer to the device memory block of size() contiguous float
values or nullptr if size()==0.
- if (the device's copy of the data is out of date) then
- copies the data from the host to the device, while this is happening
the call to device() blocks.
- Marks the host side data as out of date so that the next call to
host() will perform a device to host transfer.
!*/
virtual float* device_write_only(
) = 0;
/*!
requires
- DLIB_USE_CUDA is #defined
ensures
- This function returns the same pointer as device(), except that it never
performs a host to device memory copy. Instead, it immediately marks the
host side data as out of date, effectively discarding it. Therefore, the
values in the data pointed to by device_write_only() are undefined and
you should only call device_write_only() if you are going to assign to
every memory location in the returned memory block.
!*/
virtual const any& annotation(
) const = 0;
/*!
ensures
- returns a const reference to the any object in this tensor. The any
object can be used to store any additional annotation you like in a
tensor. However, it should be noted that the annotation() is ignored by
serialize() and therefore not saved when a tensor is serialized.
!*/
virtual any& annotation(
) = 0;
/*!
ensures
- returns a non-const reference to the any object in this tensor. The any
object can be used to store any additional annotation you like in a
tensor. However, it should be noted that the annotation() is ignored by
serialize() and therefore not saved when a tensor is serialized.
!*/
int device_id(
) const;
/*!
ensures
- returns the ID of the CUDA device that allocated this memory. I.e. the
number returned by cudaGetDevice() when the memory was allocated.
- If CUDA is not being used then this function always returns 0.
!*/
tensor& operator= (
float val
);
/*!
ensures
- sets all elements of this tensor equal to val.
- returns *this
!*/
tensor& operator*= (
float val
);
/*!
ensures
- pointwise multiplies all elements of *this tensor with val.
- returns *this
!*/
tensor& operator/= (
float val
);
/*!
ensures
- pointwise divides all elements of *this tensor with val.
- returns *this
!*/
template <typename EXP>
tensor& operator= (
const matrix_exp<EXP>& item
);
/*!
requires
- num_samples() == item.nr()
- k()*nr()*nc() == item.nc()
- item contains float values
ensures
- Assigns item to *this tensor by performing:
set_ptrm(host(), num_samples(), k()*nr()*nc()) = item;
!*/
template <typename EXP>
tensor& operator+= (
const matrix_exp<EXP>& item
);
/*!
requires
- num_samples() == item.nr()
- k()*nr()*nc() == item.nc()
- item contains float values
ensures
- Adds item to *this tensor by performing:
set_ptrm(host(), num_samples(), k()*nr()*nc()) += item;
!*/
template <typename EXP>
tensor& operator-= (
const matrix_exp<EXP>& item
);
/*!
requires
- num_samples() == item.nr()
- k()*nr()*nc() == item.nc()
- item contains float values
ensures
- Subtracts item from *this tensor by performing:
set_ptrm(host(), num_samples(), k()*nr()*nc()) -= item;
!*/
template <typename EXP>
void set_sample (
unsigned long long idx,
const matrix_exp<EXP>& item
);
/*!
requires
- idx < num_samples()
- k()*nr()*nc() == item.size()
- item contains float values
ensures
- Assigns item to the idx'th sample in *this by performing:
set_ptrm(host()+idx*item.size(), item.nr(), item.nc()) = item;
!*/
template <typename EXP>
void add_to_sample (
unsigned long long idx,
const matrix_exp<EXP>& item
);
/*!
requires
- idx < num_samples()
- k()*nr()*nc() == item.size()
- item contains float values
ensures
- Adds item to the idx'th sample in *this by performing:
set_ptrm(host()+idx*item.size(), item.nr(), item.nc()) += item;
!*/
protected:
// You can't move or copy another tensor into *this since that might modify the
// tensor's dimensions. If you want to do that sort of thing then use a
// resizable_tensor.
tensor(const tensor& item);
tensor& operator= (const tensor& item);
tensor(tensor&& item);
tensor& operator=(tensor&& item);
};
// ----------------------------------------------------------------------------------------
void memcpy (
tensor& dest,
const tensor& src
);
/*!
requires
- dest.size() == src.size()
ensures
- Copies the data in src to dest. If the device data is current on both src
and dest then the copy will happen entirely on the device side.
- It doesn't matter what GPU device is selected by cudaSetDevice(). You can
always copy tensor objects to and from each other regardless.
- This function blocks until the copy has completed.
!*/
// ----------------------------------------------------------------------------------------
bool is_vector (
const tensor& t
);
/*!
ensures
- returns true if and only if one of the following is true:
- t.size() == t.num_samples()
- t.size() == t.k()
- t.size() == t.nr()
- t.size() == t.nc()
!*/
// ----------------------------------------------------------------------------------------
const matrix_exp mat (
const tensor& t,
long long nr,
long long nc
);
/*!
requires
- nr >= 0
- nc >= 0
- nr*nc == t.size()
ensures
- returns a matrix M such that:
- M.nr() == nr
- m.nc() == nc
- for all valid r and c:
M(r,c) == t.host()[r*nc + c]
(i.e. the tensor is interpreted as a matrix laid out in memory
in row major order)
!*/
const matrix_exp mat (
const tensor& t
);
/*!
ensures
- if (t.size() != 0) then
- returns mat(t, t.num_samples(), t.size()/t.num_samples())
- else
- returns an empty matrix.
!*/
const matrix_exp image_plane (
const tensor& t,
long long sample = 0,
long long k = 0
);
/*!
requires
- t.size() != 0
- 0 <= sample < t.num_samples()
- 0 <= k < t.k()
ensures
- returns the k-th image plane from the sample-th image in t. That is,
returns a matrix M such that:
- M contains float valued elements.
- M.nr() == t.nr()
- M.nc() == t.nc()
- for all valid r and c:
- M(r,c) == t.host()[((sample*t.k() + k)*t.nr() + r)*t.nc() + c]
!*/
// ----------------------------------------------------------------------------------------
bool have_same_dimensions (
const tensor& a,
const tensor& b
);
/*!
ensures
- returns true if and only if all of the fallowing are satisfied:
- a.num_samples() == b.num_samples()
- a.k() == b.k()
- a.nr() == b.nr()
- a.nc() == b.nc()
!*/
// ----------------------------------------------------------------------------------------
class resizable_tensor : public tensor
{
/*!
WHAT THIS OBJECT REPRESENTS
This object is just a tensor with the additional ability to be resized.
!*/
public:
resizable_tensor(
);
/*!
ensures
- #size() == 0
- #num_samples() == 0
- #k() == 0
- #nr() == 0
- #nc() == 0
- #capacity() == 0
!*/
template <typename EXP>
resizable_tensor(
const matrix_exp<EXP>& item
);
/*!
requires
- item contains float values
ensures
- #num_samples() == item.nr()
- #k() == item.nc()
- #nr() == 1
- #nc() == 1
- Assigns item to *this tensor by performing:
set_ptrm(host(), num_samples(), k()*nr()*nc()) = item;
- #capacity() == size()
!*/
explicit resizable_tensor(
long long n_, long long k_ = 1, long long nr_ = 1, long long nc_ = 1
);
/*!
requires
- n_ >= 0
- k_ >= 0
- nr_ >= 0
- nc_ >= 0
ensures
- #size() == n_*k_*nr_*nc_
- #num_samples() == n_
- #k() == k_
- #nr() == nr_
- #nc() == nc_
- #capacity() == size()
!*/
// This object is copyable and movable
resizable_tensor(const resizable_tensor&) = default;
resizable_tensor(resizable_tensor&&) = default;
resizable_tensor& operator= (const resizable_tensor&) = default;
resizable_tensor& operator= (resizable_tensor&&) = default;
size_t capacity (
) const;
/*!
ensures
- returns the total number of floats allocated. This might be different
from the size() since calls to set_size() that make a tensor smaller
don't trigger reallocations. They simply adjust the nominal dimensions
while keeping the same allocated memory block. This makes calls to
set_size() very fast. If you need to deallocate a tensor then use
clear().
!*/
void clear(
);
/*!
ensures
- #size() == 0
- #num_samples() == 0
- #k() == 0
- #nr() == 0
- #nc() == 0
- #annotation().is_empty() == true
- #capacity() == 0
!*/
void copy_size (
const tensor& item
);
/*!
ensures
- resizes *this so that: have_same_dimensions(#*this, item)==true
!*/
void set_size(
long long n_, long long k_ = 1, long long nr_ = 1, long long nc_ = 1
);
/*!
requires
- n_ >= 0
- k_ >= 0
- nr_ >= 0
- nc_ >= 0
ensures
- #size() == n_*k_*nr_*nc_
- #num_samples() == n_
- #k() == k_
- #nr() == nr_
- #nc() == nc_
- #capacity() == max(#size(), capacity())
(i.e. capacity() never goes down when calling set_size().)
!*/
template <typename EXP>
resizable_tensor& operator= (
const matrix_exp<EXP>& item
);
/*!
requires
- item contains float values
ensures
- if (num_samples() == item.nr() && k()*nr()*nc() == item.nc()) then
- the dimensions of this tensor are not changed
- else
- #num_samples() == item.nr()
- #k() == item.nc()
- #nr() == 1
- #nc() == 1
- Assigns item to *this tensor by performing:
set_ptrm(host(), num_samples(), k()*nr()*nc()) = item;
!*/
};
void serialize(const tensor& item, std::ostream& out);
void deserialize(resizable_tensor& item, std::istream& in);
/*!
provides serialization support for tensor and resizable_tensor. Note that you can
serialize to/from any combination of tenor and resizable_tensor objects.
!*/
// ----------------------------------------------------------------------------------------
double dot(
const tensor& a,
const tensor& b
);
/*!
requires
- a.size() == b.size()
ensures
- returns the dot product between a and b when they are both treated as
a.size() dimensional vectors. That is, this function pointwise multiplies
the vectors together, then sums the result and returns it.
!*/
// ----------------------------------------------------------------------------------------
class alias_tensor_instance : public tensor
{
/*!
WHAT THIS OBJECT REPRESENTS
This object is a tensor that aliases another tensor. That is, it doesn't
have its own block of memory but instead simply holds pointers to the
memory of another tensor object. It therefore allows you to efficiently
break a tensor into pieces and pass those pieces into functions.
An alias_tensor_instance doesn't own the resources it points to in any sense.
So it is important to make sure that the underlying owning tensor doesn't get
destructed before any alias tensors which point to it are destructed.
!*/
// You can't default initialize this object. You can only get instances of it from
// alias_tensor::operator().
alias_tensor_instance(
);
};
class alias_tensor_const_instance
{
/*!
WHAT THIS OBJECT REPRESENTS
This is essentially a const version of alias_tensor_instance and therefore
represents a tensor. However, due to the mechanics of C++, this object
can't inherit from tensor. So instead it provides a get() and an implicit
conversion to const tensor.
!*/
public:
// non-const alias tensors are convertible to const ones.
alias_tensor_const_instance(const alias_tensor_instance& item);
// Methods that cast the alias to a tensor.
const tensor& get() const;
operator const tensor& ();
private:
// You can't default initialize this object. You can only get instances of it from
// alias_tensor::operator().
alias_tensor_const_instance();
};
class alias_tensor
{
/*!
WHAT THIS OBJECT REPRESENTS
This is a tool for creating tensor objects that alias other tensor objects.
That is, it allows you to make a tensor that references the memory space of
another tensor object rather than owning its own memory. This allows you
to do things like interpret a single tensor in different ways or even as a
group of multiple tensors.
!*/
public:
alias_tensor (
);
/*!
ensures
- #size() == 0
- #num_samples() == 0
- #k() == 0
- #nr() == 0
- #nc() == 0
!*/
alias_tensor (
long long n_, long long k_ = 1, long long nr_ = 1, long long nc_ = 1
);
/*!
requires
- n_ >= 0
- k_ >= 0
- nr_ >= 0
- nc_ >= 0
ensures
- #size() == n_*k_*nr_*nc_
- #num_samples() == n_
- #k() == k_
- #nr() == nr_
- #nc() == nc_
!*/
long long num_samples() const;
long long k() const;
long long nr() const;
long long nc() const;
size_t size() const;
alias_tensor_instance operator() (
tensor& t,
size_t offset = 0
) const;
/*!
requires
- offset+size() <= t.size()
ensures
- Returns a tensor that simply aliases the elements of t beginning with t's
offset'th element. Specifically, this function returns an aliasing
tensor T such that:
- T.size() == size()
- T.num_samples() == num_samples()
- T.k() == k()
- T.nr() == nr()
- T.nc() == nc()
- T.host() == t.host()+offset
- T.device() == t.device()+offset
- &T.annotation() == &t.annotation()
!*/
alias_tensor_const_instance operator() (
const tensor& t,
size_t offset = 0
) const;
/*!
requires
- offset+size() <= t.size()
ensures
- This function is identical to the above version of operator() except that
it takes and returns const tensors instead of non-const tensors.
!*/
};
void serialize(const alias_tensor& item, std::ostream& out);
void deserialize(alias_tensor& item, std::istream& in);
/*!
provides serialization support for alias_tensor.
!*/
// ----------------------------------------------------------------------------------------
}
#endif // DLIB_DNn_TENSOR_ABSTRACT_H_

File diff suppressed because it is too large Load Diff