参考自英伟达官方教程 link
lambda note from zhihu zhihu

CUDA Made Easy

execution space

execution space:where the code execute
partitioned into host(cpu) and device(typically gpu)
by default it runs on host,you need to specify which code to run on device

#include <thrust/execution_policy.h>
#include <thrust/universal_vector.h>
#include <thrust/transform.h>
#include <cstdio>

int main() {
    float k = 0.5;
    float ambient_temp = 20;
    thrust::universal_vector<float> temp{ 42, 24, 50 };
    auto transformation = [=] __host__ __device__ (float temp) { return temp + k * (ambient_temp - temp); };//既可以在host也可以在device

    std::printf("step  temp[0]  temp[1]  temp[2]\n");
    for (int step = 0; step < 3; step++) {
        thrust::transform(thrust::device, temp.begin(), temp.end(), temp.begin(), transformation);//关键看其第一个参数显式指明的设备
        std::printf("%d     %.2f    %.2f    %.2f\n", step, temp[0], temp[1], temp[2]);
    }
}

两个要点:函数定义前的修饰符得要对,执行的时候传进去的设备参数也得对
很多时候就是将std::换为thrust::,但是记得要传设备(thrust::device)
禁止直接从host调用一个device函数(最简单的办法可以是在device function外面包一层host function)

Extending Standard Algorithms

example

正确但低效的代码

#include "ach.h"

float naive_max_change(const thrust::universal_vector<float>& a,
                       const thrust::universal_vector<float>& b)
{
    // allocate vector to store `a - b`
    thrust::universal_vector<float> unnecessarily_materialized_diff(a.size());

    // compute products
    // load 两个数组并放回,这里就是3*n的 memory access
    thrust::transform(thrust::device,
                      a.begin(), a.end(),                       // first input sequence
                      b.begin(),                                // second input sequence
                      unnecessarily_materialized_diff.begin(),  // result
                      []__host__ __device__(float x, float y) { // transformation (abs diff)
                         return abs(x - y);
                      });

    // compute max difference
    // load 进来比大小,又是n memory access,远小于直接for loop
    return thrust::reduce(thrust::device,
                          unnecessarily_materialized_diff.begin(),
                          unnecessarily_materialized_diff.end(),
                          0.0f, thrust::maximum<float>{});
}

int main()
{
    float k = 0.5;
    float ambient_temp = 20;
    thrust::universal_vector<float> temp[] = {{ 42, 24, 50 }, { 0, 0, 0}};
    auto transformation = [=] __host__ __device__ (float temp) { return temp + k * (ambient_temp - temp); };

    std::printf("step  max-change\n");
    for (int step = 0; step < 3; step++) {
        thrust::universal_vector<float> &current = temp[step % 2];
        thrust::universal_vector<float> &next = temp[(step + 1) % 2];

        thrust::transform(thrust::device, current.begin(), current.end(), next.begin(), transformation);
        std::printf("%d     %.2f\n", step, naive_max_change(current, next));
    }
}

iterator

使用迭代器可以认为就是使用指针的泛化
迭代器的思想是建立在cpp运算符重载之上的
通过重载运算符[]可以实现读取时计算(常规我们需要全部计算完了之后再取结果数组,重载取到某个元素的时候同时算)

#include "ach.h"

struct wrapper//定义wrapper,重载是针对自定义数据类型的
{
   int *ptr;

   void operator=(int value) {
      *ptr = value / 2;
   }
};

struct transform_output_iterator
{
  int *a;

  wrapper operator[](int i)
  {
    return {a + i};
  }
};

int main()
{
  std::array<int, 3> a{ 0, 1, 2 };
  transform_output_iterator it{a.data()};

  it[0] = 10;//it[0]返回一个临时对象 wrapper(a,0),ptr就是a的第0个元素,再执行=,将ptr所指更改为一半
  it[1] = 20;

  std::printf("a[0]: %d\n", a[0]); // prints 5
  std::printf("a[1]: %d\n", a[1]); // prints 10
}

cuda iterator

#include "ach.h"

float max_change(const thrust::universal_vector<float>& a,
                 const thrust::universal_vector<float>& b)
{
    auto zip = thrust::make_zip_iterator(a.begin(), b.begin());//make iterator
    auto transform = thrust::make_transform_iterator(zip, []__host__ __device__(thrust::tuple<float, float> t) {
        return abs(thrust::get<0>(t) - thrust::get<1>(t));
    });//attach function to dereference of iterator

    // compute max difference
    return thrust::reduce(thrust::device, transform, transform + a.size(), 0.0f, thrust::maximum<float>{});
}

int main()
{
    float k = 0.5;
    float ambient_temp = 20;
    thrust::universal_vector<float> temp[] = {{ 42, 24, 50 }, { 0, 0, 0}};
    auto transformation = [=] __host__ __device__ (float temp) { return temp + k * (ambient_temp - temp); };

    std::printf("step  max-change\n");
    for (int step = 0; step < 3; step++) {
        thrust::universal_vector<float> &current = temp[step % 2];
        thrust::universal_vector<float> &next = temp[(step + 1) % 2];

        thrust::transform(thrust::device, current.begin(), current.end(), next.begin(), transformation);
        std::printf("%d     %.2f\n", step, max_change(current, next));
    }
}

cpp lambda

三种方法传递可调用对象:function pointer,functor(仿函数),lambda
functor就是对类重载括号运算符
lambda expression
[ capture-list ] ( params ) mutable(optional) exception(optional) attribute(optional) -> ret(optional) { body }
capture-list:[target]:按值捕获 [&target]:按引用捕获 [v = target]:按值捕获target,函数内部叫v
default capture [=] or [&]按值或引用捕获可见范围内所有的局部变量
只要不写mutable，默认就是一个const函数
捕获this指针只能按值捕获
可视为一个类,捕获列表就是成员变量,参数列表就是成员函数参数
const修饰成员函数int C::func() const{},不会修改成员变量

Vocabulary Types

thrust::tabulate对数组每个下标处执行operator然后存在原来的位置(map)
处理多维数据如果手动从一维展平状态去算非常困难,使用std::pair可存2维,cuda::std::mdspan可处理任意维度的坐标
标准cpp库函数直接从device调用不了,cuda::实现的有

#include <cuda/std/mdspan>
#include <cuda/std/array>
#include <cstdio>

//exercise
cuda::std::mdspan temp_in(thrust::raw_pointer_cast(in.data()), height, width);//mdspan takes raw pointer(just in.data() fail)
//tabulate的lambda的参数值是index!!!
thrust::tabulate(thrust::device, out.begin(), out.end(), [temp_in] __host__ __device__(int id) {
  int column = id % temp_in.extent(1);//id%widgh
  int row    = id / temp_in.extent(1);

  if (row > 0 && column > 0 && row < temp_in.extent(0) - 1 && column < temp_in.extent(1) - 1)
  {
    float d2tdx2 = temp_in(row, column - 1) - 2 * temp_in(row, column) + temp_in(row, column + 1);
    float d2tdy2 = temp_in(row - 1, column) - 2 * temp_in(row, column) + temp_in(row + 1, column);

    return temp_in(row, column) + 0.2f * (d2tdx2 + d2tdy2);
  }
  else
  {
    return temp_in(row, column);
  }
});

Serial vs Parallel

for loop是完全的串行的,非常慢
thrust::reduce_by_key reduce segments of values
key 数组两者不同的时候即为新的一个segment

#include <thrust/iterator/counting_iterator.h>
#include <thrust/iterator/transform_iterator.h>
#include <thrust/iterator/discard_iterator.h>
#include <thrust/reduce.h>
#include <thrust/execution_policy.h>

thrust::universal_vector<float> row_temperatures(
    int height, int width,
    thrust::universal_vector<int>& row_ids, // 这个参数在优化后其实可以不用了
    thrust::universal_vector<float>& temp)
{
    thrust::universal_vector<float> sums(height);

    // 1. 创建生成行索引的迭代器：直接根据当前元素的序号 i 计算 i / width,既不用从内存中读数来算,也没有将整个算出来
    auto row_indices_gen = thrust::make_transform_iterator(
        thrust::make_counting_iterator(0),
        [=] __host__ __device__ (int i) { return i / width; }
    );

    // 2. 执行规约
    // row_indices_gen 会生成类似 0,0,0, 1,1,1, 2,2,2... 的序列
    thrust::reduce_by_key(thrust::device,
                          row_indices_gen, row_indices_gen + temp.size(),
                          temp.begin(),
                          thrust::make_discard_iterator(), // 丢弃返回的 keys
                          sums.begin());

    return sums;
}

//make transform iterator:在得到结果之前调用仿函数
struct mean_functor {
    int width;
    __host__ __device__ float operator()(float x) const {
        return x / width;
    }
};
auto means_output =
    thrust::make_transform_output_iterator(
      means.begin(), 
      mean_functor{width});

memory spaces

thrust::universal_vector同时在cpu和gpu的memory space并且有automatic migrates
可能出现的隐式多次拷贝
alt text
为了避免,我们使用显式memory space
thrust::host_vector/thrust::device_vector
使用thrust::copy_n显式复制

在host code中取address of device function is not allowed
所以直接在transform(host code)中给命名函数是不行的
但是lambda表达式可以,因为它实际上是一个函数对象
解释:device function是直接在device地址空间的,cpu code找不到,直接写函数名隐式传递函数指针,写lambda相当于对象值传递

Unlocking the GPU’s Full Potential

Asynchrony

thrust在cpu和gpu之间是同步的(cpu 的thrust launch了gpu task后是空闲的)
可以使用cub库来编写代码,这样cpu launch之后立刻返回,可以overlap
cudaDeviceSynchronize():让cpu等gpu完成

nsight:tool for profile your program
CUDA HW:memory看host和device之间的transfer
IO:writev fclose等系统调用

为了方便看,可以在代码中使用NVTX
nvtx3::scoped_range temp{std::string("copy stage")}//建立一个range,直接放在一个{}块里面就可以标记成功

streams

为了更好的重叠操作以提高性能,需要异步的内存复制
(GPU内的内存复制远比GPU和CPU间的更快,所以在GPU上再开一个buffer用来异步运数据是值得的)
cuda stream可以认为是一个有序的将会在GPU上执行的工作队列(流内有序,流间无序)

#include "ach.h"

int main()
{
  int height = 2048;
  int width = 8192;

  cudaStream_t compute_stream;
  cudaStreamCreate(&compute_stream);

  cudaStream_t copy_stream;
  cudaStreamCreate(&copy_stream);

  thrust::device_vector<float> d_prev = ach::init(height, width);
  thrust::device_vector<float> d_next(height * width);
  thrust::device_vector<float> d_buffer(height * width);

  // TODO: Change code below to allocate host vector in pinned memory
  thrust::host_vector<float> h_prev(height * width);

  const int compute_steps = 750;
  const int write_steps = 3;
  for (int write_step = 0; write_step < write_steps; write_step++)
  {
    cudaMemcpy(thrust::raw_pointer_cast(d_buffer.data()),
               thrust::raw_pointer_cast(d_prev.data()),
               height * width * sizeof(float), cudaMemcpyDeviceToDevice);
    cudaMemcpyAsync(thrust::raw_pointer_cast(h_prev.data()),
                    thrust::raw_pointer_cast(d_buffer.data()),
                    height * width * sizeof(float), cudaMemcpyDeviceToHost,
                    copy_stream);

    for (int compute_step = 0; compute_step < compute_steps; compute_step++)
    {
      ach::simulate(width, height, d_prev, d_next, compute_stream);
      d_prev.swap(d_next);
    }

    cudaStreamSynchronize(copy_stream);
    ach::store(write_step, height, width, h_prev);

    cudaStreamSynchronize(compute_stream);
  }

  cudaStreamDestroy(compute_stream);
  cudaStreamDestroy(copy_stream);
}

pinned memory

直接使用上述方法仍然可能无法同步:
操作系统使用页管理进程的虚拟内存,可能存在页的交换(写回到磁盘中并load新的页)
gpu只能从物理内存复制数据(这意味着复制时的数据是pinned)
从host复制到device,cuda运行时首先将其复制到staging buffer再复制到device,这涉及到cpu,使原本异步变成串行的
为了解决这个问题,我们需要显式声明pinned memory
使用thrust::universal_host_pinned_vector声明buffer即可

Implementing New Algorithms

kernels

一个使用__global__修饰的函数叫kernel:invoked on cpu but run on gpu
to launch a kernelkernel<<<1,1,0,stream>>>()
cuda kernel是异步的
gridDim.x blockDim.x blockIdx.x threadIdx.x 前两个参数是gridsize和blocksize
计算给定块线程数下的块数cuda::ceil_div(width, block_size)
也可以(width+block_size-1)/block_size
debug:使用compute-sanitizer运行程序

histogram&sync

计算数据统计的histogram图的时候会出现data race,要用原子操作cuda::std::atomic_ref
所有线程全部去抢,原子操作虽然正确但是性能低,每个block更新local最后更新性能更好
_syncthreads()同步线程
cuda::std::atomic_ref<int> cuda::atomic_ref<int,cuda::thread_scope_system>
后者有个接口表示这是在什么范围内(system,block等)

shared&cooperative

shared memory:不存在global memory中,而是直接放在一个块的共享内存,跑得更快

__shared__ int block_histogram[num_bins];

if (threadIdx.x < num_bins) 
{
  block_histogram[threadIdx.x] = 0;
}
//并行初始化
__syncthreads();

int cell = blockIdx.x * blockDim.x + threadIdx.x;
int bin = static_cast<int>(temperatures[cell] / bin_width);

cuda::atomic_ref<int, cuda::thread_scope_block> 
  block_ref(block_histogram[bin]);
block_ref.fetch_add(1, cuda::memory_order_relaxed);
__syncthreads();

if (threadIdx.x < num_bins) 
{
  cuda::atomic_ref<int, cuda::thread_scope_device> ref(histogram[threadIdx.x]);
  ref.fetch_add(block_histogram[threadIdx.x], cuda::memory_order_relaxed);
}

sequential algorithm:invoked and executed by a single thread
cooperative algorithm:invoked and executed by multi threads
parallel algorithm:invoked by a single thread but run internally on multiple threads

#include "ach.cuh"
constexpr int block_size = 256;
constexpr int items_per_thread = 1;
constexpr int num_bins = 10;
constexpr float bin_width = 10;

__global__ void histogram_kernel(cuda::std::span<float> temperatures,
                                 cuda::std::span<int> histogram) {
  __shared__ int block_histogram[num_bins];

  int cell = blockIdx.x * blockDim.x + threadIdx.x;
  int bins[items_per_thread] = {
      static_cast<int>(temperatures[cell] / bin_width)};

  using histogram_t =
      cub::BlockHistogram<int, block_size, items_per_thread, num_bins,
                          cub::BlockHistogramAlgorithm::BLOCK_HISTO_ATOMIC>;
                          //定义别名histogram_t是一个模板类
  __shared__ typename histogram_t::TempStorage temp_storage;//用于存计算需要的临时变量
  histogram_t(temp_storage).Histogram(bins, block_histogram);//自动初始化并清零,原子统计,最后同步

  __syncthreads();
  if (threadIdx.x < num_bins) {
    cuda::atomic_ref<int, cuda::thread_scope_device> ref(
        histogram[threadIdx.x]);
    ref.fetch_add(block_histogram[threadIdx.x]);
  }
}

void histogram(cuda::std::span<float> temperatures,
               cuda::std::span<int> histogram, cudaStream_t stream) {
  int grid_size = cuda::ceil_div(temperatures.size(), block_size);
  histogram_kernel<<<grid_size, block_size, 0, stream>>>(temperatures,
                                                         histogram);
}