参考自英伟达官方教程 link
lambda note from zhihu zhihu

CUDA Made Easy

execution space

execution space:where the code execute
partitioned into host(cpu) and device(typically gpu)
by default it runs on host,you need to specify which code to run on device

#include <thrust/execution_policy.h>
#include <thrust/universal_vector.h>
#include <thrust/transform.h>
#include <cstdio>

int main() {
float k = 0.5;
float ambient_temp = 20;
thrust::universal_vector<float> temp{ 42, 24, 50 };
auto transformation = [=] __host__ __device__ (float temp) { return temp + k * (ambient_temp - temp); };//既可以在host也可以在device

std::printf("step temp[0] temp[1] temp[2]\n");
for (int step = 0; step < 3; step++) {
thrust::transform(thrust::device, temp.begin(), temp.end(), temp.begin(), transformation);//关键看其第一个参数显式指明的设备
std::printf("%d %.2f %.2f %.2f\n", step, temp[0], temp[1], temp[2]);
}
}

两个要点:函数定义前的修饰符得要对,执行的时候传进去的设备参数也得对
很多时候就是将std::换为thrust::,但是记得要传设备(thrust::device)
禁止直接从host调用一个device函数(最简单的办法可以是在device function外面包一层host function)

Extending Standard Algorithms

example

正确但低效的代码

#include "ach.h"

float naive_max_change(const thrust::universal_vector<float>& a,
const thrust::universal_vector<float>& b)
{
// allocate vector to store `a - b`
thrust::universal_vector<float> unnecessarily_materialized_diff(a.size());

// compute products
// load 两个数组并放回,这里就是3*n的 memory access
thrust::transform(thrust::device,
a.begin(), a.end(), // first input sequence
b.begin(), // second input sequence
unnecessarily_materialized_diff.begin(), // result
[]__host__ __device__(float x, float y) { // transformation (abs diff)
return abs(x - y);
});

// compute max difference
// load 进来比大小,又是n memory access,远小于直接for loop
return thrust::reduce(thrust::device,
unnecessarily_materialized_diff.begin(),
unnecessarily_materialized_diff.end(),
0.0f, thrust::maximum<float>{});
}

int main()
{
float k = 0.5;
float ambient_temp = 20;
thrust::universal_vector<float> temp[] = {{ 42, 24, 50 }, { 0, 0, 0}};
auto transformation = [=] __host__ __device__ (float temp) { return temp + k * (ambient_temp - temp); };

std::printf("step max-change\n");
for (int step = 0; step < 3; step++) {
thrust::universal_vector<float> &current = temp[step % 2];
thrust::universal_vector<float> &next = temp[(step + 1) % 2];

thrust::transform(thrust::device, current.begin(), current.end(), next.begin(), transformation);
std::printf("%d %.2f\n", step, naive_max_change(current, next));
}
}

iterator

使用迭代器可以认为就是使用指针的泛化
迭代器的思想是建立在cpp运算符重载之上的
通过重载运算符[]可以实现读取时计算(常规我们需要全部计算完了之后再取结果数组,重载取到某个元素的时候同时算)

#include "ach.h"

struct wrapper//定义wrapper,重载是针对自定义数据类型的
{
int *ptr;

void operator=(int value) {
*ptr = value / 2;
}
};

struct transform_output_iterator
{
int *a;

wrapper operator[](int i)
{
return {a + i};
}
};

int main()
{
std::array<int, 3> a{ 0, 1, 2 };
transform_output_iterator it{a.data()};

it[0] = 10;//it[0]返回一个临时对象 wrapper(a,0),ptr就是a的第0个元素,再执行=,将ptr所指更改为一半
it[1] = 20;

std::printf("a[0]: %d\n", a[0]); // prints 5
std::printf("a[1]: %d\n", a[1]); // prints 10
}

cuda iterator

#include "ach.h"

float max_change(const thrust::universal_vector<float>& a,
const thrust::universal_vector<float>& b)
{
auto zip = thrust::make_zip_iterator(a.begin(), b.begin());//make iterator
auto transform = thrust::make_transform_iterator(zip, []__host__ __device__(thrust::tuple<float, float> t) {
return abs(thrust::get<0>(t) - thrust::get<1>(t));
});//attach function to dereference of iterator

// compute max difference
return thrust::reduce(thrust::device, transform, transform + a.size(), 0.0f, thrust::maximum<float>{});
}

int main()
{
float k = 0.5;
float ambient_temp = 20;
thrust::universal_vector<float> temp[] = {{ 42, 24, 50 }, { 0, 0, 0}};
auto transformation = [=] __host__ __device__ (float temp) { return temp + k * (ambient_temp - temp); };

std::printf("step max-change\n");
for (int step = 0; step < 3; step++) {
thrust::universal_vector<float> &current = temp[step % 2];
thrust::universal_vector<float> &next = temp[(step + 1) % 2];

thrust::transform(thrust::device, current.begin(), current.end(), next.begin(), transformation);
std::printf("%d %.2f\n", step, max_change(current, next));
}
}

cpp lambda

三种方法传递可调用对象:function pointer,functor(仿函数),lambda
functor就是对类重载括号运算符
lambda expression
[ capture-list ] ( params ) mutable(optional) exception(optional) attribute(optional) -> ret(optional) { body }
capture-list:[target]:按值捕获[&target]:按引用捕获[v = target]:按值捕获target,函数内部叫v
default capture [=] or [&]按值或引用捕获可见范围内所有的局部变量
只要不写mutable,默认就是一个const函数
捕获this指针只能按值捕获
可视为一个类,捕获列表就是成员变量,参数列表就是成员函数参数
const修饰成员函数int C::func() const{},不会修改成员变量

Vocabulary Types

thrust::tabulate对数组每个下标处执行operator然后存在原来的位置(map)
处理多维数据如果手动从一维展平状态去算非常困难,使用std::pair可存2维,cuda::std::mdspan可处理任意维度的坐标
标准cpp库函数直接从device调用不了,cuda::实现的有

#include <cuda/std/mdspan>
#include <cuda/std/array>
#include <cstdio>

//exercise
cuda::std::mdspan temp_in(thrust::raw_pointer_cast(in.data()), height, width);//mdspan takes raw pointer(just in.data() fail)
//tabulate的lambda的参数值是index!!!
thrust::tabulate(thrust::device, out.begin(), out.end(), [temp_in] __host__ __device__(int id) {
int column = id % temp_in.extent(1);//id%widgh
int row = id / temp_in.extent(1);

if (row > 0 && column > 0 && row < temp_in.extent(0) - 1 && column < temp_in.extent(1) - 1)
{
float d2tdx2 = temp_in(row, column - 1) - 2 * temp_in(row, column) + temp_in(row, column + 1);
float d2tdy2 = temp_in(row - 1, column) - 2 * temp_in(row, column) + temp_in(row + 1, column);

return temp_in(row, column) + 0.2f * (d2tdx2 + d2tdy2);
}
else
{
return temp_in(row, column);
}
});

Serial vs Parallel

for loop是完全的串行的,非常慢
thrust::reduce_by_key reduce segments of values
key 数组两者不同的时候即为新的一个segment

#include <thrust/iterator/counting_iterator.h>
#include <thrust/iterator/transform_iterator.h>
#include <thrust/iterator/discard_iterator.h>
#include <thrust/reduce.h>
#include <thrust/execution_policy.h>

thrust::universal_vector<float> row_temperatures(
int height, int width,
thrust::universal_vector<int>& row_ids, // 这个参数在优化后其实可以不用了
thrust::universal_vector<float>& temp)
{
thrust::universal_vector<float> sums(height);

// 1. 创建生成行索引的迭代器:直接根据当前元素的序号 i 计算 i / width,既不用从内存中读数来算,也没有将整个算出来
auto row_indices_gen = thrust::make_transform_iterator(
thrust::make_counting_iterator(0),
[=] __host__ __device__ (int i) { return i / width; }
);

// 2. 执行规约
// row_indices_gen 会生成类似 0,0,0, 1,1,1, 2,2,2... 的序列
thrust::reduce_by_key(thrust::device,
row_indices_gen, row_indices_gen + temp.size(),
temp.begin(),
thrust::make_discard_iterator(), // 丢弃返回的 keys
sums.begin());

return sums;
}

//make transform iterator:在得到结果之前调用仿函数
struct mean_functor {
int width;
__host__ __device__ float operator()(float x) const {
return x / width;
}
};
auto means_output =
thrust::make_transform_output_iterator(
means.begin(),
mean_functor{width});

memory spaces

thrust::universal_vector同时在cpu和gpu的memory space并且有automatic migrates
可能出现的隐式多次拷贝
alt text
为了避免,我们使用显式memory space
thrust::host_vector/thrust::device_vector
使用thrust::copy_n显式复制

在host code中取address of device function is not allowed
所以直接在transform(host code)中给命名函数是不行的
但是lambda表达式可以,因为它实际上是一个函数对象
解释:device function是直接在device地址空间的,cpu code找不到,直接写函数名隐式传递函数指针,写lambda相当于对象值传递

Unlocking the GPU’s Full Potential

Asynchrony

thrust在cpu和gpu之间是同步的(cpu 的thrust launch了gpu task后是空闲的)
可以使用cub库来编写代码,这样cpu launch之后立刻返回,可以overlap
cudaDeviceSynchronize():让cpu等gpu完成

nsight:tool for profile your program
CUDA HW:memory看host和device之间的transfer
IO:writev fclose等系统调用

为了方便看,可以在代码中使用NVTX
nvtx3::scoped_range temp{std::string("copy stage")}//建立一个range,直接放在一个{}块里面就可以标记成功

streams

为了更好的重叠操作以提高性能,需要异步的内存复制
(GPU内的内存复制远比GPU和CPU间的更快,所以在GPU上再开一个buffer用来异步运数据是值得的)
cuda stream可以认为是一个有序的将会在GPU上执行的工作队列(流内有序,流间无序)

#include "ach.h"

int main()
{
int height = 2048;
int width = 8192;

cudaStream_t compute_stream;
cudaStreamCreate(&compute_stream);

cudaStream_t copy_stream;
cudaStreamCreate(&copy_stream);

thrust::device_vector<float> d_prev = ach::init(height, width);
thrust::device_vector<float> d_next(height * width);
thrust::device_vector<float> d_buffer(height * width);

// TODO: Change code below to allocate host vector in pinned memory
thrust::host_vector<float> h_prev(height * width);

const int compute_steps = 750;
const int write_steps = 3;
for (int write_step = 0; write_step < write_steps; write_step++)
{
cudaMemcpy(thrust::raw_pointer_cast(d_buffer.data()),
thrust::raw_pointer_cast(d_prev.data()),
height * width * sizeof(float), cudaMemcpyDeviceToDevice);
cudaMemcpyAsync(thrust::raw_pointer_cast(h_prev.data()),
thrust::raw_pointer_cast(d_buffer.data()),
height * width * sizeof(float), cudaMemcpyDeviceToHost,
copy_stream);

for (int compute_step = 0; compute_step < compute_steps; compute_step++)
{
ach::simulate(width, height, d_prev, d_next, compute_stream);
d_prev.swap(d_next);
}

cudaStreamSynchronize(copy_stream);
ach::store(write_step, height, width, h_prev);

cudaStreamSynchronize(compute_stream);
}

cudaStreamDestroy(compute_stream);
cudaStreamDestroy(copy_stream);
}

pinned memory

直接使用上述方法仍然可能无法同步:
操作系统使用页管理进程的虚拟内存,可能存在页的交换(写回到磁盘中并load新的页)
gpu只能从物理内存复制数据(这意味着复制时的数据是pinned)
从host复制到device,cuda运行时首先将其复制到staging buffer再复制到device,这涉及到cpu,使原本异步变成串行的
为了解决这个问题,我们需要显式声明pinned memory
使用thrust::universal_host_pinned_vector声明buffer即可

Implementing New Algorithms

kernels

一个使用__global__修饰的函数叫kernel:invoked on cpu but run on gpu
to launch a kernelkernel<<<1,1,0,stream>>>()
cuda kernel是异步的
gridDim.x blockDim.x blockIdx.x threadIdx.x 前两个参数是gridsize和blocksize
计算给定块线程数下的块数cuda::ceil_div(width, block_size)
也可以(width+block_size-1)/block_size
debug:使用compute-sanitizer运行程序

histogram&sync

计算数据统计的histogram图的时候会出现data race,要用原子操作cuda::std::atomic_ref
所有线程全部去抢,原子操作虽然正确但是性能低,每个block更新local最后更新性能更好
_syncthreads()同步线程
cuda::std::atomic_ref<int> cuda::atomic_ref<int,cuda::thread_scope_system>
后者有个接口表示这是在什么范围内(system,block等)

shared&cooperative

shared memory:不存在global memory中,而是直接放在一个块的共享内存,跑得更快

__shared__ int block_histogram[num_bins];

if (threadIdx.x < num_bins)
{
block_histogram[threadIdx.x] = 0;
}
//并行初始化
__syncthreads();

int cell = blockIdx.x * blockDim.x + threadIdx.x;
int bin = static_cast<int>(temperatures[cell] / bin_width);

cuda::atomic_ref<int, cuda::thread_scope_block>
block_ref(block_histogram[bin]);
block_ref.fetch_add(1, cuda::memory_order_relaxed);
__syncthreads();

if (threadIdx.x < num_bins)
{
cuda::atomic_ref<int, cuda::thread_scope_device> ref(histogram[threadIdx.x]);
ref.fetch_add(block_histogram[threadIdx.x], cuda::memory_order_relaxed);
}
  • sequential algorithm:invoked and executed by a single thread
  • cooperative algorithm:invoked and executed by multi threads
  • parallel algorithm:invoked by a single thread but run internally on multiple threads
#include "ach.cuh"
constexpr int block_size = 256;
constexpr int items_per_thread = 1;
constexpr int num_bins = 10;
constexpr float bin_width = 10;

__global__ void histogram_kernel(cuda::std::span<float> temperatures,
cuda::std::span<int> histogram) {
__shared__ int block_histogram[num_bins];

int cell = blockIdx.x * blockDim.x + threadIdx.x;
int bins[items_per_thread] = {
static_cast<int>(temperatures[cell] / bin_width)};

using histogram_t =
cub::BlockHistogram<int, block_size, items_per_thread, num_bins,
cub::BlockHistogramAlgorithm::BLOCK_HISTO_ATOMIC>;
//定义别名histogram_t是一个模板类
__shared__ typename histogram_t::TempStorage temp_storage;//用于存计算需要的临时变量
histogram_t(temp_storage).Histogram(bins, block_histogram);//自动初始化并清零,原子统计,最后同步

__syncthreads();
if (threadIdx.x < num_bins) {
cuda::atomic_ref<int, cuda::thread_scope_device> ref(
histogram[threadIdx.x]);
ref.fetch_add(block_histogram[threadIdx.x]);
}
}

void histogram(cuda::std::span<float> temperatures,
cuda::std::span<int> histogram, cudaStream_t stream) {
int grid_size = cuda::ceil_div(temperatures.size(), block_size);
histogram_kernel<<<grid_size, block_size, 0, stream>>>(temperatures,
histogram);
}