GPU Architecture & CUDA Programming

history

origin targets:rendering 3D image
使用三角形描述,给定图片各点的材料,camera,计算投影的样子
early scientiffic computation:
指定区域,fragment shader function 换成实际要执行的函数
2007之前:gpu只有计算graphics pipeline 的interface
2007之后的架构:
Let’s say a user wants to run a non-graphics program on the GPU’s
programmable cores…

  • Application can allocate buffers in GPU memory and copy data
    to/from buffers
  • Application (via graphics driver) provides GPU a single kernel
    program binary
  • Application tells GPU to run the kernel in an SPMD fashion
    (“run N instances of this kernel”)
    launch(myKernel, N)

CUDA abstraction

programming model:SPMD

cuda thread的thread id 可以是多维的,这样天生更符合多维任务的习惯
一共num_blocks乘threadsperblock个
alt text
实际上host和device的地址空间是分开的,在现代机器上可以直接在gpu code中访问host指针,但实际实现可能涉及总线之类的,很慢
显示复制buffer的方法

float* A = new float[N];  // allocate buffer in host mem 
// populate host address space pointer A
for (int i=0 i<N; i++)
A[i] = (float)i;
int bytes = sizeof(float) * N;
float* deviceA; // allocate buffer in
cudaMalloc(&deviceA, bytes); // device address space
// populate deviceA
cudaMemcpy(deviceA, A, bytes, cudaMemcpyHostToDevice);
  • cuda的内存模型
    alt text
    这使得我们可以更好地利用cache hit
    e.g 1D convolution
//每个线程都把该块会用到的变量加载到共享内存中(线程内不存在复用,但是线程之间存在复用)
#define THREADS_PER_BLK 128
__global__ void convolve(int N, float* input, float* output) {
__shared__ float support[THREADS_PER_BLK+2];
// per-block allocation
int index = blockIdx.x * blockDim.x + threadIdx.x; // thread local variable
support[threadIdx.x] = input[index];
if (threadIdx.x < 2) {
support[THREADS_PER_BLK + threadIdx.x] = input[index+THREADS_PER_BLK];
}
__syncthreads();
}
float result = 0.0f; // thread-local variable
for (int i=0; i<3; i++)
result += support[threadIdx.x + i];
output[index] = result / 3.f;

cuda semantics

编译好的device binary:不仅有text(code),还有一些metadata(blockthread,local data,shared data)
alt text
alt text
整个gpu每个周期调度4个warp(所有指令都是标量指令,这意味着没有硬编码SIMD的width,硬件改变的时候不用重新编译)
你写的更细更容易出问题(如果有barrier,你写的block有256个线程,只能执行256,会锁住)

Data-Parallel Thinking

sequence:an ordered collection of element(unlike array,program can only acess element in special operation)

  • map:higher order function(将无副作用的函数应用于序列每个元素并返回相同长度的序列)
    因为无副作用应用,所以没有依赖并且无关顺序,有极好的并行性
  • fold:对元素或者积累出的值执行二元操作f(从初始值开始不断和序列中的元素执行操作)
    alt text

scan

scan:接受序列,输出序列,输出序列的第i个值是对输入序列的前i个值执行操作的结果
(exclusive 和inclusive)

  • 方法1:为了并行化执行了更多的操作
    alt text

  • 方法2:虽然常数项略大但是相当高效(缓存不友好)

Up-sweep:
for d=0 to (log2n - 1) do
forall k=0 to n-1 by 2d+1 do
a[k + 2d+1 - 1] = a[k + 2d - 1] + a[k + 2d+1 - 1]
Down-sweep:
x[n-1] = 0
for d=(log2n - 1) down to 0 do
forall k=0 to n-1 by 2d+1 do
tmp = a[k + 2d - 1]
a[k + 2d - 1] = a[k + 2d+1 - 1]
a[k + 2d+1 - 1] = tmp + a[k + 2d+1 - 1]

alt text

  • 方法3:(以两核处理器为例)
    分成两部分先串行处理
    再同时使用两个处理器处理后半部分的部分和
    这样缓存友好,计算次数也少

实际使用cuda等实现的时候,O(nlogn)的方法实际上的指令数会小于O(n)
SIMD利用率太低了(work efficient 的那个方法)

Parallel Segmented Scan

operating on the sequences of sequences:经常有两个层面的并行性
segmented scan:就是在输入序列的不同的分区上单独的scan(使用bool array 直接表示每个nested sequence 起始位置即可)
gather:将原序列中的对应位置处的元素收集起来构成新密集数组
scatter:将原序列中元素按照对应位置分散到稀疏数组
e.g1:
alt text
e.g2:
output[index[i]] op=index[i]常规多线程得加锁,这样设计算法就不用
alt text
(sort之后的数组如果相邻index就可以认为是新的nested sequence)

Efficiently Evaluating DNNs

Overlapping communication and computation costs footprint(存储占用), since buffers for the data being
processing AND the data being transferred need to be maintained on chip.(为了实现边读边算,要双缓冲区)

Efficiently implementing convolution layers

approach one:just design new network

approach two:code optimization
矩阵形式的卷积:alt text
alt text
以上为显式:非常快,但是生成的矩阵极其巨大
优化:
1.分块矩阵乘法,为了把矩阵放入缓存中有多级分块(甚至有直接放进寄存器的)
2.SIMD指令,可能有将某个矩阵预先转置以获得更好的性能

Matrix multiplication implementations

  • 隐式GEMM(generalize matrix multiplication):不显式的生成矩阵,而是在乘的时候根据需要的坐标去原始数据中取
    alt text

  • winograd:alt text

  • fusing
    网络层的操作前后都要load/store,这也是巨大的开销,很多时候可以将多个操作融合成一个(conv,scale,maxpool)

  • 注意力融合
    alt text

asst3

cpu不能直接修改gpu上的内存!!!
解决方法就是直接调用cudamemset or cudamemcpy

前缀和&find_repeats

__global__ void calculate_up(int*output,int tow_d,int num){
int plus1=tow_d*2;
int idx=blockIdx.x*blockDim.x+threadIdx.x;
if(idx>=num)return;
idx*=plus1;
output[idx+plus1-1]+=output[idx+tow_d-1];
}
__global__ void calculate_down(int*output,int two_d,int num){
int plus1=two_d*2;
int idx=blockIdx.x*blockDim.x+threadIdx.x;
if(idx>=num)return;//可能会有多的
idx*=plus1;
int t=output[idx+two_d-1];
output[idx+two_d-1]=output[idx+plus1-1];
output[idx+plus1-1]+=t;
}
void exclusive_scan(int* input, int N, int* result)
{
int formal_size=nextPow2(N);//处理原长度or补的
if(formal_size>N){
cudaMemset(&result[N],0,sizeof(int)*(formal_size-N));
cudaMemset(&input[N],0,sizeof(int)*(formal_size-N));
}
for(int tow_d=1;tow_d<=formal_size/2;tow_d*=2){
int tow_dplus=2*tow_d;
int tasknum=formal_size/tow_dplus;
int blocknum=(tasknum+THREADS_PER_BLOCK-1)/THREADS_PER_BLOCK;
calculate_up<<<blocknum,THREADS_PER_BLOCK>>>(result,tow_d,tasknum);
}
cudaMemset(&result[formal_size-1],0,sizeof(int));

for(int two_d=formal_size/2;two_d>=1;two_d/=2){
int two_dplus1=2*two_d;
int tasknum=formal_size/two_dplus1;
int blocknum=(tasknum+THREADS_PER_BLOCK-1)/THREADS_PER_BLOCK;
calculate_down<<<blocknum,THREADS_PER_BLOCK>>>(result,two_d,tasknum);
}
}
//计算方法,先标记再前缀和求下标
__global__ void setone(int*array,int formal,int*temp){
int idx=blockIdx.x*blockDim.x+threadIdx.x;
if(idx>=formal-1)return;
if(array[idx]==array[idx+1])temp[idx]=1;
else temp[idx]=0;
}
__global__ void cal(int*temp,int formal,int*output,int*size){
*size=temp[formal-1];
int idx=blockIdx.x*blockDim.x+threadIdx.x;
if(idx>=formal-1)return;
if(temp[idx]!=temp[idx+1]){
output[temp[idx]]=idx;
}
}
int find_repeats(int* device_input, int length, int* device_output) {
//cuda kernel must be void,the way to return value is to pass pointer between device and host
int rounded=nextPow2(length);
int *temp=nullptr;
int size=0;
int*h_size=&size;
int*d_size=nullptr;
cudaMalloc((void**)&d_size,sizeof(int));
cudaMalloc((void**)&temp,rounded*sizeof(int));
int blocksize=(rounded+THREADS_PER_BLOCK-1)/THREADS_PER_BLOCK;
setone<<<blocksize,THREADS_PER_BLOCK>>>(device_input,rounded,temp);
exclusive_scan(temp,rounded,temp);
cal<<<blocksize,THREADS_PER_BLOCK>>>(temp,length,device_output,d_size);
cudaMemcpy(h_size,d_size,sizeof(int),cudaMemcpyDeviceToHost);
cudaDeviceSynchronize();
cudaFree(temp);
cudaFree(d_size);
return size;
}

render

warning 也得看!!!(变量是在host or device(__constant__表示device特殊位置))

cuda debug

//使用以下宏来wrap function call
#define DEBUG

#ifdef DEBUG
#define cudaCheckError(ans) { cudaAssert((ans), __FILE__, __LINE__); }
inline void cudaAssert(cudaError_t code, const char *file, int line, bool abort=true)
{
if (code != cudaSuccess)
{
fprintf(stderr, "CUDA Error: %s at %s:%d\n",
cudaGetErrorString(code), file, line);
if (abort) exit(code);
}
}
#else
#define cudaCheckError(ans) ans
#endif

算法流程

Clear image
for each circle
update position and velocity
for each circle
compute screen bounding box
for all pixels in bounding box
compute pixel center point
if center point is within the circle
compute color of circle at point
blend contribution of circle into image for this pixel

为了保证圆的有序,很自然的想到应该对像素并行
以下代码来源blog

namespace Solution3 {

constexpr int BLOCK_DIM = 16; // 每个 block 的宽度/高度
constexpr int BLOCK_SIZE = BLOCK_DIM * BLOCK_DIM; // 每个 block 的线程总数

#define SCAN_BLOCK_DIM BLOCK_SIZE
#include "exclusiveScan.cu_inl" // 并行 exclusive scan 实现

__global__ void kernelRenderCircles() {
// 使用共享内存存储 block 内数据,避免频繁访问全局内存
__shared__ uint circleIsInBox[BLOCK_SIZE]; // 当前线程负责的圆是否在 block 内
__shared__ uint circleIndex[BLOCK_SIZE]; // circleIsInBox 的 exclusive scan 结果
__shared__ uint scratch[2 * BLOCK_SIZE]; // scan 过程的临时空间
__shared__ int inBoxCircles[BLOCK_SIZE]; // 存储在 block 内的圆的全局 ID

// 计算 block 对应的图像区域
int boxL = blockIdx.x * BLOCK_DIM;
int boxB = blockIdx.y * BLOCK_DIM;
int boxR = min(boxL + BLOCK_DIM, cuConstRendererParams.imageWidth);
int boxT = min(boxB + BLOCK_DIM, cuConstRendererParams.imageHeight);

// 标准化坐标用于 circle-in-box 测试
float boxLNorm = boxL * cuConstRendererParams.invWidth;
float boxRNorm = boxR * cuConstRendererParams.invWidth;
float boxTNorm = boxT * cuConstRendererParams.invHeight;
float boxBNorm = boxB * cuConstRendererParams.invHeight;

// 线程索引计算
int index = threadIdx.y * BLOCK_DIM + threadIdx.x;
int pixelX = boxL + threadIdx.x;
int pixelY = boxB + threadIdx.y;
int pixelId = pixelY * cuConstRendererParams.imageWidth + pixelX;

// 分块处理所有圆,每次处理 BLOCK_SIZE 个圆
for (int i = 0; i < cuConstRendererParams.numCircles; i += BLOCK_SIZE) {
int circleId = i + index;
if (circleId < cuConstRendererParams.numCircles) {
// 获取圆心坐标
float3 p = *reinterpret_cast<float3 *>(
&cuConstRendererParams.position[3 * circleId]);
// 测试当前圆是否与 block 相交
circleIsInBox[index] =
circleInBox(p.x, p.y, cuConstRendererParams.radius[circleId],
boxLNorm, boxRNorm, boxTNorm, boxBNorm);
} else {
circleIsInBox[index] = 0; // 超出圆数量的线程填 0
}
__syncthreads(); // 保证所有线程完成 circle-in-box 测试

// 并行 exclusive scan,统计 block 内圆索引
sharedMemExclusiveScan(index, circleIsInBox, circleIndex, scratch,
BLOCK_SIZE);

// 将在 block 内的圆的全局 ID 保存到共享数组
if (circleIsInBox[index]) {
inBoxCircles[circleIndex[index]] = circleId;
}
__syncthreads(); // 确保所有圆 ID 已写入共享内存

// 计算当前 block 内圆的数量
int numCirclesInBox =
circleIndex[BLOCK_SIZE - 1] + circleIsInBox[BLOCK_SIZE - 1];
__syncthreads();

// 对 block 内每个像素渲染所有相交的圆
if (pixelX < boxR && pixelY < boxT) {
float4 *imgPtr = reinterpret_cast<float4 *>(
&cuConstRendererParams.imageData[4 * pixelId]);
for (int j = 0; j < numCirclesInBox; j++) {
circleId = inBoxCircles[j];
// 核心渲染函数,对像素累加颜色
shadePixel(
circleId,
make_float2((pixelX + 0.5) * cuConstRendererParams.invWidth,
(pixelY + 0.5) * cuConstRendererParams.invHeight),
*reinterpret_cast<float3 *>(
&cuConstRendererParams.position[3 * circleId]),
imgPtr);
}
}
}
}

// 渲染入口函数
void renderCircles(int width, int height) {
kernelRenderCircles<<<dim3((width + BLOCK_DIM - 1) / BLOCK_DIM,
(height + BLOCK_DIM - 1) / BLOCK_DIM),
dim3(BLOCK_DIM, BLOCK_DIM)>>>();
cudaCheckError(cudaDeviceSynchronize()); // 等待 GPU 完成并检查错误
}
} // namespace Solution3