Here is a small benchmark with ordinary global memory. The inplace version with 512*4 elements per block (kernel 2) is around ten times faster on an A100 than using 1 thread per row. You could try to adapt this approach with cudaSurface.
shiftRowsKernel 19.2051 ms. 65.0868GB/s
shiftRowsKernel2 1.8135 ms. 689.273GB/s
shiftRowsKernel3 1.84934 ms. 675.915GB/s
#include <iostream>
#include <cassert>
__global__
void shiftRowsKernel(uchar4* data, int width, int height){
int idx = threadIdx.x + blockIdx.x * blockDim.x;
int stride = blockDim.x * gridDim.x;
for(unsigned int i = idx; i < height; i += stride)
{
for(unsigned int j = 0; j < width - 1; j += 1)
{
uchar4 color = data[i * width + j + 1];
data[i * width + j] = color;
}
data[i * width + width - 1] = make_uchar4(0,0,0,0);
}
}
__global__
void shiftRowsKernel2(uchar4* data, int width, int height){
for(unsigned int i = blockIdx.x; i < height; i += gridDim.x)
{
uchar4 color[4]; //assuming blocksize 512, each block can hold 512*4 = 2048 pixels
for(int k = 0; k < 4; k++){
const int x = k * blockDim.x + threadIdx.x;
if(0 < x && x < width){
color[k] = data[i * width + x];
}
}
__syncthreads();
for(int k = 0; k < 4; k++){
const int x = k * blockDim.x + threadIdx.x;
if(0 < x && x < width){
data[i * width + x-1] = color[k];
}
}
if(threadIdx.x == 0){
data[i * width + width-1] = make_uchar4(0,0,0,0);
}
}
}
__global__
void shiftRowsKernel3(uchar4* __restrict__ outdata, const uchar4* __restrict__ indata, int width, int height){
for(unsigned int i = blockIdx.x; i < height; i += gridDim.x)
{
for(int x = threadIdx.x; x < 2048; x += 512){
if(x < width - 1){
outdata[i * width + x] = indata[i * width + x + 1];
}else if(x == width - 1){
outdata[i * width + x] = make_uchar4(0,0,0,0);
}
}
}
}
__global__
void check(uchar4* data, int width, int height, int numIterations){
for(unsigned int i = blockIdx.x; i < height; i += gridDim.x)
{
for(int x = threadIdx.x; x < width - numIterations; x += blockDim.x){
uchar4 pixel = data[i * width + x];
assert(pixel.x == 42);
assert(pixel.y == 42);
assert(pixel.z == 42);
assert(pixel.w == 42);
}
for(int x = width - numIterations + threadIdx.x; x < width; x += blockDim.x){
uchar4 pixel = data[i * width + x];
//printf("%d, %d %d %d %d\n", i * width + x, int(pixel.x), int(pixel.y), int(pixel.z), int(pixel.w));
assert(pixel.x == 0);
assert(pixel.y == 0);
assert(pixel.z == 0);
assert(pixel.w == 0);
}
}
}
int main(){
const int width = 2048;
const int height = 16*1024;
const int numIterations = 10;
uchar4* d_data; cudaMalloc(&d_data, sizeof(uchar4) * width * height);
uchar4* d_data2; cudaMalloc(&d_data2, sizeof(uchar4) * width * height);
cudaMemset(d_data, (char)42, sizeof(uchar4) * width * height);
cudaEvent_t start; cudaEventCreate(&start);
cudaEvent_t stop; cudaEventCreate(&stop);
check<<<height, 512>>>(d_data, width, height, 0);
cudaDeviceSynchronize();
cudaEventRecord(start);
for(int iter = 0; iter < numIterations; iter++){
//std::cout << "iter " << iter << "\n";
shiftRowsKernel<<<((height) + 512 - 1) / 512, 512>>>(d_data, width, height);
// check<<<height, 512>>>(d_data, width, height, iter+1);
// cudaDeviceSynchronize();
}
cudaEventRecord(stop);
cudaEventSynchronize(stop);
check<<<height, 512>>>(d_data, width, height, numIterations);
cudaDeviceSynchronize();
float timingMS; cudaEventElapsedTime(&timingMS, start, stop);
double numGB = (sizeof(uchar4) * width * height) * numIterations / 1024. / 1024. / 1024.;
double GBperSecond = numGB / (timingMS / 1000);
std::cout << "shiftRowsKernel " << timingMS << " ms. " << GBperSecond << "GB/s\n";
cudaMemset(d_data, (char)42, sizeof(uchar4) * width * height);
cudaEventRecord(start);
for(int iter = 0; iter < numIterations; iter++){
//std::cout << "iter " << iter << "\n";
shiftRowsKernel2<<<height, 512>>>(d_data, width, height);
//check<<<height, 512>>>(d_data, width, height, iter+1);
//cudaDeviceSynchronize();
}
cudaEventRecord(stop);
cudaEventSynchronize(stop);
check<<<height, 512>>>(d_data, width, height, numIterations);
cudaDeviceSynchronize();
float timingMS2; cudaEventElapsedTime(&timingMS2, start, stop);
double numGB2 = (sizeof(uchar4) * width * height) * numIterations / 1024. / 1024. / 1024.;
double GBperSecond2 = numGB2 / (timingMS2 / 1000);
std::cout << "shiftRowsKernel2 " << timingMS2 << " ms. " << GBperSecond2 << "GB/s\n";
cudaMemset(d_data, (char)42, sizeof(uchar4) * width * height);
cudaEventRecord(start);
for(int iter = 0; iter < numIterations; iter++){
//std::cout << "iter " << iter << "\n";
const uchar4* input = (iter % 2 == 0) ? d_data : d_data2;
uchar4* output = (iter % 2 == 0) ? d_data2 : d_data;
shiftRowsKernel3<<<height, 512>>>(output, input, width, height);
// check<<<height, 512>>>(output, width, height, iter+1);
// cudaDeviceSynchronize();
}
cudaEventRecord(stop);
cudaEventSynchronize(stop);
uchar4* output = ((numIterations-1) % 2 == 0) ? d_data2 : d_data;
check<<<height, 512>>>(output, width, height, numIterations);
cudaDeviceSynchronize();
float timingMS3; cudaEventElapsedTime(&timingMS3, start, stop);
double numGB3 = (sizeof(uchar4) * width * height) * numIterations / 1024. / 1024. / 1024.;
double GBperSecond3 = numGB3 / (timingMS3 / 1000);
std::cout << "shiftRowsKernel3 " << timingMS3 << " ms. " << GBperSecond3 << "GB/s\n";
}