Hi all,
I use the code listed below to split a source image into two half size images where one destination image contains the even rows and the other destination image contains the odd rows.
It’s implemented as a sliding window in the vertical direction because it is part of a larger project which requires it to be a sliding window. While playing around with the block dimensions,
however, I noticed there is a large drop in performance when going from a (32,4) block to a (16,8) block. I know that there’s a performance hit when using blocks with an x-dimension less
than 16, but I was under the impression that 16 ints are coalesced in a single 64-bit memory transaction within a half warp.
Could anybody tell me why the full warp (32,4)block is almost twice as fast as the half warp (16,8) block?
Cheers,
Nico
Here’s the ouput generated on a Quadro FX 1600M (in all tests, the total amount of threads as well as the required amount of shared memory is the same):
Uploading input data to GPU memory…
Running GPU test…
Array size : 2048x2048 (int)
blockDim : 128x1
gridDim : 16x1
Average GPU time : 1.945023 ms
Checking result…
Test: PASSED
Running GPU test…
Array size : 2048x2048 (int)
blockDim : 64x2
gridDim : 32x1
Average GPU time : 1.967922 ms
Checking result…
Test: PASSED
Running GPU test…
Array size : 2048x2048 (int)
blockDim : 32x4
gridDim : 64x1
Average GPU time : 2.053766 ms
Checking result…
Test: PASSED
Running GPU test…
Array size : 2048x2048 (int)
blockDim : 16x8
gridDim : 128x1
Average GPU time : 4.024047 ms
Checking result…
Test: PASSED
Running GPU test…
Array size : 2048x2048 (int)
blockDim : 8x16
gridDim : 256x1
Average GPU time : 17.548977 ms
Checking result…
Test: PASSED
Shutting down…
Press ENTER to exit…
[codebox]#include <cutil_inline.h>
#define NUM_ITERATIONS 128
global void splitKernel(int* src, int* dst1, int* dst2, const int w, const int h) {
extern __shared__ int shared[];
const unsigned int stride = blockDim.y*w;
unsigned int write = threadIdx.y*w+blockIdx.x*blockDim.x+threadIdx.x;
unsigned int read = threadIdx.y*w+blockIdx.x*blockDim.x+threadIdx.x;
for (unsigned int i=0;i<h/(2*blockDim.y);++i) {
shared[(0*blockDim.y+threadIdx.y)*blockDim.x+threadIdx.x] = src[read];
read+=stride;
shared[(1*blockDim.y+threadIdx.y)*blockDim.x+threadIdx.x] = src[read];
read+=stride;
__syncthreads();
dst1[write] = shared[(2*threadIdx.y+0)*blockDim.x+threadIdx.x];
dst2[write] = shared[(2*threadIdx.y+1)*blockDim.x+threadIdx.x];
write+=stride;
}
}
int main(int argc, char **argv) {
if ( cutCheckCmdLineFlag(argc, (const char**)argv, "device") )
cutilDeviceInit(argc, argv);
else
cudaSetDevice( cutGetMaxGflopsDeviceId() );
unsigned int hTimer;
cutilCheckError( cutCreateTimer(&hTimer) );
int* h_data1 = 0;
int* h_data2 = 0;
int* d_data1 = 0;
int* d_data2 = 0;
const unsigned int width = 2048;
const unsigned int height = 2048;
cudaMallocHost((void**)&h_data1, width*height*sizeof(int));
cudaMallocHost((void**)&h_data2, width*height*sizeof(int));
for (unsigned int i = 0 ; i<width*height;++i)
h_data1[i] = rand();
cudaMalloc((void **)&d_data1, width*height*sizeof(int));
cudaMalloc((void **)&d_data2, width*height*sizeof(int));
fprintf(stderr,"Uploading input data to GPU memory...\n");
cudaMemcpy(d_data1, h_data1, width*height*sizeof(int), cudaMemcpyHostToDevice);
for (unsigned int i=0;i<5;++i) {
dim3 bdim(128>>i , 1<<i);
dim3 gdim(width/bdim.x,1);
fprintf(stderr,"\nRunning GPU test...\nArray size : %dx%d (int)\nblockDim : %dx%d\ngridDim : %dx%d\n",width,height,bdim.x,bdim.y,gdim.x,gdim.y);
cutilSafeCall( cudaThreadSynchronize() );
cutilCheckError( cutResetTimer(hTimer) );
cutilCheckError( cutStartTimer(hTimer) );
for(int it = 0; it < NUM_ITERATIONS; it++){
splitKernel<<<gdim, bdim , 2*bdim.x*bdim.y*sizeof(int)>>>(d_data1, &d_data2[0], &d_data2[width*height/2], width, height);
cutilCheckMsg("dwtKernel() execution failed\n");
}
cutilSafeCall( cudaThreadSynchronize() );
cutilCheckError( cutStopTimer(hTimer) );
double gpuTime = cutGetTimerValue(hTimer) / NUM_ITERATIONS;
fprintf(stderr,"Average GPU time : %f ms\n", gpuTime);
cudaMemcpy(h_data2, d_data2, width*height*sizeof(int), cudaMemcpyDeviceToHost);
fprintf(stderr,"Checking result...\n");
bool passed = true;
for (unsigned int y=0;y<height;++y) {
for (unsigned int x=0;x<width;++x) {
if (h_data1[y*width+x]!= h_data2[ (y&0x1)*width*(height>>1)+(y>>1)*width+x])
passed = false;
}
}
fprintf(stderr,"Test: %s\n", (passed ? "PASSED" : "FAILED"));
}
fprintf(stderr,"Shutting down...\n");
cudaFree(d_data1);
cudaFree(d_data2);
cudaFreeHost(h_data1);
cudaFreeHost(h_data2);
cutilCheckError( cutDeleteTimer(hTimer) );
cutilExit(argc, argv);
cudaThreadExit();
}
[/codebox]