Hi all,
I am trying to write a mini benchmark to test the coalesced global memory access time. I get some results that is not expected.
What I am doing is: I define a float array x with 32M elements, then I want to move this array to another allocated global memory array y.
Memory accesses are strided, i.e a thread move one element at a time, then move to next stride with stride width = blocksize.
I use only one block of threads, and I change the block size from 16, 32, 64, … to 512.
From CUDA programming guide 2.3, these accesses are coalesced, every 16 requests are coalesced into one memory transaction.
Since the array size is fixed, the total number of transactions are constant, which is 32M/16 = 2M, independent of block size.
But the running time is as follow on GTX 285, I observed similar result on Tesla C1060
block size = 16, t=8846.04395 ms
block size = 32, t=4460.80420 ms
block size = 64, t=2227.35669 ms
block size = 128, t=1111.58301 ms
block size = 256, t=603.39233 ms
block size = 512, t=374.28571 ms
We can see as I double the block size, the performance is almost doubled. So it is possible that there is some overlap
between the request from the first 16 threads and the second 16 threads and so on, OR there might be some caching effect
in the coalesced accesses?
My code is as follow, compiled with nvcc 2.3, without any optimization flags.
[codebox]#include<stdio.h>
#include<stdlib.h>
#include<time.h>
#include<cuda.h>
#define DIVIDE_INTO(x,y) ((x + y - 1)/y)
define CUDA_SAFE_CALL_NO_SYNC( call) do { \
cudaError err = call; \
if( cudaSuccess != err) { \
fprintf(stderr, "Cuda error in file '%s' in line %i : %s.\n", \
__FILE__, __LINE__, cudaGetErrorString( err) ); \
exit(EXIT_FAILURE); \
} } while (0)
define CUDA_SAFE_CALL( call) do { \
CUDA_SAFE_CALL_NO_SYNC(call); \
cudaError err = cudaThreadSynchronize(); \
if( cudaSuccess != err) { \
fprintf(stderr, "Cuda error in file '%s' in line %i : %s.\n", \
__FILE__, __LINE__, cudaGetErrorString( err) ); \
exit(EXIT_FAILURE); \
} } while (0)
class timer
{
cudaEvent_t start;
cudaEvent_t end;
public:
timer()
{
CUDA_SAFE_CALL(cudaEventCreate(&start));
CUDA_SAFE_CALL(cudaEventCreate(&end));
CUDA_SAFE_CALL(cudaEventRecord(start,0));
}
~timer()
{
CUDA_SAFE_CALL(cudaEventDestroy(start));
CUDA_SAFE_CALL(cudaEventDestroy(end));
}
float milliseconds_elapsed()
{
float elapsed_time;
CUDA_SAFE_CALL(cudaEventRecord(end, 0));
CUDA_SAFE_CALL(cudaEventSynchronize(end));
CUDA_SAFE_CALL(cudaEventElapsedTime(&elapsed_time, start, end));
return elapsed_time;
}
float seconds_elapsed()
{
return 1000.0 * milliseconds_elapsed();
}
};
global void memMove(float *src, float *trg, int N){
int i,j;
for(i=0;i<10;i++){
for(j=threadIdx.x;j<N;j+=blockDim.x){
trg[j] = src[j];
}
}
}
int main(int argc, char ** argv){
float *x, *y, *dx, *dy;
int N = 1024*1024*32; // array size 32M
int j;
size_t memSize;
int blocksize;
x = (float *)malloc(sizeof(float)*N);
y = (float *)malloc(sizeof(float)*N);
if(!x || !y)printf("Error: mem alloc x,y\n");
for(j=0;j<N;j++){
x[j] = 1.0f;
y[j] = 0.0f;
}
memSize = sizeof(float)*N;
cudaMalloc((void**)&dx,memSize);
cudaMemcpy(dx,x,memSize,cudaMemcpyHostToDevice);
cudaMalloc((void**)&dy,memSize);
cudaMemcpy(dy,y,memSize,cudaMemcpyHostToDevice);
blocksize = 16;
while(blocksize<=512){
timer t;
memMove<<<1,blocksize>>>(dx,dy,N);
cudaThreadSynchronize();
double msec = t.milliseconds_elapsed();
printf("block size = %d, t=%.5f ms\n",blocksize,msec);
blocksize *= 2;
}
free(x);
free(y);
cudaFree(dx);
cudaFree(dy);
}
[/codebox]