Hallo everyone!
I have a strange problem and I am hoping, that someone might help me.
I played around with some CUDA code to test the effective bandwith with my “Quaddro 4000” card (peak bandwidth about 90GB/s).
I wrote a kernel, that simply copys data from global-mem to global-mem and was hoping to get the maximal badwidth with I read from the Compute Visual Profiler!
So, when I do so… copying data with a datatype “double” I get the maximal bandwidth of about 77 GB/s… OK so far I think!
BUT when I copy the same kernel and now FLOAT data my bandwith goes down and I get only about 60 GB/s!
Why that?! I played around with the threads per block and also copied more or less data, but no better value! What am I doing wrong?
Thanks for any help!!
Jester
Here is the code I am using:
#define CU_ERR_HANDLE( err ) (HandleError( err, __FILE__, __LINE__ ))
static void HandleError(cudaError_t err,const char *file,int line)
{
if (err != cudaSuccess)
{
printf("%s in %s at line %d\n", cudaGetErrorString( err ),file, line);
//exit(EXIT_FAILURE);
}
}
void init_rand(float* data, int n)
{
for (int i = 0; i < n; ++i)
data[i] = rand() / (float)RAND_MAX ;
}
__global__ void strided_copy(float* A, float* C, int M)
{
int i = (blockDim.x * blockIdx.x + threadIdx.x);
if (i < M)
{
C[i] = A[i];
}
}
int main()
{
int N = 10000000;
int size = N * sizeof(float);
int threadsPerBlock = 512;
int numOfBlocks = (N+threadsPerBlock-1)/threadsPerBlock;
float* h_A;
float* h_C;
float* d_A;
float* d_C;
h_A = (float*)malloc(size);
h_C = (float*)malloc(size);
init_rand(h_A, N);
// Data to Device
CU_ERR_HANDLE(cudaMalloc((void**)&d_A, size));
CU_ERR_HANDLE(cudaMalloc((void**)&d_C, size));
CU_ERR_HANDLE(cudaMemcpy(d_A, h_A, size, cudaMemcpyHostToDevice));
// KERNEL
strided_copy<<<numOfBlocks,threadsPerBlock> > >(d_A,d_C,N);
// Data back to Host
CU_ERR_HANDLE(cudaMemcpy(h_C, d_C, size, cudaMemcpyDeviceToHost));
cudaFree(d_A);
cudaFree(d_C);
free(h_A);
free(h_C);
CU_ERR_HANDLE(cudaThreadExit());
return 0;
}
#endif