Nice find VanDammage. I can confirm your experiment on my hardware (gentoo linux w/ 8800 GTX, CUDA 1.1, 169.09 drivers):
"Average time: 0.913098 ms
Bandwidth: 4.278020 GiB/s"
I include my test application below so anyone can try.
This clearly seems to be a performance bug (I don’t recall anything in the release notes about it…). I’m 90% ceratain that the last time I used cudaMemcpyToArray with device to device (way back in CUDA 0.8) I was getting ~70 GiB/s, so the bug must have been introduced since then.
#include <stdio.h>
# define CUDA_SAFE_CALL( call) do { \
cudaError err = call; \
if( cudaSuccess != err) { \
fprintf(stderr, "Cuda error in file '%s' in line %i : %s.\n", \
__FILE__, __LINE__, cudaGetErrorString( err) ); \
exit(EXIT_FAILURE); \
} } while (0)
#ifdef NDEBUG
#define CUT_CHECK_ERROR(errorMessage)
#else
# define CUT_CHECK_ERROR(errorMessage) do { \
cudaThreadSynchronize(); \
cudaError_t err = cudaGetLastError(); \
if( cudaSuccess != err) { \
fprintf(stderr, "Cuda error: %s in file '%s' in line %i : %s.\n", \
errorMessage, __FILE__, __LINE__, cudaGetErrorString( err) );\
exit(EXIT_FAILURE); \
} } while (0)
#endif
int main()
{
const int width = 512;
int len = width*width;
float4 *d_data;
CUDA_SAFE_CALL( cudaMalloc((void**)&d_data, sizeof(float4)*len) );
cudaArray *d_array;
cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<float4>();
CUDA_SAFE_CALL( cudaMallocArray(&d_array, &channelDesc, width, width) );
CUDA_SAFE_CALL( cudaMemcpyToArray(d_array, 0, 0, (void*)d_data, len*sizeof(float4), cudaMemcpyDeviceToDevice) );
cudaEvent_t start, end;
CUDA_SAFE_CALL( cudaEventCreate(&start) );
CUDA_SAFE_CALL( cudaEventCreate(&end) );
CUDA_SAFE_CALL( cudaEventRecord(start, 0) );
// execute the kernel
int N = 100;
for (int i=0; i < N; ++i)
{
cudaMemcpyToArray(d_array, 0, 0, (void*)d_data, len*sizeof(float4), cudaMemcpyDeviceToDevice);
}
CUDA_SAFE_CALL( cudaEventRecord(end, 0) );
CUDA_SAFE_CALL( cudaEventSynchronize(end) );
float runTime;
CUDA_SAFE_CALL( cudaEventElapsedTime(&runTime, start, end) );
runTime /= float(N);
printf("Average time: %f ms\n", runTime);
printf("Bandwidth: %f GiB/s\n\n", (len * sizeof(float4)) / (runTime * 1.0e-3 * 1024*1024*1024));
}