Possible bug in cudaMemcpyDeviceToDevice?

Either I am misunderstanding the usage of that function, or it does not behave as expected. What I find is that it will for some parameters copy a few more bytes than intended. In the enclosed test case I want to copy positions 4444-4702 from the array “dBuffer” to “dData”. What actually happens is that positions 4444-4706 are copied. I tested this with a Geforce 9800GT 1024MB on Vista 64bit with Cuda 2.3 and driver version 190.38 and with a Tesla C1060 on OpenSuse 11.1 and Cuda 2.3, driver version 190.18 Beta.

EDIT: whoops, meant to post this in Programming and Development forums.

#include <stdio.h>

// Test case demonstrating unexpected/wrong(?) behaviour of cudaMemcpy/cudaMemcpyDeviceToDevice.

  int main()

  {

	int size = 9000;

	unsigned int *data = new unsigned int;

	unsigned int *buffer = new unsigned int;

	unsigned int *dData = 0;

	unsigned int *dBuffer = 0;

	cudaMalloc((void**)&dData, size * sizeof(unsigned int));

	cudaMalloc((void**)&dBuffer, size * sizeof(unsigned int));

	for (int i = 0; i < 4444; i++)

	{

	  data[i] = 0;

	  buffer[i] = 4;

	}

	for (int i = 4444; i < 4703; i++)

	{

	  data[i] = 1;

	  buffer[i] = 2;

	}

	for (int i = 4703; i < 9000; i++)

	{

	  data[i] = 3;

	  buffer[i] = 1;

	}

	cudaMemcpy(dData, data, size * sizeof(unsigned int), cudaMemcpyHostToDevice);

	cudaMemcpy(dBuffer, buffer, size * sizeof(unsigned int), cudaMemcpyHostToDevice);

	cudaMemcpy(data, dData, size * sizeof(unsigned int), cudaMemcpyDeviceToHost);

	cudaMemcpy(buffer, dBuffer, size * sizeof(unsigned int), cudaMemcpyDeviceToHost);

	

	printf("\n(input 4440 - 4450) -> ");

	for (int j = 4440; j < 4450; j++)

	  printf("%d  ", data[j]);

	printf("... 4700 - 4714: ");

	for (int j = 4700; j < 4715; j++)

	  printf("%d  ", data[j]);

	printf("\n(buffer 4440 - 4450) -> ");

	for (int j = 4440; j < 4450; j++)

	  printf("%d  ", buffer[j]);

	printf("... 4700 - 4714: ");

	for (int j = 4700; j < 4715; j++)

	  printf("%d  ", buffer[j]);

	// Intending to copy 4444 - 4702 from buffer to data, but actually copies 4 more elements?

	cudaMemcpy(dData + 4444, dBuffer + 4444, 259 * sizeof(unsigned int), cudaMemcpyDeviceToDevice);

	printf("\nCopied buffer 4444-4702 to data 4444-4702..");

	cudaMemcpy(data, dData, size * sizeof(unsigned int), cudaMemcpyDeviceToHost);

	cudaMemcpy(buffer, dBuffer, size * sizeof(unsigned int), cudaMemcpyDeviceToHost);

	printf("\n(input 4440 - 4449: ) -> ");

	for (int j = 4440; j < 4450; j++)

	  printf("%d  ", data[j]);

	printf("... 4700 - 4714");

	for (int j = 4700; j < 4715; j++)

	  printf("%d  ", data[j]);

	printf("\n(buffer 4440 - 4449) -> ");

	for (int j = 4440; j < 4450; j++)

	  printf("%d  ", buffer[j]);

	printf("... 4700 - 4714: ");

	for (int j = 4700; j < 4715; j++)

	  printf("%d  ", buffer[j]);

	delete [] data;

	delete [] buffer;

	cudaFree(dData);

	cudaFree(dBuffer);

	

	return 0;

  }

For my algorithm I replaced the memcpy with a kernel that does the same thing, which was in any case what I wanted to do eventually because it replaces several calls to memcpy and might be faster. Still, I’d be interested to hear if anyone can reproduce the behaviour I describe and if it is indeed a bug, or just me not using the function in the way it is supposed to be used.

I think we actually found this bug today independently…

Nice to hear :)