Either I am misunderstanding the usage of that function, or it does not behave as expected. What I find is that it will for some parameters copy a few more bytes than intended. In the enclosed test case I want to copy positions 4444-4702 from the array “dBuffer” to “dData”. What actually happens is that positions 4444-4706 are copied. I tested this with a Geforce 9800GT 1024MB on Vista 64bit with Cuda 2.3 and driver version 190.38 and with a Tesla C1060 on OpenSuse 11.1 and Cuda 2.3, driver version 190.18 Beta.
EDIT: whoops, meant to post this in Programming and Development forums.
#include <stdio.h>
// Test case demonstrating unexpected/wrong(?) behaviour of cudaMemcpy/cudaMemcpyDeviceToDevice.
int main()
{
int size = 9000;
unsigned int *data = new unsigned int;
unsigned int *buffer = new unsigned int;
unsigned int *dData = 0;
unsigned int *dBuffer = 0;
cudaMalloc((void**)&dData, size * sizeof(unsigned int));
cudaMalloc((void**)&dBuffer, size * sizeof(unsigned int));
for (int i = 0; i < 4444; i++)
{
data[i] = 0;
buffer[i] = 4;
}
for (int i = 4444; i < 4703; i++)
{
data[i] = 1;
buffer[i] = 2;
}
for (int i = 4703; i < 9000; i++)
{
data[i] = 3;
buffer[i] = 1;
}
cudaMemcpy(dData, data, size * sizeof(unsigned int), cudaMemcpyHostToDevice);
cudaMemcpy(dBuffer, buffer, size * sizeof(unsigned int), cudaMemcpyHostToDevice);
cudaMemcpy(data, dData, size * sizeof(unsigned int), cudaMemcpyDeviceToHost);
cudaMemcpy(buffer, dBuffer, size * sizeof(unsigned int), cudaMemcpyDeviceToHost);
printf("\n(input 4440 - 4450) -> ");
for (int j = 4440; j < 4450; j++)
printf("%d ", data[j]);
printf("... 4700 - 4714: ");
for (int j = 4700; j < 4715; j++)
printf("%d ", data[j]);
printf("\n(buffer 4440 - 4450) -> ");
for (int j = 4440; j < 4450; j++)
printf("%d ", buffer[j]);
printf("... 4700 - 4714: ");
for (int j = 4700; j < 4715; j++)
printf("%d ", buffer[j]);
// Intending to copy 4444 - 4702 from buffer to data, but actually copies 4 more elements?
cudaMemcpy(dData + 4444, dBuffer + 4444, 259 * sizeof(unsigned int), cudaMemcpyDeviceToDevice);
printf("\nCopied buffer 4444-4702 to data 4444-4702..");
cudaMemcpy(data, dData, size * sizeof(unsigned int), cudaMemcpyDeviceToHost);
cudaMemcpy(buffer, dBuffer, size * sizeof(unsigned int), cudaMemcpyDeviceToHost);
printf("\n(input 4440 - 4449: ) -> ");
for (int j = 4440; j < 4450; j++)
printf("%d ", data[j]);
printf("... 4700 - 4714");
for (int j = 4700; j < 4715; j++)
printf("%d ", data[j]);
printf("\n(buffer 4440 - 4449) -> ");
for (int j = 4440; j < 4450; j++)
printf("%d ", buffer[j]);
printf("... 4700 - 4714: ");
for (int j = 4700; j < 4715; j++)
printf("%d ", buffer[j]);
delete [] data;
delete [] buffer;
cudaFree(dData);
cudaFree(dBuffer);
return 0;
}