So I have some optimized device-to-device memcpy routines along the lines of this:
template< MemcpyType type >
__device__ void _memcpy(void* _destination,
const void* _source, size_t length)
{
int index;
int stride;
char* destination = (char*) _destination;
char* source = (char*) _source;
switch( type )
{
case MemcpyWarp:
{
index = INTRA_WARP_ID();
stride = WARP_SIZE();
break;
}
case MemcpyCta:
{
index = THREAD_ID();
stride = CTA_DIMENSION();
break;
}
case MemcpyBase:
{
index = GLOBAL_ID();
stride = TOTAL_THREADS();
break;
}
}
if( (size_t)destination % sizeof(uint2) != 0
|| (size_t)source % sizeof(uint2) != 0 )
{
for( unsigned int i = index; i < length; i += stride )
{
destination[i] = source[i];
}
}
else
{
int steps = length/sizeof(int2);
int doubleStride = stride * 2;
int i;
// Transfer bulk
for(i= 0; i < steps - doubleStride; i += doubleStride )
{
int2 tempA = ((int2*)source)[ i + index + stride * 0 ];
int2 tempB = ((int2*)source)[ i + index + stride * 1 ];
((int2*)destination)[ i + index + stride * 0 ] = tempA;
((int2*)destination)[ i + index + stride * 1 ] = tempB;
}
// Transfer remainder
for( ; i< steps; i += stride)
{
if( (i + index) < steps )
{
((int2*)destination)[ i + index ] =
((int2*)source)[ i + index ];
}
}
// Transfer last few bytes
for(i= length - length % sizeof(int2); i< length; i++)
{
destination[ i ] = source[ i ];
}
}
}
I find that this code give incorrect results if either _source or _destination is not aligned to an 8 byte boundary. It seems like the GPU silently aligns all of these accesses to 8-byte boundaries. Adding the following correction code fixes the problem, but I am wondering why this is necessary.
template< MemcpyType type >
__device__ void _memcpy(void* _destination,
const void* _source, size_t length)
{
int index;
int stride;
char* destination = (char*) _destination;
char* source = (char*) _source;
switch( type )
{
case MemcpyWarp:
{
index = INTRA_WARP_ID();
stride = WARP_SIZE();
break;
}
case MemcpyCta:
{
index = THREAD_ID();
stride = CTA_DIMENSION();
break;
}
case MemcpyBase:
{
index = GLOBAL_ID();
stride = TOTAL_THREADS();
break;
}
}
if( (size_t)destination % sizeof(uint2) != 0
|| (size_t)source % sizeof(uint2) != 0 )
{
for( unsigned int i = index; i < length; i += stride )
{
destination[i] = source[i];
}
}
else
{
int steps = length/sizeof(int2);
int doubleStride = stride * 2;
int i;
// Transfer bulk
for(i= 0; i < steps - doubleStride; i += doubleStride )
{
int2 tempA = ((int2*)source)[ i + index + stride * 0 ];
int2 tempB = ((int2*)source)[ i + index + stride * 1 ];
((int2*)destination)[ i + index + stride * 0 ] = tempA;
((int2*)destination)[ i + index + stride * 1 ] = tempB;
}
// Transfer remainder
for( ; i< steps; i += stride)
{
if( (i + index) < steps )
{
((int2*)destination)[ i + index ] =
((int2*)source)[ i + index ];
}
}
// Transfer last few bytes
for(i= length - length % sizeof(int2); i< length; i++)
{
destination[ i ] = source[ i ];
}
}
}
Any ideas? I would rather the GPU would throw an error here rather than silenty loading/storing the wrong data…