I had a simple memcpy routine built in before that was templated on the type being copied that worked well for basic data types like ints, floats, etc. I also used it on complex data types like classes as well. However, I recently noticed that performance took a 10x or so hit when using classes with more than one data member such as:
[codebox]template< typename T >global void memcpy( T* out, const T* in, unsigned int size )
{
if( GLOBAL_ID() >= size )
{
return;
}
switch( sizeof( T ) )
{
case 8:
{
uint2* _out = ( uint2* ) out;
uint2* _in = ( uint2* ) in;
_out[ GLOBAL_ID() ] = _in[ GLOBAL_ID() ];
break;
}
case 16:
{
uint4* _out = ( uint4* ) out;
uint4* _in = ( uint4* ) in;
_out[ GLOBAL_ID() ] = _in[ GLOBAL_ID() ];
break;
}
default:
const unsigned int mod4 = sizeof( T ) % 4;
const unsigned int mod8 = sizeof( T ) % 8;
const unsigned int mod16 = sizeof( T ) % 16;
if( mod16 == 0 )
{
const unsigned int div16 = sizeof( T ) / 16;
uint4* _out = ( uint4* ) out;
uint4* _in = ( uint4* ) in;
for( unsigned int i = 0, offset = 0; i != div16; ++i,
offset += TOTAL_THREADS() )
{
_out[ GLOBAL_ID() + offset ] =
_in[ GLOBAL_ID() + offset ];
}
}
else if( mod8 == 0 )
{
const unsigned int div8 = sizeof( T ) / 8;
uint2* _out = ( uint2* ) out;
uint2* _in = ( uint2* ) in;
for( unsigned int i = 0, offset = 0; i < div8; ++i,
offset += TOTAL_THREADS() )
{
_out[ GLOBAL_ID() + offset ] =
_in[ GLOBAL_ID() + offset ];
}
}
else if( mod4 == 0 )
{
const unsigned int div4 = sizeof( T ) / 4;
unsigned int* _out = ( unsigned int* ) out;
unsigned int* _in = ( unsigned int* ) in;
for( unsigned int i = 0, offset = 0; i < div4; ++i,
offset += TOTAL_THREADS() )
{
_out[ GLOBAL_ID() + offset ] =
_in[ GLOBAL_ID() + offset ];
}
}
else
{
unsigned char* _out = ( unsigned char* ) out;
unsigned char* _in = ( unsigned char* ) in;
for( unsigned int i = 0, offset = 0; i < sizeof( T ); ++i,
offset += TOTAL_THREADS() )
{
_out[ GLOBAL_ID() + offset ] =
_in[ GLOBAL_ID() + offset ];
}
}
break;
}
}
[/codebox]
which seems to get 85-90% of the throughput of the built-in uint2 on irregular classes. Has anyone looked at this in more detail and maybe come up with an even more optimized version that gets closer to the max bandwidth on any data type?