Hi! I’m trying to realize MD5 algorithm using CUDA as first programm. On common processor under Windows XP this algorithm working properly. But on GPU it is not working. Please look at my code. What mistakes have i made in it?

```
#define MESSAGE_LENGTH 64
__device__ void AllRounds( unsigned int& A, unsigned int& B, unsigned int& C, unsigned int& D, unsigned int* X );
__global__ void DevEncodeMD5( char* szData, unsigned int* uiAHost, unsigned int* uiBHost, unsigned int* uiCHost, unsigned int* uiDHost );
extern "C" void EncodeMD5( char* szData, unsigned int uiCount, unsigned int* uiResA, unsigned int* uiResB, unsigned int* uiResC, unsigned int* uiResD )
{
//every password- max 56 bytes + 8 bytes ( length )
char* szDeviceData;
unsigned int uiDeviceDataLength = uiCount * MESSAGE_LENGTH;
unsigned int uiSize = sizeof( unsigned int ) * uiCount;
unsigned int *uiDeviceA, *uiDeviceB, *uiDeviceC, *uiDeviceD;
cudaMalloc( ( void** )&szDeviceData, uiDeviceDataLength );
cudaMemcpy( szDeviceData, szData, uiDeviceDataLength, cudaMemcpyHostToDevice );
cudaMalloc( ( void** )&uiDeviceA, uiSize );
cudaMalloc( ( void** )&uiDeviceB, uiSize );
cudaMalloc( ( void** )&uiDeviceC, uiSize );
cudaMalloc( ( void** )&uiDeviceD, uiSize );
DevEncodeMD5<<<1, uiCount>>>( szDeviceData, uiDeviceA, uiDeviceB, uiDeviceC, uiDeviceD );
cudaMemcpy( uiResA, uiDeviceA, uiSize, cudaMemcpyDeviceToHost );
cudaMemcpy( uiResB, uiDeviceB, uiSize, cudaMemcpyDeviceToHost );
cudaMemcpy( uiResC, uiDeviceC, uiSize, cudaMemcpyDeviceToHost );
cudaMemcpy( uiResD, uiDeviceD, uiSize, cudaMemcpyDeviceToHost );
cudaFree( szDeviceData );
cudaFree( uiDeviceA );
cudaFree( uiDeviceB );
cudaFree( uiDeviceC );
cudaFree( uiDeviceD );
}
__global__ void DevEncodeMD5( char* szData, unsigned int* uiAHost, unsigned int* uiBHost, unsigned int* uiCHost, unsigned int* uiDHost )
{
unsigned int X[ 16 ];
unsigned int uiA = 0x67452301,
uiB = 0xEFCDAB89,
uiC = 0x98BADCFE,
uiD = 0x10325476;
unsigned int uiAA, uiBB, uiCC, uiDD;
unsigned int uiBasePos = threadIdx.x * MESSAGE_LENGTH;
for ( unsigned int i = 0; i < MESSAGE_LENGTH; i += 64 )
{
for ( unsigned int j = 0; j < 64; j += 4 )
{
X[ j / 4 ] = *reinterpret_cast< unsigned int* >( &szData[ uiBasePos + i + j ] );
}
uiAA = uiA;
uiBB = uiB;
uiCC = uiC;
uiDD = uiD;
AllRounds( uiA, uiB, uiC, uiD, X );
uiA += uiAA;
uiB += uiBB;
uiC += uiCC;
uiD += uiDD;
}
uiAHost[ threadIdx.x ] = uiA;
uiBHost[ threadIdx.x ] = uiB;
uiCHost[ threadIdx.x ] = uiC;
uiDHost[ threadIdx.x ] = uiD;
}
__device__ void RotateLeft( unsigned int x, unsigned int n, unsigned int& uiResult )
{
uiResult = ( x << n ) | ( x >> ( 32 - n ) );
}
__device__ void Round1Func( unsigned int& a, unsigned int b, unsigned int c, unsigned int d, unsigned int s, unsigned int T, unsigned int X )
{
unsigned int uiResult;
RotateLeft( a + ( b & c | ~b & d ) + X + T, s, uiResult );
a = b + uiResult;
}
__device__ void Round2Func( unsigned int& a, unsigned int b, unsigned int c, unsigned int d, unsigned int s, unsigned int T, unsigned int X )
{
unsigned int uiResult;
RotateLeft( a + ( b & d | c & ~d ) + X + T, s, uiResult );
a = b + uiResult;
}
__device__ void Round3Func( unsigned int& a, unsigned int b, unsigned int c, unsigned int d, unsigned int s, unsigned int T, unsigned int X )
{
unsigned int uiResult;
RotateLeft( a + ( b ^ c ^ d ) + X + T, s, uiResult );
a = b + uiResult;
}
__device__ void Round4Func( unsigned int& a, unsigned int b, unsigned int c, unsigned int d, unsigned int s, unsigned int T, unsigned int X )
{
unsigned int uiResult;
RotateLeft( a + ( c ^ ( b | ~d ) ) + X + T, s, uiResult );
a = b + uiResult;
}
__device__ void AllRounds( unsigned int& A, unsigned int& B, unsigned int& C, unsigned int& D, unsigned int* X )
{
/*Round 1*/
Round1Func( A, B, C, D, 7, 0xd76aa478, X[ 0 ] );
Round1Func( D, A, B, C, 12, 0xe8c7b756, X[ 1 ] );
Round1Func( C, D, A, B, 17, 0x242070db, X[ 2 ] );
Round1Func( B, C, D, A, 22, 0xc1bdceee, X[ 3 ] );
Round1Func( A, B, C, D, 7, 0xf57c0faf, X[ 4 ] );
Round1Func( D, A, B, C, 12, 0x4787c62a, X[ 5 ] );
Round1Func( C, D, A, B, 17, 0xa8304613, X[ 6 ] );
Round1Func( B, C, D, A, 22, 0xfd469501, X[ 7 ] );
Round1Func( A, B, C, D, 7, 0x698098d8, X[ 8 ] );
Round1Func( D, A, B, C, 12, 0x8b44f7af, X[ 9 ] );
Round1Func( C, D, A, B, 17, 0xffff5bb1, X[ 10 ] );
Round1Func( B, C, D, A, 22, 0x895cd7be, X[ 11 ] );
Round1Func( A, B, C, D, 7, 0x6b901122, X[ 12 ] );
Round1Func( D, A, B, C, 12, 0xfd987193, X[ 13 ] );
Round1Func( C, D, A, B, 17, 0xa679438e, X[ 14 ] );
Round1Func( B, C, D, A, 22, 0x49b40821, X[ 15 ] );
/*Round 2*/
Round2Func( A, B, C, D, 5, 0xf61e2562, X[ 1 ] );
Round2Func( D, A, B, C, 9, 0xc040b340, X[ 6 ] );
Round2Func( C, D, A, B, 14, 0x265e5a51, X[ 11 ] );
Round2Func( B, C, D, A, 20, 0xe9b6c7aa, X[ 0 ] );
Round2Func( A, B, C, D, 5, 0xd62f105d, X[ 5 ] );
Round2Func( D, A, B, C, 9, 0x2441453, X[ 10 ] );
Round2Func( C, D, A, B, 14, 0xd8a1e681, X[ 15 ] );
Round2Func( B, C, D, A, 20, 0xe7d3fbc8, X[ 4 ] );
Round2Func( A, B, C, D, 5, 0x21e1cde6, X[ 9 ] );
Round2Func( D, A, B, C, 9, 0xc33707d6, X[ 14 ] );
Round2Func( C, D, A, B, 14, 0xf4d50d87, X[ 3 ] );
Round2Func( B, C, D, A, 20, 0x455a14ed, X[ 8 ] );
Round2Func( A, B, C, D, 5, 0xa9e3e905, X[ 13 ] );
Round2Func( D, A, B, C, 9, 0xfcefa3f8, X[ 2 ] );
Round2Func( C, D, A, B, 14, 0x676f02d9, X[ 7 ] );
Round2Func( B, C, D, A, 20, 0x8d2a4c8a, X[ 12 ] );
/*Round 3*/
Round3Func( A, B, C, D, 4, 0xfffa3942, X[ 5 ] );
Round3Func( D, A, B, C, 11, 0x8771f681, X[ 8 ] );
Round3Func( C, D, A, B, 16, 0x6d9d6122, X[ 11 ] );
Round3Func( B, C, D, A, 23, 0xfde5380c, X[ 14 ] );
Round3Func( A, B, C, D, 4, 0xa4beea44, X[ 1 ] );
Round3Func( D, A, B, C, 11, 0x4bdecfa9, X[ 4 ] );
Round3Func( C, D, A, B, 16, 0xf6bb4b60, X[ 7 ] );
Round3Func( B, C, D, A, 23, 0xbebfbc70, X[ 10 ] );
Round3Func( A, B, C, D, 4, 0x289b7ec6, X[ 13 ] );
Round3Func( D, A, B, C, 11, 0xeaa127fa, X[ 0 ] );
Round3Func( C, D, A, B, 16, 0xd4ef3085, X[ 3 ] );
Round3Func( B, C, D, A, 23, 0x4881d05, X[ 6 ] );
Round3Func( A, B, C, D, 4, 0xd9d4d039, X[ 9 ] );
Round3Func( D, A, B, C, 11, 0xe6db99e5, X[ 12 ] );
Round3Func( C, D, A, B, 16, 0x1fa27cf8, X[ 15 ] );
Round3Func( B, C, D, A, 23, 0xc4ac5665, X[ 2 ] );
/*Round 4*/
Round4Func( A, B, C, D, 6, 0xf4292244, X[ 0 ] );
Round4Func( D, A, B, C, 10, 0x432aff97, X[ 7 ] );
Round4Func( C, D, A, B, 15, 0xab9423a7, X[ 14 ] );
Round4Func( B, C, D, A, 21, 0xfc93a039, X[ 5 ] );
Round4Func( A, B, C, D, 6, 0x655b59c3, X[ 12 ] );
Round4Func( D, A, B, C, 10, 0x8f0ccc92, X[ 3 ] );
Round4Func( C, D, A, B, 15, 0xffeff47d, X[ 10 ] );
Round4Func( B, C, D, A, 21, 0x85845dd1, X[ 1 ] );
Round4Func( A, B, C, D, 6, 0x6fa87e4f, X[ 8 ] );
Round4Func( D, A, B, C, 10, 0xfe2ce6e0, X[ 15 ] );
Round4Func( C, D, A, B, 15, 0xa3014314, X[ 6 ] );
Round4Func( B, C, D, A, 21, 0x4e0811a1, X[ 13 ] );
Round4Func( A, B, C, D, 6, 0xf7537e82, X[ 4 ] );
Round4Func( D, A, B, C, 10, 0xbd3af235, X[ 11 ] );
Round4Func( C, D, A, B, 15, 0x2ad7d2bb, X[ 2 ] );
Round4Func( B, C, D, A, 21, 0xeb86d391, X[ 9 ] );
}
```