New to CUDA Some troubles with computing MD5

Hi! I’m trying to realize MD5 algorithm using CUDA as first programm. On common processor under Windows XP this algorithm working properly. But on GPU it is not working. Please look at my code. What mistakes have i made in it?

#define MESSAGE_LENGTH 64

__device__	void	AllRounds( unsigned int& A, unsigned int& B, unsigned int& C, unsigned int& D, unsigned int* X );

__global__	void	DevEncodeMD5(	char* szData, unsigned int* uiAHost, unsigned int* uiBHost, unsigned int* uiCHost, unsigned int* uiDHost );

extern "C" void EncodeMD5(	char* szData, unsigned int uiCount, unsigned int* uiResA, unsigned int* uiResB, unsigned int* uiResC, unsigned int* uiResD )

{

	//every password- max 56 bytes + 8 bytes ( length )

	char*			szDeviceData;

	unsigned int	uiDeviceDataLength = uiCount * MESSAGE_LENGTH;

	unsigned int	uiSize = sizeof( unsigned int ) * uiCount;

	unsigned int	*uiDeviceA, *uiDeviceB,	*uiDeviceC,	*uiDeviceD;

	cudaMalloc( ( void** )&szDeviceData, uiDeviceDataLength );

	cudaMemcpy( szDeviceData, szData, uiDeviceDataLength, cudaMemcpyHostToDevice );

			

	cudaMalloc( ( void** )&uiDeviceA, uiSize );

	cudaMalloc( ( void** )&uiDeviceB, uiSize );

	cudaMalloc( ( void** )&uiDeviceC, uiSize );

	cudaMalloc( ( void** )&uiDeviceD, uiSize );

		

	DevEncodeMD5<<<1, uiCount>>>( szDeviceData, uiDeviceA, uiDeviceB, uiDeviceC, uiDeviceD );

	cudaMemcpy( uiResA, uiDeviceA, uiSize, cudaMemcpyDeviceToHost );

	cudaMemcpy( uiResB, uiDeviceB, uiSize, cudaMemcpyDeviceToHost );

	cudaMemcpy( uiResC, uiDeviceC, uiSize, cudaMemcpyDeviceToHost );

	cudaMemcpy( uiResD, uiDeviceD, uiSize, cudaMemcpyDeviceToHost );

			

	cudaFree( szDeviceData );

	cudaFree( uiDeviceA );

	cudaFree( uiDeviceB );

	cudaFree( uiDeviceC );

	cudaFree( uiDeviceD );

}

__global__ void DevEncodeMD5( char* szData, unsigned int* uiAHost, unsigned int* uiBHost, unsigned int* uiCHost, unsigned int* uiDHost )

{	

	unsigned int	X[ 16 ];

	

	unsigned int	uiA = 0x67452301,

					uiB = 0xEFCDAB89,

					uiC = 0x98BADCFE,

					uiD = 0x10325476;

	unsigned int	uiAA, uiBB, uiCC, uiDD;

	

	unsigned int	uiBasePos = threadIdx.x * MESSAGE_LENGTH;

	for ( unsigned int i = 0; i < MESSAGE_LENGTH; i += 64 )

	{

		for ( unsigned int j = 0; j < 64; j += 4 )

		{

			X[ j / 4 ] = *reinterpret_cast< unsigned int* >( &szData[ uiBasePos + i + j ] );

		}

		uiAA = uiA;

		uiBB = uiB;

		uiCC = uiC;

		uiDD = uiD;

		AllRounds( uiA, uiB, uiC, uiD, X );

		uiA += uiAA;

		uiB += uiBB;

		uiC += uiCC;

		uiD += uiDD;

	}

	uiAHost[ threadIdx.x ] = uiA;

	uiBHost[ threadIdx.x ] = uiB;

	uiCHost[ threadIdx.x ] = uiC;

	uiDHost[ threadIdx.x ] = uiD;

}

__device__ void RotateLeft( unsigned int x, unsigned int n, unsigned int& uiResult )

{

	uiResult = ( x << n ) | ( x >> ( 32 - n ) );

}

__device__ void Round1Func( unsigned int& a, unsigned int b, unsigned int c, unsigned int d, unsigned int s, unsigned int T, unsigned int X )

{

	unsigned int uiResult;

	RotateLeft( a + ( b & c | ~b & d ) + X + T, s, uiResult );

	a = b + uiResult;

}

__device__ void Round2Func( unsigned int& a, unsigned int b, unsigned int c, unsigned int d, unsigned int s, unsigned int T, unsigned int X )

{

	unsigned int uiResult;

	RotateLeft( a + ( b & d | c & ~d ) + X + T, s, uiResult );

	a = b + uiResult;

}

__device__ void Round3Func( unsigned int& a, unsigned int b, unsigned int c, unsigned int d, unsigned int s, unsigned int T, unsigned int X )

{

	unsigned int uiResult;

	RotateLeft( a + ( b ^ c ^ d ) + X + T, s, uiResult );

	a = b + uiResult;

}

__device__ void Round4Func( unsigned int& a, unsigned int b, unsigned int c, unsigned int d, unsigned int s, unsigned int T, unsigned int X )

{

	unsigned int uiResult;

	RotateLeft( a + ( c ^ ( b | ~d ) ) + X + T, s, uiResult );

	a = b + uiResult;

}

__device__ void AllRounds( unsigned int& A, unsigned int& B, unsigned int& C, unsigned int& D, unsigned int* X )

{	

 	/*Round 1*/

 	Round1Func( A, B, C, D, 7,	0xd76aa478, X[ 0 ] );	

	Round1Func( D, A, B, C, 12, 0xe8c7b756, X[ 1 ] );

 	Round1Func( C, D, A, B, 17, 0x242070db, X[ 2 ] );	

	Round1Func( B, C, D, A, 22, 0xc1bdceee, X[ 3 ] );

 	Round1Func( A, B, C, D, 7,	0xf57c0faf, X[ 4 ] );	

	Round1Func( D, A, B, C, 12, 0x4787c62a, X[ 5 ] );

 	Round1Func( C, D, A, B, 17,	0xa8304613, X[ 6 ] );	

	Round1Func( B, C, D, A, 22, 0xfd469501, X[ 7 ] );

 	Round1Func( A, B, C, D, 7,	0x698098d8, X[ 8 ] );	

	Round1Func( D, A, B, C, 12, 0x8b44f7af, X[ 9 ] );

 	Round1Func( C, D, A, B, 17, 0xffff5bb1, X[ 10 ] );	

	Round1Func( B, C, D, A, 22, 0x895cd7be, X[ 11 ] );

 	Round1Func( A, B, C, D, 7,	0x6b901122, X[ 12 ] );	

	Round1Func( D, A, B, C, 12, 0xfd987193, X[ 13 ] );

 	Round1Func( C, D, A, B, 17, 0xa679438e, X[ 14 ] );	

	Round1Func( B, C, D, A, 22, 0x49b40821, X[ 15 ] );

 	/*Round 2*/

 	Round2Func( A, B, C, D, 5,	0xf61e2562, X[ 1 ] );	

	Round2Func( D, A, B, C, 9,	0xc040b340, X[ 6 ] );

 	Round2Func( C, D, A, B, 14, 0x265e5a51, X[ 11 ] );	

	Round2Func( B, C, D, A, 20, 0xe9b6c7aa, X[ 0 ] );

 	Round2Func( A, B, C, D, 5,	0xd62f105d, X[ 5 ] ); 	

	Round2Func( D, A, B, C, 9,	0x2441453,	X[ 10 ] );

 	Round2Func( C, D, A, B, 14, 0xd8a1e681, X[ 15 ] ); 	

	Round2Func( B, C, D, A, 20, 0xe7d3fbc8, X[ 4 ] );

 	Round2Func( A, B, C, D, 5,	0x21e1cde6, X[ 9 ] ); 	

	Round2Func( D, A, B, C, 9,	0xc33707d6, X[ 14 ] );

 	Round2Func( C, D, A, B, 14, 0xf4d50d87, X[ 3 ] ); 	

	Round2Func( B, C, D, A, 20, 0x455a14ed, X[ 8 ] );

 	Round2Func( A, B, C, D, 5,	0xa9e3e905, X[ 13 ] ); 	

	Round2Func( D, A, B, C, 9,	0xfcefa3f8, X[ 2 ] );

 	Round2Func( C, D, A, B, 14, 0x676f02d9, X[ 7 ] ); 	

	Round2Func( B, C, D, A, 20, 0x8d2a4c8a, X[ 12 ] );

 	/*Round 3*/

 	Round3Func( A, B, C, D,	4,	0xfffa3942, X[ 5 ] ); 	

	Round3Func( D, A, B, C, 11, 0x8771f681, X[ 8 ] );

 	Round3Func( C, D, A, B, 16, 0x6d9d6122, X[ 11 ] ); 	

	Round3Func( B, C, D, A, 23, 0xfde5380c, X[ 14 ] );

 	Round3Func( A, B, C, D, 4,	0xa4beea44, X[ 1 ] ); 	

	Round3Func( D, A, B, C, 11, 0x4bdecfa9, X[ 4 ] );

 	Round3Func( C, D, A, B, 16, 0xf6bb4b60, X[ 7 ] ); 	

	Round3Func( B, C, D, A, 23, 0xbebfbc70, X[ 10 ] );

 	Round3Func( A, B, C, D, 4,	0x289b7ec6, X[ 13 ] ); 	

	Round3Func( D, A, B, C, 11, 0xeaa127fa, X[ 0 ] );

 	Round3Func( C, D, A, B, 16, 0xd4ef3085, X[ 3 ] ); 	

	Round3Func( B, C, D, A, 23, 0x4881d05,	X[ 6 ] );

 	Round3Func( A, B, C, D, 4,	0xd9d4d039, X[ 9 ] ); 	

	Round3Func( D, A, B, C, 11, 0xe6db99e5, X[ 12 ] );

 	Round3Func( C, D, A, B, 16, 0x1fa27cf8, X[ 15 ] ); 	

	Round3Func( B, C, D, A, 23, 0xc4ac5665, X[ 2 ] );

 	/*Round 4*/

 	Round4Func( A, B, C, D,	6,	0xf4292244, X[ 0 ] ); 	

	Round4Func( D, A, B, C, 10, 0x432aff97, X[ 7 ] );

 	Round4Func( C, D, A, B, 15, 0xab9423a7, X[ 14 ] ); 	

	Round4Func( B, C, D, A, 21, 0xfc93a039, X[ 5 ] );

 	Round4Func( A, B, C, D, 6,	0x655b59c3, X[ 12 ] ); 	

	Round4Func( D, A, B, C, 10, 0x8f0ccc92, X[ 3 ] );

 	Round4Func( C, D, A, B, 15, 0xffeff47d, X[ 10 ] ); 	

	Round4Func( B, C, D, A, 21, 0x85845dd1, X[ 1 ] );

 	Round4Func( A, B, C, D, 6,	0x6fa87e4f, X[ 8 ] ); 	

	Round4Func( D, A, B, C, 10, 0xfe2ce6e0, X[ 15 ] );

 	Round4Func( C, D, A, B, 15, 0xa3014314, X[ 6 ] ); 	

	Round4Func( B, C, D, A, 21, 0x4e0811a1, X[ 13 ] );

 	Round4Func( A, B, C, D, 6,	0xf7537e82, X[ 4 ] ); 	

	Round4Func( D, A, B, C, 10, 0xbd3af235, X[ 11 ] );

 	Round4Func( C, D, A, B, 15, 0x2ad7d2bb, X[ 2 ] ); 	

	Round4Func( B, C, D, A, 21, 0xeb86d391, X[ 9 ] );

}

I’m not pretty sure what’s the problem with your code… What do you mean by “not working”? It produces incorrect results? Or it fails to launch? Or something else?

Not sure if this helps, but here’s the link to proof-of-concept MD5 code. [link]

Yes, produces incorrect results.

May be there is any well-known mistakes of newbies?

It’s very strange, but after installing CUDA SDK 2.1 beta and corresponding drivers evrything are going OK. Programm working.

People have encountered a number of bugs with integers in CUDA. Maybe some of those got solved in 2.1?

There is still some problems: when i calling

DevEncodeMD5<<<1, uiCount>>>( szDeviceData, uiDeviceA, uiDeviceB, uiDeviceC, uiDeviceD );

with uiCount more than 320 and szDeviceData - more than 320 samples by 64 bytes, function produses very strange results. When 320 or less - everything is OK.

GPU - GeForce 8800 GT.

The error messages and the error reporting system really needs to get fixed.

Your error is putting too many threads in a block and running out of registers. When the kernel fails to run, you just get garbage data. (You’ll also get a perplexing error message if you insert the error checking macro from the SDK samples, as you should be doing after all kernel calls.)

But in documentation written:

Specifications for Compute Capability 1.0

The maximum number of threads per block is 512; ( not 320 )

The number of registers per multiprocessor is 8192;

is this false for 8800 GT?

This is true for 8800GT.

Maximum number of threads per block is 512 but this is only achievable if your kernel uses very little registers and/or shared memory.

Maximum number of threads per block for particular kernel is determined by min( 512, 8192/Nreg) where Nreg – number of registers used by your kernel.

How can i calculate, how many registers i using?

In documentation said that 8800GT have 14 multiprocessors. I think that i now using only one multiprocessor ( 1 block by 320 threads ). How to use other 13?

P.S.:

Может все-таки перейдем на русский? :)))

No of registers can be calculated using the CUDA occupancy calculator , check CUDA website to download it.

Divide your workspace into grid of blocks and threads and then you can use multiple blocks.

Thanks,

Nittin

No, number of registers is something you can find in the cubin file, or as output from nvcc with a command line option. The occupancy calculator needs the number of register as input.

Using the other multiprocessors is a matter of requesting more blocks.

of registers (as well as amount of lmem and smem – all CRITICAL parameters) is reported by adding ’ --ptxas-options=-v ’ to nvcc build line.

This is something else that should get fixed in CUDA to be automatic, not something you have to visit google or the forums to figure out.

Ok. Output is: “ptxas info : Used 23 registers, 36+32 bytes smem, 16 bytes cmem[1]”

What means smem and cmem? I can’t find it in docs :(.

Shared memory and constant memory.

Some strange things happenings - when the code part

A = ui4ABCD.x;

B = ui4ABCD.y;

C = ui4ABCD.z;

D = ui4ABCD.w;

present - used 23 registers, else if it not - 4 registers. Why it can be?

uint4 ui4ABCD = make_uint4( A, B, C, D );

	/*Round 1*/

 	Round1Func( ui4ABCD.x, ui4ABCD.y, ui4ABCD.z, ui4ABCD.w, 7,	0xd76aa478, X[ 0 ] );	

	...

	  /*Round 2*/

	  Round2Func( ui4ABCD.x, ui4ABCD.y, ui4ABCD.z, ui4ABCD.w, 5,	0xf61e2562, X[ 1 ] );	

 	...

	  /*Round 3*/

	  Round3Func( ui4ABCD.x, ui4ABCD.y, ui4ABCD.z, ui4ABCD.w,	4,	0xfffa3942, X[ 5 ] ); 	

 	...

	  /*Round 4*/

	  Round4Func( ui4ABCD.x, ui4ABCD.y, ui4ABCD.z, ui4ABCD.w,	6,	0xf4292244, X[ 0 ] ); 	

 	...

// 	A = ui4ABCD.x;

// 	B = ui4ABCD.y;

// 	C = ui4ABCD.z;

// 	D = ui4ABCD.w;

when i do so - too 23 registers.

__device__ void AllRounds( uint4& ui4ABCD, unsigned int* X )

{		

 	/*Round 1*/

 	Round1Func( ui4ABCD.x, ui4ABCD.y, ui4ABCD.z, ui4ABCD.w, 7,	0xd76aa478, X[ 0 ] );	

	..

	  /*Round 2*/

	  Round2Func( ui4ABCD.x, ui4ABCD.y, ui4ABCD.z, ui4ABCD.w, 5,	0xf61e2562, X[ 1 ] );	

 	...;

	  /*Round 3*/

	  Round3Func( ui4ABCD.x, ui4ABCD.y, ui4ABCD.z, ui4ABCD.w,	4,	0xfffa3942, X[ 5 ] ); 	

 	...

	  /*Round 4*/

	  Round4Func( ui4ABCD.x, ui4ABCD.y, ui4ABCD.z, ui4ABCD.w,	6,	0xf4292244, X[ 0 ] ); 	

		...

Because the compiler deletes code that it thinks doesn’t affect anything. Which in this case ends up being everything.

How to do it?

I tried to do so:

...

DevEncodeMD5<<< 14, 320 >>>( szDeviceData, uiDeviceA, uiDeviceB, uiDeviceC, uiDeviceD );

...

__global__ void DevEncodeMD5( char* szData, unsigned int* uiAHost, unsigned int* uiBHost, unsigned int* uiCHost, unsigned int* uiDHost )

{

	...

	unsigned int	uiBasePos = blockIdx.x * MESSAGE_LENGTH * 320 + threadIdx.x * MESSAGE_LENGTH;

	for ( unsigned int j = 0; j < 64; j += 4 )

	{

		 X[ j / 4 ] = *reinterpret_cast< unsigned int* >( &szData[ uiBasePos + j ] );

	}

	...

}

But code produces correct resylts only for first 320 samples.

Something like this should work. Note that 14 blocks is not a lot (a GTX280 would have <50% of the multiprocessors doing something)

[codebox]DevEncodeMD5<<< 14, 320 >>>( szDeviceData, uiDeviceA, uiDeviceB, uiDeviceC, uiDeviceD );

global void DevEncodeMD5( char* szData, unsigned int* uiAHost, unsigned int* uiBHost, unsigned int* uiCHost, unsigned int* uiDHost )

{

...

unsigned int index = blockIdx.x * blockDim.x + threadIdx.x;

unsigned int uiBasePos = index * MESSAGE_LENGTH;

#pragma unroll 16

for ( unsigned int j = 0; j < 16; j ++ )

{

     X[ j ] = *reinterpret_cast< unsigned int* >( &szData[ uiBasePos + j*16 ] );

}

...

uiAHost[index] = uiA;

uiBHost[index] = uiB;

uiCHost[index] = uiC;

uiDHost[index] = uiD;

}[/codebox]

No, it doesn’t. It produces same results as my version.

In debug version it produses 0xBADFOOD :) instead of incorrect results.

I’m now writing for 8800 GT - that have 14 mp. But unfortunately working only one :(((