Bruteforce with CUDA

Hello everybody

For my degree dissertation I coded a bruteforce NTML (windows-hash) cracker in C. Now I would like to transform this code on CUDA to be able to make a performance analysis. However I’m stuck…

Attached you can find my code (please bear in mind that it is only a code for testing purpose, so currently all user inputs are ignored). What is wrong?

I coded the cu-file with the thought that all my threads will come and execute the global function. So I could use threadIdx.x as a counter. But at the end I think this thought isn’t right.

Thanks in advance!

#include <string.h>  //espinosa

#include <windows.h>

#include <stdio.h>	//standard IO

#include <math.h>	//f¸r pow

#include <conio.h>  // f¸r _getch()

#include <ctype.h>  // f¸r toupper() und tolower() 

#include "I:\Program Files\NVIDIA Corporation\NVIDIA CUDA SDK\common\inc\cutil.h"	//cutil funtions

#include <time.h>	//f¸r clock

//This is the MD4 compress function

__device__ void ntlm_crypt(int* nt_buffer, int* output)

{

	unsigned int INIT_A = 0x67452301;

	unsigned int INIT_B = 0xefcdab89;

	unsigned int INIT_C = 0x98badcfe;

	unsigned int INIT_D = 0x10325476;

	unsigned int a = INIT_A;

	unsigned int b = INIT_B;

	unsigned int c = INIT_C;

	unsigned int d = INIT_D;

	

	unsigned int SQRT_2 = 0x5a827999;

	unsigned int SQRT_3 = 0x6ed9eba1;

  

	/* Round 1 */

	a += (d ^ (b & (c ^ d)))  +  nt_buffer[0] ;a = (a << 3 ) | (a >> 29);

	d += (c ^ (a & (b ^ c)))  +  nt_buffer[1] ;d = (d << 7 ) | (d >> 25);

	c += (b ^ (d & (a ^ b)))  +  nt_buffer[2] ;c = (c << 11) | (c >> 21);

	b += (a ^ (c & (d ^ a)))  +  nt_buffer[3] ;b = (b << 19) | (b >> 13);

  	

	a += (d ^ (b & (c ^ d)))  +  nt_buffer[4] ;a = (a << 3 ) | (a >> 29);

	d += (c ^ (a & (b ^ c)))  +  nt_buffer[5] ;d = (d << 7 ) | (d >> 25);

	c += (b ^ (d & (a ^ b)))  +  nt_buffer[6] ;c = (c << 11) | (c >> 21);

	b += (a ^ (c & (d ^ a)))  +  nt_buffer[7] ;b = (b << 19) | (b >> 13);

  	

	a += (d ^ (b & (c ^ d)))  +  nt_buffer[8] ;a = (a << 3 ) | (a >> 29);

	d += (c ^ (a & (b ^ c)))  +  nt_buffer[9] ;d = (d << 7 ) | (d >> 25);

	c += (b ^ (d & (a ^ b)))  +  nt_buffer[10];c = (c << 11) | (c >> 21);

	b += (a ^ (c & (d ^ a)))  +  nt_buffer[11];b = (b << 19) | (b >> 13);

  	

	a += (d ^ (b & (c ^ d)))  +  nt_buffer[12];a = (a << 3 ) | (a >> 29);

	d += (c ^ (a & (b ^ c)))  +  nt_buffer[13];d = (d << 7 ) | (d >> 25);

	c += (b ^ (d & (a ^ b)))  +  nt_buffer[14];c = (c << 11) | (c >> 21);

	b += (a ^ (c & (d ^ a)))  +  nt_buffer[15];b = (b << 19) | (b >> 13);

  	

	/* Round 2 */

	a += ((b & (c | d)) | (c & d)) + nt_buffer[0] +SQRT_2; a = (a<<3 ) | (a>>29);

	d += ((a & (b | c)) | (b & c)) + nt_buffer[4] +SQRT_2; d = (d<<5 ) | (d>>27);

	c += ((d & (a | b)) | (a & b)) + nt_buffer[8] +SQRT_2; c = (c<<9 ) | (c>>23);

	b += ((c & (d | a)) | (d & a)) + nt_buffer[12]+SQRT_2; b = (b<<13) | (b>>19);

  	

	a += ((b & (c | d)) | (c & d)) + nt_buffer[1] +SQRT_2; a = (a<<3 ) | (a>>29);

	d += ((a & (b | c)) | (b & c)) + nt_buffer[5] +SQRT_2; d = (d<<5 ) | (d>>27);

	c += ((d & (a | b)) | (a & b)) + nt_buffer[9] +SQRT_2; c = (c<<9 ) | (c>>23);

	b += ((c & (d | a)) | (d & a)) + nt_buffer[13]+SQRT_2; b = (b<<13) | (b>>19);

  	

	a += ((b & (c | d)) | (c & d)) + nt_buffer[2] +SQRT_2; a = (a<<3 ) | (a>>29);

	d += ((a & (b | c)) | (b & c)) + nt_buffer[6] +SQRT_2; d = (d<<5 ) | (d>>27);

	c += ((d & (a | b)) | (a & b)) + nt_buffer[10]+SQRT_2; c = (c<<9 ) | (c>>23);

	b += ((c & (d | a)) | (d & a)) + nt_buffer[14]+SQRT_2; b = (b<<13) | (b>>19);

  	

	a += ((b & (c | d)) | (c & d)) + nt_buffer[3] +SQRT_2; a = (a<<3 ) | (a>>29);

	d += ((a & (b | c)) | (b & c)) + nt_buffer[7] +SQRT_2; d = (d<<5 ) | (d>>27);

	c += ((d & (a | b)) | (a & b)) + nt_buffer[11]+SQRT_2; c = (c<<9 ) | (c>>23);

	b += ((c & (d | a)) | (d & a)) + nt_buffer[15]+SQRT_2; b = (b<<13) | (b>>19);

  	

	/* Round 3 */

	a += (d ^ c ^ b) + nt_buffer[0]  +  SQRT_3; a = (a << 3 ) | (a >> 29);

	d += (c ^ b ^ a) + nt_buffer[8]  +  SQRT_3; d = (d << 9 ) | (d >> 23);

	c += (b ^ a ^ d) + nt_buffer[4]  +  SQRT_3; c = (c << 11) | (c >> 21);

	b += (a ^ d ^ c) + nt_buffer[12] +  SQRT_3; b = (b << 15) | (b >> 17);

  

	a += (d ^ c ^ b) + nt_buffer[2]  +  SQRT_3; a = (a << 3 ) | (a >> 29);

	d += (c ^ b ^ a) + nt_buffer[10] +  SQRT_3; d = (d << 9 ) | (d >> 23);

	c += (b ^ a ^ d) + nt_buffer[6]  +  SQRT_3; c = (c << 11) | (c >> 21);

	b += (a ^ d ^ c) + nt_buffer[14] +  SQRT_3; b = (b << 15) | (b >> 17);

  

	a += (d ^ c ^ b) + nt_buffer[1]  +  SQRT_3; a = (a << 3 ) | (a >> 29);

	d += (c ^ b ^ a) + nt_buffer[9] +  SQRT_3; d = (d << 9 ) | (d >> 23);

	c += (b ^ a ^ d) + nt_buffer[5]  +  SQRT_3; c = (c << 11) | (c >> 21);

	b += (a ^ d ^ c) + nt_buffer[13] +  SQRT_3; b = (b << 15) | (b >> 17);

	a += (d ^ c ^ b) + nt_buffer[3]  +  SQRT_3; a = (a << 3 ) | (a >> 29);

	d += (c ^ b ^ a) + nt_buffer[11] +  SQRT_3; d = (d << 9 ) | (d >> 23);

	c += (b ^ a ^ d) + nt_buffer[7]  +  SQRT_3; c = (c << 11) | (c >> 21);

	b += (a ^ d ^ c) + nt_buffer[15] +  SQRT_3; b = (b << 15) | (b >> 17);

  

	output[0] = a + INIT_A;

	output[1] = b + INIT_B;

	output[2] = c + INIT_C;

	output[3] = d + INIT_D;

}	

//This include the unicode conversion and the padding

__device__ void prepare_key(char* key, int length, int* nt_buffer)

{

	int i=0;

	memset(nt_buffer,0,16*4);

	//The length of key need to be <= 27

	for(;i<length/2;i++)	

  nt_buffer[i] = key[2*i] | (key[2*i+1]<<16);

  

	//padding

	if(length%2==1)

  nt_buffer[i] = key[length-1] | 0x800000;

	else

  nt_buffer[i]=0x80;

	//put the length

	nt_buffer[14] = length << 4;

}

//This convert the output to hexadecimal form

__device__ void convert_hex(int* output, char* hex_format)

{

	char itoa16[17] = "0123456789abcdef";

	int i=0;

	//Iterate the integer

	for(;i<4;i++)

	{

  int j=0;

  unsigned int n=output[i];

  //iterate the bytes of the integer  

  for(;j<4;j++)

  {

  	unsigned int convert=n%256;

  	hex_format[i*8+j*2+1]=itoa16[convert%16];

  	convert=convert/16;

  	hex_format[i*8+j*2+0]=itoa16[convert%16];

  	n=n/256;

  }	

	}

	//null terminate the string

	hex_format[33]=0;

}

__device__ int adistrcmp (const char * src, const char * dst)

{

        int ret = 0;

       while( ! (ret = *(unsigned char *)src - *(unsigned char *)dst) && *dst)

                ++src, ++dst;

       if ( ret < 0 )

                ret = -1;

        else if ( ret > 0 )

                ret = 1;

       return( ret );

}

void gotolower(char *str){

	int i=0;

	while (str[i]){

  // Hier kann ich char = char, weil ich mich auf eine Referenz (*) beziehe.

  str[i] = tolower(str[i]);

  i++;

	}

}

__global__ void bruteforce(char* g_charset, char* g_hash_hex, unsigned int* g_counter, char* g_pstorer, bool* g_found, int maxPwLength){

	unsigned int* nt_buffer[16];  //memory zuordnen und von global ¸bergeben

	unsigned int* output[4];  //memory zuordnen und von global ¸bergeben

	char* hex_format[33];

	*g_counter = *g_counter+2;

	int position = 0;

	unsigned int decounter;

 decounter = blockIdx.x * blockDim.x + threadIdx.x;

 //storer reinigen

  

  for (int i = 0; i < maxPwLength; i++) {

  	g_pstorer[i] = '

#include <string.h>  //espinosa

#include <windows.h>

#include <stdio.h> //standard IO

#include <math.h> //f¸r pow

#include <conio.h>  // f¸r _getch()

#include <ctype.h>  // f¸r toupper() und tolower()

#include “I:\Program Files\NVIDIA Corporation\NVIDIA CUDA SDK\common\inc\cutil.h” //cutil funtions

#include <time.h> //f¸r clock

//This is the MD4 compress function

device void ntlm_crypt(int* nt_buffer, int* output)

{

unsigned int INIT_A = 0x67452301;

unsigned int INIT_B = 0xefcdab89;

unsigned int INIT_C = 0x98badcfe;

unsigned int INIT_D = 0x10325476;

unsigned int a = INIT_A;

unsigned int b = INIT_B;

unsigned int c = INIT_C;

unsigned int d = INIT_D;



unsigned int SQRT_2 = 0x5a827999;

unsigned int SQRT_3 = 0x6ed9eba1;

/* Round 1 */

a += (d ^ (b & (c ^ d)))  +  nt_buffer[0] ;a = (a << 3 ) | (a >> 29);

d += (c ^ (a & (b ^ c)))  +  nt_buffer[1] ;d = (d << 7 ) | (d >> 25);

c += (b ^ (d & (a ^ b)))  +  nt_buffer[2] ;c = (c << 11) | (c >> 21);

b += (a ^ (c & (d ^ a)))  +  nt_buffer[3] ;b = (b << 19) | (b >> 13);

a += (d ^ (b & (c ^ d)))  +  nt_buffer[4] ;a = (a << 3 ) | (a >> 29);

d += (c ^ (a & (b ^ c)))  +  nt_buffer[5] ;d = (d << 7 ) | (d >> 25);

c += (b ^ (d & (a ^ b)))  +  nt_buffer[6] ;c = (c << 11) | (c >> 21);

b += (a ^ (c & (d ^ a)))  +  nt_buffer[7] ;b = (b << 19) | (b >> 13);

a += (d ^ (b & (c ^ d)))  +  nt_buffer[8] ;a = (a << 3 ) | (a >> 29);

d += (c ^ (a & (b ^ c)))  +  nt_buffer[9] ;d = (d << 7 ) | (d >> 25);

c += (b ^ (d & (a ^ b)))  +  nt_buffer[10];c = (c << 11) | (c >> 21);

b += (a ^ (c & (d ^ a)))  +  nt_buffer[11];b = (b << 19) | (b >> 13);

a += (d ^ (b & (c ^ d)))  +  nt_buffer[12];a = (a << 3 ) | (a >> 29);

d += (c ^ (a & (b ^ c)))  +  nt_buffer[13];d = (d << 7 ) | (d >> 25);

c += (b ^ (d & (a ^ b)))  +  nt_buffer[14];c = (c << 11) | (c >> 21);

b += (a ^ (c & (d ^ a)))  +  nt_buffer[15];b = (b << 19) | (b >> 13);

/* Round 2 */

a += ((b & (c | d)) | (c & d)) + nt_buffer[0] +SQRT_2; a = (a<<3 ) | (a>>29);

d += ((a & (b | c)) | (b & c)) + nt_buffer[4] +SQRT_2; d = (d<<5 ) | (d>>27);

c += ((d & (a | b)) | (a & b)) + nt_buffer[8] +SQRT_2; c = (c<<9 ) | (c>>23);

b += ((c & (d | a)) | (d & a)) + nt_buffer[12]+SQRT_2; b = (b<<13) | (b>>19);

a += ((b & (c | d)) | (c & d)) + nt_buffer[1] +SQRT_2; a = (a<<3 ) | (a>>29);

d += ((a & (b | c)) | (b & c)) + nt_buffer[5] +SQRT_2; d = (d<<5 ) | (d>>27);

c += ((d & (a | b)) | (a & b)) + nt_buffer[9] +SQRT_2; c = (c<<9 ) | (c>>23);

b += ((c & (d | a)) | (d & a)) + nt_buffer[13]+SQRT_2; b = (b<<13) | (b>>19);

a += ((b & (c | d)) | (c & d)) + nt_buffer[2] +SQRT_2; a = (a<<3 ) | (a>>29);

d += ((a & (b | c)) | (b & c)) + nt_buffer[6] +SQRT_2; d = (d<<5 ) | (d>>27);

c += ((d & (a | b)) | (a & b)) + nt_buffer[10]+SQRT_2; c = (c<<9 ) | (c>>23);

b += ((c & (d | a)) | (d & a)) + nt_buffer[14]+SQRT_2; b = (b<<13) | (b>>19);

a += ((b & (c | d)) | (c & d)) + nt_buffer[3] +SQRT_2; a = (a<<3 ) | (a>>29);

d += ((a & (b | c)) | (b & c)) + nt_buffer[7] +SQRT_2; d = (d<<5 ) | (d>>27);

c += ((d & (a | b)) | (a & b)) + nt_buffer[11]+SQRT_2; c = (c<<9 ) | (c>>23);

b += ((c & (d | a)) | (d & a)) + nt_buffer[15]+SQRT_2; b = (b<<13) | (b>>19);

/* Round 3 */

a += (d ^ c ^ b) + nt_buffer[0]  +  SQRT_3; a = (a << 3 ) | (a >> 29);

d += (c ^ b ^ a) + nt_buffer[8]  +  SQRT_3; d = (d << 9 ) | (d >> 23);

c += (b ^ a ^ d) + nt_buffer[4]  +  SQRT_3; c = (c << 11) | (c >> 21);

b += (a ^ d ^ c) + nt_buffer[12] +  SQRT_3; b = (b << 15) | (b >> 17);

a += (d ^ c ^ b) + nt_buffer[2]  +  SQRT_3; a = (a << 3 ) | (a >> 29);

d += (c ^ b ^ a) + nt_buffer[10] +  SQRT_3; d = (d << 9 ) | (d >> 23);

c += (b ^ a ^ d) + nt_buffer[6]  +  SQRT_3; c = (c << 11) | (c >> 21);

b += (a ^ d ^ c) + nt_buffer[14] +  SQRT_3; b = (b << 15) | (b >> 17);

a += (d ^ c ^ b) + nt_buffer[1]  +  SQRT_3; a = (a << 3 ) | (a >> 29);

d += (c ^ b ^ a) + nt_buffer[9] +  SQRT_3; d = (d << 9 ) | (d >> 23);

c += (b ^ a ^ d) + nt_buffer[5]  +  SQRT_3; c = (c << 11) | (c >> 21);

b += (a ^ d ^ c) + nt_buffer[13] +  SQRT_3; b = (b << 15) | (b >> 17);

a += (d ^ c ^ b) + nt_buffer[3]  +  SQRT_3; a = (a << 3 ) | (a >> 29);

d += (c ^ b ^ a) + nt_buffer[11] +  SQRT_3; d = (d << 9 ) | (d >> 23);

c += (b ^ a ^ d) + nt_buffer[7]  +  SQRT_3; c = (c << 11) | (c >> 21);

b += (a ^ d ^ c) + nt_buffer[15] +  SQRT_3; b = (b << 15) | (b >> 17);

output[0] = a + INIT_A;

output[1] = b + INIT_B;

output[2] = c + INIT_C;

output[3] = d + INIT_D;

}

//This include the unicode conversion and the padding

device void prepare_key(char* key, int length, int* nt_buffer)

{

int i=0;

memset(nt_buffer,0,16*4);

//The length of key need to be <= 27

for(;i<length/2;i++)	

nt_buffer[i] = key[2i] | (key[2i+1]<<16);

//padding

if(length%2==1)

nt_buffer[i] = key[length-1] | 0x800000;

else

nt_buffer[i]=0x80;

//put the length

nt_buffer[14] = length << 4;

}

//This convert the output to hexadecimal form

device void convert_hex(int* output, char* hex_format)

{

char itoa16[17] = "0123456789abcdef";

int i=0;

//Iterate the integer

for(;i<4;i++)

{

int j=0;

unsigned int n=output[i];

//iterate the bytes of the integer

for(;j<4;j++)

{

unsigned int convert=n%256;

hex_format[i*8+j*2+1]=itoa16[convert%16];

convert=convert/16;

hex_format[i*8+j*2+0]=itoa16[convert%16];

n=n/256;

}

}

//null terminate the string

hex_format[33]=0;

}

device int adistrcmp (const char * src, const char * dst)

{

int ret = 0;

while( ! (ret = *(unsigned char *)src - *(unsigned char *)dst) && *dst)

++src, ++dst;

if ( ret < 0 )

ret = -1;

else if ( ret > 0 )

ret = 1;

return( ret );

}

void gotolower(char *str){

int i=0;

while (str[i]){

// Hier kann ich char = char, weil ich mich auf eine Referenz (*) beziehe.

str[i] = tolower(str[i]);

i++;

}

}

global void bruteforce(char* g_charset, char* g_hash_hex, unsigned int* g_counter, char* g_pstorer, bool* g_found, int maxPwLength){

unsigned int* nt_buffer[16];  //memory zuordnen und von global ¸bergeben

unsigned int* output[4];  //memory zuordnen und von global ¸bergeben

char* hex_format[33];

*g_counter = *g_counter+2;

int position = 0;

unsigned int decounter;

decounter = blockIdx.x * blockDim.x + threadIdx.x;

//storer reinigen

for (int i = 0; i < maxPwLength; i++) {

g_pstorer[i] = ’

#include <string.h>  //espinosa

#include <windows.h>

#include <stdio.h>	//standard IO

#include <math.h>	//f¸r pow

#include <conio.h>  // f¸r _getch()

#include <ctype.h>  // f¸r toupper() und tolower() 

#include "I:\Program Files\NVIDIA Corporation\NVIDIA CUDA SDK\common\inc\cutil.h"	//cutil funtions

#include <time.h>	//f¸r clock

//This is the MD4 compress function

__device__ void ntlm_crypt(int* nt_buffer, int* output)

{

	unsigned int INIT_A = 0x67452301;

	unsigned int INIT_B = 0xefcdab89;

	unsigned int INIT_C = 0x98badcfe;

	unsigned int INIT_D = 0x10325476;

	unsigned int a = INIT_A;

	unsigned int b = INIT_B;

	unsigned int c = INIT_C;

	unsigned int d = INIT_D;

	

	unsigned int SQRT_2 = 0x5a827999;

	unsigned int SQRT_3 = 0x6ed9eba1;

  

	/* Round 1 */

	a += (d ^ (b & (c ^ d)))  +  nt_buffer[0] ;a = (a << 3 ) | (a >> 29);

	d += (c ^ (a & (b ^ c)))  +  nt_buffer[1] ;d = (d << 7 ) | (d >> 25);

	c += (b ^ (d & (a ^ b)))  +  nt_buffer[2] ;c = (c << 11) | (c >> 21);

	b += (a ^ (c & (d ^ a)))  +  nt_buffer[3] ;b = (b << 19) | (b >> 13);

  	

	a += (d ^ (b & (c ^ d)))  +  nt_buffer[4] ;a = (a << 3 ) | (a >> 29);

	d += (c ^ (a & (b ^ c)))  +  nt_buffer[5] ;d = (d << 7 ) | (d >> 25);

	c += (b ^ (d & (a ^ b)))  +  nt_buffer[6] ;c = (c << 11) | (c >> 21);

	b += (a ^ (c & (d ^ a)))  +  nt_buffer[7] ;b = (b << 19) | (b >> 13);

  	

	a += (d ^ (b & (c ^ d)))  +  nt_buffer[8] ;a = (a << 3 ) | (a >> 29);

	d += (c ^ (a & (b ^ c)))  +  nt_buffer[9] ;d = (d << 7 ) | (d >> 25);

	c += (b ^ (d & (a ^ b)))  +  nt_buffer[10];c = (c << 11) | (c >> 21);

	b += (a ^ (c & (d ^ a)))  +  nt_buffer[11];b = (b << 19) | (b >> 13);

  	

	a += (d ^ (b & (c ^ d)))  +  nt_buffer[12];a = (a << 3 ) | (a >> 29);

	d += (c ^ (a & (b ^ c)))  +  nt_buffer[13];d = (d << 7 ) | (d >> 25);

	c += (b ^ (d & (a ^ b)))  +  nt_buffer[14];c = (c << 11) | (c >> 21);

	b += (a ^ (c & (d ^ a)))  +  nt_buffer[15];b = (b << 19) | (b >> 13);

  	

	/* Round 2 */

	a += ((b & (c | d)) | (c & d)) + nt_buffer[0] +SQRT_2; a = (a<<3 ) | (a>>29);

	d += ((a & (b | c)) | (b & c)) + nt_buffer[4] +SQRT_2; d = (d<<5 ) | (d>>27);

	c += ((d & (a | b)) | (a & b)) + nt_buffer[8] +SQRT_2; c = (c<<9 ) | (c>>23);

	b += ((c & (d | a)) | (d & a)) + nt_buffer[12]+SQRT_2; b = (b<<13) | (b>>19);

  	

	a += ((b & (c | d)) | (c & d)) + nt_buffer[1] +SQRT_2; a = (a<<3 ) | (a>>29);

	d += ((a & (b | c)) | (b & c)) + nt_buffer[5] +SQRT_2; d = (d<<5 ) | (d>>27);

	c += ((d & (a | b)) | (a & b)) + nt_buffer[9] +SQRT_2; c = (c<<9 ) | (c>>23);

	b += ((c & (d | a)) | (d & a)) + nt_buffer[13]+SQRT_2; b = (b<<13) | (b>>19);

  	

	a += ((b & (c | d)) | (c & d)) + nt_buffer[2] +SQRT_2; a = (a<<3 ) | (a>>29);

	d += ((a & (b | c)) | (b & c)) + nt_buffer[6] +SQRT_2; d = (d<<5 ) | (d>>27);

	c += ((d & (a | b)) | (a & b)) + nt_buffer[10]+SQRT_2; c = (c<<9 ) | (c>>23);

	b += ((c & (d | a)) | (d & a)) + nt_buffer[14]+SQRT_2; b = (b<<13) | (b>>19);

  	

	a += ((b & (c | d)) | (c & d)) + nt_buffer[3] +SQRT_2; a = (a<<3 ) | (a>>29);

	d += ((a & (b | c)) | (b & c)) + nt_buffer[7] +SQRT_2; d = (d<<5 ) | (d>>27);

	c += ((d & (a | b)) | (a & b)) + nt_buffer[11]+SQRT_2; c = (c<<9 ) | (c>>23);

	b += ((c & (d | a)) | (d & a)) + nt_buffer[15]+SQRT_2; b = (b<<13) | (b>>19);

  	

	/* Round 3 */

	a += (d ^ c ^ b) + nt_buffer[0]  +  SQRT_3; a = (a << 3 ) | (a >> 29);

	d += (c ^ b ^ a) + nt_buffer[8]  +  SQRT_3; d = (d << 9 ) | (d >> 23);

	c += (b ^ a ^ d) + nt_buffer[4]  +  SQRT_3; c = (c << 11) | (c >> 21);

	b += (a ^ d ^ c) + nt_buffer[12] +  SQRT_3; b = (b << 15) | (b >> 17);

  

	a += (d ^ c ^ b) + nt_buffer[2]  +  SQRT_3; a = (a << 3 ) | (a >> 29);

	d += (c ^ b ^ a) + nt_buffer[10] +  SQRT_3; d = (d << 9 ) | (d >> 23);

	c += (b ^ a ^ d) + nt_buffer[6]  +  SQRT_3; c = (c << 11) | (c >> 21);

	b += (a ^ d ^ c) + nt_buffer[14] +  SQRT_3; b = (b << 15) | (b >> 17);

  

	a += (d ^ c ^ b) + nt_buffer[1]  +  SQRT_3; a = (a << 3 ) | (a >> 29);

	d += (c ^ b ^ a) + nt_buffer[9] +  SQRT_3; d = (d << 9 ) | (d >> 23);

	c += (b ^ a ^ d) + nt_buffer[5]  +  SQRT_3; c = (c << 11) | (c >> 21);

	b += (a ^ d ^ c) + nt_buffer[13] +  SQRT_3; b = (b << 15) | (b >> 17);

	a += (d ^ c ^ b) + nt_buffer[3]  +  SQRT_3; a = (a << 3 ) | (a >> 29);

	d += (c ^ b ^ a) + nt_buffer[11] +  SQRT_3; d = (d << 9 ) | (d >> 23);

	c += (b ^ a ^ d) + nt_buffer[7]  +  SQRT_3; c = (c << 11) | (c >> 21);

	b += (a ^ d ^ c) + nt_buffer[15] +  SQRT_3; b = (b << 15) | (b >> 17);

  

	output[0] = a + INIT_A;

	output[1] = b + INIT_B;

	output[2] = c + INIT_C;

	output[3] = d + INIT_D;

}	

//This include the unicode conversion and the padding

__device__ void prepare_key(char* key, int length, int* nt_buffer)

{

	int i=0;

	memset(nt_buffer,0,16*4);

	//The length of key need to be <= 27

	for(;i<length/2;i++)	

  nt_buffer[i] = key[2*i] | (key[2*i+1]<<16);

  

	//padding

	if(length%2==1)

  nt_buffer[i] = key[length-1] | 0x800000;

	else

  nt_buffer[i]=0x80;

	//put the length

	nt_buffer[14] = length << 4;

}

//This convert the output to hexadecimal form

__device__ void convert_hex(int* output, char* hex_format)

{

	char itoa16[17] = "0123456789abcdef";

	int i=0;

	//Iterate the integer

	for(;i<4;i++)

	{

  int j=0;

  unsigned int n=output[i];

  //iterate the bytes of the integer  

  for(;j<4;j++)

  {

  	unsigned int convert=n%256;

  	hex_format[i*8+j*2+1]=itoa16[convert%16];

  	convert=convert/16;

  	hex_format[i*8+j*2+0]=itoa16[convert%16];

  	n=n/256;

  }	

	}

	//null terminate the string

	hex_format[33]=0;

}

__device__ int adistrcmp (const char * src, const char * dst)

{

        int ret = 0;

       while( ! (ret = *(unsigned char *)src - *(unsigned char *)dst) && *dst)

                ++src, ++dst;

       if ( ret < 0 )

                ret = -1;

        else if ( ret > 0 )

                ret = 1;

       return( ret );

}

void gotolower(char *str){

	int i=0;

	while (str[i]){

  // Hier kann ich char = char, weil ich mich auf eine Referenz (*) beziehe.

  str[i] = tolower(str[i]);

  i++;

	}

}

__global__ void bruteforce(char* g_charset, char* g_hash_hex, unsigned int* g_counter, char* g_pstorer, bool* g_found, int maxPwLength){

	unsigned int* nt_buffer[16];  //memory zuordnen und von global ¸bergeben

	unsigned int* output[4];  //memory zuordnen und von global ¸bergeben

	char* hex_format[33];

	*g_counter = *g_counter+2;

	int position = 0;

	unsigned int decounter;

 decounter = blockIdx.x * blockDim.x + threadIdx.x;

 //storer reinigen

  

  for (int i = 0; i < maxPwLength; i++) {

  	g_pstorer[i] = '\0';

  }

 do{

  	g_pstorer[position] = g_charset[decounter%((int)(sizeof(g_charset)/sizeof(char)))];

  	decounter = decounter/((int)(sizeof(g_charset)/sizeof(char)));

  	position++;

 } while (decounter != 0);

 prepare_key(g_pstorer, position, (int*)nt_buffer);

  __syncthreads();

  ntlm_crypt((int*)nt_buffer, (int*)output);

  __syncthreads();

  convert_hex((int*)output, (char*)hex_format);

  __syncthreads();

 //Vergleich machen...

  if (adistrcmp((char*)hex_format,g_hash_hex) == 0){

  	*g_found = true;

  }

	

	__syncthreads();

}

void print_device_information(){

	int deviceCount;

    CUDA_SAFE_CALL(cudaGetDeviceCount(&deviceCount));

    if (deviceCount == 0)

        printf("There is no device supporting CUDA\n");

    int dev;

    for (dev = 0; dev < deviceCount; ++dev) {

        cudaDeviceProp deviceProp;

        CUDA_SAFE_CALL(cudaGetDeviceProperties(&deviceProp, dev));

        if (dev == 0) {

            if (deviceProp.major == 9999 && deviceProp.minor == 9999)

                printf("There is no device supporting CUDA.\n");

            else if (deviceCount == 1)

                printf("There is 1 device supporting CUDA\n");

            else

                printf("There are %d devices supporting CUDA\n", deviceCount);

        }

        printf("\nDevice %d: \"%s\"\n", dev, deviceProp.name);

        printf("  Major revision number:                         %d\n",

               deviceProp.major);

        printf("  Minor revision number:                         %d\n",

               deviceProp.minor);

        printf("  Total amount of global memory:                 %u bytes\n",

               deviceProp.totalGlobalMem);

    #if CUDART_VERSION >= 2000

        printf("  Number of multiprocessors:                     %d\n",

               deviceProp.multiProcessorCount);

        printf("  Number of cores:                               %d\n",

               8 * deviceProp.multiProcessorCount);

    #endif

        printf("  Total amount of constant memory:               %u bytes\n",

               deviceProp.totalConstMem); 

        printf("  Total amount of shared memory per block:       %u bytes\n",

               deviceProp.sharedMemPerBlock);

        printf("  Total number of registers available per block: %d\n",

               deviceProp.regsPerBlock);

        printf("  Warp size:                                     %d\n",

               deviceProp.warpSize);

        printf("  Maximum number of threads per block:           %d\n",

               deviceProp.maxThreadsPerBlock);

        printf("  Maximum sizes of each dimension of a block:    %d x %d x %d\n",

               deviceProp.maxThreadsDim[0],

               deviceProp.maxThreadsDim[1],

               deviceProp.maxThreadsDim[2]);

        printf("  Maximum sizes of each dimension of a grid:     %d x %d x %d\n",

               deviceProp.maxGridSize[0],

               deviceProp.maxGridSize[1],

               deviceProp.maxGridSize[2]);

        printf("  Maximum memory pitch:                          %u bytes\n",

               deviceProp.memPitch);

        printf("  Texture alignment:                             %u bytes\n",

               deviceProp.textureAlignment);

        printf("  Clock rate:                                    %.2f GHz\n",

               deviceProp.clockRate * 1e-6f);

    #if CUDART_VERSION >= 2000

        printf("  Concurrent copy and execution:                 %s\n",

               deviceProp.deviceOverlap ? "Yes" : "No");

    #endif

    }

}

int main(int argc, char* argv[])

{

	const int maxPwLength = 10;

	unsigned int start = 0;

	unsigned int ende = 1000;

	char charset[63] = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789";

	char hash_hex[33];

	

	unsigned int counter;

	char pstorer[maxPwLength];

	bool found = false;

	

	//nur main

	char switcher[100];

	int pwLength;

	// -> header

    printf("                                                                         \n");

	printf("      #################################################################  \n");

    printf("     #                                                                 # \n");

	printf("     #           NTLM Brutforce Attack by Adrian Schwalller            # \n");

	printf("     #                   Algorithm by Alain Espinosa                   # \n");

	printf("     #                                                                 # \n");

	printf("      #################################################################  \n");

	printf("                                                                         \n");

	printf("                                                                         \n");

	printf("Press any key to start the programm...\n");

	// _getch() iest ein Zeichen (kˆnnte es auch zur¸ckgeben)

	_getch();

	

	

	//print device information

	print_device_information();

	printf("Please insert the HTML-Hash-Value:\n");

	scanf ("%s",hash_hex); 

	gotolower(hash_hex);

	printf("Would you like to insert a maxPwLength then insert 'pw'\n. Else you work with the counter.\n");

	scanf ("%s",switcher); 

	if (strcmp(switcher, "pw") == 0){

  printf("\nPlease insert the max passwort length:\n");

  scanf ("%d",&pwLength);	

  start = 0;

  ende = (int)pow((double)strlen(charset), pwLength);

	}else{

 printf("\nPlease initialise the counter:\n");

  scanf ("%d",&start); 

 printf("\nPlease insert the maxCount for the counter:\n");

  scanf ("%d",&ende);

 pwLength = 1;

  for(unsigned int i = ende; i > (unsigned int)strlen(charset); i = (unsigned int) ceil((double)i/62)){

  	pwLength++;

  }

	}

	printf("PwLength: %d\n", pwLength);

	printf("Start: %d\n", start);

	printf("Ende: %d\n", ende);

	

	

	//Variablen verschieben

	

	char* g_charset;

	char* g_hash_hex;

	unsigned int* g_counter;

	char* g_pstorer;

	bool* g_found;

	

	CUDA_SAFE_CALL(cudaMalloc((void**) &g_charset, sizeof(char)*strlen(charset)));

	CUDA_SAFE_CALL(cudaMalloc((void**) &g_hash_hex, sizeof(char)*strlen(hash_hex)));

	CUDA_SAFE_CALL(cudaMalloc((void**) &g_counter, sizeof(int)));

	CUDA_SAFE_CALL(cudaMalloc((void**) &g_pstorer, sizeof(char)*maxPwLength));

	CUDA_SAFE_CALL(cudaMalloc((void**) &g_found, sizeof(bool)));

	printf("I'am here0");

	CUDA_SAFE_CALL(cudaMemcpy(g_charset, charset, sizeof(char)*strlen(charset), cudaMemcpyHostToDevice));

	CUDA_SAFE_CALL(cudaMemcpy(g_hash_hex, hash_hex, sizeof(char)*strlen(hash_hex), cudaMemcpyHostToDevice));

	//CUDA_SAFE_CALL(cudaMemcpy(g_counter, counter, sizeof(int), cudaMemcpyHostToDevice));

	CUDA_SAFE_CALL(cudaMemcpy(g_pstorer, pstorer, sizeof(char)*maxPwLength, cudaMemcpyHostToDevice));

	//CUDA_SAFE_CALL(cudaMemcpy(g_found, found, sizeof(bool), cudaMemcpyHostToDevice));

	

	int difference = ende - start;

	//int words_per_thread = 10;

	const int threads_per_block = 512;

	int threads = 512;

	int blocks = 65535; //65535*512=33553920

	dim3 grid(blocks,1);

	dim3 threadblock(threads,1,1);

	printf("I'am here1");

	clock_t t1,t2;

	t1=clock();

	bruteforce<<< grid, threadblock >>>(g_charset, g_hash_hex, g_counter, g_pstorer, g_found, maxPwLength);

	t2=clock();

	double time=((float)(t2-t1)/CLOCKS_PER_SEC);

	

	printf("I'am here2");

	CUDA_SAFE_CALL(cudaMemcpy(&counter, g_counter, sizeof(int), cudaMemcpyDeviceToHost));

	CUDA_SAFE_CALL(cudaMemcpy(pstorer, g_pstorer, maxPwLength, cudaMemcpyDeviceToHost));

	CUDA_SAFE_CALL(cudaMemcpy(&found, g_found, sizeof(bool), cudaMemcpyDeviceToHost));

	if(found){

  printf("\nThe Password is: %s\n", pstorer);

	}else{

  printf("\nNo Password found in the list.\n");

	}

	printf("\n\n + Done. \n\n");

	printf("%d words checked\n\n", counter);

	printf("Rechenzeit: %f\n\n", time);

	getchar();  //equivalent zu _getch()

	return 0;

}

';

}

do{

g_pstorer[position] = g_charset[decounter%((int)(sizeof(g_charset)/sizeof(char)))];

decounter = decounter/((int)(sizeof(g_charset)/sizeof(char)));

position++;

} while (decounter != 0);

prepare_key(g_pstorer, position, (int*)nt_buffer);

__syncthreads();

ntlm_crypt((int*)nt_buffer, (int*)output);

__syncthreads();

convert_hex((int*)output, (char*)hex_format);

__syncthreads();

//Vergleich machen…

if (adistrcmp((char*)hex_format,g_hash_hex) == 0){

*g_found = true;

}

__syncthreads();

}

void print_device_information(){

int deviceCount;

CUDA_SAFE_CALL(cudaGetDeviceCount(&deviceCount));

if (deviceCount == 0)

printf(“There is no device supporting CUDA\n”);

int dev;

for (dev = 0; dev < deviceCount; ++dev) {

cudaDeviceProp deviceProp;

CUDA_SAFE_CALL(cudaGetDeviceProperties(&deviceProp, dev));

if (dev == 0) {

if (deviceProp.major == 9999 && deviceProp.minor == 9999)

printf(“There is no device supporting CUDA.\n”);

else if (deviceCount == 1)

printf(“There is 1 device supporting CUDA\n”);

else

printf(“There are %d devices supporting CUDA\n”, deviceCount);

}

printf(“\nDevice %d: "%s"\n”, dev, deviceProp.name);

printf("  Major revision number:                         %d\n",

deviceProp.major);

printf("  Minor revision number:                         %d\n",

deviceProp.minor);

printf("  Total amount of global memory:                 %u bytes\n",

deviceProp.totalGlobalMem);

#if CUDART_VERSION >= 2000

printf("  Number of multiprocessors:                     %d\n",

deviceProp.multiProcessorCount);

printf("  Number of cores:                               %d\n",

8 * deviceProp.multiProcessorCount);

#endif

printf("  Total amount of constant memory:               %u bytes\n",

deviceProp.totalConstMem);

printf("  Total amount of shared memory per block:       %u bytes\n",

deviceProp.sharedMemPerBlock);

printf("  Total number of registers available per block: %d\n",

deviceProp.regsPerBlock);

printf("  Warp size:                                     %d\n",

deviceProp.warpSize);

printf("  Maximum number of threads per block:           %d\n",

deviceProp.maxThreadsPerBlock);

printf("  Maximum sizes of each dimension of a block:    %d x %d x %d\n",

deviceProp.maxThreadsDim[0],

deviceProp.maxThreadsDim[1],

deviceProp.maxThreadsDim[2]);

printf("  Maximum sizes of each dimension of a grid:     %d x %d x %d\n",

deviceProp.maxGridSize[0],

deviceProp.maxGridSize[1],

deviceProp.maxGridSize[2]);

printf("  Maximum memory pitch:                          %u bytes\n",

deviceProp.memPitch);

printf("  Texture alignment:                             %u bytes\n",

deviceProp.textureAlignment);

printf("  Clock rate:                                    %.2f GHz\n",

deviceProp.clockRate * 1e-6f);

#if CUDART_VERSION >= 2000

printf("  Concurrent copy and execution:                 %s\n",

deviceProp.deviceOverlap ? “Yes” : “No”);

#endif

}

}

int main(int argc, char* argv)

{

const int maxPwLength = 10;

unsigned int start = 0;

unsigned int ende = 1000;

char charset[63] = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789";

char hash_hex[33];



unsigned int counter;

char pstorer[maxPwLength];

bool found = false;



//nur main

char switcher[100];

int pwLength;

// -> header

printf("                                                                         \n");

printf("      #################################################################  \n");

printf("     #                                                                 # \n");

printf("     #           NTLM Brutforce Attack by Adrian Schwalller            # \n");

printf("     #                   Algorithm by Alain Espinosa                   # \n");

printf("     #                                                                 # \n");

printf("      #################################################################  \n");

printf("                                                                         \n");

printf("                                                                         \n");

printf("Press any key to start the programm...\n");

// _getch() iest ein Zeichen (kˆnnte es auch zur¸ckgeben)

_getch();





//print device information

print_device_information();

printf("Please insert the HTML-Hash-Value:\n");

scanf ("%s",hash_hex); 

gotolower(hash_hex);

printf("Would you like to insert a maxPwLength then insert 'pw'\n. Else you work with the counter.\n");

scanf ("%s",switcher); 

if (strcmp(switcher, "pw") == 0){

printf(“\nPlease insert the max passwort length:\n”);

scanf (“%d”,&pwLength);

start = 0;

ende = (int)pow((double)strlen(charset), pwLength);

}else{

printf(“\nPlease initialise the counter:\n”);

scanf (“%d”,&start);

printf(“\nPlease insert the maxCount for the counter:\n”);

scanf (“%d”,&ende);

pwLength = 1;

for(unsigned int i = ende; i > (unsigned int)strlen(charset); i = (unsigned int) ceil((double)i/62)){

pwLength++;

}

}

printf("PwLength: %d\n", pwLength);

printf("Start: %d\n", start);

printf("Ende: %d\n", ende);





//Variablen verschieben



char* g_charset;

char* g_hash_hex;

unsigned int* g_counter;

char* g_pstorer;

bool* g_found;



CUDA_SAFE_CALL(cudaMalloc((void**) &g_charset, sizeof(char)*strlen(charset)));

CUDA_SAFE_CALL(cudaMalloc((void**) &g_hash_hex, sizeof(char)*strlen(hash_hex)));

CUDA_SAFE_CALL(cudaMalloc((void**) &g_counter, sizeof(int)));

CUDA_SAFE_CALL(cudaMalloc((void**) &g_pstorer, sizeof(char)*maxPwLength));

CUDA_SAFE_CALL(cudaMalloc((void**) &g_found, sizeof(bool)));

printf("I'am here0");

CUDA_SAFE_CALL(cudaMemcpy(g_charset, charset, sizeof(char)*strlen(charset), cudaMemcpyHostToDevice));

CUDA_SAFE_CALL(cudaMemcpy(g_hash_hex, hash_hex, sizeof(char)*strlen(hash_hex), cudaMemcpyHostToDevice));

//CUDA_SAFE_CALL(cudaMemcpy(g_counter, counter, sizeof(int), cudaMemcpyHostToDevice));

CUDA_SAFE_CALL(cudaMemcpy(g_pstorer, pstorer, sizeof(char)*maxPwLength, cudaMemcpyHostToDevice));

//CUDA_SAFE_CALL(cudaMemcpy(g_found, found, sizeof(bool), cudaMemcpyHostToDevice));



int difference = ende - start;

//int words_per_thread = 10;

const int threads_per_block = 512;

int threads = 512;

int blocks = 65535; //65535*512=33553920

dim3 grid(blocks,1);

dim3 threadblock(threads,1,1);

printf("I'am here1");

clock_t t1,t2;

t1=clock();

bruteforce<<< grid, threadblock >>>(g_charset, g_hash_hex, g_counter, g_pstorer, g_found, maxPwLength);

t2=clock();

double time=((float)(t2-t1)/CLOCKS_PER_SEC);



printf("I'am here2");

CUDA_SAFE_CALL(cudaMemcpy(&counter, g_counter, sizeof(int), cudaMemcpyDeviceToHost));

CUDA_SAFE_CALL(cudaMemcpy(pstorer, g_pstorer, maxPwLength, cudaMemcpyDeviceToHost));

CUDA_SAFE_CALL(cudaMemcpy(&found, g_found, sizeof(bool), cudaMemcpyDeviceToHost));

if(found){

printf(“\nThe Password is: %s\n”, pstorer);

}else{

printf(“\nNo Password found in the list.\n”);

}

printf("\n\n + Done. \n\n");

printf("%d words checked\n\n", counter);

printf("Rechenzeit: %f\n\n", time);

getchar();  //equivalent zu _getch()

return 0;

}

';

  }

 do{

  	g_pstorer[position] = g_charset[decounter%((int)(sizeof(g_charset)/sizeof(char)))];

  	decounter = decounter/((int)(sizeof(g_charset)/sizeof(char)));

  	position++;

 } while (decounter != 0);

 prepare_key(g_pstorer, position, (int*)nt_buffer);

  __syncthreads();

  ntlm_crypt((int*)nt_buffer, (int*)output);

  __syncthreads();

  convert_hex((int*)output, (char*)hex_format);

  __syncthreads();

 //Vergleich machen...

  if (adistrcmp((char*)hex_format,g_hash_hex) == 0){

  	*g_found = true;

  }

	

	__syncthreads();

}

void print_device_information(){

	int deviceCount;

    CUDA_SAFE_CALL(cudaGetDeviceCount(&deviceCount));

    if (deviceCount == 0)

        printf("There is no device supporting CUDA\n");

    int dev;

    for (dev = 0; dev < deviceCount; ++dev) {

        cudaDeviceProp deviceProp;

        CUDA_SAFE_CALL(cudaGetDeviceProperties(&deviceProp, dev));

        if (dev == 0) {

            if (deviceProp.major == 9999 && deviceProp.minor == 9999)

                printf("There is no device supporting CUDA.\n");

            else if (deviceCount == 1)

                printf("There is 1 device supporting CUDA\n");

            else

                printf("There are %d devices supporting CUDA\n", deviceCount);

        }

        printf("\nDevice %d: \"%s\"\n", dev, deviceProp.name);

        printf("  Major revision number:                         %d\n",

               deviceProp.major);

        printf("  Minor revision number:                         %d\n",

               deviceProp.minor);

        printf("  Total amount of global memory:                 %u bytes\n",

               deviceProp.totalGlobalMem);

    #if CUDART_VERSION >= 2000

        printf("  Number of multiprocessors:                     %d\n",

               deviceProp.multiProcessorCount);

        printf("  Number of cores:                               %d\n",

               8 * deviceProp.multiProcessorCount);

    #endif

        printf("  Total amount of constant memory:               %u bytes\n",

               deviceProp.totalConstMem); 

        printf("  Total amount of shared memory per block:       %u bytes\n",

               deviceProp.sharedMemPerBlock);

        printf("  Total number of registers available per block: %d\n",

               deviceProp.regsPerBlock);

        printf("  Warp size:                                     %d\n",

               deviceProp.warpSize);

        printf("  Maximum number of threads per block:           %d\n",

               deviceProp.maxThreadsPerBlock);

        printf("  Maximum sizes of each dimension of a block:    %d x %d x %d\n",

               deviceProp.maxThreadsDim[0],

               deviceProp.maxThreadsDim[1],

               deviceProp.maxThreadsDim[2]);

        printf("  Maximum sizes of each dimension of a grid:     %d x %d x %d\n",

               deviceProp.maxGridSize[0],

               deviceProp.maxGridSize[1],

               deviceProp.maxGridSize[2]);

        printf("  Maximum memory pitch:                          %u bytes\n",

               deviceProp.memPitch);

        printf("  Texture alignment:                             %u bytes\n",

               deviceProp.textureAlignment);

        printf("  Clock rate:                                    %.2f GHz\n",

               deviceProp.clockRate * 1e-6f);

    #if CUDART_VERSION >= 2000

        printf("  Concurrent copy and execution:                 %s\n",

               deviceProp.deviceOverlap ? "Yes" : "No");

    #endif

    }

}

int main(int argc, char* argv[])

{

	const int maxPwLength = 10;

	unsigned int start = 0;

	unsigned int ende = 1000;

	char charset[63] = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789";

	char hash_hex[33];

	

	unsigned int counter;

	char pstorer[maxPwLength];

	bool found = false;

	

	//nur main

	char switcher[100];

	int pwLength;

	// -> header

    printf("                                                                         \n");

	printf("      #################################################################  \n");

    printf("     #                                                                 # \n");

	printf("     #           NTLM Brutforce Attack by Adrian Schwalller            # \n");

	printf("     #                   Algorithm by Alain Espinosa                   # \n");

	printf("     #                                                                 # \n");

	printf("      #################################################################  \n");

	printf("                                                                         \n");

	printf("                                                                         \n");

	printf("Press any key to start the programm...\n");

	// _getch() iest ein Zeichen (kˆnnte es auch zur¸ckgeben)

	_getch();

	

	

	//print device information

	print_device_information();

	printf("Please insert the HTML-Hash-Value:\n");

	scanf ("%s",hash_hex); 

	gotolower(hash_hex);

	printf("Would you like to insert a maxPwLength then insert 'pw'\n. Else you work with the counter.\n");

	scanf ("%s",switcher); 

	if (strcmp(switcher, "pw") == 0){

  printf("\nPlease insert the max passwort length:\n");

  scanf ("%d",&pwLength);	

  start = 0;

  ende = (int)pow((double)strlen(charset), pwLength);

	}else{

 printf("\nPlease initialise the counter:\n");

  scanf ("%d",&start); 

 printf("\nPlease insert the maxCount for the counter:\n");

  scanf ("%d",&ende);

 pwLength = 1;

  for(unsigned int i = ende; i > (unsigned int)strlen(charset); i = (unsigned int) ceil((double)i/62)){

  	pwLength++;

  }

	}

	printf("PwLength: %d\n", pwLength);

	printf("Start: %d\n", start);

	printf("Ende: %d\n", ende);

	

	

	//Variablen verschieben

	

	char* g_charset;

	char* g_hash_hex;

	unsigned int* g_counter;

	char* g_pstorer;

	bool* g_found;

	

	CUDA_SAFE_CALL(cudaMalloc((void**) &g_charset, sizeof(char)*strlen(charset)));

	CUDA_SAFE_CALL(cudaMalloc((void**) &g_hash_hex, sizeof(char)*strlen(hash_hex)));

	CUDA_SAFE_CALL(cudaMalloc((void**) &g_counter, sizeof(int)));

	CUDA_SAFE_CALL(cudaMalloc((void**) &g_pstorer, sizeof(char)*maxPwLength));

	CUDA_SAFE_CALL(cudaMalloc((void**) &g_found, sizeof(bool)));

	printf("I'am here0");

	CUDA_SAFE_CALL(cudaMemcpy(g_charset, charset, sizeof(char)*strlen(charset), cudaMemcpyHostToDevice));

	CUDA_SAFE_CALL(cudaMemcpy(g_hash_hex, hash_hex, sizeof(char)*strlen(hash_hex), cudaMemcpyHostToDevice));

	//CUDA_SAFE_CALL(cudaMemcpy(g_counter, counter, sizeof(int), cudaMemcpyHostToDevice));

	CUDA_SAFE_CALL(cudaMemcpy(g_pstorer, pstorer, sizeof(char)*maxPwLength, cudaMemcpyHostToDevice));

	//CUDA_SAFE_CALL(cudaMemcpy(g_found, found, sizeof(bool), cudaMemcpyHostToDevice));

	

	int difference = ende - start;

	//int words_per_thread = 10;

	const int threads_per_block = 512;

	int threads = 512;

	int blocks = 65535; //65535*512=33553920

	dim3 grid(blocks,1);

	dim3 threadblock(threads,1,1);

	printf("I'am here1");

	clock_t t1,t2;

	t1=clock();

	bruteforce<<< grid, threadblock >>>(g_charset, g_hash_hex, g_counter, g_pstorer, g_found, maxPwLength);

	t2=clock();

	double time=((float)(t2-t1)/CLOCKS_PER_SEC);

	

	printf("I'am here2");

	CUDA_SAFE_CALL(cudaMemcpy(&counter, g_counter, sizeof(int), cudaMemcpyDeviceToHost));

	CUDA_SAFE_CALL(cudaMemcpy(pstorer, g_pstorer, maxPwLength, cudaMemcpyDeviceToHost));

	CUDA_SAFE_CALL(cudaMemcpy(&found, g_found, sizeof(bool), cudaMemcpyDeviceToHost));

	if(found){

  printf("\nThe Password is: %s\n", pstorer);

	}else{

  printf("\nNo Password found in the list.\n");

	}

	printf("\n\n + Done. \n\n");

	printf("%d words checked\n\n", counter);

	printf("Rechenzeit: %f\n\n", time);

	getchar();  //equivalent zu _getch()

	return 0;

}

hex_format is in bruteforce() declared incorrectly, you have declared an array of 33 pointers to objects of type char.

convert_hex() writes one too far into hex_format passed from bruteforce().

Many strings are not zero terminated; zero termination should be applied consistently, and copied with each string.

g_counter is not initialized before use; you have not copied anything into it before starting the kernel; same for g_found.

You are not using shared memory, so you should not need calls to __syncthreads() in the kernel.

The first line in the do-loop of bruteforce() hits global memory twice both p_charset and g_pstorer. This is performance unfriendly.

g_charset is constant so would probably be faster if stored in constant memory.

g_pstorer is where you expect to read the result when the kernel terminates; unfortunately each thread also uses this memory as scratch pad to create the string to hash. When a warp worth of threads are all trying to write this buffer at the same time, you are hashing garbage.

64k-1 blocks of 512 threads is only sufficient to find a password of 4 characters or less.