Hey guys I finally finished my SHA Bruteforcer. But it’s slow as hell. I tried to use a lot of makros so that the code wont contain any memory difficaulties, but can’t even BF a three char long hash before

```
The launch timed our and was terminated
```

what is 5 seconds, google told me. Is constant memory better than working with makros? It’s cached then isn’t it? Is there any way to write inline functions in cuda?

A reason for that is, that I can’t terminate the other threads if one found the searched hash. I tried with a flag (have a look at the code) but then I get an

```
unspecified launch failure
```

What else can be improved?

```
#include "sha_function.h"
#include <time.h>
#define word unsigned int
/* f1 to f4 */
__device__ inline word f1( word x, word y, word z) { return ( ( x & y ) | ( ~x & z ) ); }
__device__ inline word f2( word x, word y, word z) { return ( x ^ y ^ z ); }
__device__ inline word f3( word x, word y, word z) { return ( ( x & y ) | ( x & z ) | ( y & z ) ); }
__device__ inline word f4( word x, word y, word z) { return ( x ^ y ^ z ); }
/* SHA init values */
__constant__ word I1 = 0x67452301L;
__constant__ word I2 = 0xEFCDAB89L;
__constant__ word I3 = 0x98BADCFEL;
__constant__ word I4 = 0x10325476L;
__constant__ word I5 = 0xC3D2E1F0L;
/* SHA constants */
__constant__ word C1 = 0x5A827999L;
__constant__ word C2 = 0x6Ed9EBA1L;
__constant__ word C3 = 0x8F1BBCDCL;
__constant__ word C4 = 0xCA62C1D6L;
/* 32-bit rotate */
__device__ inline word ROT(word x,int n){ return ( ( x << n ) | ( x >> ( 32 - n ) ) ); }
/* main function */
#define CALC(n,i) temp = ROT ( A , 5 ) + f##n( B , C, D ) + W[i] + E + C##n ; E = D; D = C; C = ROT ( B , 30 ); B = A; A = temp
void start(unsigned int * hash_tmp, int length, unsigned char * res)
{
unsigned char * buffer = 0;
unsigned int * hash = 0;
//unsigned int * debug = 0;
//debug = (unsigned int*) malloc(5 * sizeof(unsigned int));
cutilSafeCall ( cudaMalloc((void** ) &buffer, 10 * sizeof(unsigned char)) );
cutilSafeCall ( cudaMalloc((void** ) &hash, 5 * sizeof(unsigned int)) );
unsigned char * buffer_fill[10];
for(int i = 0; i <10; i++)
buffer_fill[i] = 0x0;
cudaMemcpy (hash, hash_tmp, 5 * sizeof(unsigned int), cudaMemcpyHostToDevice);
cudaMemcpy (buffer, buffer_fill, 10 * sizeof(unsigned char), cudaMemcpyHostToDevice);
// KERNEL EXECUTION
smash<<<9025,95>>>(length, buffer, hash);
cudaMemcpy(res, buffer, 10 * sizeof(unsigned char), cudaMemcpyDeviceToHost);
cudaMemcpy(debug, hash, 5 * sizeof(unsigned int), cudaMemcpyDeviceToHost);
cudaError_t err = cudaGetLastError();
if( cudaSuccess != err)
printf( "Cuda error: %s.\n", cudaGetErrorString( err) );
cudaFree(buffer);
cudaFree(hash);
}
__global__ void smash(int length, unsigned char * buffer, unsigned int * hash)
{
int higher = 127;
int lower = 32;
unsigned char input_cpy[10];
int carry = 1;
for(int i = 0; i <10; i++)
input_cpy[i] = lower;
// init input_cpy
input_cpy[0] = threadIdx.x + lower;
input_cpy[1] = (blockIdx.x / (higher-lower)) + lower;
input_cpy[2] = (blockIdx.x % (higher-lower)) + lower;
// value @length as a flag.
// if != 0 break
for(int i = 3; i < 10; i++)
if(i >= length)
input_cpy[i] = 0;
unsigned int W[80],A,B,C,D,E,temp;
// calculate all possible charsets with the
// given threadId, blockId and length
while(input_cpy[length] == 0 && buffer[0] == 0) //@TODO || flag)
//for(int j = 0; j < 1000; j++)
{
// Calculate sha for given input.
// DO THE SHA ------------------------------------------------------
memInit(W, input_cpy, length);
for(int i = 16; i < 80; i++)
W[i] = ROT( ( W[i-3] ^ W[i-8] ^ W[i-14] ^ W[i-16] ) , 1 );
A = I1; B = I2; C = I3; D = I4; E = I5;
CALC(1,0); CALC(1,1); CALC(1,2); CALC(1,3); CALC(1,4);
CALC(1,5); CALC(1,6); CALC(1,7); CALC(1,8); CALC(1,9);
CALC(1,10); CALC(1,11); CALC(1,12); CALC(1,13); CALC(1,14);
CALC(1,15); CALC(1,16); CALC(1,17); CALC(1,18); CALC(1,19);
CALC(2,20); CALC(2,21); CALC(2,22); CALC(2,23); CALC(2,24);
CALC(2,25); CALC(2,26); CALC(2,27); CALC(2,28); CALC(2,29);
CALC(2,30); CALC(2,31); CALC(2,32); CALC(2,33); CALC(2,34);
CALC(2,35); CALC(2,36); CALC(2,37); CALC(2,38); CALC(2,39);
CALC(3,40); CALC(3,41); CALC(3,42); CALC(3,43); CALC(3,44);
CALC(3,45); CALC(3,46); CALC(3,47); CALC(3,48); CALC(3,49);
CALC(3,50); CALC(3,51); CALC(3,52); CALC(3,53); CALC(3,54);
CALC(3,55); CALC(3,56); CALC(3,57); CALC(3,58); CALC(3,59);
CALC(4,60); CALC(4,61); CALC(4,62); CALC(4,63); CALC(4,64);
CALC(4,65); CALC(4,66); CALC(4,67); CALC(4,68); CALC(4,69);
CALC(4,70); CALC(4,71); CALC(4,72); CALC(4,73); CALC(4,74);
CALC(4,75); CALC(4,76); CALC(4,77); CALC(4,78); CALC(4,79);
// That needs to be done, == with like (A + I1) =0 hash[0]
// is wrong all the time?!
unsigned int tmp1, tmp2, tmp3, tmp4, tmp5;
tmp1 = A + I1;
tmp2 = B + I2;
tmp3 = C + I3;
tmp4 = D + I4;
tmp5 = E + I5;
// if result was found, cpy to buffer
if( tmp1 == hash[0] &&
tmp2 == hash[1] &&
tmp3 == hash[2] &&
tmp4 == hash[3] &&
tmp5 == hash[4] )
{
buffer[0] = input_cpy[0];
buffer[1] = input_cpy[1];
buffer[2] = input_cpy[2];
buffer[3] = input_cpy[3];
buffer[4] = input_cpy[4];
buffer[5] = input_cpy[5];
buffer[6] = input_cpy[6];
buffer[7] = input_cpy[7];
buffer[8] = input_cpy[8];
buffer[9] = input_cpy[9];
break;
}
// adding new value
// DO THE ADDITION ----------------------------------------------
for(int i = 3; i < 10; i++)
{
if(carry)
{
input_cpy[i] = input_cpy[i]+ 1;
if(input_cpy[i] > higher)
{
input_cpy[i] = lower;
carry = 1;
} else
carry = 0;
} else
break;
}
carry = 1;
}
}
__device__ void memInit(unsigned int * tmp, unsigned char input[], int length)
{
int stop = 0;
// reseting tmp
for(int i = 0; i < 80; i++) tmp[i] = 0;
// fill tmp like: message char c0,c1,c2,...,cn,10000000,00...000
for(int i = 0; i < length; i+=4)
{
for(int j = 0; j < 4; j++)
if(i + j < length)
tmp[i/4] |= input[i+j] << (24-j * 8);
else
{
stop = 1;
break;
}
if(stop)
break;
}
tmp[length/4] |= 0x80 << (24-(length%4) * 8); // Append 1 then zeros
// Adding length as last value
tmp[15] |= length * 8;
}
261,1 Bot
```