Speeding SHA up

Hey guys I finally finished my SHA Bruteforcer. But it’s slow as hell. I tried to use a lot of makros so that the code wont contain any memory difficaulties, but :/ can’t even BF a three char long hash before

The launch timed our and was terminated

what is 5 seconds, google told me. Is constant memory better than working with makros? It’s cached then isn’t it? Is there any way to write inline functions in cuda?

A reason for that is, that I can’t terminate the other threads if one found the searched hash. I tried with a flag (have a look at the code) but then I get an

unspecified launch failure

What else can be improved?

#include "sha_function.h"

#include <time.h>

#define word    unsigned int

/* f1 to f4 */

__device__ inline word f1( word x, word y, word z) { return ( ( x & y ) | ( ~x & z ) ); }

__device__ inline word f2( word x, word y, word z) { return ( x ^ y ^ z ); }

__device__ inline word f3( word x, word y, word z) { return ( ( x & y ) | ( x & z ) | ( y & z ) ); }

__device__ inline word f4( word x, word y, word z) { return ( x ^ y ^ z ); }

/* SHA init values */

__constant__ word I1 = 0x67452301L;

__constant__ word I2 = 0xEFCDAB89L;

__constant__ word I3 = 0x98BADCFEL;

__constant__ word I4 = 0x10325476L;

__constant__ word I5 = 0xC3D2E1F0L;

/* SHA constants */

__constant__ word C1 = 0x5A827999L;

__constant__ word C2 = 0x6Ed9EBA1L;

__constant__ word C3 = 0x8F1BBCDCL;

__constant__ word C4 = 0xCA62C1D6L;

/* 32-bit rotate */

__device__ inline word ROT(word x,int n){ return ( ( x << n ) | ( x >> ( 32 - n ) ) ); }

/* main function */

#define CALC(n,i) temp =  ROT ( A , 5 ) + f##n( B , C, D ) +  W[i] + E + C##n  ; E = D; D = C; C = ROT ( B , 30 ); B = A; A = temp

void start(unsigned int * hash_tmp,  int length, unsigned char * res)

{

unsigned char * buffer = 0;

    unsigned int  * hash = 0;

    //unsigned int * debug = 0;

//debug = (unsigned int*) malloc(5 * sizeof(unsigned int));

    cutilSafeCall ( cudaMalloc((void** ) &buffer, 10 * sizeof(unsigned char)) );

    cutilSafeCall ( cudaMalloc((void** ) &hash, 5 * sizeof(unsigned int)) );

unsigned char * buffer_fill[10];

    for(int i = 0; i <10; i++)

        buffer_fill[i] = 0x0;

cudaMemcpy (hash, hash_tmp, 5 * sizeof(unsigned int), cudaMemcpyHostToDevice);

    cudaMemcpy (buffer, buffer_fill, 10 * sizeof(unsigned char), cudaMemcpyHostToDevice);

// KERNEL EXECUTION

    smash<<<9025,95>>>(length, buffer, hash);

cudaMemcpy(res, buffer, 10 * sizeof(unsigned char), cudaMemcpyDeviceToHost);

    cudaMemcpy(debug, hash, 5 * sizeof(unsigned int), cudaMemcpyDeviceToHost);

cudaError_t err = cudaGetLastError();

    if( cudaSuccess != err)

        printf( "Cuda error: %s.\n",  cudaGetErrorString( err) );

cudaFree(buffer);

    cudaFree(hash);

}

__global__ void smash(int length, unsigned char * buffer, unsigned int * hash)

{

int higher = 127;

    int lower = 32;

    unsigned char input_cpy[10];

    int carry = 1;

for(int i = 0; i <10; i++)

        input_cpy[i] = lower;

// init input_cpy

    input_cpy[0] = threadIdx.x + lower;

    input_cpy[1] = (blockIdx.x / (higher-lower)) + lower;

    input_cpy[2] = (blockIdx.x % (higher-lower)) + lower;

// value @length as a flag.

    // if != 0 break

    for(int i = 3; i < 10; i++)

        if(i >= length)

            input_cpy[i] = 0;

unsigned int W[80],A,B,C,D,E,temp;

// calculate all possible charsets with the

    // given threadId, blockId and length

    while(input_cpy[length] == 0 && buffer[0] == 0) //@TODO || flag) 

    //for(int j = 0; j < 1000; j++)

    {

        // Calculate sha for given input.

        // DO THE SHA ------------------------------------------------------

memInit(W, input_cpy, length);

for(int i = 16; i < 80; i++)

            W[i] = ROT( ( W[i-3] ^ W[i-8] ^ W[i-14] ^ W[i-16] ) , 1 );

A = I1;    B = I2;    C = I3;    D = I4;    E = I5;

CALC(1,0);  CALC(1,1);  CALC(1,2);  CALC(1,3);  CALC(1,4);

        CALC(1,5);  CALC(1,6);  CALC(1,7);  CALC(1,8);  CALC(1,9);

        CALC(1,10); CALC(1,11); CALC(1,12); CALC(1,13); CALC(1,14);

        CALC(1,15); CALC(1,16); CALC(1,17); CALC(1,18); CALC(1,19);

        CALC(2,20); CALC(2,21); CALC(2,22); CALC(2,23); CALC(2,24);

        CALC(2,25); CALC(2,26); CALC(2,27); CALC(2,28); CALC(2,29);

        CALC(2,30); CALC(2,31); CALC(2,32); CALC(2,33); CALC(2,34);

        CALC(2,35); CALC(2,36); CALC(2,37); CALC(2,38); CALC(2,39);

        CALC(3,40); CALC(3,41); CALC(3,42); CALC(3,43); CALC(3,44);

        CALC(3,45); CALC(3,46); CALC(3,47); CALC(3,48); CALC(3,49);

        CALC(3,50); CALC(3,51); CALC(3,52); CALC(3,53); CALC(3,54);

        CALC(3,55); CALC(3,56); CALC(3,57); CALC(3,58); CALC(3,59);

        CALC(4,60); CALC(4,61); CALC(4,62); CALC(4,63); CALC(4,64);

        CALC(4,65); CALC(4,66); CALC(4,67); CALC(4,68); CALC(4,69);

        CALC(4,70); CALC(4,71); CALC(4,72); CALC(4,73); CALC(4,74);

        CALC(4,75); CALC(4,76); CALC(4,77); CALC(4,78); CALC(4,79);

// That needs to be done, == with like (A + I1) =0 hash[0] 

        // is wrong all the time?!

        unsigned int tmp1, tmp2, tmp3, tmp4, tmp5;

tmp1 = A + I1;

        tmp2 = B + I2;

        tmp3 = C + I3;

        tmp4 = D + I4;

        tmp5 = E + I5;

// if result was found, cpy to buffer

        if( tmp1 == hash[0] &&

            tmp2 == hash[1] &&

            tmp3 == hash[2] &&

            tmp4 == hash[3] &&

            tmp5 == hash[4] )

        {

            buffer[0] = input_cpy[0];

            buffer[1] = input_cpy[1];

            buffer[2] = input_cpy[2];

            buffer[3] = input_cpy[3];

            buffer[4] = input_cpy[4];

            buffer[5] = input_cpy[5];

            buffer[6] = input_cpy[6];

            buffer[7] = input_cpy[7];

            buffer[8] = input_cpy[8];

            buffer[9] = input_cpy[9];

break;

        }

// adding new value

        // DO THE ADDITION ----------------------------------------------

for(int i = 3; i < 10; i++)

        {

            if(carry)

            {

                input_cpy[i] = input_cpy[i]+ 1;

                if(input_cpy[i] > higher)

                {

                    input_cpy[i] = lower;

                    carry = 1;

                } else

                    carry = 0;

            } else

                break;

        }

        carry = 1;

    }

}

__device__ void memInit(unsigned int * tmp, unsigned char input[], int length)

{

int stop = 0;

    // reseting tmp

    for(int i = 0; i < 80; i++) tmp[i] = 0;

// fill tmp like: message char c0,c1,c2,...,cn,10000000,00...000

    for(int i = 0; i < length; i+=4)

    {

        for(int j = 0; j < 4; j++)

            if(i + j < length)

                tmp[i/4] |= input[i+j] << (24-j * 8);

            else

            {

stop = 1;

                break;

            }

        if(stop)

            break;

    }

    tmp[length/4] |= 0x80 << (24-(length%4) * 8);     // Append 1 then zeros

    // Adding length as last value

    tmp[15] |= length * 8;

}

                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                   261,1         Bot

I edited the code above, removed some unnecessary calculations, removed some macros for inline function for better readability (no perfomance differences measured.)

Now I can bruteforce a 4 char password. 5 char passwort (with a 95 charset) still throws “The launch timed out and was terminated”.

A question to you guys: I read that array can’t be kept in registers, would it be a significant speedup if I move input_cpy (which is used for word initialisation each time and the addition with the carry flag) to “normal” unsigned int vars, and maybe (what would be alot of writing stuff) the W[80] array to? How many registers are available?

You have to add a “persistence” logic into your CUDA code:

  • bootstrap from global memory saved state [and init code if you don’t want to pre-save initial condition from CPU thread before the first launch]
  • global memory updated status information (to know if computation is finished or not)
  • global memory timeout information, maintained by…
  • watchdog inside the GPU code to maintain the global memory timeout information
  • for each SM, you will need a warp that willjust poll the global memory timeout, and store it on local memory
  • when timeout condition is flagged, each thread must save it’s current state (ie: in end of their main loop) to be able to be restarted later, and end

The idea is to use allocated global memory, that will stay untouched between 2 kernel launch, and launch repetitively your kernel until it’s status is “finished” with a result, using a task persistence mechanism as well as a software watchdog on the GPU, and communication from global memory to local memory (that should avoid to use too much bandwidth, a trick is to do it on a separate warp, and slowdown things on this warp using dumb computation or just waiting for a synchronisation with the other warps)

This is a quite awesome mechanism that will sadly cost a lot of speed.
I managed to kill the X-server. Then the timeout-error isn’t thrown.
Using that, I was abled to crack a 6 char long string within ~2 hours (Quadro 6000).

Thank iAPX for the useful technique. I think I will use it in an other context once!

It’s wothwhile to have that in your CUDA libraries, because it solves many problems caused by OS timeouts when using your GPU as a display, and it’s not costly in terms of time, because the kernel is already compiled when you relaunch it, and you don’t have to read and write the full GPU memory, only a single flag to know if it needs to be relaunched, keeping current state into the GPU global memory!

In any case, you might have up to 48KB x SM number to write and read into the GPU global memory, and it’s really fast!

Ty for your advice! Is there any paper or how-to to implement this in a fast way, or do you have any example code? That would be awesome!