Cuda SHA empty result problem

Hey guys, I want to do SHA-1 on Cuda (little bruteforce project, but at first I want to make it work).

#include "sha_function.h"

/* f1 to f4 */

#define f1(x,y,z)       ( ( x & y ) | ( ~x & z ) )

#define f2(x,y,z)       ( x ^ y ^ z )

#define f3(x,y,z)       ( ( x & y ) | ( x & z ) | ( y & z ) )

#define f4(x,y,z)       ( x ^ y ^ z )

/* SHA init values */

#define I1      0x67452301L

#define I2      0xEFCDAB89L

#define I3      0x98BADCFEL

#define I4      0x10325476L

#define I5      0xC3D2E1F0L

/* SHA constants */

#define C1      0x5a827999L

#define C2      0x6ed9eba1L

#define C3      0x8f1bbcdcL

#define C4      0xca62c1d6L

/* 32-bit rotate */

#define ROT(x,n)        ( ( x << n ) | ( x >> ( 32 - n ) ) )

/* main function */

#define CALC(n,i) temp = ( ROT ( A , 5 ) + f##n( B , C, D ) ) + ( W[i] + ( E + C##n ) ); E = D; D = C; C = ROT ( B , 30 ); B = A; A = temp

int main()

{

        unsigned long * hash = (unsigned long *) malloc(4);

        int length = 6;

        start(hash, length);

        return 1;

}

void start(unsigned long * hash,  int length)

{

        unsigned long * buffer = 0;

        unsigned long * res = 0;

res = (unsigned long*) malloc(5 * sizeof(unsigned long));

        cudaMalloc((void** ) &buffer, 5 * sizeof(unsigned long));

unsigned char input[length];

input[0] = 'a';

        input[1] = 'a';

        input[2] = 'a';

        input[3] = 'a';

        input[4] = 'a';

        input[5] = 'a';

doSHA<<<1,1>>>(input, length, buffer);

        cudaMemcpy(res, buffer, 5 * sizeof(unsigned long), cudaMemcpyDeviceToHost);

printf("%X\n", res[0]);

}

The printf at the end of the function prints 0 :( not what it should do. I think it’s a very stupid fault again. Ignore the variable “hash” that is passed to the function, it’s for the bruteforcer later. The rest of the code looks like that:

__global__ void doSHA(unsigned char * input, int length, unsigned long * buffer)

{

        unsigned long W[80],A,B,C,D,E,temp;

        memInit(W, input, length);

for(int i = 16; i < 80; i++)

            W[i] = ( W[i-3] ^ W[i-8] ^ W[i-14] ^ W[i-16] );

A = I1; B = I2; C = I3; D = I4; E = I5;

CALC(1,0);  CALC(1,1);  CALC(1,2);  CALC(1,3);  CALC(1,4);

        CALC(1,5);  CALC(1,6);  CALC(1,7);  CALC(1,8);  CALC(1,9);

        CALC(1,10); CALC(1,11); CALC(1,12); CALC(1,13); CALC(1,14);

        CALC(1,15); CALC(1,16); CALC(1,17); CALC(1,18); CALC(1,19);

CALC(2,20); CALC(2,21); CALC(2,22); CALC(2,23); CALC(2,24);

        CALC(2,25); CALC(2,26); CALC(2,27); CALC(2,28); CALC(2,29);

        CALC(2,30); CALC(2,31); CALC(2,32); CALC(2,33); CALC(2,34);

        CALC(2,55); CALC(2,36); CALC(2,37); CALC(2,38); CALC(2,39);

CALC(3,40); CALC(3,41); CALC(3,42); CALC(3,43); CALC(3,44);

        CALC(3,45); CALC(3,46); CALC(3,47); CALC(3,48); CALC(3,49);

        CALC(3,50); CALC(3,51); CALC(3,52); CALC(3,53); CALC(3,54);

        CALC(3,55); CALC(3,56); CALC(3,57); CALC(3,58); CALC(3,59);

CALC(4,60); CALC(4,61); CALC(4,62); CALC(4,63); CALC(4,64);

        CALC(4,65); CALC(4,66); CALC(4,67); CALC(4,68); CALC(4,69);

        CALC(4,70); CALC(4,71); CALC(4,72); CALC(4,73); CALC(4,74);

        CALC(4,75); CALC(4,76); CALC(4,77); CALC(4,78); CALC(4,79);

buffer[0] = A;

        buffer[1] = B;

        buffer[2] = C;

        buffer[3] = D;

        buffer[4] = E;

}

__device__ void memInit(unsigned long * tmp, unsigned char * input, int length)

{

        int stop = 0;

        // reseting tmp

        for(int i = 0; i < 80; i++) tmp[i] = 0;

// fill tmp like: message char c0,c1,c2,...,cn,10000000,00...000

        for(int i = 0; i < length; i+=4)

        {

            for(int j = 0; j < 4; j++)

                if(i + j < length)

                    tmp[i/4] |= input[i+j] << (24 - j * 8);

                else

                {

                    tmp[i/4] |= 0x80;   // Append 1 then zeros

                    stop = 1;

                    break;

                }

        if(stop)

            break;

        }

// Adding length as last value

        tmp[15] |= length;

}

Ty for your help guy, once more!

General comment: You should check the return codes from CUDA functions, because they will tell you when your code isn’t running because of an error.

In this case, I believe the error is because you are passing a host pointer, the input array, to the device code. You need cudaMalloc() some device memory to hold the input, cudaMemcpy your input array into it, and pass that device pointer to your kernel.

Solved