SHA but ... doesn't calculate SHA :(

I’m not really sure if this is a CUDA Problem but I strictly followed this guide and the result isn’t what I expect. There is only one message block because it will be a bruteforcer sooner or later -.- I compared the results with a working sha-function but it is wrong.

I’m pretty sure the memInit-function does the right thing. With the input ‘aaaaaa’ it precomputes:

61616161 61618000 00000000 00000000 00000000 00000000 00000000 00000000 00000000 00000000 00000000 00000000 00000000 00000000 00000000 00000030

Here is the code. Maybe it is a problem with some type length, endians or something.

#include "sha_function.h"

/* f1 to f4 */

#define f1(x,y,z)       ( ( x & y ) | ( ~x & z ) )

#define f2(x,y,z)       ( x ^ y ^ z )

#define f3(x,y,z)       ( ( x & y ) | ( x & z ) | ( y & z ) )

#define f4(x,y,z)       ( x ^ y ^ z )

/* SHA init values */

#define I1      0x67452301L

#define I2      0xEFCDAB89L

#define I3      0x98BADCFEL

#define I4      0x10325476L

#define I5      0xC3D2E1F0L

/* SHA constants */

#define C1      0x5a827999L

#define C2      0x6ed9eba1L

#define C3      0x8f1bbcdcL

#define C4      0xca62c1d6L

/* 32-bit rotate */

#define ROT(x,n)        ( ( x << n ) | ( x >> ( 32 - n ) ) )

/* main function */

#define CALC(n,i) temp =  ROT ( A , 5 ) + f##n( B , C, D ) +  W[i] + E + C##n  ; E = D; D = C; C = ROT ( B , 30 ); B = A; A = temp

int main()

{

        unsigned long * hash = (unsigned long *) malloc(4);

        int length = 6;

        start(hash, length);

        return 1;

}

void start(unsigned long * hash,  int length)

{

        unsigned long * buffer = 0;

        unsigned long * res = 0;

        unsigned char * input;

res = (unsigned long*) malloc(5 * sizeof(unsigned long));

        cudaMalloc((void** ) &buffer, 5 * sizeof(unsigned long));

        cudaMalloc((void** ) &input, length * sizeof(unsigned char));

unsigned char input_tmp[length];

input_tmp[0] = 'a';

        input_tmp[1] = 'a';

        input_tmp[2] = 'a';

        input_tmp[3] = 'a';

        input_tmp[4] = 'a';

        input_tmp[5] = 'a';

cudaMemcpy(input, input_tmp, length * sizeof(unsigned char), cudaMemcpyHostToDevice);

        doSHA<<<1,1>>>(input, length, buffer);

        cudaMemcpy(res, buffer, 5 * sizeof(unsigned long), cudaMemcpyDeviceToHost);

cudaError_t err = cudaGetLastError();

        if( cudaSuccess != err)

                printf( "Cuda error: %s.\n",  cudaGetErrorString( err) );

printf("%X\n", res[0]);

}

__global__ void doSHA(unsigned char * input, int length, unsigned long * buffer)

{

        unsigned long W[80],A,B,C,D,E,temp;

        memInit(W, input, length);

//      buffer[0] = W[1];

for(int i = 16; i < 80; i++)

                W[i] = ROT( ( W[i-3] ^ W[i-8] ^ W[i-14] ^ W[i-16] ) , 1 );

A = I1; B = I2; C = I3; D = I4; E = I5;

int i = 0;

for (i = 0; i < 20; ++i)

                CALC(1,i);

for (i = 20; i < 40; ++i)

                CALC(2,i);

for (i = 40; i < 60; ++i)

                CALC(3,i);

for (i = 60; i < 80; ++i)

                CALC(4,i);

buffer[0] = A + I1;

        buffer[1] = B + I2;

        buffer[2] = C + I3;

        buffer[3] = D + I4;

        buffer[4] = E + I5;

}

__device__ void memInit(unsigned long * tmp, unsigned char * input, int length)

{

int stop = 0;

        // reseting tmp

        for(int i = 0; i < 80; i++) tmp[i] = 0;

// fill tmp like: message char c0,c1,c2,...,cn,10000000,00...000

        for(int i = 0; i < length; i+=4)

        {

                for(int j = 0; j < 4; j++)

                        if(i + j < length)

                                tmp[i/4] |= input[i+j] << (24-j * 8);

                        else

                        {

                                tmp[i/4] |= 0x80 << (24-j * 8);         // Append 1 then zeros

                                stop = 1;

                                break;

                        }

                if(stop)

                        break;

        }

        // Adding length as last value

        tmp[15] |= length * 8;

}

Thank you for helping me, very appreciated!

Okay sry for bothering you, I solved the problem.
OFC long is 64-bit not 32-bit, rotation didn’t work so. I use uint now :)