I’m pretty new to cuda,
maybe somebody can give me a helping hand?
Source:
https://github.com/grocid/gpusha1
The kernel file:
// Modified version of
// https://github.com/B-Con/crypto-algorithms/blob/master/sha1.c
#include <stdio.h>
__device__ __constant__ unsigned int threadMax;
#define ROTLEFT(a, b) ((a << b) | (a >> (32 - b)))
#define SHA1_BLOCK_SIZE 20
#define TRAIL 24
typedef unsigned char BYTE; // 8-bit byte
typedef unsigned int WORD; // 32-bit word, change to "long" for 16-bit machines
typedef struct {
BYTE data[64];
WORD datalen;
unsigned long long bitlen;
WORD state[5];
WORD k[4];
} SHA1_CTX;
__device__ void sha1_transform(SHA1_CTX *ctx, const BYTE data[])
{
WORD a, b, c, d, e, i, j, t, m[80];
for (i = 0, j = 0; i < 16; ++i, j += 4)
m[i] = (data[j] << 24) + (data[j + 1] << 16) + (data[j + 2] << 8) + (data[j + 3]);
for ( ; i < 80; ++i) {
m[i] = (m[i - 3] ^ m[i - 8] ^ m[i - 14] ^ m[i - 16]);
m[i] = (m[i] << 1) | (m[i] >> 31);
}
a = ctx->state[0];
b = ctx->state[1];
c = ctx->state[2];
d = ctx->state[3];
e = ctx->state[4];
for (i = 0; i < 20; ++i)
{
t = ROTLEFT(a, 5) + ((b & c) ^ (~b & d)) + e + ctx->k[0] + m[i];
e = d;
d = c;
c = ROTLEFT(b, 30);
b = a;
a = t;
}
for ( ; i < 40; ++i)
{
t = ROTLEFT(a, 5) + (b ^ c ^ d) + e + ctx->k[1] + m[i];
e = d;
d = c;
c = ROTLEFT(b, 30);
b = a;
a = t;
}
for ( ; i < 60; ++i)
{
t = ROTLEFT(a, 5) + ((b & c) ^ (b & d) ^ (c & d)) + e + ctx->k[2] + m[i];
e = d;
d = c;
c = ROTLEFT(b, 30);
b = a;
a = t;
}
for ( ; i < 80; ++i)
{
t = ROTLEFT(a, 5) + (b ^ c ^ d) + e + ctx->k[3] + m[i];
e = d;
d = c;
c = ROTLEFT(b, 30);
b = a;
a = t;
}
ctx->state[0] += a;
ctx->state[1] += b;
ctx->state[2] += c;
ctx->state[3] += d;
ctx->state[4] += e;
}
__device__ void sha1_init(SHA1_CTX *ctx)
{
ctx->datalen = 0;
ctx->bitlen = 0;
ctx->state[0] = 0x67452301;
ctx->state[1] = 0xEFCDAB89;
ctx->state[2] = 0x98BADCFE;
ctx->state[3] = 0x10325476;
ctx->state[4] = 0xc3d2e1f0;
ctx->k[0] = 0x5a827999;
ctx->k[1] = 0x6ed9eba1;
ctx->k[2] = 0x8f1bbcdc;
ctx->k[3] = 0xca62c1d6;
}
__device__ void sha1_update(SHA1_CTX *ctx, const BYTE data[], size_t len)
{
size_t i;
for (i = 0; i < len; ++i)
{
ctx->data[ctx->datalen] = data[i];
ctx->datalen++;
if (ctx->datalen == 64)
{
sha1_transform(ctx, ctx->data);
ctx->bitlen += 512;
ctx->datalen = 0;
}
}
}
__device__ void sha1_final(SHA1_CTX *ctx, BYTE hash[])
{
WORD i;
i = ctx->datalen;
// Pad whatever data is left in the buffer.
if (ctx->datalen < 56)
{
ctx->data[i++] = 0x80;
while (i < 56)
ctx->data[i++] = 0x00;
}
else
{
ctx->data[i++] = 0x80;
while (i < 64)
ctx->data[i++] = 0x00;
sha1_transform(ctx, ctx->data);
memset(ctx->data, 0, 56);
}
// Append to the padding the total message's length in bits and transform.
ctx->bitlen += ctx->datalen * 8;
ctx->data[63] = ctx->bitlen;
ctx->data[62] = ctx->bitlen >> 8;
ctx->data[61] = ctx->bitlen >> 16;
ctx->data[60] = ctx->bitlen >> 24;
ctx->data[59] = ctx->bitlen >> 32;
ctx->data[58] = ctx->bitlen >> 40;
ctx->data[57] = ctx->bitlen >> 48;
ctx->data[56] = ctx->bitlen >> 56;
sha1_transform(ctx, ctx->data);
// Since this implementation uses little endian byte ordering and MD uses big endian,
// reverse all the bytes when copying the final state to the output hash.
for (i = 0; i < 4; ++i)
{
hash[i] = (ctx->state[0] >> (24 - i * 8)) & 0x000000ff;
hash[i + 4] = (ctx->state[1] >> (24 - i * 8)) & 0x000000ff;
hash[i + 8] = (ctx->state[2] >> (24 - i * 8)) & 0x000000ff;
hash[i + 12] = (ctx->state[3] >> (24 - i * 8)) & 0x000000ff;
hash[i + 16] = (ctx->state[4] >> (24 - i * 8)) & 0x000000ff;
}
}
__device__ bool check_trail(BYTE *ptr)
{
return (ptr[19-0] == 0xff) && (ptr[19-1] == 0xff) && (ptr[19-2] == 0xff);
}
__global__ void run()
{
int nnum = blockIdx.x * 1024 + threadIdx.x;
int num;
SHA1_CTX ctx;
BYTE text1[] = {
__PREFIX__
};
BYTE buf[SHA1_BLOCK_SIZE];
BYTE alphabet[] = {
__ALPHABET__
};
int k, r;
do
{
r = 0;
num = nnum;
for (k = 0; k < __LENGTH__; k++)
{
r = num & __MASK__;
text1[__PREFIXLEN__ + k] = alphabet[r];
num = (num >> __LOG2ALPHABET__);
}
sha1_init(&ctx);
sha1_update(&ctx, text1, __PREFIXLEN__+__LENGTH__);
sha1_final(&ctx, buf);
nnum += 256 * 1024;
} while ((__INVMASKFUNC__) && nnum < __MAX_ITERATIONS__);
if (__MASKFUNC__)
{
printf("%s\n", text1);/*
printf("%.2x %.2x %.2x %.2x %.2x %.2x %.2x %.2x %.2x %.2x %.2x %.2x %.2x %.2x %.2x %.2x %.2x %.2x %.2x %.2x\n",
buf[0], buf[1], buf[2], buf[3],
buf[4], buf[5], buf[6], buf[7],
buf[8], buf[9], buf[10], buf[11],
buf[12], buf[13], buf[14], buf[15],
buf[16], buf[17], buf[18], buf[19]
);*/
}
}
thepython execution file:
#!/usr/bin/env python
import os, sys
import pycuda.driver as cuda
import pycuda.autoinit
from pycuda.compiler import SourceModule
import math
class SHAgpu:
cuda_inited = False
threadMax = 1024
blockMax = 1024
cuda_buf_size = threadMax
def __init__(self,
alphabet='abcdefghijklmnopqrstuvwxyzABCDEF', length=6, set_bits=26, set_bits_val=1,
threadMax = 1024, blockMax = 1024):
self.threadMax = threadMax
self.blockMax = blockMax
self.alphabet = alphabet
self.alphalen = len(self.alphabet)
self.length = length
self.loglen = int(math.log(len(self.alphabet)) / math.log(2))
self.set_bits = set_bits
self.set_bits_val = set_bits_val
def hex_encode(self, str):
return ''.join(hex(ord(c)) + ',' for c in str)[:-1]
def generate_mask_function(self):
full = self.set_bits / 8
rem = self.set_bits % 8
L = ['(buf[' + str(19-i) + '] == 0xff)' for i in range(0, full)]
R = ['(buf[' + str(19-full) + '] == ' + hex(2**rem - 1) + ')']
return '&&'.join(L + R)
def inv_generate_mask_function(self):
full = self.set_bits / 8
rem = self.set_bits % 8
L = ['(buf[' + str(19-i) + '] != 0xff)' for i in range(0, full)]
R = ['(buf[' + str(19-full) + '] != ' + hex(2**rem - 1) + ')']
return '||'.join(L + R)
def init_cuda(self, prefix):
if self.cuda_inited:
return
f = open('sha1.c')
cuda_kernel = f.read()
cuda_kernel = cuda_kernel.replace('__MAX_ITERATIONS__',
str(len(self.alphabet) ** self.length))
cuda_kernel = cuda_kernel.replace('__ALPHABET__',
self.hex_encode(self.alphabet))
cuda_kernel = cuda_kernel.replace('__LENGTH__', str(self.length))
cuda_kernel = cuda_kernel.replace('__LOG2ALPHABET__', str(self.loglen))
cuda_kernel = cuda_kernel.replace('__MASK__', str(self.alphalen - 1))
cuda_kernel = cuda_kernel.replace('__PREFIX__',
self.hex_encode(prefix + '\x00' * (self.length + 1)))
cuda_kernel = cuda_kernel.replace('__PREFIXLEN__', str(len(prefix)))
cuda_kernel = cuda_kernel.replace('__MASKFUNC__', self.generate_mask_function())
cuda_kernel = cuda_kernel.replace('__INVMASKFUNC__', self.inv_generate_mask_function())
f.close()
self.mod = SourceModule(cuda_kernel)
self.cuda_buf = cuda.mem_alloc(self.cuda_buf_size)
self.cuda_inited = True
def cuda_run(self, prefix, silent):
if not silent:
print 'Running grocollv1 on 2^%s candidates...' % str(self.loglen * self.length)
print '-' * 30
self.init_cuda(prefix)
run = self.mod.get_function("run");
run(self.cuda_buf, block = (1024, 1, 1), grid = (256, 1))
if __name__ == "__main__":
silent = False
if len(sys.argv) == 1:
prefix = 'grocid'
if len(sys.argv) == 2:
prefix = sys.argv[1]
if len(sys.argv) == 3:
prefix = sys.argv[1]
if sys.argv[2] == '-s':
silent = True
hasher = SHAgpu()
hasher.cuda_run(prefix, silent)
I am getting following errors:
Running grocollv1 on 2^30 candidates...
------------------------------
PyCUDA WARNING: a clean-up operation failed (dead context maybe?)
cuMemFree failed: misaligned address
PyCUDA WARNING: a clean-up operation failed (dead context maybe?)
cuModuleUnload failed: misaligned address
I really got no idea, what’s wrong, any help would be appreciated.