I have a kernel
__global__ void sha1_kernel_global (unsigned char *data, sha1_gpu_context *ctx, int total_threads, unsigned long *extended)
{
int thread_index = threadIdx.x + blockDim.x * blockIdx.x;
int e_index = thread_index * 80;
int block_index = thread_index * 64;
unsigned long temp, t;
if (thread_index > total_threads -1)
return;
GET_UINT32_BE( extended[e_index ], data + block_index, 0 );
GET_UINT32_BE( extended[e_index + 1], data + block_index, 4 );
GET_UINT32_BE( extended[e_index + 2], data + block_index, 8 );
GET_UINT32_BE( extended[e_index + 3], data + block_index, 12 );
GET_UINT32_BE( extended[e_index + 4], data + block_index, 16 );
GET_UINT32_BE( extended[e_index + 5], data + block_index, 20 );
GET_UINT32_BE( extended[e_index + 6], data + block_index, 24 );
GET_UINT32_BE( extended[e_index + 7], data + block_index, 28 );
GET_UINT32_BE( extended[e_index + 8], data + block_index, 32 );
GET_UINT32_BE( extended[e_index + 9], data + block_index, 36 );
GET_UINT32_BE( extended[e_index +10], data + block_index, 40 );
GET_UINT32_BE( extended[e_index +11], data + block_index, 44 );
GET_UINT32_BE( extended[e_index +12], data + block_index, 48 );
GET_UINT32_BE( extended[e_index +13], data + block_index, 52 );
GET_UINT32_BE( extended[e_index +14], data + block_index, 56 );
GET_UINT32_BE( extended[e_index +15], data + block_index, 60 );
for (t = 16; t < 80; t++) {
temp = extended[e_index + t - 3] ^ extended[e_index + t - 8] ^
extended[e_index + t - 14] ^ extended[e_index + t - 16];
extended[e_index + t] = S(temp,1);
}
__syncthreads();
if (thread_index == total_threads - 1) {
for (t = 0; t < total_threads; t++)
sha1_gpu_process (ctx, (unsigned long*)&extended[t * 80]);
}
}
And it is executed something like this:
if (k - 1 > 0) {
for (i = 0; i < k; i++) {
// printf ("offset: %d\n", total_threads * i * 64);
sha1_kernel_global <<<blocks_per_grid, threads_per_block>>>(d_message + threads_per_block * i * 64, d_ctx, threads_per_block, d_extended);
//CUDA_SAFE_CALL (cudaThreadSynchronize());
}
}
I use only one block per grid. If i process the data with 128 or less threads I get the results as I want, but the computation is really slow. If thread number is greater than 128 the algorithm finishes extremely fast, however the results are wrong. I just don’t understand how this can happen. I guess there is something with syncthreads ? Or maybe I am wrong. Any hints? Thank you.
Tadas