Generate all combinations of a char array inside of a CUDA __device__ kernel

Hi,

I need help please. I started to program a common brute forcer / password guesser with CUDA (2.3 / 3.0beta). I tried different ways to generate all possible plain text “candidates” of a defined ASCII char set.

In this sample code I want to generate all 74^4 possible combinations (and just output the result back to host/stdout)

$ ./combinations

Total number of combinations	: 29986576

Maximum output length   : 4

ASCII charset length	: 74

ASCII charset   : 0x30 - 0x7a

 "0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxy"

CUDA code (compiled with 2.3 and 3.0b - sm_10) - combinaions.cu:

#include <stdio.h>

#include <cuda.h>

__device__ uchar4 charset_global = {0x30, 0x30, 0x30, 0x30};

__shared__ __device__ uchar4 charset[128];

__global__ void combo_kernel(uchar4 * result_d, unsigned int N)

{

 int totalThreads = blockDim.x * gridDim.x;

 int tasksPerThread = (N % totalThreads) == 0 ? N / totalThreads : N/totalThreads + 1;

 int myThreadIdx = blockIdx.x * blockDim.x + threadIdx.x;

 int endIdx = myThreadIdx + totalThreads * tasksPerThread;

 if( endIdx > N) endIdx = N;

const unsigned int m = 74 + 0x30;

for(int idx = myThreadIdx; idx < endIdx; idx += totalThreads) {

  charset[threadIdx.x].x = charset_global.x;

  charset[threadIdx.x].y = charset_global.y;

  charset[threadIdx.x].z = charset_global.z;

  charset[threadIdx.x].w = charset_global.w;

  __threadfence();

if(charset[threadIdx.x].x < m) {

   charset[threadIdx.x].x++;

} else if(charset[threadIdx.x].y < m) {

   charset[threadIdx.x].x = 0x30; // = 0

   charset[threadIdx.x].y++;

} else if(charset[threadIdx.x].z < m) {

   charset[threadIdx.x].y = 0x30; // = 0

   charset[threadIdx.x].z++;

} else if(charset[threadIdx.x].w < m) {

   charset[threadIdx.x].z = 0x30;

   charset[threadIdx.x].w++;; // = 0

  }

charset_global.x = charset[threadIdx.x].x;

  charset_global.y = charset[threadIdx.x].y;

  charset_global.z = charset[threadIdx.x].z;

  charset_global.w = charset[threadIdx.x].w;

result_d[idx].x = charset_global.x;

  result_d[idx].y = charset_global.y;

  result_d[idx].z = charset_global.z;

  result_d[idx].w = charset_global.w;

 }

}

#define BLOCKS 65535

#define THREADS 128

int main(int argc, char **argv)

{

 const int ascii_chars = 74;

 const int max_len = 4;

 const unsigned int N = pow((float)ascii_chars, max_len);

 size_t size = N * sizeof(uchar4);

uchar4 *result_d, *result_h;

 result_h = (uchar4 *)malloc(size );

 cudaMalloc((void **)&result_d, size );

 cudaMemset(result_d, 0, size);

printf("Total number of combinations\t: %d\n\n", N); 

 printf("Maximum output length\t: %d\n", max_len);

 printf("ASCII charset length\t: %d\n\n", ascii_chars);

printf("ASCII charset\t: 0x30 - 0x%02x\n ", 0x30 + ascii_chars);

 for(int i=0; i < ascii_chars; i++)

  printf("%c",i + 0x30);

 printf("\n\n");

combo_kernel <<< BLOCKS, THREADS >>> (result_d, N);

 cudaThreadSynchronize();

printf("CUDA kernel done\n");

 printf("hit key to continue...\n");

 getchar();

cudaMemcpy(result_h, result_d, size, cudaMemcpyDeviceToHost);

for (unsigned int i=0; i<N; i++)

  printf("result[%06u]\t%c%c%c%c\n",i, result_h[i].x, result_h[i].y, result_h[i].z, result_h[i].w);

free(result_h);

 cudaFree(result_d);

}

The code should compile without any problems but the output is not what i expected.

On emulation mode:

CUDA kernel done hit key to continue...

	result[000000]  1000 

...

	result[000128]  5000

On release mode:

CUDA kernel done hit key to continue...

	result[000000]  1000 

...

	result[012288]  5000

if possible I want to generate everything inside of the kernel function . I also tried “pre” generating of possible plain text candidates inside host main function and memcpy to device, this works only with a very limited charset size (because of limited device memory).

any idea about the output, why the repeating (even with __threadfence() or __syncthreads()) ?

any other method to generate plain text (candidates) inside CUDA kernel fast :-) (~75^8) - maybe with “offset” constant values ?

thanks a million