Salsa 20 Ransomware Brute Force program

Hi,

I’m trying to optimize my cuda kernel but currently I’m out of ideas.

My C/C++ program which I’ve parallelized on the CPU with 10 threads gave me an execution time of 13 years which was so ridiculous that I tried to get in to CUDA to solve the problem in a reasonable amount of time.

First of all a couple of questions concerning CUDA:

1st. Is it possible to run functions from the GPU? The compiler was always complaining so there was no other opportunity left for me then expanding the code into one big function and work with this. I thought that inlining would be an option but I didn’t had success with this either.

2nd. The program is currently split up into two chunks. First chunk is running until either a number of passwords have been tried or the password is found. I needed this because otherwise Windows or OSX seem to raise the watchdog for the Gfx Card driver and terminate my program. Is there a way around this? On my Macbook I thought I could just use the dedicated Gfx card and use the 750M for the Cuda calculations but I couldn’t do this. Either I switched to the Intel Graphics, then no Nvidia Card was found by Cuda or if I switched to the GPU and then I had the watchdog issue.
On my Windows machine I had no solution to switch off the watchdog.

3rd: As you can see from the Code below each thread works independently. Can you see further optimization that I can do to make this quicker. The ETA to try all these keys is 96 years currently. Why is this so bloody slow compared to the same CPU implementation which just uses 10 threads?

the whole project is on github currently, so if you want to have a further look to get the full picture:
https://github.com/procrash/petya-green-multicore

You need boost cmake and cuda to compile it.

The main GPU Core program looks like this:

global void gpu_crypt_and_validate(uint8_t *keys,

                        uint8_t nonce[8],
                        uint32_t si,
                        uint8_t *buf,
                        uint32_t buflen,
                        bool *isValid,
						int nrTotal,
						unsigned long nrKeysToCalculatePerThread,
						char *keyChars,
						int *keyToIndexMap
						)

{

int threadNr = blockDim.x * blockIdx.x + threadIdx.x;

if (threadNr>=nrTotal) return;

uint8_t key = keys + (threadNr(KEY_SIZE));

while (nrKeysToCalculatePerThread>0) {
(isValid)[threadNr] = false;

  uint8_t keystream[64];
  uint8_t n[16] = { 0 };
  uint32_t i;

  for (i = 0; i < 8; ++i)
	n[i] = nonce[i];

  if (si % 64 != 0) {
	// s20_rev_littleendian(n+8, si / 64);
	(n+8)[0] = (si / 64);
	(n+8)[1] = (si / 64)>>8;
	(n+8)[2] = (si / 64)>>16;
	(n+8)[3] = (si / 64)>>24;

	// --------------------------------
	// s20_expand16(key, n, keystream);
	// --------------------------------

	  int i, j;
  uint8_t t[4][4] = {
	{ 'e', 'x', 'p', 'a' },
	{ 'n', 'd', ' ', '1' },
	{ '6', '-', 'b', 'y' },
	{ 't', 'e', ' ', 'k' }
  };

  for (i = 0; i < 64; i += 20)
	for (j = 0; j < 4; ++j)
	  keystream[i + j] = t[i / 20][j];
  
  
  

  for (i = 0; i < 16; ++i) {
	keystream[4+i]  = key[i];
	keystream[44+i] = key[i];
	keystream[24+i] = n[i];
  }

  // ____________________
  // s20_hash(keystream);
  // --------------------

//    int i;
  uint32_t x[16];
  uint32_t z[16];

  for (i = 0; i < 16; ++i) {

	// s20_littleendian
	uint8_t* result = keystream + (4 * i);
	x[i] = z[i] = (int16_t)(result[0]+(result[1]<<8)); //  s20_littleendian(seq + (4 * i));
  }

  for (i = 0; i < 10; ++i) {
//    s20_doubleround(z);

  // ColumnRound
  // s20_quarterround(&x[0], &x[4], &x[8], &x[12]);

  z[4] =  z[4]  ^ ROTL(z[0]  + z[12], 7);
  z[8] =  z[8]  ^ ROTL(z[4]  + z[0], 9);
  z[12] = z[12] ^ ROTL(z[8]  + z[4], 13);
  z[0] =  z[0]  ^ ROTL(z[12] + z[8], 18);

  // s20_quarterround(&x[5], &x[9], &x[13], &x[1]);
  z[9] =  z[9]  ^ ROTL(z[5]  + z[1], 7);
  z[13] = z[13] ^ ROTL(z[9]  + z[5], 9);
  z[1] =  z[1]  ^ ROTL(z[13] + z[9], 13);
  z[5] =  z[5]  ^ ROTL(z[1]  + z[13], 18);

  // s20_quarterround(&x[10], &x[14], &x[2], &x[6]);
  z[14]=  z[14] ^ ROTL(z[10] + z[6], 7);
  z[2] =  z[2]  ^ ROTL(z[14] + z[10], 9);
  z[6] =  z[6]  ^ ROTL(z[2]  + z[14], 13);
  z[10] = z[10] ^ ROTL(z[6]  + z[2], 18);

  // s20_quarterround(&x[15], &x[3], &x[7], &x[11]);
  z[3] =  z[3]  ^ ROTL(z[15] + z[11], 7);
  z[7] =  z[7]  ^ ROTL(z[3]  + z[15], 9);
  z[11] = z[11] ^ ROTL(z[7]  + z[3], 13);
  z[15] = z[15] ^ ROTL(z[11] + z[7], 18);

  // Rowround
  // s20_quarterround(&y[0], &y[1], &y[2], &y[3]);
  z[1] = z[1] ^ ROTL(z[0]+  z[3], 7);
  z[2] = z[2] ^ ROTL(z[1] + z[0], 9);
  z[3] = z[3] ^ ROTL(z[2] + z[1], 13);
  z[0] = z[0] ^ ROTL(z[3] + z[2], 18);

  // s20_quarterround(&y[5], &y[6], &y[7], &y[4]);
  z[6] = z[6] ^ ROTL(z[5] + z[4], 7);
  z[7] = z[7] ^ ROTL(z[6] + z[5], 9);
  z[4] = z[4] ^ ROTL(z[7] + z[6], 13);
  z[5] = z[5] ^ ROTL(z[4] + z[7], 18);

  // s20_quarterround(&y[10], &y[11], &y[8], &y[9]);
  z[11] = z[11] ^ ROTL(z[10] + z[9], 7);
  z[8] =  z[8]  ^ ROTL(z[11] + z[10], 9);
  z[9] =  z[9]  ^ ROTL(z[8] +  z[11], 13);
  z[10] = z[10] ^ ROTL(z[9] +  z[8], 18);
  
  // s20_quarterround(&y[15], &y[12], &y[12], &y[14]);
  z[12] = z[12] ^ ROTL(z[15] + z[14], 7);
  z[12] = z[12] ^ ROTL(z[12] + z[15], 9);
  z[14] = z[14] ^ ROTL(z[12] + z[12], 13);
  z[15] = z[15] ^ ROTL(z[14] + z[12], 18);
  }

  for (i = 0; i < 16; ++i) {
	z[i] += x[i];
	// s20_rev_littleendian(seq + (4 * i), z[i]);
	  (keystream + (4 * i))[0] = z[i];
	  (keystream + (4 * i))[1] = z[i] >> 8;
	  (keystream + (4 * i))[2] = z[i] >> 16;
	  (keystream + (4 * i))[3] = z[i] >> 24;
  }
  


  }

  for (int bufPos = 0; bufPos < buflen; ++bufPos) {
	if ((si + bufPos) % 64 == 0) {
	  //s20_rev_littleendian(n+8, ((si + i) / 64));
	  (n+8)[0] = ((si + bufPos) / 64);
	  (n+8)[1] = ((si + bufPos) / 64)>>8;
	  (n+8)[2] = ((si + bufPos) / 64)>>16;
	  (n+8)[3] = ((si + bufPos) / 64)>>24;

	  // s20_expand16(key, n, keystream);

	  int i, j;
	  uint8_t t[4][4] = {
		{ 'e', 'x', 'p', 'a' },
		{ 'n', 'd', ' ', '1' },
		{ '6', '-', 'b', 'y' },
		{ 't', 'e', ' ', 'k' }
	  };

	  for (i = 0; i < 64; i += 20)
		for (j = 0; j < 4; ++j)
		  keystream[i + j] = t[i / 20][j];

	  for (i = 0; i < 16; ++i) {
		keystream[4+i]  = key[i];
		keystream[44+i] = key[i];
		keystream[24+i] = n[i];
	  }

  // ____________________
  // s20_hash(keystream);
  // --------------------

	//    int i;
	  uint32_t x[16];
	  uint32_t z[16];

	  for (i = 0; i < 16; ++i) {

		// s20_littleendian
		uint8_t* result = keystream + (4 * i);
		x[i] = z[i] = (int16_t)(result[0]+(result[1]<<8)); //  s20_littleendian(seq + (4 * i));
	  }

	  for (i = 0; i < 10; ++i) {
			//    s20_doubleround(z);

			  // ColumnRound
			  // s20_quarterround(&x[0], &x[4], &x[8], &x[12]);

			  z[4] =  z[4]  ^ ROTL(z[0]  + z[12], 7);
			  z[8] =  z[8]  ^ ROTL(z[4]  + z[0], 9);
			  z[12] = z[12] ^ ROTL(z[8]  + z[4], 13);
			  z[0] =  z[0]  ^ ROTL(z[12] + z[8], 18);

			  // s20_quarterround(&x[5], &x[9], &x[13], &x[1]);
			  z[9] =  z[9]  ^ ROTL(z[5]  + z[1], 7);
			  z[13] = z[13] ^ ROTL(z[9]  + z[5], 9);
			  z[1] =  z[1]  ^ ROTL(z[13] + z[9], 13);
			  z[5] =  z[5]  ^ ROTL(z[1]  + z[13], 18);

			  // s20_quarterround(&x[10], &x[14], &x[2], &x[6]);
			  z[14]=  z[14] ^ ROTL(z[10] + z[6], 7);
			  z[2] =  z[2]  ^ ROTL(z[14] + z[10], 9);
			  z[6] =  z[6]  ^ ROTL(z[2]  + z[14], 13);
			  z[10] = z[10] ^ ROTL(z[6]  + z[2], 18);

			  // s20_quarterround(&x[15], &x[3], &x[7], &x[11]);
			  z[3] =  z[3]  ^ ROTL(z[15] + z[11], 7);
			  z[7] =  z[7]  ^ ROTL(z[3]  + z[15], 9);
			  z[11] = z[11] ^ ROTL(z[7]  + z[3], 13);
			  z[15] = z[15] ^ ROTL(z[11] + z[7], 18);

			  // Rowround
			  // s20_quarterround(&y[0], &y[1], &y[2], &y[3]);
			  z[1] = z[1] ^ ROTL(z[0]+  z[3], 7);
			  z[2] = z[2] ^ ROTL(z[1] + z[0], 9);
			  z[3] = z[3] ^ ROTL(z[2] + z[1], 13);
			  z[0] = z[0] ^ ROTL(z[3] + z[2], 18);

			  // s20_quarterround(&y[5], &y[6], &y[7], &y[4]);
			  z[6] = z[6] ^ ROTL(z[5] + z[4], 7);
			  z[7] = z[7] ^ ROTL(z[6] + z[5], 9);
			  z[4] = z[4] ^ ROTL(z[7] + z[6], 13);
			  z[5] = z[5] ^ ROTL(z[4] + z[7], 18);

			  // s20_quarterround(&y[10], &y[11], &y[8], &y[9]);
			  z[11] = z[11] ^ ROTL(z[10] + z[9], 7);
			  z[8] =  z[8]  ^ ROTL(z[11] + z[10], 9);
			  z[9] =  z[9]  ^ ROTL(z[8] +  z[11], 13);
			  z[10] = z[10] ^ ROTL(z[9] +  z[8], 18);

			  // s20_quarterround(&y[15], &y[12], &y[12], &y[14]);
			  z[12] = z[12] ^ ROTL(z[15] + z[14], 7);
			  z[12] = z[12] ^ ROTL(z[12] + z[15], 9);
			  z[14] = z[14] ^ ROTL(z[12] + z[12], 13);
			  z[15] = z[15] ^ ROTL(z[14] + z[12], 18);
		  }

		  for (i = 0; i < 16; ++i) {
			z[i] += x[i];
			// s20_rev_littleendian(seq + (4 * i), z[i]);
			  (keystream + (4 * i))[0] = z[i];
			  (keystream + (4 * i))[1] = z[i] >> 8;
			  (keystream + (4 * i))[2] = z[i] >> 16;
			  (keystream + (4 * i))[3] = z[i] >> 24;
		  }

		}
		buf[bufPos] ^= keystream[(si + bufPos) % 64];
	  }

  	  (isValid)[threadNr] = true; // Assume we found the key

  	  
	  // Validate Crypto Result
	  for (size_t bufPos = 0; bufPos < VERIBUF_SIZE; bufPos++) {
		 if (buf[bufPos] != VERIFICATION_CHAR) {
			(isValid)[threadNr] = false; // We didn't
			
			// Calculate next key to try...
			int posToKey[] = {13,12,9,8,5,4,1,0};

			for (int i=0; i<8; i++) {
				int idx = keyToIndexMap[key[posToKey[i]]];
				idx++;
				idx %=sizeof(keyChars);
				key[posToKey[i]] = keyChars[idx];

				if (idx!=0) break;
			}				
			break;
		}
		
	  }

	  nrKeysToCalculatePerThread--;
  }

}

just a quick hint - if any program will become automagically fatser just by recompiling for GPU, you will be seen GPUs used everywhere - from solitaire to jvm

unfortunetely, gpu optimization require learning gpu architecture and use it in optimal way. but since you are looking for pretty popular crypto-algo, check google for existing implementations, in particluar in cryptominers

just for example: http://binary.cr.yp.to/gpuasm-20120313.pdf