I created a simulation program in VS2022 CUDA running solely on one __device__function which generates 200 billion of 8 character password (lower, upper case, digit) in about 500 ms (without printing last 3 passwords, row 73-79). I added to the loop PBKDF2 and HMAC-SHA1 function to search for a password by known PMKID.
For my NVIDIA GeForce GTX 960M I use:
# define blocks 4
# define threads 992
# define characters 8
When I run the program I get an error:
Kernel launch failed: too many resources requested for launch
Is there any chance to try to run it in a cloud on GPU with 8-12 GB for free?
Program for generating 200 billion of 8 character password (62^8):
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
#include <iostream>
#include <chrono>
using namespace std;
using namespace std::chrono;
// NVIDIA GeForce GTX 960M (device 5.0) has 5 Multiprocessors * 32 threads * 128 CUDA Cores/MP = 20 480
//# define blocks 4
//# define threads 961
//# define characters 4
//# define blocks 4
//# define threads 961
//# define characters 5
//# define blocks 4
//# define threads 961
//# define characters 6
//# define blocks 4
//# define threads 992
//# define characters 7
# define blocks 4
# define threads 992
# define characters 8
cudaError_t cudaStatus;
// "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789"
__constant__ uint8_t charset[] = { 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78, 0x79, 0x7a, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, 0x58, 0x59, 0x5a, 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39 };
__global__ void blackcat(void) {
uint8_t password[characters];
uint8_t counters[characters];
uint64_t n = (pow(62, characters) / threads); // Number of search cycles per thread
for (int i = characters - 1; i >= 0; i--) {
counters[i] = (n * threadIdx.x / (uint64_t)pow(62, characters - 1 - i) % 62);
}
while (n > 0) {
bool flag = false;
for (int i = characters - 1; i >= 0; i--) {
password[i] = charset[counters[i]];
if (i == characters - 1) {
counters[i]++;
if (counters[i] > 61) {
counters[i] = (uint8_t)0;
flag = true;
}
}
else {
if (flag) {
counters[i]++;
if (counters[i] > 61) {
counters[i] = (uint8_t)0;
}
else {
flag = false;
}
}
}
}
// 960
// Print last three generated passwords
//if (threadIdx.x == threads - 1 && blockIdx.x == blocks - 1 && n < 4) {
/*if (threadIdx.x == 0 && blockIdx.x == 0 && n > nn - 3 ) {
printf("Thread[%d]",threadIdx.x);
for (int i = 0; i < characters; i++) {
printf(" %c", password[i]);
}
printf("\n");
}*/
/* Test whether we found the password,
if yes, we print the password, terminate all threads and return prematurely from the function,
it might be a good idea to print the run time occasionally so we know the program is still running */
n--;
}
}
int main() {
auto start = high_resolution_clock::now();
cudaSetDevice(0);
cudaStatus = cudaGetLastError();
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaSetDevice failed! Do you have a CUDA-capable GPU installed?");
}
/*blackcat << <1, threads >> > ();*/
blackcat << <blocks, threads >> > ();
cudaStatus = cudaGetLastError();
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "Kernel launch failed: %s\n", cudaGetErrorString(cudaStatus));
}
cudaDeviceSynchronize();
cudaStatus = cudaGetLastError();
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaDeviceSynchronize returned error code %d after launching addKernel!\n", cudaStatus);
}
auto stop = high_resolution_clock::now();
auto duration = duration_cast<microseconds>(stop - start);
printf("\nTime ms = %llx (HEX)\n", duration.count());
return 0;
}