Advice on how to optimize code speed

The program generates all possible combinations from the specified character set. For 6 characters it runs on my NVIDIA GeForce GTX 960M graphics card for 21.5 seconds compiled as Release (Visual Studio 2022) generating 2 637 155 942 password/s.

How to speed up the program by changing the number of blocks and threads to get the most out of the GPU?

CUDA Device Query (Runtime API) version (CUDART static linking)

Detected 1 CUDA Capable device(s)

Device 0: “NVIDIA GeForce GTX 960M”
CUDA Driver Version / Runtime Version 11.8 / 11.8
CUDA Capability Major/Minor version number: 5.0
Total amount of global memory: 4096 MBytes (4294836224 bytes)
(005) Multiprocessors, (128) CUDA Cores/MP: 640 CUDA Cores
GPU Max Clock rate: 1176 MHz (1.18 GHz)
Memory Clock rate: 2505 Mhz
Memory Bus Width: 128-bit
L2 Cache Size: 2097152 bytes
Maximum Texture Dimension Size (x,y,z) 1D=(65536), 2D=(65536, 65536), 3D=(4096, 4096, 4096)
Maximum Layered 1D Texture Size, (num) layers 1D=(16384), 2048 layers
Maximum Layered 2D Texture Size, (num) layers 2D=(16384, 16384), 2048 layers
Total amount of constant memory: 65536 bytes
Total amount of shared memory per block: 49152 bytes
Total shared memory per multiprocessor: 65536 bytes
Total number of registers available per block: 65536
Warp size: 32
Maximum number of threads per multiprocessor: 2048
Maximum number of threads per block: 1024

#include "cuda_runtime.h"
#include "device_launch_parameters.h"

#include <stdio.h>
#include <iostream>
#include <chrono>

using namespace std;
using namespace std::chrono;

// NVIDIA GeForce GTX 960M v NB has 5 Multiprocessors * 32 threads * 128 CUDA Cores/MP = 20 480
//# define threads 3 844 = 4 * 961
# define blocks 4
# define threads 961
# define characters 6

cudaError_t cudaStatus;

// "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789"
__constant__ char1 charset[] = { 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78, 0x79, 0x7a, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, 0x58, 0x59, 0x5a, 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39 };

__global__ void blackcat(void) {

	//printf("Hi from kernel\n");

	char1 password[characters];

	uint8_t counters[characters];
	uint64_t n = (pow(62, characters) / threads);	// Number of search cycles per thread

	for (int i = characters - 1; i >= 0; i--) {
		counters[i] = (n * threadIdx.x / (uint64_t)pow(62, characters - 1 - i) % 62);
	}

	while (n > 0) {

			bool flag = false;
			for (int i = characters - 1; i >= 0; i--) {
				password[i] = charset[counters[i]];
				if (i == characters - 1) {
					counters[i]++;
					if (counters[i] > 61) {
						counters[i] = (uint8_t)0;
						flag = true;
					}
				}
				else {
					if (flag) {
						counters[i]++;
						if (counters[i] > 61) {
							counters[i] = (uint8_t)0;
						}
						else {
							flag = false;
						}
					}
				}
			}
			// 960
			if (threadIdx.x == threads - 1 && blockIdx.x == blocks - 1 && n < 4) {
				printf("Thread[%d]",threadIdx.x);
				for (int i = 0; i < characters; i++) {
					printf(" %c", password[i]);
				}
				printf("\n");
			}

		/* Additional test */ 

		n--;
	}
}

int main() {

	auto start = high_resolution_clock::now();
	cudaSetDevice(0);
	cudaStatus = cudaGetLastError();
	    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaSetDevice failed!  Do you have a CUDA-capable GPU installed?");
    }

	blackcat << <blocks, threads >> > ();
	cudaStatus = cudaGetLastError();
	if (cudaStatus != cudaSuccess) {
		fprintf(stderr, "Kernel launch failed: %s\n", cudaGetErrorString(cudaStatus));
	}

	cudaDeviceSynchronize();
	cudaStatus = cudaGetLastError();
	    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaDeviceSynchronize returned error code %d after launching addKernel!\n", cudaStatus);
    }

	auto stop = high_resolution_clock::now();
	auto duration = duration_cast<microseconds>(stop - start);
	printf("\nTime  = %llx (HEX)\n", duration.count());

	return 0;
}

Do not use pow for integers, it can - despite suboptimal performance - cause rounding errors. You should try to coalesce memory accesses and try to avoid indirect indexing into local arrays as it can lead the compiler to generate accesses to local memory, which will get slow, if it is not suitably cached. Registers cannot be indexed, so either use shared memory or program in a way that the loops can be unrolled and the indices can be determined at compile time. Use Nvidia Nsight to identify bottlenecks.

each block is doing exactly the same thing (apart from printout). That is probably not what you want.