When I used blockidx.x and threadidx.x to index sepcific thread, I found the block index cannot reach the number of block. (gridsize = 128, blocksize = 256)
But it work when I set grid size to 64.
My kernel function:
__global__ void des_gpu_crack_kernel(uint64_t message, uint64_t cipher,
uint64_t begin, uint64_t limit, bool* d_done, uint64_t* d_key, int* d_counters)
{
uint64_t key = begin + blockIdx.x * blockDim.x + threadIdx.x;;
uint32_t count = 0;
while (key < limit && !(*d_done))
{
uint64_t encrypted = des_encrypt_56(key, message);
count++;
//printf("blockid x: %d, gridDimd x: %d, threadid: %d\n", blockIdx.x, gridDim.x, threadIdx.x);
if (encrypted == cipher)
{
*d_key = key;
*d_done = true;
break;
}
key += gridDim.x * blockDim.x;
}
if (threadIdx.x % warpSize == 0)
{
size_t index = (blockIdx.x * blockDim.x + threadIdx.x) / warpSize;
d_counters[index] = count * warpSize;
}
}
My main function:
// Run kernel.
printf("running kernel... \n");
des_gpu_crack_kernel <<<128, 256>>> (message, cipher, begin, limit / (num_of_blocks * block_size) + 1, d_done, d_key, d_counters);
printf("back to cpu... \n");
When I print block idx.x (in may kernel), it only can reach 70 or 80.
Nvidia-smi output:
Tue Sep 20 12:03:33 2022
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 512.15 Driver Version: 512.15 CUDA Version: 11.6 |
|-------------------------------+----------------------+----------------------+
| GPU Name TCC/WDDM | Bus-Id Disp.A | Volatile Uncorr. ECC |
| Fan Temp Perf Pwr:Usage/Cap| Memory-Usage | GPU-Util Compute M. |
| | | MIG M. |
|===============================+======================+======================|
| 0 NVIDIA GeForce ... WDDM | 00000000:07:00.0 On | N/A |
| 0% 54C P2 99W / 200W | 1220MiB / 8192MiB | 100% Default |
| | | N/A |
+-------------------------------+----------------------+----------------------+```