Under conditon, I need to set the lobal variable that every thread would have to check, from time to time, and the thread exits when it sees the global variable set. When all threads exit, then the kernel will “return”-
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
#include <iostream>
#include <time.h>
// NVIDIA GeForce GTX 960M (compute/sm 50, 5x SM)
# define blocks 2
# define threads 1024
cudaError_t cudaStatus;
__global__ void blackcat(void) {
//uint64_t n = 218340105584896ull / threads; // Number of search cycles per thread 1,073,741,824 for 1,024 threads
uint64_t n = 1000000000ull;
uint64_t a = 0;
while (n --> 0) {
a++;
if (threadIdx.x == 0 && blockIdx.x == 0 && a == 968000000ull) {
printf("%lld\n", a);
}
}
}
int main() {
cudaEvent_t start, stop; // CUDA time
float time;
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaSetDevice(0);
cudaStatus = cudaGetLastError();
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaSetDevice failed! Do you have a CUDA-capable GPU installed?");
}
cudaEventRecord(start, 0);
blackcat << <blocks, threads >> > ();
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&time, start, stop);
cudaEventDestroy(start);
cudaEventDestroy(stop);
printf("CUDA time is %f seconds\n", &time);
cudaStatus = cudaGetLastError();
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "Kernel launch failed: %s\n", cudaGetErrorString(cudaStatus));
}
cudaDeviceSynchronize();
cudaStatus = cudaGetLastError();
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaDeviceSynchronize returned error code %d after launching addKernel!\n", cudaStatus);
}
return 0;
}
Also, your kernel launch error checking is no longer trustworthy. I suggest following the recommendations carefully. You can’t just put that “Kernel launch failed” error check anywhere you want, and expect it to give you useful and reliable information. However that is not the cause of the issue.
Since the GPU is doing work in parallel, that particular calculation scheme provides very little insight for me. I don’t know what value a calculation like that provides unless you use it in some comparative way (and even then, carefully). The GPU is not calculating anything in such a way that any useful, measurable, or noticeable work is done in 5 picoseconds.
The actual clock cycles of GPUs are much longer than that, on the order of 500 to 1000 picoseconds, approximately. Even then, its nearly impossible to identify what exactly has transpired on a GPU in a given, single clock cycle.