Kernel time measurements return 0 after some time

What I am doing wrong?

968000000
CUDA time is 0.000000 seconds

Under conditon, I need to set the lobal variable that every thread would have to check, from time to time, and the thread exits when it sees the global variable set. When all threads exit, then the kernel will “return”-


#include "cuda_runtime.h"
#include "device_launch_parameters.h"

#include <stdio.h>
#include <iostream>
#include <time.h>

// NVIDIA GeForce GTX 960M (compute/sm 50, 5x SM)
# define blocks 2
# define threads 1024

cudaError_t cudaStatus;

__global__ void blackcat(void) {

	//uint64_t n = 218340105584896ull / threads;	// Number of search cycles per thread 1,073,741,824 for 1,024 threads
	uint64_t n = 1000000000ull;

	uint64_t a = 0;

	while (n --> 0) {
		a++;
		if (threadIdx.x == 0 && blockIdx.x == 0 && a == 968000000ull) {
			printf("%lld\n", a);
		}
	}
}

int main() {

	cudaEvent_t start, stop;	// CUDA time
	float time;
	cudaEventCreate(&start);
	cudaEventCreate(&stop);

	cudaSetDevice(0);
	cudaStatus = cudaGetLastError();
	if (cudaStatus != cudaSuccess) {
		fprintf(stderr, "cudaSetDevice failed!  Do you have a CUDA-capable GPU installed?");
	}

	cudaEventRecord(start, 0);
	blackcat << <blocks, threads >> > ();
	cudaEventRecord(stop, 0);
	cudaEventSynchronize(stop);

	cudaEventElapsedTime(&time, start, stop);
	cudaEventDestroy(start);
	cudaEventDestroy(stop);
	printf("CUDA time is %f seconds\n", &time);

	cudaStatus = cudaGetLastError();
	if (cudaStatus != cudaSuccess) {
		fprintf(stderr, "Kernel launch failed: %s\n", cudaGetErrorString(cudaStatus));
	}
	cudaDeviceSynchronize();
	cudaStatus = cudaGetLastError();
	if (cudaStatus != cudaSuccess) {
		fprintf(stderr, "cudaDeviceSynchronize returned error code %d after launching addKernel!\n", cudaStatus);
	}

	return 0;
}

This is incorrect:

printf("CUDA time is %f seconds\n", &time);
                                    ^

Also, your kernel launch error checking is no longer trustworthy. I suggest following the recommendations carefully. You can’t just put that “Kernel launch failed” error check anywhere you want, and expect it to give you useful and reliable information. However that is not the cause of the issue.

Thank you for your advice. I fixed it, but now is the result:

printf("CUDA time is %f seconds\n", time);

968000000
CUDA time is 10712.498047 seconds

while measurement on my watches shows just 10 seconds. Just got it, milliseconds…

Does it mean that 1 cycle takes 5.231 E-12 s? I.e. 10.712498047 s / 1024 threads * 2 blocks * 1E9 (loop) = 5.231 E-12 s/cycle?

Since the GPU is doing work in parallel, that particular calculation scheme provides very little insight for me. I don’t know what value a calculation like that provides unless you use it in some comparative way (and even then, carefully). The GPU is not calculating anything in such a way that any useful, measurable, or noticeable work is done in 5 picoseconds.

The actual clock cycles of GPUs are much longer than that, on the order of 500 to 1000 picoseconds, approximately. Even then, its nearly impossible to identify what exactly has transpired on a GPU in a given, single clock cycle.

With only 1 block set, I think this may exit all threads on a condition using a shared memory flag variable:


#include "cuda_runtime.h"
#include "device_launch_parameters.h"

#include <stdio.h>
#include <iostream>
#include <time.h>


// NVIDIA GeForce GTX 960M (compute/sm 50, 5x SM)
//# define blocks 2
//# define threads 1024
# define blocks 1
# define threads 1024

cudaError_t cudaStatus;

__global__ void blackcat(void) {

	uint64_t n = 10000000ull;	// 10E6
	uint64_t a = 0;
	__shared__ bool flag;

	flag = false;

	while (n --> 0) {
		a++;
		if (flag) {
			break;
		}
		if (threadIdx.x == 512 && a == 1000000ull) {	// 1E6, just 1 block
			printf("%lld\n", a);
			flag = true;
		}
	}
}

int main() {

	cudaEvent_t start, stop;	// CUDA time
	float time;
	cudaEventCreate(&start);
	cudaEventCreate(&stop);

	cudaSetDevice(0);
	cudaStatus = cudaGetLastError();
	if (cudaStatus != cudaSuccess) {
		fprintf(stderr, "cudaSetDevice failed!  Do you have a CUDA-capable GPU installed?");
	}

	cudaEventRecord(start, 0);
	blackcat << <blocks, threads >> > ();
	cudaEventRecord(stop, 0);
	cudaEventSynchronize(stop);

	cudaEventElapsedTime(&time, start, stop);
	cudaEventDestroy(start);
	cudaEventDestroy(stop);
	printf("CUDA time is %f s\n", time/1000);

	cudaStatus = cudaGetLastError();
	if (cudaStatus != cudaSuccess) {
		fprintf(stderr, "Kernel launch failed: %s\n", cudaGetErrorString(cudaStatus));
	}
	cudaDeviceSynchronize();
	cudaStatus = cudaGetLastError();
	if (cudaStatus != cudaSuccess) {
		fprintf(stderr, "cudaDeviceSynchronize returned error code %d after launching addKernel!\n", cudaStatus);
	}

	return 0;
}