Unspecified launch failure error

Hi guys, I’m relatively new to this CUDA computing.
I tried to run my kernel with certain number of blocks(159) and threads(768).
I get the expected results.

But when i increase the thread count by 1(i.e 769), the whole thing goes berserk.

And also when i increase the block count by 1(i.e. 160), it throws a UNSPECIFIED LAUNCH FAILURE ERROR CODE 4.

I’ve posted my code below.
Please help me out.

//100000 rounds i.e 1000*100 = 1700 ms
//max threads=768 ................if it exceeds, cnt value is reset to 0;
//max blocks=159..................if it exceeds, unspecified launch failure error code 4;

//Implemented using atomicAdd on global memory

#include <stdio.h>
#include <time.h>
#include <stdlib.h>
#include <ctime>
// CUDA runtime
#include <cuda_runtime.h>

// Helper functions and utilities to work with CUDA
#include "helper_functions.h"
#include "helper_cuda.h"
#include "curand.h"
#include <curand_kernel.h>

#define ul unsigned int
#define ull unsigned long long 

#define BIASMAXLIMIT 0.51
#define BIASMINLIMIT 0.49

__device__ double cudaRand(int id)
{
    curandState state;
    curand_init((unsigned long long)clock(), id, 0, &state);

    double d_out = curand_uniform_double(&state);
    return d_out;
}

__device__ void initializeR(ul *x){
	
	int l = threadIdx.x + blockIdx.x * blockDim.x;
	#define myrand32 ((ul) (4294967296.0*((double)cudaRand(l))))

	int i;
	for (i = 0; i < 16; i++)
		x[i] = myrand32;
	
	x[0] = 0x61707865;
	x[5] = 0x3320646e;
	x[10] = 0x79622d32;
	x[15] = 0x6b206574;
}

__device__ void copystate(ul *x1, ul *x){
	int i;
	for (i = 0; i < 16; i++)
		x1[i] = x[i];
}

__device__ void print(ul *x){
	int i;
	for (i = 0; i < 16; i++){
		printf("%8x ", x[i]);
		if (i > 0 && i%4 == 3)
			printf("\n");
	}
	printf("\n");
}

__device__ void qr(ul *x0, ul *x1, ul *x2, ul *x3){
	#define rotateleft(x, n) (((x) << (n)) ^((x) >> (32-n)))

	#define update(a, b, c, n) ((a)^(rotateleft(((b)+(c)),(n))))

	ul z0, z1, z2, z3;
	z1 = update(*x1, *x3, *x0, 7);
	z2 = update(*x2, *x0, z1, 9);
	z3 = update(*x3, z1, z2, 13);
	z0 = update(*x0, z2, z3, 18);
	*x0 = z0; *x1 = z1, *x2 = z2, *x3 = z3;
}

__device__ void transpose(ul *x){
  	ul temp;
  	temp=x[1]; x[1]=x[4]; x[4]=temp;
  	temp=x[2]; x[2]=x[8]; x[8]=temp;
  	temp=x[3]; x[3]=x[12]; x[12]=temp;
  	temp=x[6]; x[6]=x[9]; x[9]=temp;
  	temp=x[7]; x[7]=x[13]; x[13]=temp;
  	temp=x[11]; x[11]=x[14]; x[14]=temp;
}

__device__ void rounds(ul *x){
	qr(&(x[0]), &(x[4]), &(x[8]),&(x[12]));
	qr(&(x[5]), &(x[9]), &(x[13]),&(x[1]));
	qr(&(x[10]), &(x[14]), &(x[2]),&(x[6]));
	qr(&(x[15]), &(x[3]), &(x[7]),&(x[11]));
	transpose(x);
}

__global__ void gen (unsigned int *d_cnt)
{	
	ul x[16], x1[16], pattern;
	initializeR(x);	
	
	ull pt = 0x80000000;   
    copystate(x1, x);
	x1[7] = x[7] ^ pt;
	
	for (int i = 0; i < 4; i++)
    { rounds(x); rounds(x1); }

	for (int k = 0; k < 16; k++)
    {
	    pattern = 0x80000000;
	    for (int j = 31; j >= 0; j--)
        {
			if (((x[k] ^ x1[k]) & pattern) == 0) 
                {atomicAdd(&d_cnt[k*32+j],1);}
		    
			pattern = pattern >> 1;
		}
	}
}

int main()
{
	int k, kmin, jmin, kmax=0, jmax=0;
	double val, max, min;
	FILE *fp;

	
    
    // Allocate host memory for matrix cnt
    unsigned int size_A = 16 * 32;
    unsigned int mem_size_A = sizeof(unsigned int) * size_A;
    unsigned int *h_cnt = (unsigned int *)malloc(mem_size_A);

for (int i = 0; i < size_A; ++i)
    {
        h_cnt[i] = 0;
    }
    // Allocate device memory
    unsigned int *d_cnt;

    cudaError_t error;

    error = cudaMalloc((void **) &d_cnt, mem_size_A);

    if (error != cudaSuccess)
    {
        printf("cudaMalloc d_cnt returned error %s (code %d), line(%d)\n", cudaGetErrorString(error), error, __LINE__);
        exit(EXIT_FAILURE);
    }

    // copy host memory to device
    error = cudaMemcpy(d_cnt, h_cnt, mem_size_A, cudaMemcpyHostToDevice);

    if (error != cudaSuccess)
    {
        printf("cudaMemcpy (d_A,h_A) returned error %s (code %d), line(%d)\n", cudaGetErrorString(error), error, __LINE__);
        exit(EXIT_FAILURE);
    }

    // Setup execution parameters

	dim3 dimGrid(159,1,1);
	dim3 dimBlock(768,1,1); 
 
    // Create and start timer
    printf("Computing result using CUDA Kernel...\n");

    // Execute the kernel
    int x=0;
long starttime=clock();
    while(x<64)
    {	
       	 gen<<<dimGrid,dimBlock>>>(d_cnt);
        
        error = cudaMemcpy(h_cnt, d_cnt, mem_size_A, cudaMemcpyDeviceToHost);

	    if (error != cudaSuccess)
	    {
	        printf("cudaMemcpy (h_cnt,d_cnt) returned error %s (code %d), line(%d)\n", cudaGetErrorString(error), error, __LINE__);
	        exit(EXIT_FAILURE);
	    }

	    for (int p = 0; p < 16; p++)
	    {
		    for (int q = 0; q < 32; q++)
	        {
				printf(" %u",h_cnt[p*32+q]);
			}
			printf("\n");
		}

        fp = fopen("ask1.dat", "w");
		fprintf(fp, "Itr %d\n",x+1);
		printf("Itr %d\n",x+1);
		max = min = 0.5;
		x++;
		
		for (k = 0; k < 16; k++)
        {                                   		
            fprintf(fp, "For index %d\n ", k);
	        for (int j = 0; j < 32; j++)
            {
				val = (double)h_cnt[k*32+j]/((double)x*dimBlock.x*dimBlock.y*dimGrid.x);

				if (val > max) {max = val; kmax = k;jmax = j;}
				
				if (val < min) {min = val; kmin = k;jmin = j;}
				
				fprintf(fp, "(%2d %lf)", j, val);
				
				if (val >= BIASMAXLIMIT || val <=BIASMINLIMIT) 					
					fprintf(fp, "* ");
				
				else 
                    fprintf(fp, "  ");

				if (j > 0 && j%8 == 7)
					fprintf(fp, "\n");
	        }

	        fprintf(fp, "-------------\n");
	    }
        
        fprintf(fp, "%d %d %.20lf %d %d%.20lf\n", kmin, jmin, min, kmax, jmax, max);
    	fclose(fp);
        printf("%d %d %.20lf %d %d %.20lf\n",kmin, jmin, min, kmax, jmax, max);
        long finishtime=clock();
        printf("%ld\n", finishtime-starttime );
        starttime=clock();
        
    }

    // Copy result from device to host

// Clean up memory
    free(h_cnt);
    cudaFree(d_cnt);
	
}

I modified your code to only run 2 loops instead of 64. And I modified the block count from 159 to 160. The code ran with no errors for me. You may simply be running into a kernel timeout for that case.

What is your GPU? Is the GPU hosting a display? Are you running on windows? It may also be useful to know if you are compiling with the debug switch (-G) or not.

Any time you are having trouble with a CUDA code, in addition to the error checking you have done, I recommend running your code with cuda-memcheck. If it reports an error such as an illegal memory access, you can follow the method described here:

https://stackoverflow.com/questions/27277365/unspecified-launch-failure-on-memcpy/27278218#27278218

to localize that kind of error to a single line of your kernel code.

Regarding the error associated with increasing the threadblock size from 768 to 769, I can reproduce that error, and cuda-memcheck immediately points it out. Your error checking around the kernel launch is incomplete/incorrect, which is why you are not seeing that error. To do proper CUDA error checking around your kernel launch, add this:

gen<<<dimGrid,dimBlock>>>(d_cnt);
        error = cudaGetLastError();   // add this line, and check the error code
        // check error code here
        error = cudaMemcpy(h_cnt, d_cnt, mem_size_A, cudaMemcpyDeviceToHost);

Thanks @txbob for ur reply.
To answer ur questions…

  1. I run a GeForce 940 MX
  2. My intel processor also has a intel UHD 620 integrated graphic processor, so i guess it hosts the display(Not sure how to check:P)
  3. Yes I run on Windows 10 home edition
  4. Till now i havent heard of that “-G debug switch” thing, so NO.

PS. i need to run the loop for 64 times
each time launching a kernel with 1024*1024 threads(which is what i intended), hence i was experimenting with the diff. block and thread combinations…

PPS. Since I’m new at this thing, I’m prone to ask many doubts(which may seem silly), plis do bear :P

If you are building a debug project in Visual Studio, you have the -G switch included in your compilation, whether you have heard of it or not.

Since you are on windows, I think the unspecified launch failure is likely a kernel time-out, as I already mentioned. Google “cuda wddm tdr timeout”

I did the foll. stuff:

  • I changed the tdrdelay value from 2 to 15 sec
  • I compiled using "-G switch". But when i ran it, it returned "error due to too many resources req. for launch". Also it only happens when the number of blocks/threads are above a certain limit (which is still way below what my GPU could handle). I thought my GPU could handle 1024 threadsperblock and a high number of blocks too. Also, i calculated the total number of registers and global memory, which are also lower than what, my GPU could handle.
  • Then i also compiled using cuda-memcheck option. It showed this:
    $ cuda-memcheck ./a
    Computing result using CUDA Kernel...
    cudaMemcpy (h_cnt,d_cnt) returned error unspecified launch failure (code 4), line(196)
    ========= CUDA-MEMCHECK
    ========= Program hit cudaErrorLaunchFailure (error 4) due to "unspecified launch failure" on CUDA API call to cudaMemcpy.
    =========     Saved host backtrace up to driver entry point at error
    =========     Host Frame:C:\windows\system32\nvcuda.dll (cuTexRefSetAddress + 0x2cb589) [0x2d8bdb]
    =========     Host Frame:C:\cygwin64\Already tested\a.exe [0x17b4]
    =========     Host Frame:C:\cygwin64\Already tested\a.exe [0xe9a5]
    =========     Host Frame:C:\cygwin64\Already tested\a.exe [0xf709]
    =========     Host Frame:C:\windows\System32\KERNEL32.DLL (BaseThreadInitThunk + 0x14) [0x12774]
    =========     Host Frame:C:\windows\SYSTEM32\ntdll.dll (RtlUserThreadStart + 0x21) [0x70d51]
    =========
    ========= Program hit CUDA_ERROR_LAUNCH_FAILED (error 719) due to "unspecified launch failure" on CUDA API call to cuModuleUnload.
    =========     Saved host backtrace up to driver entry point at error
    =========     Host Frame:C:\windows\system32\nvcuda.dll (cuTexRefSetAddress + 0x1bbcac) [0x1c92fe]
    =========     Host Frame:C:\cygwin64\Already tested\a.exe [0x8c20]
    =========     Host Frame:C:\cygwin64\Already tested\a.exe [0x9ba8]
    =========     Host Frame:C:\cygwin64\Already tested\a.exe [0xb288]
    =========     Host Frame:C:\cygwin64\Already tested\a.exe [0x5c8f]
    =========     Host Frame:C:\cygwin64\Already tested\a.exe [0x61b6]
    =========     Host Frame:C:\cygwin64\Already tested\a.exe [0x18f53]
    =========     Host Frame:C:\cygwin64\Already tested\a.exe [0x191f9]
    =========     Host Frame:C:\cygwin64\Already tested\a.exe [0x158ac]
    =========     Host Frame:C:\cygwin64\Already tested\a.exe [0xe9dd]
    =========     Host Frame:C:\cygwin64\Already tested\a.exe [0xf709]
    =========     Host Frame:C:\windows\System32\KERNEL32.DLL (BaseThreadInitThunk + 0x14) [0x12774]
    =========     Host Frame:C:\windows\SYSTEM32\ntdll.dll (RtlUserThreadStart + 0x21) [0x70d51]
    =========
    ========= ERROR SUMMARY: 2 errors
    

I did the foll. stuff:

  • I changed the tdrdelay value from 2 to 15 sec
  • I compiled using "-G switch". But when i ran it, it returned "error due to too many resources req. for launch". Also it only happens when the number of blocks/threads are above a certain limit (which is still way below what my GPU could handle). I thought my GPU could handle 1024 threadsperblock and a high number of blocks too. Also, i calculated the total number of registers and global memory, which are also lower than what, my GPU could handle.
  • Then i also compiled using cuda-memcheck option. It showed this:
    $ cuda-memcheck ./a
    Computing result using CUDA Kernel...
    cudaMemcpy (h_cnt,d_cnt) returned error unspecified launch failure (code 4), line(196)
    ========= CUDA-MEMCHECK
    ========= Program hit cudaErrorLaunchFailure (error 4) due to "unspecified launch failure" on CUDA API call to cudaMemcpy.
    =========     Saved host backtrace up to driver entry point at error
    =========     Host Frame:C:\windows\system32\nvcuda.dll (cuTexRefSetAddress + 0x2cb589) [0x2d8bdb]
    =========     Host Frame:C:\cygwin64\Already tested\a.exe [0x17b4]
    =========     Host Frame:C:\cygwin64\Already tested\a.exe [0xe9a5]
    =========     Host Frame:C:\cygwin64\Already tested\a.exe [0xf709]
    =========     Host Frame:C:\windows\System32\KERNEL32.DLL (BaseThreadInitThunk + 0x14) [0x12774]
    =========     Host Frame:C:\windows\SYSTEM32\ntdll.dll (RtlUserThreadStart + 0x21) [0x70d51]
    =========
    ========= Program hit CUDA_ERROR_LAUNCH_FAILED (error 719) due to "unspecified launch failure" on CUDA API call to cuModuleUnload.
    =========     Saved host backtrace up to driver entry point at error
    =========     Host Frame:C:\windows\system32\nvcuda.dll (cuTexRefSetAddress + 0x1bbcac) [0x1c92fe]
    =========     Host Frame:C:\cygwin64\Already tested\a.exe [0x8c20]
    =========     Host Frame:C:\cygwin64\Already tested\a.exe [0x9ba8]
    =========     Host Frame:C:\cygwin64\Already tested\a.exe [0xb288]
    =========     Host Frame:C:\cygwin64\Already tested\a.exe [0x5c8f]
    =========     Host Frame:C:\cygwin64\Already tested\a.exe [0x61b6]
    =========     Host Frame:C:\cygwin64\Already tested\a.exe [0x18f53]
    =========     Host Frame:C:\cygwin64\Already tested\a.exe [0x191f9]
    =========     Host Frame:C:\cygwin64\Already tested\a.exe [0x158ac]
    =========     Host Frame:C:\cygwin64\Already tested\a.exe [0xe9dd]
    =========     Host Frame:C:\cygwin64\Already tested\a.exe [0xf709]
    =========     Host Frame:C:\windows\System32\KERNEL32.DLL (BaseThreadInitThunk + 0x14) [0x12774]
    =========     Host Frame:C:\windows\SYSTEM32\ntdll.dll (RtlUserThreadStart + 0x21) [0x70d51]
    =========
    ========= ERROR SUMMARY: 2 errors
    

I posted on another forum, where a friend said it could be due to the GPU.
https://stackoverflow.com/questions/48042013/unspecified-launch-error-on-cuda

I guess i’ve found the problem.
Thank you for your patience in addressing my doubts :)

Don’t compile with -G. Don’t build a debug project.

Increase the tdrdelay value.

If you want to run with 1024 threads per block, compile with this additional switch to work around the registers per thread issue:

-maxrregcount 63

Thanks @txbob, I got my program to execute by using -maxrrregcount 63.
Also, I’ve modeified my program a lil’ bit.

//Implemented using atomicAdd on global memory

#include <stdio.h>
#include <time.h>
#include <stdlib.h>
#include <ctime>
// CUDA runtime
#include <cuda_runtime.h>

// Helper functions and utilities to work with CUDA
#include "helper_functions.h"
#include "helper_cuda.h"
#include "curand.h"
#include <curand_kernel.h>

#define hrotateleft(x, n) (((x) << (n)) ^((x) >> (32-n)))
#define hupdate(a, b, c, n) ((a)^(hrotateleft(((b)+(c)),(n))))
#define hmyrand32 ((ul) (4294967296.0*((double)rand()/(RAND_MAX))))

#define ul unsigned int
#define ull unsigned long long 

#define BIASMAXLIMIT 0.51
#define BIASMINLIMIT 0.49

__device__ double cudaRand(int id)
{
    curandState state;
    curand_init((unsigned long long)clock(), id, 0, &state);

    double d_out = curand_uniform_double(&state);
    return d_out;
}

__device__ void initializeR(ul *x){
	
	int l = threadIdx.x + blockIdx.x * blockDim.x;
	#define myrand32 ((ul) (4294967296.0*((double)cudaRand(l))))

	int i;
	for (i = 0; i < 16; i++)
		x[i] = myrand32;
	
	x[0] = 0x61707865;
	x[5] = 0x3320646e;
	x[10] = 0x79622d32;
	x[15] = 0x6b206574;
}

__device__ void copystate(ul *x1, ul *x){
	int i;
	for (i = 0; i < 16; i++)
		x1[i] = x[i];
}

__device__ void print(ul *x){
	int i;
	for (i = 0; i < 16; i++){
		printf("%8x ", x[i]);
		if (i > 0 && i%4 == 3)
			printf("\n");
	}
	printf("\n");
}


__device__ void qr(ul *x0, ul *x1, ul *x2, ul *x3){
	#define rotateleft(x, n) (((x) << (n)) ^((x) >> (32-n)))

	#define update(a, b, c, n) ((a)^(rotateleft(((b)+(c)),(n))))

	ul z0, z1, z2, z3;
	z1 = update(*x1, *x3, *x0, 7);
	z2 = update(*x2, *x0, z1, 9);
	z3 = update(*x3, z1, z2, 13);
	z0 = update(*x0, z2, z3, 18);
	*x0 = z0; *x1 = z1, *x2 = z2, *x3 = z3;
}

__device__ void transpose(ul *x){
  	ul temp;
  	temp=x[1]; x[1]=x[4]; x[4]=temp;
  	temp=x[2]; x[2]=x[8]; x[8]=temp;
  	temp=x[3]; x[3]=x[12]; x[12]=temp;
  	temp=x[6]; x[6]=x[9]; x[9]=temp;
  	temp=x[7]; x[7]=x[13]; x[13]=temp;
  	temp=x[11]; x[11]=x[14]; x[14]=temp;
}

__device__ void rounds(ul *x){
	qr(&(x[0]), &(x[4]), &(x[8]),&(x[12]));
	qr(&(x[5]), &(x[9]), &(x[13]),&(x[1]));
	qr(&(x[10]), &(x[14]), &(x[2]),&(x[6]));
	qr(&(x[15]), &(x[3]), &(x[7]),&(x[11]));
	transpose(x);
}

void hinitializeR(ul *x){
	int i;
	for (i = 0; i < 16; i++)
		x[i] = hmyrand32;

	x[0] = 0x61707865;
	x[5] = 0x3320646e;
	x[10] = 0x79622d32;
	x[15] = 0x6b206574;
}

void hcopystate(ul *x1, ul *x){
	int i;
	for (i = 0; i < 16; i++)
		x1[i] = x[i];
}

void hprint(ul *x){
	int i;
	for (i = 0; i < 16; i++){
		printf("%8x ", x[i]);
		if (i > 0 && i%4 == 3)
			printf("\n");
	}
	printf("\n");
}


void hqr(ul *x0, ul *x1, ul *x2, ul *x3){
	ul z0, z1, z2, z3;
	z1 = hupdate(*x1, *x3, *x0, 7);
	z2 = hupdate(*x2, *x0, z1, 9);
	z3 = hupdate(*x3, z1, z2, 13);
	z0 = hupdate(*x0, z2, z3, 18);
	*x0 = z0; *x1 = z1, *x2 = z2, *x3 = z3;
}

void htranspose(ul *x){
  	ul temp;
  	temp=x[1]; x[1]=x[4]; x[4]=temp;
  	temp=x[2]; x[2]=x[8]; x[8]=temp;
  	temp=x[3]; x[3]=x[12]; x[12]=temp;
  	temp=x[6]; x[6]=x[9]; x[9]=temp;
  	temp=x[7]; x[7]=x[13]; x[13]=temp;
  	temp=x[11]; x[11]=x[14]; x[14]=temp;
}

void hrounds(ul *x){
	hqr(&(x[0]), &(x[4]), &(x[8]),&(x[12]));
	hqr(&(x[5]), &(x[9]), &(x[13]),&(x[1]));
	hqr(&(x[10]), &(x[14]), &(x[2]),&(x[6]));
	hqr(&(x[15]), &(x[3]), &(x[7]),&(x[11]));
	htranspose(x);
}

__global__ void gen (int *d_cnt)
{	
	ul x[16], x1[16], pattern;
	initializeR(x);	
	
	ull pt = 0x80000000;   
    copystate(x1, x);
	x1[7] = x[7] ^ pt;
	
	for (int i = 0; i < 4; i++)
    { rounds(x); rounds(x1); }

    pattern = 0x04000000;
    
	if (((x[6] ^ x1[6]) & pattern) == 0) 
        atomicAdd(d_cnt,1);	    
}

int main()
{
	int i;
	double val=0;
	FILE *fp;

	ul x[16], x1[16], pattern, pt=0x80000000;
	unsigned int cnt=0;
	
    // Allocate host memory for matrix cnt
    int *h_cnt=(int *)malloc(sizeof(int));
    h_cnt[0] = 0;

    // Allocate device memory
    int *d_cnt=NULL;

    cudaError_t error;

    error = cudaMalloc((void **)&d_cnt, sizeof(int) );

    if (error != cudaSuccess)
    {
        printf("cudaMalloc d_cnt returned error %s (code %d), line(%d)\n", cudaGetErrorString(error), error, __LINE__);
        exit(EXIT_FAILURE);
    }

    // copy host memory to device
    error = cudaMemcpy(d_cnt, h_cnt, sizeof(int), cudaMemcpyHostToDevice);

    if (error != cudaSuccess)
    {
        printf("cudaMemcpy (d_cnt,h_cnt) returned error %s (code %d), line(%d)\n", cudaGetErrorString(error), error, __LINE__);
        exit(EXIT_FAILURE);
    }

    // Setup execution parameters

	dim3 dimGrid(100,1,1);
	dim3 dimBlock(992,1,1); 
 
    // Create and start timer
    printf("Computing result using CUDA Kernel...\n");

    // Execute the kernel
    int h=0;
	long starttime=clock();
    
    while(h<64)
    {	

    	//printf("%ld\n",clock());
       	
       	gen<<<dimGrid,dimBlock>>>(d_cnt);	

       	//printf("%ld\n",clock());

        error = cudaGetLastError();
        if (error != cudaSuccess)
	    {
	        printf("launch returned error %s (code %d), line(%d)\n", cudaGetErrorString(error), error, __LINE__);
	        free(h_cnt);
    		cudaFree(d_cnt);
	        exit(EXIT_FAILURE);
	    }
		
	    long starttimet=clock();
        //printf("%ld\n",clock());

		for(unsigned int g=0;g<946176;g++)
	       	{
	       		hinitializeR(x);	
		    
			    hcopystate(x1, x);
				x1[7] = x[7] ^ pt;
				
				for (i = 0; i < 4; i++)
		        { hrounds(x); hrounds(x1); }

				    pattern = 0x04000000;
				    
				    if (((x[6] ^ x1[6]) & pattern) == 0) 
		                cnt = cnt +1.0;
			}
        //printf("%ld\n",clock());

		long finishtimet=clock();
        printf("TIME: %f s \n", (double)(finishtimet-starttimet)/CLOCKS_PER_SEC );
        starttimet=clock();

        printf("%ld\n",clock());

        error = cudaMemcpy(h_cnt, d_cnt, sizeof(int), cudaMemcpyDeviceToHost);

        printf("%ld\n",clock());


	    if (error != cudaSuccess)
	    {
	        printf("cudaMemcpy (h_cnt,d_cnt) returned error %s (code %d), line(%d)\n", cudaGetErrorString(error), error, __LINE__);
	        free(h_cnt);
    		cudaFree(d_cnt);
	        exit(EXIT_FAILURE);
	    }

        fp = fopen("ask1.dat", "w");
		fprintf(fp, "Itr %d\n",h+1);
		printf("Itr %d\n",h+1);
	
		h++;
	
		val = (double)(cnt + h_cnt[0])/(h*1024*1024);
				
        fprintf(fp, "6 26 %.20lf\n", val);
    	fclose(fp);
        printf("6 26 %.20lf\n", val);
        long finishtime=clock();
        printf("TIME: %f s \n", (double)(finishtime-starttime)/CLOCKS_PER_SEC );
        starttime=clock();
        
    }

    // Clean up memory
    free(h_cnt);
    cudaFree(d_cnt);
	
}

When I profiled this using nvprof, I got this:

$ nvprof ./a
Computing result using CUDA Kernel...
1524
1528
1528
2242
TIME: 0.714000 s
2242
3865
Itr 1
6 26 0.59503936767578125000
TIME: 2.341000 s
==10380== NVPROF is profiling process 10380, command: ./a
==10380== Profiling application: ./a
==10380== Warning: Found 67 invalid records in the result.
==10380== Warning: This can happen if device ran out of memory or if a device kernel was stopped due to an assertion.
==10380== Profiling result:
            Type  Time(%)      Time     Calls       Avg       Min       Max  Name
 GPU activities:  100.00%  1.62219s         1  1.62219s  1.62219s  1.62219s  gen(int*)
                    0.00%  1.9520us         1  1.9520us  1.9520us  1.9520us  [CUDA memcpy HtoD]
                    0.00%  1.6640us         1  1.6640us  1.6640us  1.6640us  [CUDA memcpy DtoH]
      API calls:   89.61%  1.62248s         2  811.24ms  89.315us  1.62239s  cudaMemcpy
                    8.07%  146.13ms         1  146.13ms  146.13ms  146.13ms  cudaMalloc
                    1.98%  35.925ms         1  35.925ms  35.925ms  35.925ms  cuDevicePrimaryCtxRelease
                    0.23%  4.1557ms         1  4.1557ms  4.1557ms  4.1557ms  cudaLaunch
                    0.05%  939.80us        30  31.326us     568ns  467.63us  cuDeviceGetAttribute
                    0.03%  487.54us         1  487.54us  487.54us  487.54us  cuDeviceGetName
                    0.02%  298.67us         1  298.67us  298.67us  298.67us  cudaFree
                    0.01%  242.92us         1  242.92us  242.92us  242.92us  cuModuleUnload
                    0.00%  7.3950us         1  7.3950us  7.3950us  7.3950us  cuDeviceTotalMem
                    0.00%  2.2760us         1  2.2760us  2.2760us  2.2760us  cuDeviceGetCount
                    0.00%  1.7070us         1  1.7070us  1.7070us  1.7070us  cuDeviceGet
                    0.00%  1.1380us         1  1.1380us  1.1380us  1.1380us  cudaConfigureCall
                    0.00%  1.1380us         1  1.1380us  1.1380us  1.1380us  cudaGetLastError
                    0.00%     568ns         1     568ns     568ns     568ns  cudaSetupArgument

I’ve added some printf statements to display the clock() output, quite frequently. As is evident, the 0.7 s is taken by CPU to execute some values, the rest take 1.6s by the cudaMemcpy. Finally, I got 2.3 s as the total time. I’ve read that CPU and GPU execution take place concurrently unless a function like cudaDevicesynchronize or cudaMemcpy is encountered. If that’s the case, why is my code being serialized here?

The code you have posted now clearly is not the code you used to generate that nvprof output.