Issue with Nested for-loops in kernel

I recently got into CUDA and am currently working on a parallel implementation of a basic segmented prime-sieving algorithm. My current approach is to round the input up to a multiple of 2^32, find all primes upto 2^32 with a serial algorithm, and use those results to sieve the remaining segments (each segment covers 2^32 digits).

As a basic test, I decided to split each segment into smaller sub-segments and work on them in parallel.

#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
#include <string.h>
#include <stdlib.h>
#include <math.h>
#include <time.h>

typedef unsigned __int64 integer;
const size_t size = sizeof(integer);
const integer pow2_32 = 4294967296;
const int threads = 512;

//Utility function to calculate postive integer-powers
integer power(integer val, integer exp)
{
	integer temp = val;
	for (integer i = 1; i < exp; i++) temp *= val;
	return temp;
}

//Utility function to approximate no. of primes between 1->n as n/ln(n)
integer trimSize(integer n)
{
	long double e = 2.7183;
	integer exp = 1;
	while (pow(e, exp) < n)
		exp++;
	return n / (exp - 2);
}

///////////////////////////KERNEL START///////////////////////////
__device__ void SievePrime(bool *mark, integer p, integer min, integer minb, integer max)
{
	integer j;
	for (j = minb;j <= max;j += (p << 1))
		mark[(j - min) >> 1] = true;
}
__global__ void SieveBlock(integer *P, bool *mark, integer completed)
{
	integer id = threadIdx.x, i, j, minb;
	integer segsize = pow2_32 / threads;
	integer min = (completed * pow2_32) + (id * segsize) + 1;
	integer max = min + segsize - 2;

	//printf("Kernel %3d: [%11llu,%11llu]\n", id + completed, min, max);
	
	for (i = 0;P[i] * P[i] <= max;i++)
	{
		minb = (min / P[i]) * (P[i]) + P[i];
		if (~minb & 1)	minb += P[i];
		for (j = minb;j <= max;j += (P[i] << 1))
			mark[(j - min + 1) >> 1] = true;
	}
	//printf("Kernel %3llu stopped at %llu\n", id + completed, j - (P[i] << 2));
	/*for (j = max; j >= min; j -= 2)
	{	
		if (!mark[(j - min + 1) >> 1])
		{
			printf("Kernel %llu: %llu [Max: %llu] [Skipped %llu]\n", id + completed, j, max,(max-j)>>1);
			break;
		}
	}*/
}
////////////////////////////KERNEL END////////////////////////////

//Driver function
int main(int argc, char* argv[])
{
	//Range: Data-type dependent
	integer n; 
	printf("Enter n: "); 
	scanf("%llu", &n);

	integer m = sqrt(n);
	integer marklen = n >> 1;

	bool smallsieve = false;	//Use serial sieve for n<2^32
	if (n <= pow2_32)
		smallsieve = true;
	else if (n % pow2_32 > 0)	//If n>2^32 then round n to nearest multiple of 2^32
	{
		printf("Rounded %llu to ", n);
		n = ((n / pow2_32) + 1) * pow2_32;
		printf("%llu\n\n", n);
		m = 65536;				//sqrt(pow2_32)
		marklen = pow2_32 >> 1;
	}

	integer limit = (smallsieve) ? n : pow2_32;

	integer plen = trimSize(pow2_32);
	if (~n & 1) n--;
	if (~m & 1) m--;
	
	//Boolean array initialized to false
	bool *mark = (bool *)calloc(marklen + 1, sizeof(bool));	//Represents [2,3,5,7,9,11,...,sqrt(n)]

	//Array to store primes b/w [2,m]
	integer *P = (integer *)calloc(plen + 1, (size_t)size);
	if (mark == NULL || P == NULL) { printf("Memory Allocation Failed!\n"); exit(1); }
	integer i, j, k, offset;

	//Log execution time
	clock_t START_TIME, END_TIME;
	double  CPU_TIME = 0.0;
	float GPU_TIME = 0.0;
	float temp_t;

	//Setup-Phase: Calculate all primes in the range [3,m]
	START_TIME = clock();
	for (i = 5, k = 1, offset = 2; i < m; i += offset, offset = 6 - offset)	//i->[3,5,7,9...,sqrt(n)] | i corresponds to mark[(i-3)/2]
	{
		if (!mark[i >> 1])
		{
			if (i*i <= limit)
				for (j = i * i; j <= limit; j += (i << 1))	//j->[i^2,n] | increments by 2*i
					mark[j >> 1] = 1;
			P[k++] = i;
		}
	}
	END_TIME = clock();
	CPU_TIME = ((double)(END_TIME - START_TIME)) / CLOCKS_PER_SEC;
	
	printf("Stopped primary sieve at prime %llu\n", P[k - 1]);
	for (;i <= limit;i += offset, offset = 6 - offset)
	{
		if(!mark[i >> 1])
			P[k++] = i;
	}

	P[0] = 3;
	plen = k;
	free(mark);	
	printf("Last prime: %llu @ index [%llu]\n\n", P[plen - 1], plen - 1);
	if (smallsieve) goto end;

	integer chunksize = pow2_32 >> 1;						//Elements per chunk of 2^32 digits
	integer chunkcount = (n - pow2_32 - 1) / chunksize;		//No. of chunks
	integer completed = 1;
	printf("%llu chunks for [%llu->%llu]\n", chunkcount, pow2_32 - 1, n);

	integer *d_P;
	bool *d_mark;

	//CUDA Malloc
	cudaMalloc(&d_P, (plen + 1) * (size));
	cudaMalloc(&d_mark, chunksize + 1);

	//Calculate dimensions
	dim3 TPB(threads);

	cudaEvent_t start, stop;
	cudaEventCreate(&start);
	cudaEventCreate(&stop);

	cudaMemcpy(d_P, P, plen * size, cudaMemcpyHostToDevice);

	while (completed <= chunkcount)
	{
		mark = (bool *)calloc(chunksize + 1, sizeof(bool));
		cudaMemcpy(d_mark, mark, chunksize, cudaMemcpyHostToDevice);	//Copy current block to device

		cudaEventRecord(start);
		SieveBlock<<<1, TPB >>> (d_P, d_mark, completed);		//Execute sieving kernel
		cudaEventRecord(stop);
		cudaEventSynchronize(stop);
		cudaEventElapsedTime(&temp_t, start, stop);
		GPU_TIME += temp_t;

		cudaMemcpy(mark, d_mark, marklen + 1, cudaMemcpyDeviceToHost);	//Write back results to host
		printf("\n  Last prime in block %llu: ", completed);
		integer start = (completed * pow2_32) + 1;
		for (i = start + pow2_32 - 2;i >= start;i -= 2)
		{
			if (!mark[(i-start+1)>>1])
			{
				printf("%llu\n\n", i);
				break;
			}
		}
		completed++;
		free(mark);
		cudaFree(d_mark);
	}

	free(P); 
	cudaFree(d_P);

	GPU_TIME /= 1000;
end:printf("\nSETUP-PHASE CPU Time: %0.3f seconds\n", CPU_TIME);
	printf("COMPUTE-PHASE GPU Time: %0.3f seconds\n", GPU_TIME);
	//printf("FILE_WRITE CPU Time: %0.3f seconds\n", F_CPU_TIME);
	return 0;

}

However, for reasons beyond my understanding, the inner-loop is causing a myriad of complications.

When using the above snippet, the kernel fails to execute in its entirety. If I replace the assignment statement with printf(), then the inner-loop executes normally, while the outer-loop only executes once. If I comment out the contents of the inner-loop altogether, then the outer-loop executes normally. If I change the for-loops bounds to something like [1->100] then the inner-loop executes normally while the outer-loop executes only once.

As a newbie, I’m at my wits end and can’t figure out what on Earth I’m doing wrong. Any and all suggestions greatly appreciated.

OS: Windows 10
IDE: VS2017
GPU: GTX 1070 Ti

I don’t know if this additional info will help, but prior to working on this, I was working on a parallel implementation of a prime-sieve with odd digits mapped bit-wise to an array. The logic was sound on paper, but I ran into precisely the same nested-loop issue that I did here.

Some suggestions, in no particular order:

  • This kernel looks like it might take a long time to run. If you are running on a system where kernel runtime is limited (check deviceQuery for the GPU you are running on) then that might be an issue. To this end it’s also good to provide the GPU you are running on, the OS, and the compile command line. Sometimes these things matter.

  • I usually recommend that people who want debug assistance provide a complete code, not just a kernel. Provide a complete application around what you have shown, that others can copy, paste, compile, and run, without having to add anything or change anything, and see the issue.

  • I also usually recommend that people do proper CUDA error checking, and also run the code with cuda-memcheck. If cuda-memcheck reports an error, providing that information to others trying to help you is useful. Also, if cuda-memcheck reports a kernel execution error, you can follow the method here:

[url]cuda - Unspecified launch failure on Memcpy - Stack Overflow

to localize that error to a specific line of kernel code.

Your suggestions most certainly did help resolve a number of problems and helped me notice a couple honest-to-god stupid mistakes I’d made. I’ve updated the code in my original post to reflect the changes, though the two major ones are related to the array being sieved.

The first change was accounting for the difference in sieving ranges for different threads, and the second was shifting the array’s creation to the thread and printing the results there rather than copying a massive array to and from the host.

However this has caused another issue - I can’t view the results. The last bit of my kernel code is intended to print the last prime found in the thread’s block, and is as follows:-

printf("Kernel %3llu stopped at %llu [Max: %llu]\n", id + completed, j - (P[i] << 2), max);
for (j = max; j >= min; j -= 2)
{
	if (!mark[(j - min + 1) >> 1])
	{
		printf("Kernel %llu: %llu [Max: %llu]\n", id, j, max);
		break;
	}
}

If I comment out/remove the printf() statement within the j-loop, each thread reports that they have successfully sieved their respective blocks via the preceding printf() statement. However, when un-commented, the kernels fails to launch altogether.
cuda-memcheck reports the following:-

========= Program hit cudaErrorInvalidValue (error 11) due to "invalid argument" on CUDA API call to cudaLaunchKernel.
=========     Saved host backtrace up to driver entry point at error
=========     Host Frame:C:\WINDOWS\system32\nvcuda.dll (cuD3D9UnmapVertexBuffer + 0x2e2c29) [0x2f0feb]
=========     Host Frame:C:\Users\RB99\source\repos\CUDATest\x64\Release\CUDATest.exe (cudaLaunchKernel + 0x1f9) [0x1839]
=========     Host Frame:C:\Users\RB99\source\repos\CUDATest\x64\Release\CUDATest.exe (SieveBlock + 0xbe) [0xf3de]
=========     Host Frame:C:\Users\RB99\source\repos\CUDATest\x64\Release\CUDATest.exe (main + 0x383) [0xf813]
=========     Host Frame:C:\Users\RB99\source\repos\CUDATest\x64\Release\CUDATest.exe (__scrt_common_main_seh + 0x10c) [0xfe38]
=========     Host Frame:C:\WINDOWS\System32\KERNEL32.DLL (BaseThreadInitThunk + 0x14) [0x181f4]
=========     Host Frame:C:\WINDOWS\SYSTEM32\ntdll.dll (RtlUserThreadStart + 0x21) [0x6a251]
=========
========= ERROR SUMMARY: 1 error

I don’t quite follow what to do with this particular report, and don’t know how a second printf() is causing a supposed invalid argument error. Even printing out a simple string like “Test\n” within the loop yields the above error. If I clear the loop of printf() statements, I have no errors, and no way to view the results. Any suggestions?

invalid argument usually means one of your kernel launch configuration arguments is out of range

I suggest printing those out to rule that out. (or perhaps print out the combo on the failing launch)

I’m referring to the number passed between <<<…>>> syntax. For example does your TPB argument exceed 1024?

Side remark: It is not good practice to use variable names that are identical to the name of standard functions. In this case, ‘exp’.

No it doesn’t. I’ve been sticking to 1 block per grid with TPB set to (threads,1,1). I’ve made sure to assign threads a power of two that’s less than 1024.

I only get this invalid kernel argument error when printing anything inside that particular loop. If I don’t print anything inside it, the kernel executes just fine and each thread reports successful execution. I’ll check again to see if my threads value is somehow being affected by this change, though I still don’t understand how a single printf() statement inside the kernel is interfering with its launch arguments.

If you want to provide a short complete code that demonstrates the issue, similar to your first posting, I’ll take a look. I’m not bothering to try your first posting because you said you changed things.

I’ll provide the updated complete code at the end of this reply. However I noticed something rather curious after watching my TPB variable. For whatever reason, its z value is never 1. Even after explicitly specifying that TPB should be (threads,1,1), its z value takes on other values depending on my kernel’s contents during compile-time, i.e., the value that z is assigned won’t change until the code is recompiled, and the range of values it can take on depend on the kernel’s contents.

For instance having my kernel end with the following:-

printf("Kernel %3llu stopped at %llu [Max: %llu]\n", id + completed, j - (P[i] << 2), max);
for (j = max; j >= min; j -= 2)
{
	if (!mark[(j - min + 1) >> 1])
	{
		//printf("Kernel %llu: %llu\n", id , j);
		break;
	}
}

Sees TBP holding a value of {x=256 y=1 z=2862480397}.
Outcome: Kernel executes successfully

Ending it with:-

printf("Kernel %3llu stopped at %llu [Max: %llu]\n", id + completed, j - (P[i] << 2), max);
for (j = max; j >= min; j -= 2)
{
	if (!mark[(j - min + 1) >> 1])
	{
		printf("Kernel %llu: %llu\n", id , j);
		break;
	}
}

Sees TBP holding a value of {x=256 y=1 z=3080715277}.
Outcome: Kernel failed to launch (launch argument error)

I tried printing blockDim inside the kernel, and it reports (256,1,1), which is the expected result. So I’m not sure what’s going on here.

Updated Code:

#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
#include <string.h>
#include <stdlib.h>
#include <math.h>
#include <time.h>

typedef unsigned __int64 integer;
const size_t size = sizeof(integer);
const integer pow2_32 = 4294967296;
const int threads = 256;

//Utility function to calculate postive integer-powers
integer power(integer val, integer exp)
{
	integer temp = val;
	for (integer i = 1; i < exp; i++) temp *= val;
	return temp;
}

//Utility function to approximate no. of primes between 1->n as n/ln(n)
integer trimSize(integer n)
{
	long double e = 2.7183;
	integer exp = 1;
	while (pow(e, exp) < n)
		exp++;
	return n / (exp - 2);
}

///////////////////////////KERNEL START///////////////////////////
__global__ void SieveBlock(integer *P, integer completed)
{
	integer id = threadIdx.x, i, j, minb;
	integer segsize = pow2_32 / threads;
	integer min = (completed * pow2_32) + (id * segsize) + 1;
	integer max = min + segsize - 2;
	bool mark[(pow2_32 >> 1) / threads];
	
	for (i = 0;P[i] * P[i] <= max;i++)
	{
		minb = (min / P[i]) * (P[i]) + P[i];
		if (~minb & 1)	minb += P[i];
		for (j = minb;j <= max;j += (P[i] << 1))
			mark[(j - min + 1) >> 1] = true;
	}
	printf("Kernel %3llu stopped at %llu [Max: %llu]\n", id + completed, j - (P[i] << 2), max);
	if (id == 0)
		printf("%d %d %d\n", blockDim.x, blockDim.y, blockDim.z);
	for (j = max; j >= min; j -= 2)
	{
		if (!mark[(j - min + 1) >> 1])
		{
			printf("Kernel %llu: %llu\n", id , j);
			break;
		}
	}
}
////////////////////////////KERNEL END////////////////////////////

//Driver function
int main(int argc, char* argv[])
{
	//Range: Data-type dependent
	integer n; 
	printf("Enter n: "); 
	scanf("%llu", &n);

	integer m = sqrt(n);
	integer marklen = n >> 1;

	bool smallsieve = false;	//Use serial sieve for n<2^32
	if (n <= pow2_32)
		smallsieve = true;
	else if (n % pow2_32 > 0)	//If n>2^32 then round n to nearest multiple of 2^32
	{
		printf("Rounded %llu to ", n);
		n = ((n / pow2_32) + 1) * pow2_32;
		printf("%llu\n\n", n);
		m = 65536;				//sqrt(pow2_32)
		marklen = pow2_32 >> 1;
	}

	integer limit = (smallsieve) ? n : pow2_32;

	integer plen = trimSize(pow2_32);
	if (~n & 1) n--;
	if (~m & 1) m--;
	
	//Boolean array initialized to false
	bool *mark = (bool *)calloc(marklen + 1, sizeof(bool));	//Represents [2,3,5,7,9,11,...,sqrt(n)]

	//Array to store primes b/w [2,m]
	integer *P = (integer *)calloc(plen + 1, (size_t)size);
	if (mark == NULL || P == NULL) { printf("Memory Allocation Failed!\n"); exit(1); }
	integer i, j, k, offset;

	//Log execution time
	clock_t START_TIME, END_TIME;
	double  CPU_TIME = 0.0;
	float GPU_TIME = 0.0;
	float temp_t;

	//Setup-Phase: Calculate all primes in the range [3,m]
	START_TIME = clock();
	for (i = 5, k = 1, offset = 2; i < m; i += offset, offset = 6 - offset)	//i->[3,5,7,9...,sqrt(n)] | i corresponds to mark[(i-3)/2]
	{
		if (!mark[i >> 1])
		{
			if (i*i <= limit)
				for (j = i * i; j <= limit; j += (i << 1))	//j->[i^2,n] | increments by 2*i
					mark[j >> 1] = 1;
			P[k++] = i;
		}
	}
	END_TIME = clock();
	CPU_TIME = ((double)(END_TIME - START_TIME)) / CLOCKS_PER_SEC;
	
	printf("Stopped primary sieve at prime %llu\n", P[k - 1]);
	for (;i <= limit;i += offset, offset = 6 - offset)
	{
		if(!mark[i >> 1])
			P[k++] = i;
	}

	P[0] = 3;
	plen = k;
	free(mark);	
	printf("Last prime: %llu @ index [%llu]\n\n", P[plen - 1], plen - 1);
	if (smallsieve) goto end;

	integer chunksize = pow2_32 >> 1;						//Elements per chunk of 2^32 digits
	integer chunkcount = (n - pow2_32 - 1) / chunksize;		//No. of chunks
	integer completed = 1;
	printf("%llu chunk(s) for [%llu->%llu]\n", chunkcount, pow2_32 - 1, n);

	integer *d_P;

	//CUDA Malloc
	cudaMalloc(&d_P, (plen + 1) * (size));

	//Calculate dimensions
	dim3 TPB(threads, 1, 1);

	cudaEvent_t start, stop;
	cudaEventCreate(&start);
	cudaEventCreate(&stop);

	cudaMemcpy(d_P, P, plen * size, cudaMemcpyHostToDevice);

	while (completed <= chunkcount)
	{
		cudaEventRecord(start);
		SieveBlock<<<1, TPB >>> (d_P, completed);		//Execute sieving kernel
		cudaEventRecord(stop);
		cudaEventSynchronize(stop);
		cudaEventElapsedTime(&temp_t, start, stop);
		GPU_TIME += temp_t;
		completed++;
	}

	free(P); 
	cudaFree(d_P);

	GPU_TIME /= 1000;
end:printf("\nSETUP-PHASE CPU Time: %0.3f seconds\n", CPU_TIME);
	printf("COMPUTE-PHASE GPU Time: %0.3f seconds\n", GPU_TIME);
	//printf("FILE_WRITE CPU Time: %0.3f seconds\n", F_CPU_TIME);
	return 0;

}

Update:
I modified the end of my kernel to try something different.

__global__ void SieveBlock(integer *P, integer completed)
{
	integer id = threadIdx.x, i, j, minb;
	integer segsize = pow2_32 / threads;
	integer min = (completed * pow2_32) + (id * segsize) + 1;
	integer max = min + segsize - 2;
	bool mark[(pow2_32 >> 1) / threads];
	
	for (i = 0;P[i] * P[i] <= max;i++)
	{
		minb = (min / P[i]) * (P[i]) + P[i];
		if (~minb & 1)	minb += P[i];
		for (j = minb;j <= max;j += (P[i] << 1))
			mark[(j - min + 1) >> 1] = true;
	}
	//printf("Kernel %3llu stopped at %llu [Max: %llu]\n", id + completed, j - (P[i] << 2), max);
	integer last;
	for (j = max; j >= min; j -= 2)
	{
		if (!mark[(j - min + 1) >> 1])
		{
			last = j;
			break;
		}
	}
	printf("Kernel %llu: %llu\n", id , last);
}

Turns out printing anything after the last loop causes the same invalid arg error. If I remove the last printf(), the kernel executes yet again with zero issues. I’ve seen mentions of a cuPrintf header file. Perhaps that would be safer to use? If so, how do I go about adding it?

what value of n should I use?

So far I’ve stuck to anything between 4294967296 and 8589934592.

n is automatically rounded up to the latter. Primes upto 4294967296 are calculated within the host which takes about 23 seconds for me (not the most efficient method I realize, but the simplest I had). The while loop will run once, splitting digits between 4294967296 and 8589934592 into threads segments and the GPU works on them in parallel. Assuming the kernel is in fact sieving them properly a single kernel execution should take under a second.

The problem is here:

bool mark[(pow2_32 >> 1) / threads];

You can’t launch a kernel that requires that much local memory per thread. 2Gigabytes per threadblock.

The maximum amount of local memory per thread is 512KB:

[url]https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#features-and-technical-specifications__technical-specifications-per-compute-capability[/url]

You’re asking for 2G/256 = 8MB per thread.

You can see how much you are requesting at compile time by adding the -Xptxas=-v parameter to your compile command line

What if I declared a shared variable:

__shared__ bool mark[pow2_32>>1];

Would that work? If not, what if I increased the number of blocks so that each thread has to work on a smaller data set?

shared memory is limited to 48KB per threadblock. You can discover that by reading the link I already gave you.

I haven’t studied your code enough to give any further advice. It might be that increasing the number of blocks would work, I don’t know. Of course it would eventually reduce the local memory to the point where you are under the limit, but I haven’t studied your algorithm to see if it would work that way or if there are any other defects in your code.