Problem with CUDA code,overflow?

marin.bugati · September 10, 2020, 9:36pm

Hi!
Can someone help me with this code.I am getting overflow in 139 line(copy to GPU)?
Can someone show me where is problem and how ti fix it?
Here is code:
#define MAXSIZE 250000


#include 

#include 

#include 	//Writing to files

#include 	//Keep track of time

#include <cuda.h>

#include <cuda_runtime_api.h>

#include <cuda_runtime.h>

// to remove intellisense highlighting

#include <device_launch_parameters.h>

#ifndef CUDACC

#define CUDACC

#endif

#include 

#include “device_launch_parameters.h”
using namespace std::chrono;
int data[MAXSIZE];

//Main CUDA kernel implementing Sieve of Eratosthenes

global static void CUDASieve(int num, int range, int bNum, int tNum){

const int threadId = threadIdx.x;

const int blockId = blockIdx.x;

int tmp = blockIdtNum + threadId;

while (tmp < range){

int i = 1;

while (((2 * tmp + 3)*i + tmp + 1) < MAXSIZE){

num[(2 * tmp + 3)*i + tmp + 1] = 0;

i++;

}

tmp += bNum * tNum;

}

}

void CUDAFilter(int *number, int size){

for (int i = 0; i<size; i++)

number[i] = 2 * i + 1;

number[0] = 2;

}
void reportTime(const char* msg, steady_clock::duration span) {

auto ms = duration_cast(span);

std::cout << msg << ms.count() << " millisecs" << std::endl;

}
void CPUgenPrime(uint64_t range, bool mode, std::ofstream &fileOut) {

//Start the clock

steady_clock::time_point ts, te;

ts = steady_clock::now();

fileOut << “\nCPU version\n” << “\nCPU version generating from range (0” << “~” << range << “)\n\n”;

//Keep track of results

uint64_t count = 0;

//Outer loop

for (uint64_t i = 0; i < range; i++)

//Inner loop

for (uint64_t j = 2; j*j <= i; j++) {

if (i % j == 0)

break;

else if (j + 1 > sqrt(i)) {

//User wants to see output on screen

if (mode) {

std::cout << std::fixed << i << “\t”;

fileOut << std::fixed << i << “\t”;

count++;

}

//Just write to file if mode is 0

else

{

fileOut << std::fixed << i << “\t”;

count++;

}

}

}

//Stop the clock

te = steady_clock::now();
std::cout << "\n\nTotal number of primes: " << count << std::endl;
reportTime("\nCPU Program Completed in ", te - ts);

fileOut << "\n\nTotal number of primes: " << count << std::endl;

std::cout << "A log file with the current date/time has been placed in the program directory.\n";
std::cout << "--------------------------------------------------------------------------------\n";

}
std::ofstream fileInit(){

//Get current date and time

time_t rawtime;

struct tm * timeinfo;

char buffer[80];

time(&rawtime);

timeinfo = localtime(&rawtime);
//Format in Year-Month-Day_Hour_Minute_Seconds
strftime(buffer, 80, "%y-%m-%d_%H-%M-%S", timeinfo);
std::string dateTime(buffer);

//File handles
std::ofstream fileOut;
fileOut.open("GenPrime_out_" + dateTime + ".txt");
return fileOut;

}
int setupRange(int range) {

if (range == 0) {

std::cout << “[2/3] Please choose the range(3 ~ 500,000): \n”;

std::cin >> range;
	//Error checking
	if (range > 2 && range <= 500000) {
		return range;
	}
	else {
		std::cout << "Invalid input for range, value set to default 500,000\n";
		return 500000;
	}
}
else return range;

}
//Array of MAXSIZE is created and filled with prime numbers, where [i]

//is the prime int and the rest is padded with 0’s

//Example: cpudata[i] = {0,1,0,3,0,5,0,7,0,0,0,11,0,0,0…}

void justDoIt(int range, bool mode, std::ofstream& fileOut) {

//Output to file

fileOut << “CUDA Multithreading Sieve of Eratosthenes\n” << “CUDA Multithreading generating from range (0” << “~” << range << “)\n\n”;
//Filter out even numbers to simplify calculation
CUDAFilter(data, (range / 2) + 1);

//Initialize arrays
int *gpudata;
int cpudata[MAXSIZE];

//Allocate memory
cudaMalloc((void**)&gpudata, sizeof(int)*MAXSIZE);

//Copy to GPU
cudaMemcpy(gpudata, data, sizeof(int)*MAXSIZE, cudaMemcpyHostToDevice);

//Maximum threads per block for CUDA 5.2 is 1024
int bNum = 96, tNum = 1024;

//Start the clock
steady_clock::time_point ts, te;
ts = stead_clock::now();

//Kernel call on the GPU
CUDASieve << <bNum, tNum, 0 >> >(gpudata, range, bNum, tNum);

//Synchronize the device and the host
cudaDeviceSynchronize();

//Copy from GPU back onto host
cudaMemcpy(&cpudata, gpudata, sizeof(int)*MAXSIZE, cudaMemcpyDeviceToHost);

//Free the memory on the GPU
cudaFree(gpudata);

//Reset the device for easy profiling
cudaDeviceReset();

//Stop the clock
te = steady_clock::now();

//Display on screen
if (mode == 1) {
	for (int i = 0; i < MAXSIZE; i++) {
		if (cpudata[i] != 0)
			printf("%d\t", cpudata[i]);
	}
}
//Count number of primes
int count = std::count_if(cpudata, cpudata + MAXSIZE, [](int i){ return i; });
std::cout << "\n\nTotal number of primes: " << count-2 << std::endl;

//Write to file
for (int i = 0; i < MAXSIZE; i++) {
	if (cpudata[i] != 0) {
		fileOut << cpudata[i] << "\t";
	}
}
//Show the amount of time 
reportTime("GPU Program Completed in ", te - ts);
fileOut << "\n\nTotal number of primes: " << count - 2 << std::endl;
std::cout << "A log file with the current date/time has been placed in the program directory.\n";
std::cout << "--------------------------------------------------------------------------------\n";

}
void menu(int range, bool mode, std::ofstream& fileOut){

std::cout << “[3/3] Please select the version of the program you want to run\n”

<< “1. []  CUDA Multithreading Sieve of Eratosthenes version\n"

<< "2. []    Simple CPU version\n"

<< "3. []	Run both versions\n”

<< “0. Quit\n”

<< "Option: ";

int mainMenuOption;

std::cin >> mainMenuOption;	//Accept user input

switch (mainMenuOption) {

case 0:	// User wants to exit

break;

case 1:

std::cout << “CUDA Multithreading generating from range (0” << “~” << range << “)\n”;

std::cout << “--------------------------------------------------------------------------------\n”;

justDoIt(range, mode, fileOut);
		//Close the file handle
		fileOut.close();
		break;
	case 2:
		std::cout << "CPU version generating from range (0" << "~" << range << ")\n";
		std::cout << "--------------------------------------------------------------------------------\n";
		CPUgenPrime(range, mode, fileOut);

		//Close the file handle
		fileOut.close();
		break;
	case 3:
		std::cout << "Running all available options\n";
		justDoIt(range, mode, fileOut);
		CPUgenPrime(range, mode, fileOut);

		//Close the file handle
		fileOut.close();
		break;
	default:
		std::cout << "[Invalid option. Only integers 0-3 are allowed]\n";
		menu(range, mode, fileOut);
		break;
	}

}
void setupScreenMode(int range) {

std::cout << “Team /dev/null GPU610 PRIME NUMBER GENERATOR v3.5\n”

<< “[1/3] Would you like to see the output on screen?\n”

<< “0 = NO, write to file only\n”

<< “1 = YES, display on screen\n”

<< "Show on screen?: ";

int mode = 1;

std::cin >> mode;
	//Initialize file handle
	std::ofstream fileOut = fileInit();

	if (mode == 0) {
		std::cout << "***Writing output to file only***\n\n";
		range = setupRange(range);
		menu(range, mode, fileOut);
	}

	else if (mode == 1) {
		std::cout << "***Outputting results on screen***\n\n";
		range = setupRange(range);
		menu(range, mode, fileOut);
	}
	else {
		std::cout << "[Invalid option selected, default option 0 (output to screen) selected]\n\n";
		range = setupRange(range);
		menu(range, 1, fileOut);
	}

}
//Initialize value to be used in the program using command line arguments

int initRuntimeValue(int argc, char* argv){

//Save runtime parameter into local variable, if provided

int range = 500000;

if (argc == 1) {

std::cout << “[No command line parameters provided]\n\n”;

return 0;

}

if (argc == 2)

range = std::atoi(argv[1]);

if (range > 2 && range < 500000)

return range;

else {

std::cout << “[Bad input for range parameter (must be <= 500,000)]\n”

<< “Range has been set to 500,000\n”;

return range = 500000;

}

}
int main(int argc, char* argv) {

//Grab the command line arguments

int range = initRuntimeValue(argc, argv);
return 0;

}

njuffa · September 10, 2020, 9:46pm

I cannot tell which line that is. Please use one of the code formatting features available in the forum when inserting code into a post, otherwise it will be an unreadable wall of text. The simplest way: Select the text, then click on the </> button. The even better way: Enclose the text within <code> and </code> tags.

Before you do anything else, you would want to add error checking to CUDA API calls and kernel launches. If you don’t know how to do that, Google for proper CUDA error checking and you should see a bunch of useful links on the first page of search results.

When sharing code with others for debugging, the usual approach is to simplify that code as much as possible, so that what remains and is posted is the smallest code that still reproduces the issue.

marin.bugati · September 10, 2020, 9:53pm

I was searching for many links but none of them help me

njuffa · September 10, 2020, 10:00pm

As the error indicates, the problem is on the host side. The root cause is likely this:

Don’t allocate large objects on the stack. Use the heap, i.e. use malloc. The default stack size of host environments is typically on the order of several kilobytes, not megabytes.

[Later:] Seems the default stack size has increased since I last looked. For example, for MSVC, Microsoft states:

For ARM, x86 and x64 machines, the default stack size is 1 MB.

However, my point remains valid: It is best practice to allocate large data objects in the heap.

marin.bugati · September 10, 2020, 10:03pm

Can you show me where and what to change?What should I write in code instead int data[MAXSIZE]

njuffa · September 10, 2020, 10:06pm

I am completely confident that you can find out how to allocate memory with malloc, even if you have never done it before. Give it a try, it is really not hard.

marin.bugati · September 10, 2020, 10:18pm

I need for school project correct code and cannot get it alone so I need help from someone to check this code and to correct errors because I cant see them.

njuffa · September 10, 2020, 10:26pm

The point of school projects is for you to learn, including how to get help other than asking random strangers on the internet (which is fine, but does not provide the maximum amount of learning). I already provided a relevant pointer. I could have said “Use dynamic memory allocation” without mentioning malloc. You can use Google to search How to use malloc. I just tried that and the very first link in the search results seems to provide relevant and useful information.

marin.bugati · September 10, 2020, 10:47pm

For me it is easier to understand that if someone correct my error than using google for school examples of malloc.After seeing links with how to use malloc I still dont understand and dont know how to put him on my problem

marin.bugati · September 10, 2020, 10:51pm

Basically I am asking for someone to correct this code with right and correct code and than to explain what he do

Topic		Replies	Views
Number of items that can be processed in CUDA CUDA Programming and Performance cuda , kernel	5	329	February 17, 2024
when large number stack overflow CUDA Programming and Performance	1	1083	October 19, 2017
An Easy Introduction to CUDA C and C++ Technical Blog	48	1237	July 19, 2018
Number of Blocks CUDA Programming and Performance	3	1574	October 15, 2011
This code doesn't work maybe too much threads assigned? CUDA Programming and Performance	8	1089	February 2, 2014
GPU/CPU precision comparison and Kernel instructions question CUDA Programming and Performance	5	679	April 4, 2017
8-16 fold speed up on reading AsciiGrid files by using the GPU to do the Ascii to float conversions CUDA Programming and Performance	4	6412	September 1, 2009
simplest programming environment (editor) for Cuda? CUDA Programming and Performance	23	22933	March 13, 2009
Is CUDA really that fast? CUDA Programming and Performance	17	11718	September 21, 2009
Cant modify data on the GPU CUDA Programming and Performance	16	10242	December 20, 2008

Problem with CUDA code,overflow?

Related topics