Problem with CUDA code,overflow?

Can someone help me with this code.I am getting overflow in 139 line(copy to GPU)?
Can someone show me where is problem and how ti fix it?
Here is code:

#define MAXSIZE 250000

#include //Writing to files
#include //Keep track of time
#include <cuda.h>
#include <cuda_runtime_api.h>
#include <cuda_runtime.h>
// to remove intellisense highlighting
#include <device_launch_parameters.h>
#ifndef CUDACC
#define CUDACC
#include “device_launch_parameters.h”

using namespace std::chrono;

int data[MAXSIZE];
//Main CUDA kernel implementing Sieve of Eratosthenes
global static void CUDASieve(int num, int range, int bNum, int tNum){
const int threadId = threadIdx.x;
const int blockId = blockIdx.x;
int tmp = blockId
tNum + threadId;
while (tmp < range){
int i = 1;
while (((2 * tmp + 3)*i + tmp + 1) < MAXSIZE){
num[(2 * tmp + 3)*i + tmp + 1] = 0;
tmp += bNum * tNum;
void CUDAFilter(int *number, int size){
for (int i = 0; i<size; i++)
number[i] = 2 * i + 1;
number[0] = 2;

void reportTime(const char* msg, steady_clock::duration span) {
auto ms = duration_cast(span);
std::cout << msg << ms.count() << " millisecs" << std::endl;

void CPUgenPrime(uint64_t range, bool mode, std::ofstream &fileOut) {
//Start the clock
steady_clock::time_point ts, te;
ts = steady_clock::now();
fileOut << “\nCPU version\n” << “\nCPU version generating from range (0” << “~” << range << “)\n\n”;
//Keep track of results
uint64_t count = 0;
//Outer loop
for (uint64_t i = 0; i < range; i++)
//Inner loop
for (uint64_t j = 2; j*j <= i; j++) {
if (i % j == 0)
else if (j + 1 > sqrt(i)) {
//User wants to see output on screen
if (mode) {
std::cout << std::fixed << i << “\t”;
fileOut << std::fixed << i << “\t”;
//Just write to file if mode is 0
fileOut << std::fixed << i << “\t”;
//Stop the clock
te = steady_clock::now();

std::cout << "\n\nTotal number of primes: " << count << std::endl;
reportTime("\nCPU Program Completed in ", te - ts);

fileOut << "\n\nTotal number of primes: " << count << std::endl;

std::cout << "A log file with the current date/time has been placed in the program directory.\n";
std::cout << "--------------------------------------------------------------------------------\n";


std::ofstream fileInit(){
//Get current date and time
time_t rawtime;
struct tm * timeinfo;
char buffer[80];
timeinfo = localtime(&rawtime);

//Format in Year-Month-Day_Hour_Minute_Seconds
strftime(buffer, 80, "%y-%m-%d_%H-%M-%S", timeinfo);
std::string dateTime(buffer);

//File handles
std::ofstream fileOut;"GenPrime_out_" + dateTime + ".txt");
return fileOut;


int setupRange(int range) {
if (range == 0) {
std::cout << “[2/3] Please choose the range(3 ~ 500,000): \n”;
std::cin >> range;

	//Error checking
	if (range > 2 && range <= 500000) {
		return range;
	else {
		std::cout << "Invalid input for range, value set to default 500,000\n";
		return 500000;
else return range;


//Array of MAXSIZE is created and filled with prime numbers, where [i]
//is the prime int and the rest is padded with 0’s
//Example: cpudata[i] = {0,1,0,3,0,5,0,7,0,0,0,11,0,0,0…}
void justDoIt(int range, bool mode, std::ofstream& fileOut) {
//Output to file
fileOut << “CUDA Multithreading Sieve of Eratosthenes\n” << “CUDA Multithreading generating from range (0” << “~” << range << “)\n\n”;

//Filter out even numbers to simplify calculation
CUDAFilter(data, (range / 2) + 1);

//Initialize arrays
int *gpudata;
int cpudata[MAXSIZE];

//Allocate memory
cudaMalloc((void**)&gpudata, sizeof(int)*MAXSIZE);

//Copy to GPU
cudaMemcpy(gpudata, data, sizeof(int)*MAXSIZE, cudaMemcpyHostToDevice);

//Maximum threads per block for CUDA 5.2 is 1024
int bNum = 96, tNum = 1024;

//Start the clock
steady_clock::time_point ts, te;
ts = stead_clock::now();

//Kernel call on the GPU
CUDASieve << <bNum, tNum, 0 >> >(gpudata, range, bNum, tNum);

//Synchronize the device and the host

//Copy from GPU back onto host
cudaMemcpy(&cpudata, gpudata, sizeof(int)*MAXSIZE, cudaMemcpyDeviceToHost);

//Free the memory on the GPU

//Reset the device for easy profiling

//Stop the clock
te = steady_clock::now();

//Display on screen
if (mode == 1) {
	for (int i = 0; i < MAXSIZE; i++) {
		if (cpudata[i] != 0)
			printf("%d\t", cpudata[i]);
//Count number of primes
int count = std::count_if(cpudata, cpudata + MAXSIZE, [](int i){ return i; });
std::cout << "\n\nTotal number of primes: " << count-2 << std::endl;

//Write to file
for (int i = 0; i < MAXSIZE; i++) {
	if (cpudata[i] != 0) {
		fileOut << cpudata[i] << "\t";
//Show the amount of time 
reportTime("GPU Program Completed in ", te - ts);
fileOut << "\n\nTotal number of primes: " << count - 2 << std::endl;
std::cout << "A log file with the current date/time has been placed in the program directory.\n";
std::cout << "--------------------------------------------------------------------------------\n";


void menu(int range, bool mode, std::ofstream& fileOut){
std::cout << “[3/3] Please select the version of the program you want to run\n”
<< “1. [] CUDA Multithreading Sieve of Eratosthenes version\n"
<< "2. [
] Simple CPU version\n"
<< "3. [
] Run both versions\n”
<< “0. Quit\n”
<< "Option: ";
int mainMenuOption;
std::cin >> mainMenuOption; //Accept user input
switch (mainMenuOption) {
case 0: // User wants to exit
case 1:
std::cout << “CUDA Multithreading generating from range (0” << “~” << range << “)\n”;
std::cout << “--------------------------------------------------------------------------------\n”;
justDoIt(range, mode, fileOut);

		//Close the file handle
	case 2:
		std::cout << "CPU version generating from range (0" << "~" << range << ")\n";
		std::cout << "--------------------------------------------------------------------------------\n";
		CPUgenPrime(range, mode, fileOut);

		//Close the file handle
	case 3:
		std::cout << "Running all available options\n";
		justDoIt(range, mode, fileOut);
		CPUgenPrime(range, mode, fileOut);

		//Close the file handle
		std::cout << "[Invalid option. Only integers 0-3 are allowed]\n";
		menu(range, mode, fileOut);


void setupScreenMode(int range) {
std::cout << “Team /dev/null GPU610 PRIME NUMBER GENERATOR v3.5\n”
<< “[1/3] Would you like to see the output on screen?\n”
<< “0 = NO, write to file only\n”
<< “1 = YES, display on screen\n”
<< "Show on screen?: ";
int mode = 1;
std::cin >> mode;

	//Initialize file handle
	std::ofstream fileOut = fileInit();

	if (mode == 0) {
		std::cout << "***Writing output to file only***\n\n";
		range = setupRange(range);
		menu(range, mode, fileOut);

	else if (mode == 1) {
		std::cout << "***Outputting results on screen***\n\n";
		range = setupRange(range);
		menu(range, mode, fileOut);
	else {
		std::cout << "[Invalid option selected, default option 0 (output to screen) selected]\n\n";
		range = setupRange(range);
		menu(range, 1, fileOut);


//Initialize value to be used in the program using command line arguments
int initRuntimeValue(int argc, char* argv){
//Save runtime parameter into local variable, if provided
int range = 500000;
if (argc == 1) {
std::cout << “[No command line parameters provided]\n\n”;
return 0;
if (argc == 2)
range = std::atoi(argv[1]);
if (range > 2 && range < 500000)
return range;
else {
std::cout << “[Bad input for range parameter (must be <= 500,000)]\n”
<< “Range has been set to 500,000\n”;
return range = 500000;

int main(int argc, char* argv) {
//Grab the command line arguments
int range = initRuntimeValue(argc, argv);

return 0;


I cannot tell which line that is. Please use one of the code formatting features available in the forum when inserting code into a post, otherwise it will be an unreadable wall of text. The simplest way: Select the text, then click on the </> button. The even better way: Enclose the text within <code> and </code> tags.

Before you do anything else, you would want to add error checking to CUDA API calls and kernel launches. If you don’t know how to do that, Google for proper CUDA error checking and you should see a bunch of useful links on the first page of search results.

When sharing code with others for debugging, the usual approach is to simplify that code as much as possible, so that what remains and is posted is the smallest code that still reproduces the issue.

I was searching for many links but none of them help me

As the error indicates, the problem is on the host side. The root cause is likely this:

Don’t allocate large objects on the stack. Use the heap, i.e. use malloc. The default stack size of host environments is typically on the order of several kilobytes, not megabytes.

[Later:] Seems the default stack size has increased since I last looked. For example, for MSVC, Microsoft states:

For ARM, x86 and x64 machines, the default stack size is 1 MB.

However, my point remains valid: It is best practice to allocate large data objects in the heap.

Can you show me where and what to change?What should I write in code instead int data[MAXSIZE]

I am completely confident that you can find out how to allocate memory with malloc, even if you have never done it before. Give it a try, it is really not hard.

I need for school project correct code and cannot get it alone so I need help from someone to check this code and to correct errors because I cant see them.

The point of school projects is for you to learn, including how to get help other than asking random strangers on the internet (which is fine, but does not provide the maximum amount of learning). I already provided a relevant pointer. I could have said “Use dynamic memory allocation” without mentioning malloc. You can use Google to search How to use malloc. I just tried that and the very first link in the search results seems to provide relevant and useful information.

For me it is easier to understand that if someone correct my error than using google for school examples of malloc.After seeing links with how to use malloc I still dont understand and dont know how to put him on my problem

Basically I am asking for someone to correct this code with right and correct code and than to explain what he do