when i want to do this cod for 900000000 of number its going to be stop and this error cuda runtime api error 6 the launch timed out and was terminated and kernel stopped whats is my problem i test it on 420m
#include <tchar.h>
#include <windows.h>
#include <iostream>
#define _USE_MATH_DEFINES
#include <math.h>
#include "tbb/parallel_for.h"
#include "tbb/parallel_reduce.h"
#include "tbb/blocked_range.h"
#include "tbb/tick_count.h"
#include "tbb/task_scheduler_init.h"
//****************************************
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include "curand.h"
//****************************************
using namespace std;
using namespace tbb;
const int list_count = 9000000;
int *data = new int[list_count];
void find_primes(int* &my_array, int *&prime_array
){
int prime, factor, limit;
for (int list = 0; list < list_count; list++){
prime = 1;
if ((my_array
<ul> % 2) == 1) {
limit = (int)sqrt((float)my_array
<ul>) + 1;
factor = 3;
while (prime && (factor <= limit)) {
if (my_array
<ul> % factor == 0) prime = 0;
factor += 2;
}
}
else prime = 0;
if (prime) {
prime_array
<ul> = 1;
}
else
prime_array
<ul> = 0;
}
}
void parallel_find_primes(int *&my_array, int *& prime_array){
parallel_for(blocked_range<int>(0, list_count),
[=](const blocked_range<int>& r) {
int prime, factor, limit;
for (int list = r.begin(); list != r.end(); list++){
prime = 1;
if ((my_array
<ul> % 2) == 1) {
limit = (int)sqrt((float)my_array
<ul>) + 1;
factor = 3;
while (prime && (factor <= limit)) {
if (my_array
<ul> % factor == 0) prime = 0;
factor += 2;
}
}
else prime = 0;
if (prime)
prime_array
<ul> = 1;
else
prime_array
<ul> = 0;
}
});
}
__global__ void parallel_find_primes_by_cuda(int* my_array, int *prime_array )
{
int threadsPerBlock = blockDim.x * blockDim.y * blockDim.z;
int threadPosInBlock = threadIdx.x + blockDim.x * threadIdx.y + blockDim.x * blockDim.y * threadIdx.z;
int blockPosInGrid = blockIdx.x + gridDim.x * blockIdx.y + gridDim.x * gridDim.y * blockIdx.z;
int tid = blockPosInGrid * threadsPerBlock + threadPosInBlock;
int prime, factor, limit;
if (tid <= list_count)
{
prime = 1;
if ((my_array[tid] % 2) == 1) {
limit = (int)sqrt((float)my_array[tid]) + 1;
factor = 3;
while (prime && (factor <= limit)) {
if (my_array[tid] % factor == 0) prime = 0;
factor += 2;
}
}
else prime = 0;
if (prime) {
prime_array[tid] = 1;
}
else
prime_array[tid] = 0;
}
}
void make_array(){
int multiplier = 1;
for (int i = 0; i < list_count; i++){
if (i % 1000000 == 0) {
multiplier = 1;
}
data[i] = multiplier;
multiplier++;
}
cout << "Array created." << endl;
}
int _tmain(int argc, _TCHAR* argv[])
{
char x;
int *isprime = new int[list_count];
int *hostrime = new int[list_count];
int *device,*primes;
//cout << "Creating array of " << list_count << " numbers." << endl << "*************************************" << endl << endl;
make_array();
//cout << "Finding primes serially. This may take a few seconds. " << endl << "*************************************" << endl << endl;
//tick_count serial_prime_start = tick_count::now();
//find_primes(data, isprime);
//tick_count serial_prime_end = tick_count::now();
//cout << "Time to find primes serially for " << list_count << " numbers: " << (serial_prime_end - serial_prime_start).seconds() << " seconds." << endl << "*************************************" << endl << endl;
//cout << "Finding primes in parallel. This may take a few seconds. " << endl << "*************************************" << endl;
//tick_count parallel_prime_start = tick_count::now();
//parallel_find_primes(data, isprime);
//tick_count parallel_prime_end = tick_count::now();
//cout << "Time to find primes in parallel for " << list_count << " numbers: " << (parallel_prime_end - parallel_prime_start).seconds() << " seconds." << endl << "*************************************" << endl << endl;
//cout << "coping data to device" << endl << "*************************************" << endl<<endl;
cudaEvent_t starte, ende;
float m;
cudaMalloc(&device,( list_count*sizeof(int)));
cudaMalloc(&primes, (list_count*sizeof(int)));
cudaMemcpy(device, data, list_count*sizeof(int),cudaMemcpyKind::cudaMemcpyHostToDevice);
dim3 block(125,125,8);
dim3 threads(30,30);
cudaError_t cudaStatus;
cout << "coping end and start the exicutint in device device" << endl << "*************************************" << endl << endl;
cudaEventCreate(&starte);
cudaEventCreate(&ende);
cudaEventRecord(starte,0);
parallel_find_primes_by_cuda << <block, threads >> >(device, primes);
cudaMemcpy(hostrime,primes, list_count*sizeof(int), cudaMemcpyKind::cudaMemcpyDeviceToHost);
cudaStatus = cudaGetLastError();
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "addKernel launch failed: %s\n", cudaGetErrorString(cudaStatus));
}
cudaStatus = cudaDeviceSynchronize();
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaDeviceSynchronize returned error code %d after launching addKernel!\n", cudaStatus);
}
cudaEventRecord(ende,0);
cudaEventSynchronize(ende);
cudaEventElapsedTime(&m,starte,ende);
cudaFree(device);
cudaFree(primes);
cout << "Time to find primes in parallel on cuda for " << list_count << " numbers: " << m << " mili seconds." << endl << "*************************************" << endl << endl;
cin >> x;
//for (int i = 0; i < list_count; i++)
//{
// if (isprime[i] == hostrime[i])
//
// cout << "eshah shahriyar" ;
//}
return 0;
}