cuda runtime api error 6 the launch timed out and was terminated and kernel stopped whats my problem

when i want to do this cod for 900000000 of number its going to be stop and this error cuda runtime api error 6 the launch timed out and was terminated and kernel stopped whats is my problem i test it on 420m

#include <tchar.h>
#include <windows.h>
#include <iostream>
#define _USE_MATH_DEFINES
#include <math.h>
#include "tbb/parallel_for.h"
#include "tbb/parallel_reduce.h"
#include "tbb/blocked_range.h"
#include "tbb/tick_count.h"
#include "tbb/task_scheduler_init.h"
//****************************************
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include "curand.h"
//****************************************
using namespace std;
using namespace tbb;

const int list_count = 9000000; 
int *data = new int[list_count];





void find_primes(int*  &my_array, int *&prime_array

	){
	int prime, factor, limit;
	for (int list = 0; list < list_count; list++){
		prime = 1;
		if ((my_array

<ul> % 2) == 1) {
			limit = (int)sqrt((float)my_array

<ul>) + 1;
			factor = 3;
			while (prime && (factor <= limit)) {
				if (my_array

<ul> % factor == 0) prime = 0;
				factor += 2;
			}
		}
		else prime = 0;
		if (prime) {
			prime_array

<ul> = 1;
		}
		else
			prime_array

<ul> = 0;
	}
}


void parallel_find_primes(int *&my_array, int *& prime_array){
	parallel_for(blocked_range<int>(0, list_count),
		[=](const blocked_range<int>& r) {
		int prime, factor, limit;
		for (int list = r.begin(); list != r.end(); list++){
			prime = 1;
			if ((my_array

<ul> % 2) == 1) {
				limit = (int)sqrt((float)my_array

<ul>) + 1;
				factor = 3;
				while (prime && (factor <= limit)) {
					if (my_array

<ul> % factor == 0) prime = 0;
					factor += 2;
				}
			}
			else prime = 0;
			if (prime)
				prime_array

<ul> = 1;
			else
				prime_array

<ul> = 0;
		}
	});
}
__global__ void parallel_find_primes_by_cuda(int* my_array, int *prime_array )
{
	int threadsPerBlock = blockDim.x * blockDim.y * blockDim.z;
	int threadPosInBlock = threadIdx.x + blockDim.x * threadIdx.y + blockDim.x * blockDim.y * threadIdx.z;
	int blockPosInGrid = blockIdx.x + gridDim.x * blockIdx.y + gridDim.x * gridDim.y * blockIdx.z;
	int tid = blockPosInGrid * threadsPerBlock + threadPosInBlock;
	int prime, factor, limit;
	if (tid <= list_count)
	{
		prime = 1;
		if ((my_array[tid] % 2) == 1) {
			limit = (int)sqrt((float)my_array[tid]) + 1;
			factor = 3;
			while (prime && (factor <= limit)) {
				if (my_array[tid] % factor == 0) prime = 0;
				factor += 2;
			}
		}
		else prime = 0;
		if (prime) {
			prime_array[tid] = 1;
		}
		else
			prime_array[tid] = 0;
	}
	}
void make_array(){
	int multiplier = 1;
	for (int i = 0; i < list_count; i++){
		if (i % 1000000 == 0) {
			multiplier = 1;
		}
		data[i] = multiplier;
		multiplier++;
	}
	cout << "Array created." << endl;
}

int _tmain(int argc, _TCHAR* argv[])
{
	char x;
	int *isprime = new int[list_count];
	int *hostrime = new int[list_count];
	int *device,*primes;
	//cout << "Creating array of " << list_count << " numbers." << endl << "*************************************" << endl << endl;
	make_array();
	//cout << "Finding primes serially.  This may take a few seconds. " << endl << "*************************************" << endl << endl;
	//tick_count serial_prime_start = tick_count::now();
	//find_primes(data, isprime);
	//tick_count serial_prime_end = tick_count::now();
	//cout << "Time to find primes serially for " << list_count << " numbers: " << (serial_prime_end - serial_prime_start).seconds() << " seconds." << endl << "*************************************" << endl << endl;
	//cout << "Finding primes in parallel.  This may take a few seconds. " << endl << "*************************************" << endl;
	//tick_count parallel_prime_start = tick_count::now();
	//parallel_find_primes(data, isprime);
	//tick_count parallel_prime_end = tick_count::now();
	//cout << "Time to find primes in parallel for " << list_count << " numbers: " << (parallel_prime_end - parallel_prime_start).seconds() << " seconds." << endl << "*************************************" << endl << endl;
	//cout << "coping data to device" << endl << "*************************************" << endl<<endl;

	cudaEvent_t starte, ende;
	float m;

	cudaMalloc(&device,( list_count*sizeof(int)));
	cudaMalloc(&primes, (list_count*sizeof(int)));
	cudaMemcpy(device, data, list_count*sizeof(int),cudaMemcpyKind::cudaMemcpyHostToDevice);
	dim3 block(125,125,8);
	dim3 threads(30,30);
	cudaError_t cudaStatus;
    cout << "coping end and start the exicutint in device device" << endl << "*************************************" << endl << endl;
	cudaEventCreate(&starte);
	cudaEventCreate(&ende);
	cudaEventRecord(starte,0);
	parallel_find_primes_by_cuda << <block, threads >> >(device, primes);

	cudaMemcpy(hostrime,primes, list_count*sizeof(int), cudaMemcpyKind::cudaMemcpyDeviceToHost);


	cudaStatus = cudaGetLastError();
	if (cudaStatus != cudaSuccess) {
		fprintf(stderr, "addKernel launch failed: %s\n", cudaGetErrorString(cudaStatus));
		
	}

	cudaStatus = cudaDeviceSynchronize();
	if (cudaStatus != cudaSuccess) {
		fprintf(stderr, "cudaDeviceSynchronize returned error code %d after launching addKernel!\n", cudaStatus);
	}
	cudaEventRecord(ende,0);
	cudaEventSynchronize(ende);

	cudaEventElapsedTime(&m,starte,ende);
	cudaFree(device);
	cudaFree(primes);
	
	cout << "Time to find primes in parallel on cuda for " << list_count << " numbers: " <<  m << " mili seconds." << endl << "*************************************" << endl << endl;

	cin >> x;
	//for (int i = 0; i < list_count; i++)
	//{
	//	if (isprime[i] == hostrime[i])
	//	
	//		cout << "eshah shahriyar" ;
	//}

	return 0;
}

take a look at this thread:

https://devtalk.nvidia.com/default/topic/459869/cuda-programming-and-performance/-quot-display-driver-stopped-responding-and-has-recovered-quot-wddm-timeout-detection-and-recovery-/