When I launch my programm it crashes it makes my screen flickers, and graphic card is restarted. It is only when time of calculation is big. cudasynchronize returns error 6.
I found information in the documentation that: cudaDeviceSynchronize() returns an error if one of the preceding tasks has failed
what does it mean for me?
here is the code:
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include "device_functions.h"
#include "auxilary.h"
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <ctime>
#include <math.h>
#define M_E 2.71828182845904523536
__host__ cudaError_t PrimesWithCuda(int first, int last,int nofb,int noft,int* res,int size);
__host__ double logN(double n);
__host__ int numberOfPrimes(int first,int last);
__host__ int* allocateOutput(int first,int last, int* size);
__host__ void Primes(int first, int last,int* result);
__global__ void PrimesKernel(int first, int last,int* result);
double logN(double n)
{
return log(n)/log(M_E);
}
int numberOfPrimes(int first,int last)
{
if ( first < 0 && last < 0 )
return -1;
if ( last < 10000 || last-first < 10000 )
return 1500;
double nop = (0.997344779 * last/(logN(last) - 1.110432659) - 2.0)*1.01 - 0.997344779 * first/(logN(first) - 1.110432659) - 2.0;
return nop;
}
int* allocateOutput(int first,int last,int* size)
{
int* output;
int nop = numberOfPrimes(first,last);
*size = nop;
if ( nop <= 0 )
{
fprintf(stderr, "Wrong parameters: first & last\n");
return NULL;
}
output = (int*)malloc(sizeof(int)*nop);
if ( output == NULL )
{
fprintf(stderr, "output: malloc failed\n");
return NULL;
}
memset(output,0,nop*sizeof(int));
output[0]=1;
return output;
}
void Primes(int first, int last,int* result)
{
bool prime = false;
int ij = 1;
if (first % 2 == 0)
first++;
for ( int i=first ; i <= last ; i+=2 )
{
prime = true;
for ( int j = 3; j*j <= i && prime ; j+=2 )
{
if ( i % j == 0 )
prime = false;
}
if ( prime)
result[ij++] = i;
}
result[0] = ij-1;
}
__global__ void PrimesKernel(int first, int last,int* result)
{
int id = blockDim.x*blockIdx.x + threadIdx.x + first;
int onoft = gridDim.x*blockDim.x;
if ( id % 2 == 0 ) id += (onoft % 2 == 0 ? onoft-1 : onoft);
bool prime = false;
int i,ij=0;
for ( i=id ; i <=last ; i+=2*onoft )
{
prime = true;
for ( int j = 3; j*j <= i && prime ; j+=2 )
{
if ( i % j == 0 )
prime = false;
}
if ( prime)
{
// ij++;
while (atomicCAS(result+result[0]++,0,i));
}
}
//atomicAdd(result,ij);
}
__host__ void DisplayPrimes(int* primes)
{
int i =0;
printf("Liczba liczb pierwszych: %d ",primes[i++]);
while ( primes[i] || i < 100)
{
printf("%d ",primes[i++]);
}
printf("\n");
}
int main(int argc, char** argv)
{
double elapsed;
clock_t start, end;
cudaError_t cudaStatus;
int* res;
int size;
if ( argc != 6 )
return -1;
start = clock();
int first= atoi(argv[1]);
int last = atoi(argv[2]);
int nofb = atoi(argv[3]);
int noft = atoi(argv[4]);
int parallel = atoi(argv[5]);
printf("nop: %d\n",numberOfPrimes(first,last));
res = allocateOutput(first,last,&size);
if ( parallel )
{
cudaStatus = PrimesWithCuda(first,last,nofb,noft,res,size);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "LevenstheinWithCuda failed!\n");
return -1;
}
// cudaDeviceReset must be called before exiting in order for profiling and
// tracing tools such as Parallel Nsight and Visual Profiler to show complete traces.
cudaStatus = cudaDeviceReset();
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaDeviceReset failed!\n");
return -1;
}
}
else
{
Primes(first,last,res);
}
end = clock();
elapsed = ((double)(end-start))/CLOCKS_PER_SEC;
printf("Commercial time: %lf s\n",elapsed);
DisplayPrimes(res);
printf("Liczba liczb pierwszych: %d\n",res[0]);
printf("Hit any key to terminate\n");
getchar();
free (res);
return 0;
}
// Helper function for using CUDA to add vectors in parallel.
cudaError_t PrimesWithCuda(int first,int last,int nofb,int noft,int* output,int size)
{
// Choose which GPU to run on, change this on a multi-GPU system.
cudaError_t cudaStatus = cudaSetDevice(0);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaSetDevice failed! Do you have a CUDA-capable GPU installed?\n");
goto Error;
}
/*int * dev_first;
int * dev_last;*/
int * dev_output;
/*cudaStatus = cudaMalloc((void**)&dev_first,sizeof(int));
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "dev_first: cudaMalloc failed\n", cudaStatus);
goto Error;
};
cudaStatus = cudaMalloc((void**)&dev_last,sizeof(int));
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "dev_last: cudaMalloc failed\n", cudaStatus);
goto Error;
};*/
cudaStatus = cudaMalloc((void**)&dev_output,sizeof(int)*size);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "dev_output: cudaMalloc failed\n", cudaStatus);
goto Error;
};
cudaStatus = cudaMemcpy(dev_output,output,sizeof(int)*size,cudaMemcpyHostToDevice);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "dev_output: HtD cudaMemcpy failed , code: %d\n", cudaStatus);
goto Error;
};
/*cudaStatus = cudaMemcpy(dev_first,&first,sizeof(int),cudaMemcpyHostToDevice);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "dev_first: cudaMemcpy failed\n", cudaStatus);
goto Error;
};
cudaStatus = cudaMemcpy(dev_last,&last,sizeof(int),cudaMemcpyHostToDevice);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "dev_last: cudaMemcpy failed\n", cudaStatus);
goto Error;
};*/
if (first % 2 == 0)
first++;
// Launch a kernel on the GPU with one thread for each element.
PrimesKernel<<<nofb, noft>>>(first,last,dev_output);
// cudaDeviceSynchronize waits for the kernel to finish, and returns
// any errors encountered during the launch.
cudaStatus = cudaDeviceSynchronize();
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaDeviceSynchronize returned error code %d after launching PrimesKernel!\n", cudaStatus);
goto Error;
}
cudaStatus = cudaMemcpy(output,dev_output,sizeof(int)*size,cudaMemcpyDeviceToHost);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "dev_output: DtH cudaMemcpy failed: code %d\n", cudaStatus);
goto Error;
};
cudaFree(dev_output);
Error:
return cudaStatus;
}
I have found also description of the error, but I dont understand it fully.
/**
-
This indicates that the device kernel took too long to execute. This can
-
only occur if timeouts are enabled - see the device property
-
\ref ::cudaDeviceProp::kernelExecTimeoutEnabled “kernelExecTimeoutEnabled”
-
for more information. The device cannot be used until ::cudaThreadExit()
-
is called. All existing device memory allocations are invalid and must be
-
reconstructed if the program is to continue using CUDA.
*/