Dear all
when I try to compile this code with geforce gt 740m, it give me the following error:
Error 3 error : Entry function ‘Z10FinalPart1PiS’ uses too much shared data (0x10000 bytes, 0xc000 max)
how can I fix this problem??
#include "cuda.h"
#include "cuda_runtime.h"
#include "device_functions.h"
#include "device_launch_parameters.h"
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <cmath>
#define PP 1
#define P (PP*128)
#define N P*32*1024
// Define this to turn on error checking
#define CUDA_ERROR_CHECK
#define CudaSafeCall( err ) __cudaSafeCall( err, __FILE__, __LINE__ )
#define CudaCheckError() __cudaCheckError( __FILE__, __LINE__ )
//****************************************************************************************
// functions for cuda error checking
inline void __cudaSafeCall(cudaError err, const char *file, const int line)
{
#ifdef CUDA_ERROR_CHECK
if (cudaSuccess != err)
{
fprintf(stderr, "cudaSafeCall() failed at %s:%i : %s\n",
file, line, cudaGetErrorString(err));
exit(-1);
}
#endif
return;
}
inline void __cudaCheckError(const char *file, const int line)
{
#ifdef CUDA_ERROR_CHECK
cudaError err = cudaGetLastError();
if (cudaSuccess != err)
{
fprintf(stderr, "cudaCheckError() failed at %s:%i : %s\n",
file, line, cudaGetErrorString(err));
exit(-1);
}
// More careful checking. However, this will affect performance.
// Comment away if needed.
err = cudaDeviceSynchronize();
if (cudaSuccess != err)
{
fprintf(stderr, "cudaCheckError() with sync failed at %s:%i : %s\n",
file, line, cudaGetErrorString(err));
exit(-1);
}
#endif
return;
}
__global__ void FinalPart1(int *a, int *temp)
{
__shared__ int part_ary[16* 1024];
int threadId = threadIdx.x;
part_ary[threadId] = a[threadId ];
__syncthreads();
part_ary[threadId] = part_ary[threadId] * blockIdx.x;
__syncthreads();
a[threadId + (blockIdx.x *( 32 * 1024))] = part_ary[threadId];
}
//****************************************************************************************************************************************
int main()
{
int *a; // the main array to be sorted
int *dev_a, *dev_temp1; // array on the device
//*********************************************
cudaMalloc((void **)&dev_a, N*sizeof(int));//allocate memory on device
cudaMalloc((void **)&dev_temp1, N*sizeof(int));//allocate memory on device
a = (int *)malloc(N*sizeof(int)); //allocate memory on host
//*******1*************************************
//--------------------------
//genetate random data
for (int i = 0; i < N; i++)
{
a[i] = 1;
}
//*********************************************************
//transfer data from host memory to device memory, where dev_a is the destination
cudaMemcpy(dev_a, a, N*sizeof(int), cudaMemcpyHostToDevice);
//*********************************************************
//*************************************************************
// calculating elapsed time, phase1
cudaEvent_t start, stop; float time;
cudaEventCreate(&start); cudaEventCreate(&stop);
cudaEventRecord(start, 0);
//**************************************************************
//start execution
//*********************************************************************************
//sort up to N=1024*1024
//********************************************************************************
FinalPart1 << <32, 1024>> >(dev_a, dev_temp1);
//***********************************************************************
// calculating elapsed time, phase2
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&time, start, stop);
cudaEventDestroy(start);
cudaEventDestroy(stop);
printf("kernel time in ms:\t%.7f\n", time);
//**********************************************************************
//------------------------------------------------------------
// cuda error checking
cudaError_t error = cudaGetLastError();
if (error != cudaSuccess)
{
printf("CUDA Error: %s\n", cudaGetErrorString(error));
// we can't recover from the error -- exit the program
return 1;
}
//-----------------------------------------------------------
cudaDeviceSynchronize();
cudaMemcpy(a, dev_a, N*sizeof(int), cudaMemcpyDeviceToHost); // return sorted array to host
//***********************************************************************************
for (int i = 0; i <N; i++)
{
printf("\n\n ** \n \n");
printf("%d ", a[i]);
}
printf("\n ++++++++++++++++++++++++++++++++++++++\n\n\n\n ");
cudaFree(dev_a);//free memory on device
cudaFree(dev_temp1);
free(a);//free memory on host
}