Hi there,
I am running a CUDA code in Tesla C2075 card. I run the CUDA example to list the hardware parameters and it said that the card support 1024x1024x64 blocks and each block support 1024 threads (or 32x32 threads dim). I try the following code to use 600x600 blocks and 32x32 threads but it shows the error code saying that too many resources requested for launch. I don’t know why is that
#include <cuda.h>
#include <iostream>
#include "cuPrintf.cu"
using namespace std;
#define PI 4.0*atan(1.0)
// Macro to catch CUDA errors in kernel launches
#define CHECK_LAUNCH_ERROR() \
do { \
/* Check synchronous errors, i.e. pre-launch */ \
cudaError_t err = cudaGetLastError(); \
if (cudaSuccess != err) { \
fprintf (stderr, "Cuda error in file '%s' in line %i : %s.\n",\
__FILE__, __LINE__, cudaGetErrorString(err) ); \
exit(EXIT_FAILURE); \
} \
/* Check asynchronous errors, i.e. kernel failed (ULF) */ \
err = cudaThreadSynchronize(); \
if (cudaSuccess != err) { \
fprintf (stderr, "Cuda error in file '%s' in line %i : %s.\n",\
__FILE__, __LINE__, cudaGetErrorString( err) ); \
exit(EXIT_FAILURE); \
} \
} while (0)
int InitGPUSet()
{
char GPU[100] = "GPU: ";
char str[1000];
cudaDeviceProp tCard;
int num = 0;
if (cudaSuccess == cudaGetDeviceCount(&num))
{
for (int i = 0; i < num; ++ i)
{
cudaSetDevice(i);
cudaGetDeviceProperties(&tCard, i);
puts(strcat(GPU, tCard.name));
}
}
else return 0;
return 1;
}
bool cuPrintInit()
{
cudaError_t err = cudaPrintfInit();
if(0 != strcmp("no error", cudaGetErrorString(err))) return false;
return true;
}
__global__ void test(void)
{
unsigned int x = blockIdx.x;
unsigned int y = blockIdx.y;
unsigned int w = threadIdx.x;
unsigned int z = threadIdx.y;
double X[1000], Y[1000];
double phi=0;
X[0] = x*0.1;
Y[0] = y*0.1;
cuPrintf("blockX=%d blockY=%d threadX=%d threadY=%d %d\n", x, y, w, z, sizeof(X)/sizeof(double));
for (int n=0; n<1000; n++)
{
Y[n+1] = sin(X[n] + phi);
X[n+1] = X[n];
}
}
int main(void)
{
if(!InitGPUSet()) puts("device is not ready!");
else if(!cuPrintInit()) puts("device is not ready!");
else
{
test<<<dim3(600, 600) , dim3(32, 32)>>>();
CHECK_LAUNCH_ERROR();
cudaPrintfDisplay(stdout, true);
cudaPrintfEnd();
}
}
What really confusing me is if I remove the line 68 “Y[n+1]=sin(X(n)+phi)” or replace it with “Y[n+1] = 1;” in the loop of the kernel, it is running without any error. So do you have any idea what to cause the error? Thanks.