I’m receiving a “ERROR: Kernel renderKernel failed” when nP exceeds 512, if nP > 512 it limits the number of threads to 512 and increases the number of blocks to two. Then I get the error message
This is a code snippet
global void rPolyKernel(float* Areal, float* Aimag,
float* Breal, float* Bimag,
float* Creal, float* Cimag,
size_t nS, size_t nP)
{
unsigned i = blockIdx.x * blockDim.x + threadIdx.x;
Creal[i*nP] = Areal[i];
Cimag[i*nP] = Aimag[i];
for (size_t j = 1; j < nP; j++)
{
//C= A*B
Creal[i*nP + j] = (Creal[i*nP + j-1] * Breal[i]) -
(Cimag[i*nP + j-1] * Bimag[i]);
Cimag[i*nP + j] = (Creal[i*nP + j-1] * Bimag[i]) +
(Cimag[i*nP + j-1] * Breal[i]);
}
}
…
mathutils::splitComplex* scatsD = new mathutils::splitComplex;;
scatsD->realp = new float[numP];
scatsD->imagp = new float[numP];
cudaMalloc((void**)&Areal, memSizeS);
cudaMalloc((void**)&Aimag, memSizeS);
cudaMalloc((void**)&Breal, memSizeS);
cudaMalloc((void**)&Bimag, memSizeS);
cudaMalloc((void**)&Creal, memSizeSBP);
cudaMalloc((void**)&Cimag, memSizeSBP);
cudaMalloc((void**)&Dreal, memSizeP);
cudaMalloc((void**)&Dimag, memSizeP);
cudaMemcpy(Areal, scatsA->realp, memSizeS, cudaMemcpyHostToDevice);
cudaMemcpy(Aimag, scatsA->imagp, memSizeS, cudaMemcpyHostToDevice);
cudaMemcpy(Breal, scatsB->realp, memSizeS, cudaMemcpyHostToDevice);
cudaMemcpy(Bimag, scatsB->imagp, memSizeS, cudaMemcpyHostToDevice);
cudaMemcpy(Creal, scatsC->realp, memSizeSBP, cudaMemcpyHostToDevice);
cudaMemcpy(Cimag, scatsC->imagp, memSizeSBP, cudaMemcpyHostToDevice);
cudaMemcpy(Dreal, scatsD->realp, memSizeP, cudaMemcpyHostToDevice);
cudaMemcpy(Dimag, scatsD->imagp, memSizeP, cudaMemcpyHostToDevice);
if (numP > 512)
{
numThreads = 512;
if ( numP % 512 == 0)
{
numBlocks = numP/numThreads;
}
else
{
numBlocks = numP/numThreads + 1;
}
}
else
{
numThreads = numP;
numBlocks = numT;
}
rPolyKernel<<<numBlocks, numThreads>>>(Areal, Aimag,
Breal, Bimag,
Creal, Cimag,
numS, numP);
…