I inquired the maximum number of threads per block of my GPU and got the following message: `Maximum number of threads per block: 1024`

.

I have a kernel defined below:

```
__global__ void mulFwdRotVecLessSharedMem(HorizRCR* rcrDescrs, const int num)
{
extern __shared__ cuDoubleComplex prod_vec[]; // data for input vector and product, of size (2*p-1) + (2*p-1) = 4*p-2
for(int i = 0; i < (num+gridDim.x-1)/gridDim.x; i++)
{
int idx = blockIdx.x + i*gridDim.x;
if(idx < num)
{
// set pointers to global memory
HorizRCR *rcrDescr = rcrDescrs + idx;
cuDoubleComplex *rotMat = rcrDescr->fwdRotMat;
cuDoubleComplex *coeffs = rcrDescr->coeffs;
cuDoubleComplex *prod = rcrDescr->prod;
int p = rcrDescr->p;
// set pointers for the rotation block, the vector and the product in shared memory
// cuDoubleComplex *rotBlock = dataMatVec;
cuDoubleComplex *vecBlock = prod_vec;
cuDoubleComplex *prodBlock = prod_vec + (2*p-1);
for(int n = 0; n < p; n++)
{
// printf("n = %d\n",n);
cuDoubleComplex *rotBlock = rotMat + ROTMATSZ(n); // address of the current block
int mp = (int)threadIdx.y-n, m = (int)threadIdx.x-n;
if(mp == 0 && abs(m) <= n) vecBlock[m+n] = coeffs[NM2IDX0(n,m)];
if(m == 0 && abs(mp) <= n) prodBlock[mp+n] = make_cuDoubleComplex(0.0,0.0);
__syncthreads();
if(abs(mp) <= n && abs(m) <= n) atomicAdd(prodBlock+(mp+n),cuCmul(rotBlock[IDXC0(mp+n,m+n,2*n+1)],vecBlock[m+n]));
__syncthreads();
if(abs(mp) <= n && m == 0) prod[NM2IDX0(n,mp)] = prodBlock[mp+n];
}
}
}
}
```

where `HorizRCR`

is defined by

```
struct HorizRCR // data structure for RCR operation in the horizontal pass
{
cuDoubleComplex *fwdRotMat = NULL;
cuDoubleComplex *coaxMat = NULL;
int pCoax = 0;
cuDoubleComplex *invRotMat = NULL;
cuDoubleComplex *coeffs = NULL;
cuDoubleComplex *bufferCoeffs = NULL;
cuDoubleComplex *prod = NULL;
int p = 0;
cuDoubleComplex *tgt = NULL; // target address where prod is to be added
};
```

I launched the kernel using the following statement:

```
dim3 blkDim(2*p-1,2*p-1,1);
dim3 gridDim(numRots,1,1);
mulFwdRotVecLessSharedMem<<<gridDim,blkDim,(4*p-2)*sizeof(cuDoubleComplex)>>>(horizTransDescrs_d,numRots);
```

The issue was, kernel launch failed as p = 12 and was successful as long as p < 12. From my understanding, as p = 12, the number of threads in a block is 23 * 23 = 529 < 1024. Then why did the launch fail?