This tiny kernel gives “unspecified launch failure” on a GeForce GT 430. Lowering LOOP to 600 solves that.
I know this code is silly, but is it a hardware limitation or a hardware bug that I’ve hit?
#include <cuda.h>
#include <stdio.h>
#include <stdlib.h>
#define LOOP 1000
#define THREADS_PER_BLOCK (32 * 3)
#define BLOCKS 100
__global__ void coalesce_shared()
{
__shared__ float ff[1000];
for(int i = 0; i < LOOP; i++)
atomicAdd(& ff[threadIdx.x], 1);
}
int main()
{
coalesce_shared<<<BLOCKS, THREADS_PER_BLOCK>>>();
printf("Error: %s\n", cudaGetErrorString(cudaGetLastError()));
cudaThreadSynchronize();
printf("Error: %s\n", cudaGetErrorString(cudaGetLastError()));
return 0;
}