I’ve allocated 256 floats of shared memory and the program is freezing in Device Emulation mode when I’m trying to copy from the global to shared memory.
(WinXP MSVS2005)
The simple testing code for this problem is following :
(The program is freezing when tx value is about 203)
I’ve also tried to increase the stack for the program however it didn’t helped.
// includes, system
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <math.h>
// includes, project
#include <cutil.h>
#define BLOCK_SIZE 256
global void test_kernel(float* data)
{
// Thread index
int tx = threadIdx.x;
shared float shared_data[BLOCK_SIZE];
shared_data[tx]=data[tx];
__syncthreads();
}
int
main(int argc, char** argv)
{
CUT_CHECK_DEVICE();
unsigned int size_A = 256;
unsigned int mem_size_A = sizeof(float) * size_A;
// allocate device memory
float* d_A;
CUDA_SAFE_CALL(cudaMalloc((void**) &d_A, mem_size_A));
// setup execution parameters
dim3 threads(BLOCK_SIZE,1);
dim3 grid(16, 1);
// execute the kernel
test_kernel<<< grid, threads >>>(d_A);
CUDA_SAFE_CALL(cudaFree(d_A));
CUT_EXIT(argc, argv);
}