Dynamic Shared Memory Tesla vs Fermi

Hi all,

I have a simple kernel that perfroms prefix sum. I took this example from nvidia site and try to make some small changes:

http://http.developer.nvidia.com/GPUGems3/gpugems3_ch39.html

When I run it under GTX 260 it works fine. However, once I move to GTX 580 I start having problems with share memory Warp Out-of-range Address.

After some time I was able to make it work.

Now instead of allocating needed amount of shared memory per block I have to allocate entire shared memory

needed by all blocks. Is there any chance that you could tell me if this problem with the logic in the code or shared memory configuration was changed

for fermi architechture. I read nvidia documents but could not find anything that would suggest that shared memory should be allocated this way.

Here is an example:

I’m running 4 blocks by 256 threads each & I calculate shared memory the following way:

for GTX 260 I used:

int shared_mem_size = 2 * threads_per_block * sizeof(int);

In order to make it work on GTX 580 the previous line should be changed to:

int shared_mem_size = 4 * 2 * threads_per_block * sizeof(int);

So as you can see I had to multiply size by 4(number of blocks)

If I do not adjust the code kernel crashes.

Here is an output from cuda-gdb

============================================================================

Kernel Config is:

============================================================================

Array Size: 2048

4-1-1

256-1-1

Shared memory in bytes: 2048

Shared memory in units: 512

Total Shared memory in bytes: 8192

============================================================================

[Launch of CUDA Kernel 0 (_scan<<<(4,1,1),(256,1,1)>>>) on Device 0]

Program received signal CUDA_EXCEPTION_5, Warp Out-of-range Address.

[Switching focus to CUDA kernel 0, grid 1, block (0,0,0), thread (224,0,0), device 0, sm 0, warp 7, lane 0]

0x0000000000cedb20 in _scan<<<(4,1,1),(256,1,1)>>> ()

=======================================================================================================================

This is the complete program I run:

=======================================================================================================================

#include <stdio.h>

#include <string.h>

#include <unistd.h>

#include <stdlib.h>

#include <strings.h>

#include <iostream>

#include <mpi.h>

#include <cuda.h>

#include <cuda_runtime.h>

#include <pthread.h>

#include <vector>

//####################################################################################//####################################################################################

__global__ void _scan(int* input, int* output, int size)

{

	extern volatile __shared__ int	temp[];

	int	tidx = threadIdx.x;

	int	offset = 1;

	int 	block_displ = 2 * blockDim.x * blockIdx.x;

	//-------------------------------------------------

	// load input into shared memory

	//-------------------------------------------------

	temp[2*tidx]	= input[block_displ + 2*tidx];

	temp[2*tidx+1]	= input[block_displ + 2*tidx+1];

	//-------------------------------------------------

	// build sum in place up the tree

	//-------------------------------------------------

	for(int d = size>>1; d > 0; d >>= 1)

	{

		__syncthreads();

		if(tidx < d)

		{

			int ai = offset * (2 * tidx + 1) - 1;	

			int bi = offset * (2 * tidx + 2) - 1;	

			temp[bi] += temp[ai];

		}

		offset <<= 1;

	}

	//-------------------------------------------------

	// clear the last element

	//-------------------------------------------------

	if(tidx == 0)

	{

		temp = 0;

	}

	for(int d = 1; d < size; d <<= 1)

	{

		offset >>= 1;

		__syncthreads();

		if(tidx < d)

		{

			int ai = offset * (2 * tidx + 1) - 1;	

			int bi = offset * (2 * tidx + 2) - 1;	

			int t = temp[ai];

			temp[ai] = temp[bi];

			temp[bi] += t;

		}

	}

	__syncthreads();

	//-------------------------------------------------

	// now save the result into global array

	//-------------------------------------------------

	output[block_displ + 2*tidx] = temp[2*tidx];

	output[block_displ + 2*tidx+1] = temp[2*tidx+1];

}

//-----------------------------------------------------------------

// Array must have arbitrariy size

//-----------------------------------------------------------------

#define DEBUG_OFF	0

#define DEBUG_ON	1

#define DEBUG_MODE	DEBUG_ON

extern "C" void scan(int* input, int* output, int size)

{

	//|------------------------------------------------------------

	//| This is fixed number of threads per block.

	//|------------------------------------------------------------

	int num_of_blocks = 1;

	int threads_per_block = 256;

	//|------------------------------------------------------------

	//| One block can handle 2*threads_per block elements

	//| if array size exceeds this requirement allocate

	//| additional block(s)

	//|------------------------------------------------------------

	if((size/2) > threads_per_block)

		num_of_blocks = size / (2*threads_per_block);

	//|------------------------------------------------------------

	//| assign needed number of blocks to process an array

	//| in addition estimate size of shared memory needed per block

	//|------------------------------------------------------------

	dim3	grid	(num_of_blocks,		1, 1);

	dim3	block	(threads_per_block,	1, 1);

	int	shared_mem_size = 2 * threads_per_block * sizeof(int);

	#if (DEBUG_MODE == DEBUG_ON)

		printf("============================================================================\n");

		printf("Kernel Config is:\n");

		printf("============================================================================\n");

		printf("Array Size: %d\n", size);

		printf("%d-%d-%d\n", grid.x,  grid.y,  grid.z);

		printf("%d-%d-%d\n", block.x,  block.y,  block.z);

		printf("Shared memory in bytes:       %d\n", shared_mem_size);

		printf("Shared memory in units:       %d\n", shared_mem_size/sizeof(int));

		printf("Total Shared memory in bytes: %d\n", num_of_blocks * shared_mem_size);

		printf("============================================================================\n");

	#endif

	_scan <<< grid, block, shared_mem_size >>> (input, output, size);

}

//####################################################################################//####################################################################################

#define SYS_LOG_INF __FILE__, __PRETTY_FUNCTION__, __LINE__

void validate(cudaError_t error_id, const char* file_name, const char* func_name, const int line_num)

{

	if(error_id != cudaSuccess)

	{

		printf("Cuda error: %s\n %s: %s - %d\n", 

				cudaGetErrorString(error_id), 

				file_name,

				func_name,

				line_num

		);

		exit(1);

	}

}

class DeviceMemory

{

public:

	template <typename T>

	static T* alloc(int size)

	{

		T*	arr;

		validate(cudaMalloc((void**) &arr,	size * sizeof(T)), SYS_LOG_INF); 

		self_.add(arr);

		return arr;

	}

	virtual ~DeviceMemory()

	{

		std::vector <void*>::iterator itr;

	

		for(itr = cuda_mem_list.begin(); itr != cuda_mem_list.end(); ++itr)

			validate(cudaFree(*itr), SYS_LOG_INF);

	}

	template <typename T>

	static void cpy2host(T* _hst, T* _dev, int size)

	{

		validate(cudaMemcpy(_hst, _dev, size * sizeof(T), cudaMemcpyDeviceToHost),  SYS_LOG_INF);

	}

	template <typename T>

	static void cpy2dev(T* _dev, T* _hst, int size)

	{

		validate(cudaMemcpy(_dev, _hst, size * sizeof(T), cudaMemcpyHostToDevice),  SYS_LOG_INF);

	}

private:

	static DeviceMemory	self_;

	void	add(void* mem_ref)

	{

		cuda_mem_list.push_back(mem_ref);

	}

	DeviceMemory()

	{

		printf("DeviceMemory Constructor\n");

	}

	int i;

	std::vector <void*> cuda_mem_list;

};

DeviceMemory DeviceMemory::self_;

//####################################################################################//####################################################################################

void _print(int* arr, int size)

{

	for(int i = 0; i < size;  i++)

	{

		if((i%20) == 0)

			printf("\n");

		printf("%5d|", arr[i]);

	}

	printf("\n");

}

int main(int argc, char** argv)

{

		Benchmark	b("GPU Scan Test");

		int	size = 4*512;

		int*	state	= DeviceMemory::alloc<int>(size);

		int*	index	= DeviceMemory::alloc<int>(size);

		int*	stateH	= new int;

		int*	indexH	= new int;

		for(int i = 0; i < size;  i++)

		{

			stateH[i] = 1;

			indexH[i] = 0;

		}

		DeviceMemory::cpy2dev<int>(state, stateH, size);

		DeviceMemory::cpy2dev<int>(index, indexH, size);

		scan(state, index, size);

		DeviceMemory::cpy2host<int>(stateH, state, size);

		DeviceMemory::cpy2host<int>(indexH, index, size);

	       delete stateH;

	       delete indexH;

	       return 0;

}

Thanks