Hi all,
I have a simple kernel that perfroms prefix sum. I took this example from nvidia site and try to make some small changes:
http://http.developer.nvidia.com/GPUGems3/gpugems3_ch39.html
When I run it under GTX 260 it works fine. However, once I move to GTX 580 I start having problems with share memory Warp Out-of-range Address.
After some time I was able to make it work.
Now instead of allocating needed amount of shared memory per block I have to allocate entire shared memory
needed by all blocks. Is there any chance that you could tell me if this problem with the logic in the code or shared memory configuration was changed
for fermi architechture. I read nvidia documents but could not find anything that would suggest that shared memory should be allocated this way.
Here is an example:
I’m running 4 blocks by 256 threads each & I calculate shared memory the following way:
for GTX 260 I used:
int shared_mem_size = 2 * threads_per_block * sizeof(int);
In order to make it work on GTX 580 the previous line should be changed to:
int shared_mem_size = 4 * 2 * threads_per_block * sizeof(int);
So as you can see I had to multiply size by 4(number of blocks)
If I do not adjust the code kernel crashes.
Here is an output from cuda-gdb
============================================================================
Kernel Config is:
============================================================================
Array Size: 2048
4-1-1
256-1-1
Shared memory in bytes: 2048
Shared memory in units: 512
Total Shared memory in bytes: 8192
============================================================================
[Launch of CUDA Kernel 0 (_scan<<<(4,1,1),(256,1,1)>>>) on Device 0]
Program received signal CUDA_EXCEPTION_5, Warp Out-of-range Address.
[Switching focus to CUDA kernel 0, grid 1, block (0,0,0), thread (224,0,0), device 0, sm 0, warp 7, lane 0]
0x0000000000cedb20 in _scan<<<(4,1,1),(256,1,1)>>> ()
=======================================================================================================================
This is the complete program I run:
=======================================================================================================================
#include <stdio.h>
#include <string.h>
#include <unistd.h>
#include <stdlib.h>
#include <strings.h>
#include <iostream>
#include <mpi.h>
#include <cuda.h>
#include <cuda_runtime.h>
#include <pthread.h>
#include <vector>
//####################################################################################//####################################################################################
__global__ void _scan(int* input, int* output, int size)
{
extern volatile __shared__ int temp[];
int tidx = threadIdx.x;
int offset = 1;
int block_displ = 2 * blockDim.x * blockIdx.x;
//-------------------------------------------------
// load input into shared memory
//-------------------------------------------------
temp[2*tidx] = input[block_displ + 2*tidx];
temp[2*tidx+1] = input[block_displ + 2*tidx+1];
//-------------------------------------------------
// build sum in place up the tree
//-------------------------------------------------
for(int d = size>>1; d > 0; d >>= 1)
{
__syncthreads();
if(tidx < d)
{
int ai = offset * (2 * tidx + 1) - 1;
int bi = offset * (2 * tidx + 2) - 1;
temp[bi] += temp[ai];
}
offset <<= 1;
}
//-------------------------------------------------
// clear the last element
//-------------------------------------------------
if(tidx == 0)
{
temp = 0;
}
for(int d = 1; d < size; d <<= 1)
{
offset >>= 1;
__syncthreads();
if(tidx < d)
{
int ai = offset * (2 * tidx + 1) - 1;
int bi = offset * (2 * tidx + 2) - 1;
int t = temp[ai];
temp[ai] = temp[bi];
temp[bi] += t;
}
}
__syncthreads();
//-------------------------------------------------
// now save the result into global array
//-------------------------------------------------
output[block_displ + 2*tidx] = temp[2*tidx];
output[block_displ + 2*tidx+1] = temp[2*tidx+1];
}
//-----------------------------------------------------------------
// Array must have arbitrariy size
//-----------------------------------------------------------------
#define DEBUG_OFF 0
#define DEBUG_ON 1
#define DEBUG_MODE DEBUG_ON
extern "C" void scan(int* input, int* output, int size)
{
//|------------------------------------------------------------
//| This is fixed number of threads per block.
//|------------------------------------------------------------
int num_of_blocks = 1;
int threads_per_block = 256;
//|------------------------------------------------------------
//| One block can handle 2*threads_per block elements
//| if array size exceeds this requirement allocate
//| additional block(s)
//|------------------------------------------------------------
if((size/2) > threads_per_block)
num_of_blocks = size / (2*threads_per_block);
//|------------------------------------------------------------
//| assign needed number of blocks to process an array
//| in addition estimate size of shared memory needed per block
//|------------------------------------------------------------
dim3 grid (num_of_blocks, 1, 1);
dim3 block (threads_per_block, 1, 1);
int shared_mem_size = 2 * threads_per_block * sizeof(int);
#if (DEBUG_MODE == DEBUG_ON)
printf("============================================================================\n");
printf("Kernel Config is:\n");
printf("============================================================================\n");
printf("Array Size: %d\n", size);
printf("%d-%d-%d\n", grid.x, grid.y, grid.z);
printf("%d-%d-%d\n", block.x, block.y, block.z);
printf("Shared memory in bytes: %d\n", shared_mem_size);
printf("Shared memory in units: %d\n", shared_mem_size/sizeof(int));
printf("Total Shared memory in bytes: %d\n", num_of_blocks * shared_mem_size);
printf("============================================================================\n");
#endif
_scan <<< grid, block, shared_mem_size >>> (input, output, size);
}
//####################################################################################//####################################################################################
#define SYS_LOG_INF __FILE__, __PRETTY_FUNCTION__, __LINE__
void validate(cudaError_t error_id, const char* file_name, const char* func_name, const int line_num)
{
if(error_id != cudaSuccess)
{
printf("Cuda error: %s\n %s: %s - %d\n",
cudaGetErrorString(error_id),
file_name,
func_name,
line_num
);
exit(1);
}
}
class DeviceMemory
{
public:
template <typename T>
static T* alloc(int size)
{
T* arr;
validate(cudaMalloc((void**) &arr, size * sizeof(T)), SYS_LOG_INF);
self_.add(arr);
return arr;
}
virtual ~DeviceMemory()
{
std::vector <void*>::iterator itr;
for(itr = cuda_mem_list.begin(); itr != cuda_mem_list.end(); ++itr)
validate(cudaFree(*itr), SYS_LOG_INF);
}
template <typename T>
static void cpy2host(T* _hst, T* _dev, int size)
{
validate(cudaMemcpy(_hst, _dev, size * sizeof(T), cudaMemcpyDeviceToHost), SYS_LOG_INF);
}
template <typename T>
static void cpy2dev(T* _dev, T* _hst, int size)
{
validate(cudaMemcpy(_dev, _hst, size * sizeof(T), cudaMemcpyHostToDevice), SYS_LOG_INF);
}
private:
static DeviceMemory self_;
void add(void* mem_ref)
{
cuda_mem_list.push_back(mem_ref);
}
DeviceMemory()
{
printf("DeviceMemory Constructor\n");
}
int i;
std::vector <void*> cuda_mem_list;
};
DeviceMemory DeviceMemory::self_;
//####################################################################################//####################################################################################
void _print(int* arr, int size)
{
for(int i = 0; i < size; i++)
{
if((i%20) == 0)
printf("\n");
printf("%5d|", arr[i]);
}
printf("\n");
}
int main(int argc, char** argv)
{
Benchmark b("GPU Scan Test");
int size = 4*512;
int* state = DeviceMemory::alloc<int>(size);
int* index = DeviceMemory::alloc<int>(size);
int* stateH = new int;
int* indexH = new int;
for(int i = 0; i < size; i++)
{
stateH[i] = 1;
indexH[i] = 0;
}
DeviceMemory::cpy2dev<int>(state, stateH, size);
DeviceMemory::cpy2dev<int>(index, indexH, size);
scan(state, index, size);
DeviceMemory::cpy2host<int>(stateH, state, size);
DeviceMemory::cpy2host<int>(indexH, index, size);
delete stateH;
delete indexH;
return 0;
}
Thanks