I get this error when I try to use some shared memory:
extern __shared__ char shared_mem[];
unsigned long** sptr = (unsigned long**)shared_mem;
sptr[0] = NULL;
I have a lot more code, but this suffices for the error. What is this error “unaligned memory accesses not supported” and why do I get this? Even more strangely if I do the following, the kernel compiles and runs.
if (threadIdx.x == 0) sptr[threadIdx.x] = NULL;
Even more strangely…my real life problem: I have implemented a small subset if vector as bit-values which I want to put into shared memory for fast (read) access. However, for design reasons I cannot modify the backend that uses these values, so the change has to be transparent as far as the calling functions are concerned. What I have to do is copy all bit-vectors to shared memory, backup the pointers, let pointers point to shared memory and after the function is done, put back the backup pointers to their original locations.
So the kernel code is something like . Now the very, very strange thing is when I restore more than a half-warp size of bit-vectors. If the second threadIdx.x is > 15 then the launch fails… even if I use only 16 threads and have each thread backup/restore two elements with ofset 16, and as above, if I directly do the backup/restore with indexes the kernel doesn’t compile:
if (threadIdx.x < VECTOR_COUNT) sptr[threadIdx.x] = structure[threadIdx.x].fires.data;
__syncthreads();
other_structure.execute();
__syncthreads();
if (threadIdx.x < 16) structure[threadIdx.x].fires.data = sptr[threadIdx.x];
my bit-vector is as follows (heavily borrowed from vector and only the minimum supported). It is a small iterator, a length value and a pointer to a large pool of vectors where all bit-vectors are sequentially stored and we index in there:
template <> class myIterator<bool> {
public:
typedef std::forward_iterator_tag iterator_category;
typedef uint32_t value_type;
typedef uint32_t size_type;
typedef value_type* pointer;
typedef myIterator<bool> iterator;
__device__ myIterator() {}
__device__ myIterator(pointer val): _myval(val), _offset(0) {}
enum {VBYTES = sizeof(value_type), VBITS = 8 * VBYTES};
__device__ inline operator bool() const {return (*_myval & _mask()) != 0;}
__device__ inline iterator operator=(bool val) {
if (val)
{
*_myval |= _mask();
}
else
{
*_myval &= ~_mask();
}
return *this;
}
__device__ inline iterator operator+(size_type off) {
_offset += off;
_myval += _offset / VBITS;
_offset %= VBITS;
return *this;
}
protected:
__device__ inline size_type _mask() const {return (size_type)(1 << _offset);}
size_type _offset;
pointer _myval;
};
template<> class mycudaArray<bool>: public myIterator<bool> {
public:
typedef bool value_type;
typedef myIterator<bool> iterator;
typedef myIterator<bool> const_iterator;
typedef uint32_t* pointer;
typedef uint32_t size_type;
mycudaArray(size_type len = 0, value_type* data = NULL): len(len), data(reinterpret_cast<pointer>(data)) {}
size_type len;
pointer data; /* contiguous data array, should reside on GPU */
__device__ inline size_type size() const {return len;}
__device__ inline bool empty() {return data == NULL;}
__device__ inline iterator operator[](size_type i) {return begin() + i;}
__device__ inline const_iterator operator[](size_type i) const {return begin() + i;}
__device__ inline iterator begin() {return iterator(data);}
__device__ inline const_iterator begin() const {return const_iterator(data);}
__host__ __device__ static inline size_type size_of(size_type i) {return (i / 8) + ((i % 8) != 0);}
};
Note: Obviously we cannot dynamically allocate vectors of any size of the device, but luckily once they’re set up these vectors remain of the same length. So I scan all bit-vectors, allocate memory to them in a single block, copy over the values and use that.
Can someone shed light on my problem? :ph34r: What the hell is going on?