Blocksize Trouble

Hi,

I’m writing a normalized 32bit float to 16bit float conversion as part of a larger project. If i used 15 batches of 512 element conversions, or any other combination less than 8192 elements total, it works fine.

If i use 8192 or more e.g 16*512 i get “cudaError_enum at memory location”. Can any one help, i could do with a fresh pair of eyes.

Cheers

For reference the memory allocation is:

LittleFloat2* In;

CUDA_SAFE_CALL( cudaMallocHost((void**)&In, N*Batch*sizeof(LittleFloat2)) );
// Original Author : Nicholas Hinitt

// Date: 23/10/08

//##################################################################################

//Beginning Of Class Definition

//##################################################################################

class DeviceLittleFloat

{

public:

unsigned short Float16;

union Bits

{

float Float32;

long Float32Bits;

};

DeviceLittleFloat();

DeviceLittleFloat (float FloatIn);

operator  float () const;

DeviceLittleFloat&  operator = (DeviceLittleFloat Float16);

DeviceLittleFloat&  operator = (float Float32);

};

__device__ inline DeviceLittleFloat::DeviceLittleFloat(void)

{

}

__device__ inline DeviceLittleFloat::DeviceLittleFloat (float FloatIn)

{

Bits Conversion;

Conversion.Float32 = FloatIn;

Float16 = ((Conversion.Float32Bits >> 16) & 0xc000) | ((Conversion.Float32Bits >> 13) & 0x3fff) ;

}

__device__ inline DeviceLittleFloat& DeviceLittleFloat::operator =(float FloatIn)

{

	*this = DeviceLittleFloat(FloatIn);

    return *this;

};

__device__ inline DeviceLittleFloat& DeviceLittleFloat::operator = (DeviceLittleFloat FloatIn)

{

	Float16 = FloatIn.Float16;

    return *this;

}

__device__ inline DeviceLittleFloat::operator float () const

{

Bits Conversion;

Conversion.Float32Bits=Float16;

(Conversion.Float32Bits << 16  & 0x40000000)>0? Conversion.Float32Bits = ((Conversion.Float32Bits << 16) & 0xc0000000) | ((Conversion.Float32Bits << 13) & 0x07ffffff) 

:Conversion.Float32Bits = ((Conversion.Float32Bits << 16) & 0xc0000000) | ((Conversion.Float32Bits << 13) & 0x07ffffff) | 0x3c000000;

//Conversion.Float32 = 6.05;

return Conversion.Float32;

};

//##################################################################################

//End Of Class Definition

//##################################################################################

	

struct DeviceLittleFloat2

{

  DeviceLittleFloat x, y;

};

__shared__ float2 SharedFloat[];	

void RUN_RuntTest(void* In,void* Out, int N,int Batch);

__device__   void ToFloatConvert(DeviceLittleFloat2* In, int N);

__device__   void ToDeviceLittleFloatConvert(DeviceLittleFloat2* In, int N);

__global__   void RunTest(void* In,void* Out, int N);

void RUN_RunTest(void* In, void* Out, int N,int Batch)

{

dim3 threads=dim3(N,1);

dim3 blocks=dim3(Batch,1);

RunTest <<< blocks,threads ,N*sizeof(float2)>>>(In,Out,N);

}

__global__ void RunTest(void* DataIn,void* DataOut, int N)

{

DeviceLittleFloat2 *DataI = (DeviceLittleFloat2*)DataIn;

DeviceLittleFloat2 *DataO = (DeviceLittleFloat2*)DataOut;

ToFloatConvert(DataI,N);

ToDeviceLittleFloatConvert(DataO,N);

}

__device__ void ToDeviceLittleFloatConvert(DeviceLittleFloat2* DataOut, int N)

{

   int ThreadID = threadIdx.x;

    int BlockElementID = blockIdx.y * gridDim.x*blockDim.x*blockDim.y + blockIdx.x*blockDim.x*blockDim.y;

	float Y0 = SharedFloat[BlockElementID + ThreadID].y;

	float X0 = SharedFloat[BlockElementID + ThreadID].x;

	DataOut[BlockElementID +ThreadID ].y = Y0;

	DataOut[BlockElementID +ThreadID ].x = X0;

}

__device__ void ToFloatConvert(DeviceLittleFloat2* DataIn, int N)

{

   int ThreadID = threadIdx.x;

    int BlockElementID = blockIdx.y * gridDim.x*blockDim.x*blockDim.y + blockIdx.x*blockDim.x*blockDim.y;

	

	DeviceLittleFloat Y0 = DataIn[BlockElementID  + ThreadID ].y;

	DeviceLittleFloat X0 = DataIn[BlockElementID  + ThreadID ].x;

	SharedFloat[BlockElementID  +  ThreadID ].y = Y0;

	SharedFloat[BlockElementID  +  ThreadID ].x = X0;

}