Hi,
I’m writing a normalized 32bit float to 16bit float conversion as part of a larger project. If i used 15 batches of 512 element conversions, or any other combination less than 8192 elements total, it works fine.
If i use 8192 or more e.g 16*512 i get “cudaError_enum at memory location”. Can any one help, i could do with a fresh pair of eyes.
Cheers
For reference the memory allocation is:
LittleFloat2* In;
CUDA_SAFE_CALL( cudaMallocHost((void**)&In, N*Batch*sizeof(LittleFloat2)) );
// Original Author : Nicholas Hinitt
// Date: 23/10/08
//##################################################################################
//Beginning Of Class Definition
//##################################################################################
class DeviceLittleFloat
{
public:
unsigned short Float16;
union Bits
{
float Float32;
long Float32Bits;
};
DeviceLittleFloat();
DeviceLittleFloat (float FloatIn);
operator float () const;
DeviceLittleFloat& operator = (DeviceLittleFloat Float16);
DeviceLittleFloat& operator = (float Float32);
};
__device__ inline DeviceLittleFloat::DeviceLittleFloat(void)
{
}
__device__ inline DeviceLittleFloat::DeviceLittleFloat (float FloatIn)
{
Bits Conversion;
Conversion.Float32 = FloatIn;
Float16 = ((Conversion.Float32Bits >> 16) & 0xc000) | ((Conversion.Float32Bits >> 13) & 0x3fff) ;
}
__device__ inline DeviceLittleFloat& DeviceLittleFloat::operator =(float FloatIn)
{
*this = DeviceLittleFloat(FloatIn);
return *this;
};
__device__ inline DeviceLittleFloat& DeviceLittleFloat::operator = (DeviceLittleFloat FloatIn)
{
Float16 = FloatIn.Float16;
return *this;
}
__device__ inline DeviceLittleFloat::operator float () const
{
Bits Conversion;
Conversion.Float32Bits=Float16;
(Conversion.Float32Bits << 16 & 0x40000000)>0? Conversion.Float32Bits = ((Conversion.Float32Bits << 16) & 0xc0000000) | ((Conversion.Float32Bits << 13) & 0x07ffffff)
:Conversion.Float32Bits = ((Conversion.Float32Bits << 16) & 0xc0000000) | ((Conversion.Float32Bits << 13) & 0x07ffffff) | 0x3c000000;
//Conversion.Float32 = 6.05;
return Conversion.Float32;
};
//##################################################################################
//End Of Class Definition
//##################################################################################
struct DeviceLittleFloat2
{
DeviceLittleFloat x, y;
};
__shared__ float2 SharedFloat[];
void RUN_RuntTest(void* In,void* Out, int N,int Batch);
__device__ void ToFloatConvert(DeviceLittleFloat2* In, int N);
__device__ void ToDeviceLittleFloatConvert(DeviceLittleFloat2* In, int N);
__global__ void RunTest(void* In,void* Out, int N);
void RUN_RunTest(void* In, void* Out, int N,int Batch)
{
dim3 threads=dim3(N,1);
dim3 blocks=dim3(Batch,1);
RunTest <<< blocks,threads ,N*sizeof(float2)>>>(In,Out,N);
}
__global__ void RunTest(void* DataIn,void* DataOut, int N)
{
DeviceLittleFloat2 *DataI = (DeviceLittleFloat2*)DataIn;
DeviceLittleFloat2 *DataO = (DeviceLittleFloat2*)DataOut;
ToFloatConvert(DataI,N);
ToDeviceLittleFloatConvert(DataO,N);
}
__device__ void ToDeviceLittleFloatConvert(DeviceLittleFloat2* DataOut, int N)
{
int ThreadID = threadIdx.x;
int BlockElementID = blockIdx.y * gridDim.x*blockDim.x*blockDim.y + blockIdx.x*blockDim.x*blockDim.y;
float Y0 = SharedFloat[BlockElementID + ThreadID].y;
float X0 = SharedFloat[BlockElementID + ThreadID].x;
DataOut[BlockElementID +ThreadID ].y = Y0;
DataOut[BlockElementID +ThreadID ].x = X0;
}
__device__ void ToFloatConvert(DeviceLittleFloat2* DataIn, int N)
{
int ThreadID = threadIdx.x;
int BlockElementID = blockIdx.y * gridDim.x*blockDim.x*blockDim.y + blockIdx.x*blockDim.x*blockDim.y;
DeviceLittleFloat Y0 = DataIn[BlockElementID + ThreadID ].y;
DeviceLittleFloat X0 = DataIn[BlockElementID + ThreadID ].x;
SharedFloat[BlockElementID + ThreadID ].y = Y0;
SharedFloat[BlockElementID + ThreadID ].x = X0;
}