OK so I’ve got a buffer of short data that I’m trying to do some stuff with, things like transpose, filter, transpose again, etc. Does anyone know a good way to do a fast type conversion between a short buffer and a float buffer on the GPU? I tried something like this:
typedef struct dual_short {
short a;
short b;
} dual_short;
typedef struct dual_float {
float a;
float b;
} dual_float;
__global__ void convert_input(short* inp_buffer, float* out_buffer, int width, int height) {
dual_short* ds_ptr = (dual_short*)inp_buffer;
dual_float* df_ptr = (dual_float*)out_buffer;
unsigned int xIndex = 2*(blockIdx.x * CONV_BLOCK_DIM + threadIdx.x);
unsigned int yIndex = blockIdx.y * CONV_BLOCK_DIM + threadIdx.y;
if ((xIndex < width) && (yIndex < height)) {
unsigned int index_in = yIndex * width/2 + xIndex;
// Convert two shorts at a time
dual_short temp_ds = ds_ptr[index_in];
dual_float temp_df;
{
temp_df.a = (float)temp_ds.a;
temp_df.b = (float)temp_ds.b;
}
df_ptr[index_in] = temp_df;
}
}
But the visual profiler is still reporting it as un-coalesced. Note the dual_float type is there so I can try to coalesce on the 64-bit type. CONV_BLOCK_DIM = 16