Hello Im using Gefore9800GT for my program. I profiled the program I made and was not happy of the result so was considering aligning global memory for efficient usage. I know using float2 type would automatically allow memory alignment. The code is as following.
__global__ void kernel( int *b, int *c, int *sum_spectrum, float *resamp, bool dcsubtract, bool resample,
bool fft, float k_resampledspacing, uchar4 *ptr ){
int x = blockIdx.x*XDIM/2 + threadIdx.x; // XDIM>>1 = XDIM/2
int i, offset;
float data[2*YDIM];
float phase, temp, datatemp;
if (fft == true) { // Do FFT processing
for (int y=0; y<YDIM; y++) {
if (dcsubtract == true) { b[y+x*YDIM] -= sum_spectrum[y]/XDIM;} // Do DC subtract
if (resample == true) { // Do resampling
i = int(resamp[y+0*YDIM]);
datatemp = b[x*YDIM+i] - resamp[y+1*YDIM]*(b[x*YDIM+i+1]-b[x*YDIM+i])/k_resampledspacing;
}
else {
datatemp = b[x*YDIM+y];
}
phase = D1*(y-YDIM/2)*(y-YDIM/2); // second order dispersion correction
data[2*y] = datatemp * __cosf(phase);
data[2*y+1] = datatemp * __sinf(phase);
}
four1(data, YDIM, 1);
for (int y=0; y<YDIM/2; y++) {
c[x*2*YDIM+2*y] = data[2*y];
c[x*2*YDIM+2*y+1] = data[2*y+1];
temp = __log10f(data[2*y]*data[2*y] + data[2*y+1]*data[2*y+1])/12.;
offset = x + (YDIM/2 - y) * XDIM;
ptr[offset].x = 255 * temp - 100;
ptr[offset].y = 255 * temp - 100;
ptr[offset].z = 255 * temp - 100;
ptr[offset].w = 255;
}
}
else { // Don't do FFT processing
for (int y=0; y<YDIM; y++) {
if (dcsubtract == true) {b[y+x*YDIM] -= sum_spectrum[y]/YDIM;} // Do DC subtract
int offset = x + y * XDIM;
ptr[offset].x = 255 * (float)b[x*YDIM + y]/(65535);
ptr[offset].y = 255 * (float)b[x*YDIM + y]/(65535);
ptr[offset].z = 255 * (float)b[x*YDIM + y]/(65535);
ptr[offset].w = 255;
}
}
}
the profile showed that access to data array and ptr array is what matters. So what I was considering is using float2* instead of float* for data array. However, Im struggling and having no progress on here. So my question is
i) is using float2* for data array is sufficient for memory alignment? (for ptr float4*)
ii) How can I use float2* for FFT?
****FYI YDIM = 1024
Thank you.