I measured the times with the cutCreateTimer method and with CUDA profiler.
I did a lot of testing now:
Windows:
Geforce GTX260 : 18 seconds
Geforce GTX275 : 18 seconds
Geforce GTX285 : 64 seconds
Geforce 8800 512: 99 seconds
Linux:
Geforce GTX285: 84 seconds
Tesla C1060 : 84 seconds
So i think there is a problem with linux. But i dont know why the gtx285 under windows is so slow, because from the specs the 275 is much more like then the 260.
I can post the kernel and the kernelcall but a .cu isn’t possible so easily because there a lot of dependancies.
[codebox]global void hist256copiesTexValues(values *v, int histo_A_Btmp[256 * 256 * 256], int ax, int ay, int az, int bx, int by, int bz) {
int position = blockIdx.x * blockDim.x + threadIdx.x;
float x = (position) % ax;
float z = (position) / (ay * ax);
float y = (position) % (ay * ax) / ax;
float xB = x * v->at[0] + y * v->at[1] + z * v->at[2] + v->dt[0];
float yB = x * v->bt[0] + y * v->bt[1] + z * v->bt[2] + v->dt[1];
float zB = x * v->ct[0] + y * v->ct[1] + z * v->ct[2] + v->dt[2];
if (x < ax && (xB >= 0 && xB < bx) && (yB >= 0 && yB < by) && (zB >= 0 && zB < bz))
{
int valueA = tex3D(texA, x, y, z) * 255.0;
int valueB = tex3D(texB, xB, yB, zB) * 255.0;
atomicAdd(&histo_A_Btmp[threadIdx.x * 256 * 256 + valueA * 256 + valueB], 1);
}
}[/codebox]
[codebox] dim3 block(256);
dim3 grid((m_VolA_orig.xDim * m_VolA_orig.yDim * m_VolA_orig.zDim + 255) / 256);
hist256copiesTexValues<<<grid, block>>>( values_dev, histo_A_B_tmp_dev, m_VolA_orig.xDim, m_VolA_orig.yDim, m_VolA_orig.zDim, m_VolB_orig.xDim, m_VolB_orig.yDim, m_VolB_orig.zDim);
[/codebox]