Hi there,
I’m testing a fluid simulator of my own, based in the LBM method, and I have done 2 version (CPU and CUDA) to check results and benchmark them. I’ve done the code in a MacBook Pro laptop, both CPU and CUDA but now I’m porting them to a much powerful Ubuntu x64 box. I’ve installed and checked that CUDA 2.3 is working right (the SDK examples).
The code in the Mac was compiled in 32 bits, so it works also in the Linux box without any changes (compiling in 32 bits with the appropiate library and the -m32 flag). When I compile for 64 bits, however, the execution seems not to update the variables ux, uy, uz from the code below. Here Callback is a class which hides the write-to-file operations I do.
I’ve tried compiling and executing in 64 bits in emulation mode, this way it works as expected, so the problem is that the it seems not to work (or not correctly) in the GPU.
Any ideas about what I’m doing wrong?
void GPULBMSim::run(long int iterations, unsigned int exec_each, Action *callback){
int iter=0;
float *h_u = NULL;
if(callback!=0) {
callback->preinit((void*) dim);
h_u = new float[matdim*3];
}
while(iter<iterations && !end) {
//Run kernel
dim3 dimBlock(dim[0]); //Must be lower than 512
dim3 dimGrid(dim[1]*dim[2]);
runLBM<<<dimGrid,dimBlock>>>(c_type, rho, ux, uy, uz,
f_rest0, f_u0, f_d0, f_l0, f_r0, f_f0, f_b0, f_lf0, f_df0, f_rf0, f_uf0, f_lu0, f_ru0, f_rd0, f_ld0, f_lb0, f_db0, f_rb0, f_ub0,
f_rest1, f_u1, f_d1, f_l1, f_r1, f_f1, f_b1, f_lf1, f_df1, f_rf1, f_uf1, f_lu1, f_ru1, f_rd1, f_ld1, f_lb1, f_db1, f_rb1, f_ub1,
OMEGA);
if(exec_each!=0 && callback!=0 && iter%exec_each==0){
std::cout<<"Iter:"<<iter<<std::endl;
//Get U and RHO
CUDA_SAFE_CALL(cudaMemcpy(h_rho, rho, sizeof(float)*matdim, cudaMemcpyDeviceToHost));
CUDA_SAFE_CALL(cudaMemcpy(h_ux, ux, sizeof(float)*matdim, cudaMemcpyDeviceToHost));
CUDA_SAFE_CALL(cudaMemcpy(h_uy, uy, sizeof(float)*matdim, cudaMemcpyDeviceToHost));
CUDA_SAFE_CALL(cudaMemcpy(h_uz, uz, sizeof(float)*matdim, cudaMemcpyDeviceToHost));
for(int i=0; i<matdim; i++){
h_u[i*3] = h_ux[i];
h_u[i*3+1] = h_uy[i];
h_u[i*3+2] = h_uz[i];
}
(*callback)(cell_type, h_u, h_rho, dim);
}
swapF(); //Swaps f_rest0 for f_rest1, f_u0 for f_u1 and so on
iter++;
}
freeCUDAres(); //Frees any resources
if(callback!=0){
callback->postdestroy();
delete [] h_u;
}
}