Bug with CUDA 2D array

Hi,

I have a bug in my CUDA code, the output is wrong (I would expect values 1.0 or 0.125):

rho[10,31] = rho1d[1002] = -431602080.000000
rho[11,31] = rho1d[1003] = -431602080.000000
rho[12,31] = rho1d[1004] = -431602080.000000
rho[13,31] = rho1d[1005] = -431602080.000000
rho[14,31] = rho1d[1006] = -431602080.000000
rho[15,31] = rho1d[1007] = -431602080.000000
rho[16,31] = rho1d[1008] = -431602080.000000
rho[17,31] = rho1d[1009] = -431602080.000000
rho[18,31] = rho1d[1010] = -431602080.000000
rho[19,31] = rho1d[1011] = -431602080.000000
rho[20,31] = rho1d[1012] = -431602080.000000
rho[21,31] = rho1d[1013] = -431602080.000000
rho[22,31] = rho1d[1014] = -431602080.000000
rho[23,31] = rho1d[1015] = -431602080.000000
rho[24,31] = rho1d[1016] = -431602080.000000
rho[25,31] = rho1d[1017] = -431602080.000000
rho[26,31] = rho1d[1018] = -431602080.000000
rho[27,31] = rho1d[1019] = -431602080.000000
rho[28,31] = rho1d[1020] = -431602080.000000
rho[29,31] = rho1d[1021] = -431602080.000000
rho[30,31] = rho1d[1022] = -431602080.000000
rho[31,31] = rho1d[1023] = -431602080.000000

The code listing is below, please help !
Regards, florian

// testArray2d.cu

#include “stdafx.h”
#include <stdio.h>
#include <stdlib.h>
#include <cuda.h>
#include <cutil.h>

#define TILE_I 32
#define TILE_J 16

#define MIN(a, b) (((a) < (b)) ? (a) : (b))
#define MAX(a, b) (((a) > (b)) ? (a) : (b))

#define M 32
#define N 32
#define gamma ((float)1.4)
#define I(i, j) (i+M*j)

float *d_rho, *d_u, *d_v, *d_p, *d_c;
float *d_rhou, *d_rhov, *d_rhoE;

size_t pitch;

//
// CUDA kernel prototypes
//
global void init_kernel (int pitch, float *d_rho, float *d_u, float *d_v, float *d_p,
float *d_c, float *d_rhou, float *d_rhov, float *d_rhoE);

////////////////////////////////////////////////////////////////////////////////

int main(int argc, char *argv)
{
float
rho;

printf("testArray2d is running...\n");

rho = (float *) malloc(M*N*sizeof(float));
rho[I(0,0)] = (float)3.14;

printf( "A[0,0] = %f\n", rho[I(0,0)] );

//
//cudaMalloc((void**)&d_rho, M*N*sizeof(float)); 
// 
cudaMallocPitch((void **)&d_rho, &pitch, M*sizeof(float), N);
cudaMallocPitch((void **)&d_u  , &pitch, M*sizeof(float), N);
cudaMallocPitch((void **)&d_v  , &pitch, M*sizeof(float), N);
cudaMallocPitch((void **)&d_p  , &pitch, M*sizeof(float), N);
cudaMallocPitch((void **)&d_c  , &pitch, M*sizeof(float), N);

cudaMallocPitch((void **)&d_rhou, &pitch, M*sizeof(float), N);
cudaMallocPitch((void **)&d_rhov, &pitch, M*sizeof(float), N);
cudaMallocPitch((void **)&d_rhoE, &pitch, M*sizeof(float), N);

dim3 grid  = dim3(M/TILE_I, N/TILE_J);
dim3 block = dim3(TILE_I, TILE_J);
init_kernel<<<grid, block>>>(pitch, d_rho, d_u, d_v, d_p,
                             d_c, d_rhou, d_rhov, d_rhoE);
//
// Verif
cudaMemcpy2D(rho, sizeof(float)*M, d_rho, pitch, sizeof(float)*M, N,
             cudaMemcpyDeviceToHost);
for (int j=0; j<N; j++)
	for (int i=0; i<M; i++)
		printf("rho[%d,%d] = rho1d[%d] = %f \n", i, j, I(i,j), rho[I(i,j)]);
//
getchar();

free(rho);
cudaFree(d_rho);
cudaFree(d_u);
cudaFree(d_v);
cudaFree(d_p);
cudaFree(d_c);
cudaFree(d_rhou);
cudaFree(d_rhov);
cudaFree(d_rhoE);
return 0;

}

////////////////////////////////////////////////////////////////////////////////

global void init_kernel (int pitch, float *d_rho, float *d_u, float *d_v, float *d_p,
float *d_c, float *d_rhou, float *d_rhov, float *d_rhoE)
// CUDA kernel

{
int i, j, i2d;

i = blockIdx.x * TILE_I + threadIdx.x;
j = blockIdx.y * TILE_J + threadIdx.y;
i2d = i + j*pitch / sizeof(float);

if (i<M/2) {
	d_rho[i2d] = (float) 1.0;
	d_u[i2d]   = (float) 0.0;
	d_v[i2d]   = (float) 0.0;
	d_p[i2d]   = (float) 1.0;
}
else
{
	d_rho[i2d] = (float) 0.125;
	d_u[i2d]   = (float) 0.0;
	d_v[i2d]   = (float) 0.0;
	d_p[i2d]   = (float) 0.1;
}
d_rhou[i2d] = d_rho[i2d] * d_u[i2d];
d_rhov[i2d] = d_rho[i2d] * d_v[i2d];
d_rhoE[i2d] = (float)0.5 * d_rho[i2d] * (d_u[i2d]*d_u[i2d]+d_v[i2d]*d_v[i2d])
	        + d_p[i2d] / (gamma-(float)1.0);
d_c[i2d]    = sqrtf(gamma * d_p[i2d] / d_rho[i2d]);

}

Hi again,

Well, actually there’s no bug, only a too old nVIDIA incompatible driver pb ;p,

Cheers,