Shallow water equation with CUDA

i still got some mistakes here, i’m now confused

#include <stdio.h>
#include <cuda_runtime.h>
#include <sys/time.h>

double cpuSecond() {
struct timeval tp;
gettimeofday(&tp, NULL);
return ((double)tp.tv_sec + (double)tp.tv_usec * 1.e-6);
}

host device float maxVal(float a, float b) { return (a > b) ? a : b; }
host device float minVal(float a, float b) { return (a < b) ? a : b; }
device float solve_Ux(float Ul, float Ur, float hl, float hr) {
return 0.5f * (Ul + Ur) + sqrt(9.8 * hl) - sqrt(9.8 * hr);
}
device float solve_hx(float Ul, float Ur, float hl, float hr) {
return (1.f/9.8f) * pow(0.5f*(sqrt(9.8f * hl) + sqrt(9.8f * hr)) + 0.25f * (Ul - Ur), 2);
}
device float solve_c1(float Ul, float Ur, float hl, float hr, float Ux, float hx) {
if (hl == 0.f) return Ur - 2.f * sqrt(9.8f * hr);
else return minVal(Ul - sqrt(9.8f * hl), Ux - sqrt(9.8f * hx));
}
device float solve_c2(float Ul, float Ur, float hl, float hr, float Ux, float hx) {
if (hr == 0.f) return Ul + 2.f * sqrt(9.8f * hl);
else return maxVal(Ur + sqrt(9.8f * hr), Ux + sqrt(9.8f * hx));
}

device float solve_t1(float c1, float c2) {
return (minVal(c2, 0.f) - minVal(c1, 0.f)) / (c2 - c1);
}
device float solve_t2(float t1) { return (1.f - t1); }
device float solve_t3(float c1, float c2) {
return (c2 * fabs(c1) - c1 * fabs(c2)) / (2.f* c2 - 2.f * c1);
}
global void kernel(float *h, float *u, float *q, float dx, float dt, const int n) {
// thread index
int i = threadIdx.x;

// define some shared variabels on the device
__shared__ float hh[256], uu[256], qq[256];
hh[i] = h[i];
__syncthreads();
uu[i] = u[i];
__syncthreads();
qq[i] = q[i];
__syncthreads();

// define some local variables
float Ux, hx, c1, c2, t1, t2, t3;
float _Ux, _hx, _c1, _c2, _t1, _t2, _t3;
float time = 0.f;

while (time < 0.3) {
    time += dt;
    if (i >= 1 && i <= n-1) {
    //F(U[i-1], U[i])
    Ux = solve_Ux(uu[i - 1], uu[i], hh[i - 1], hh[i]);
    __syncthreads();
    hx = solve_hx(uu[i - 1], uu[i], hh[i - 1], hh[i]);
    __syncthreads();
    c1 = solve_c1(uu[i - 1], uu[i], hh[i - 1], hh[i], Ux, hx);
    __syncthreads();
    c2 = solve_c2(uu[i - 1], uu[i], hh[i - 1], hh[i], Ux, hx);
    __syncthreads();
    t1 = solve_t1(c1, c2);
    t2 = solve_t2(t1);
    t3 = solve_t3(c1, c2);
    //F(U[i], U[i+1])
    _Ux = solve_Ux(uu[i], uu[i + 1], hh[i], hh[i + 1]);
    __syncthreads();
    _hx = solve_hx(uu[i], uu[i + 1], hh[i], hh[i + 1]);
    __syncthreads();
    _c1 = solve_c1(uu[i], uu[i + 1], hh[i], hh[i + 1], _Ux, _hx);
    __syncthreads();
    _c2 = solve_c2(uu[i], uu[i + 1], hh[i], hh[i + 1], _Ux, _hx);
    __syncthreads();
    _t1 = solve_t1(_c1, _c2);
    _t2 = solve_t2(_t1);
    _t3 = solve_t3(_c1, _c2);

    hh[i] = hh[i] - (dt / dx)*((_t1*(hh[i+1]*uu[i+1]) +
        _t2*(hh[i]*uu[i]) - _t3*(hh[i+1] - hh[i])) - (t1*(hh[i]*uu[i]) + 
        t2*(hh[i-1]*uu[i-1]) - t3*(hh[i] - hh[i-1])));
    __syncthreads();
    /*
    hh[i] = temp1;
    __syncthreads();
    */
    qq[i] = qq[i] - (dt/dx) * (float)((_t1*(h[i+1] * (float)pow(uu[i+1], 2) + 
        0.5f*9.8f*(float)pow(hh[i+1], 2)) + _t2*(h[i] * (float)pow(uu[i], 2) + 
        0.5f*9.8f*(float)pow(hh[i], 2)) - _t3*(hh[i+1]*uu[i+1] - hh[i]*uu[i])) - 
        (t1*(hh[i] * (float)pow(uu[i], 2) + 0.5f*9.8f*(float)pow(hh[i], 2)) + 
        t2*(hh[i-1] * (float)pow(uu[i-1], 2) + 0.5f*9.8f*(float)pow(hh[i-1], 2)) - 
        t3*(hh[i]*uu[i] - hh[i-1]*uu[i-1])));
    __syncthreads();
    /*
    qq[i] = temp2;
    __syncthreads();
    */
    uu[i] = (qq[i] / (hh[i] + 0.00001f));
    __syncthreads();
    //uu[i] = temp3;
}
__syncthreads();

// boundary 
if (threadIdx.x == 0) {
    uu[0] = uu[n - 1] = 0.f;
    qq[0] = qq[n - 1] = 0.f;
    hh[0] = hh[1];
    hh[n - 1] = hh[n - 2];
}
__syncthreads();
// update
if (i <= n) {
    h[i] = hh[i];
    u[i] = uu[i];
    q[i] = qq[i];
}
__syncthreads();
}

}

int main() {
FILE *init = fopen(“init.dat”, “w”);
FILE *fin = fopen(“fin.dat”, “w”);

 // define host variable
 float Lx = 1.0f;
 int N = 256, i;
 float dx = (float)(Lx / N);

 int size = (N + 1) * sizeof(float);

 // allocate device 
 float *dh, *du, *dq;
 cudaMalloc((void **)&dh, size);
 cudaMalloc((void **)&du, size);
 cudaMalloc((void **)&dq, size);

 // allocate host
 float *h, *u, *q, *x;
 h = (float *)malloc(size);
 u = (float *)malloc(size);
 q = (float *)malloc(size);
 x = (float *)malloc(size);

 // __init__
 for (i = 0; i <= N; i++) {
     x[i] = (float)i * dx;
     u[i] = 0.f;             
     if (x[i] <= 0.5) h[i] = 1.f;
     else h[i] = 0.00001f;
     q[i] = h[i] * u[i];
     fprintf(init, "%f %f\n", x[i], h[i]);
 }
 // copying from host to device
 cudaMemcpy(dh, h, size, cudaMemcpyHostToDevice);
 cudaMemcpy(du, u, size, cudaMemcpyHostToDevice);
 cudaMemcpy(dq, q, size, cudaMemcpyHostToDevice);

 double iStart, iElaps;
 float dt = 0.0001f;
 
 int time, tFinal;
 time = 0;
 tFinal = 3;

 // invoke the kernel
 iStart = cpuSecond();
 //while (time < tFinal) {
    //time++;
    kernel <<<1, 256>>>(dh, du, dq, dx, dt, N);
 //}
 iElaps = cpuSecond() - iStart;
 printf("Elapsed time = %lf seconds\n", iElaps);

 // copy data back to host
 cudaMemcpy(h, dh, size, cudaMemcpyDeviceToHost);

 for (i = 0; i <= N; i++) fprintf(fin, "%f %f\n", x[i], h[i]);

 free(h); 
 free(u); 
 free(q); 
 free(x);
 cudaFree(dh);
 cudaFree(du);
 cudaFree(dq);
 fclose(init);
 fclose(fin);
 return 0;

}