Shallow water equation with CUDA

i still got some mistakes here, i’m now confused

#include <stdio.h>
#include <cuda_runtime.h>
#include <sys/time.h>

double cpuSecond() {
struct timeval tp;
gettimeofday(&tp, NULL);
return ((double)tp.tv_sec + (double)tp.tv_usec * 1.e-6);
}

host device float maxVal(float a, float b) { return (a > b) ? a : b; }
host device float minVal(float a, float b) { return (a < b) ? a : b; }
device float solve_Ux(float Ul, float Ur, float hl, float hr) {
return 0.5f * (Ul + Ur) + sqrt(9.8 * hl) - sqrt(9.8 * hr);
}
device float solve_hx(float Ul, float Ur, float hl, float hr) {
return (1.f/9.8f) * pow(0.5f*(sqrt(9.8f * hl) + sqrt(9.8f * hr)) + 0.25f * (Ul - Ur), 2);
}
device float solve_c1(float Ul, float Ur, float hl, float hr, float Ux, float hx) {
if (hl == 0.f) return Ur - 2.f * sqrt(9.8f * hr);
else return minVal(Ul - sqrt(9.8f * hl), Ux - sqrt(9.8f * hx));
}
device float solve_c2(float Ul, float Ur, float hl, float hr, float Ux, float hx) {
if (hr == 0.f) return Ul + 2.f * sqrt(9.8f * hl);
else return maxVal(Ur + sqrt(9.8f * hr), Ux + sqrt(9.8f * hx));
}

device float solve_t1(float c1, float c2) {
return (minVal(c2, 0.f) - minVal(c1, 0.f)) / (c2 - c1);
}
device float solve_t2(float t1) { return (1.f - t1); }
device float solve_t3(float c1, float c2) {
return (c2 * fabs(c1) - c1 * fabs(c2)) / (2.f* c2 - 2.f * c1);
}
global void kernel(float *h, float *u, float *q, float dx, float dt, const int n) {

``````// define some shared variabels on the device
__shared__ float hh[256], uu[256], qq[256];
hh[i] = h[i];
uu[i] = u[i];
qq[i] = q[i];

// define some local variables
float Ux, hx, c1, c2, t1, t2, t3;
float _Ux, _hx, _c1, _c2, _t1, _t2, _t3;
float time = 0.f;

while (time < 0.3) {
time += dt;
if (i >= 1 && i <= n-1) {
//F(U[i-1], U[i])
Ux = solve_Ux(uu[i - 1], uu[i], hh[i - 1], hh[i]);
hx = solve_hx(uu[i - 1], uu[i], hh[i - 1], hh[i]);
c1 = solve_c1(uu[i - 1], uu[i], hh[i - 1], hh[i], Ux, hx);
c2 = solve_c2(uu[i - 1], uu[i], hh[i - 1], hh[i], Ux, hx);
t1 = solve_t1(c1, c2);
t2 = solve_t2(t1);
t3 = solve_t3(c1, c2);
//F(U[i], U[i+1])
_Ux = solve_Ux(uu[i], uu[i + 1], hh[i], hh[i + 1]);
_hx = solve_hx(uu[i], uu[i + 1], hh[i], hh[i + 1]);
_c1 = solve_c1(uu[i], uu[i + 1], hh[i], hh[i + 1], _Ux, _hx);
_c2 = solve_c2(uu[i], uu[i + 1], hh[i], hh[i + 1], _Ux, _hx);
_t1 = solve_t1(_c1, _c2);
_t2 = solve_t2(_t1);
_t3 = solve_t3(_c1, _c2);

hh[i] = hh[i] - (dt / dx)*((_t1*(hh[i+1]*uu[i+1]) +
_t2*(hh[i]*uu[i]) - _t3*(hh[i+1] - hh[i])) - (t1*(hh[i]*uu[i]) +
t2*(hh[i-1]*uu[i-1]) - t3*(hh[i] - hh[i-1])));
/*
hh[i] = temp1;
*/
qq[i] = qq[i] - (dt/dx) * (float)((_t1*(h[i+1] * (float)pow(uu[i+1], 2) +
0.5f*9.8f*(float)pow(hh[i+1], 2)) + _t2*(h[i] * (float)pow(uu[i], 2) +
0.5f*9.8f*(float)pow(hh[i], 2)) - _t3*(hh[i+1]*uu[i+1] - hh[i]*uu[i])) -
(t1*(hh[i] * (float)pow(uu[i], 2) + 0.5f*9.8f*(float)pow(hh[i], 2)) +
t2*(hh[i-1] * (float)pow(uu[i-1], 2) + 0.5f*9.8f*(float)pow(hh[i-1], 2)) -
t3*(hh[i]*uu[i] - hh[i-1]*uu[i-1])));
/*
qq[i] = temp2;
*/
uu[i] = (qq[i] / (hh[i] + 0.00001f));
//uu[i] = temp3;
}

// boundary
uu[0] = uu[n - 1] = 0.f;
qq[0] = qq[n - 1] = 0.f;
hh[0] = hh[1];
hh[n - 1] = hh[n - 2];
}
// update
if (i <= n) {
h[i] = hh[i];
u[i] = uu[i];
q[i] = qq[i];
}
}
``````

}

int main() {
FILE *init = fopen(“init.dat”, “w”);
FILE *fin = fopen(“fin.dat”, “w”);

`````` // define host variable
float Lx = 1.0f;
int N = 256, i;
float dx = (float)(Lx / N);

int size = (N + 1) * sizeof(float);

// allocate device
float *dh, *du, *dq;
cudaMalloc((void **)&dh, size);
cudaMalloc((void **)&du, size);
cudaMalloc((void **)&dq, size);

// allocate host
float *h, *u, *q, *x;
h = (float *)malloc(size);
u = (float *)malloc(size);
q = (float *)malloc(size);
x = (float *)malloc(size);

// __init__
for (i = 0; i <= N; i++) {
x[i] = (float)i * dx;
u[i] = 0.f;
if (x[i] <= 0.5) h[i] = 1.f;
else h[i] = 0.00001f;
q[i] = h[i] * u[i];
fprintf(init, "%f %f\n", x[i], h[i]);
}
// copying from host to device
cudaMemcpy(dh, h, size, cudaMemcpyHostToDevice);
cudaMemcpy(du, u, size, cudaMemcpyHostToDevice);
cudaMemcpy(dq, q, size, cudaMemcpyHostToDevice);

double iStart, iElaps;
float dt = 0.0001f;

int time, tFinal;
time = 0;
tFinal = 3;

// invoke the kernel
iStart = cpuSecond();
//while (time < tFinal) {
//time++;
kernel <<<1, 256>>>(dh, du, dq, dx, dt, N);
//}
iElaps = cpuSecond() - iStart;
printf("Elapsed time = %lf seconds\n", iElaps);

// copy data back to host
cudaMemcpy(h, dh, size, cudaMemcpyDeviceToHost);

for (i = 0; i <= N; i++) fprintf(fin, "%f %f\n", x[i], h[i]);

free(h);
free(u);
free(q);
free(x);
cudaFree(dh);
cudaFree(du);
cudaFree(dq);
fclose(init);
fclose(fin);
return 0;
``````

}