Hello,
Sorry for the late reply, I wasn’t getting notifications.
Regarding your question. Yes, It holds a valid value. I even tried declaring a normal integer with the same value.
Well, I did something else to make it more organized and understandable. I declared another struct to gather most of the variables such that I put fewer arguments into the Cuda kernel.
I have read that I cannot pass more than 4kb of arguments, nevertheless, I used the sizeof() function to measure each of the arguments and I am definitely not passing more than 4 kb, which is barely 280 bytes. BTW my GPU is the GTX 1050 ti, so I know I am able to pass up to 4kb.
I made it partially work, I mean, it is doing what is supposed to do, nonetheless, I am not allowed to debug that Cuda kernel in particular. I know that it is working due to I am copying from Device to Host again to check them out.
Is anything of my syntax wrong, and that’s why I’m not able to debug or is it related to the threads and block dimensions?
This is the new struct for the variables.
#ifndef BLOCK_STRUCT_H
#define BLOCK_STRUCT_H
#pragma once
typedef struct {
double *po1;
double *pw1;
double *pg1;
double *so1;
double *sw1;
double *sg1;
double *deno1;
double *denw1;
double *deng1;
double *denmix;
double *uo;
double *uw;
double *ug;
double *rso1;
double *rsg1;
double *VB;
double *rsoix;
double *rsoiy;
double *rsoiz;
double *rsgix;
double *rsgiy;
double *rsgiz;
} block_struct;
#endif
PVT struct remains the same.
This is de variables allocation.
// allocate kernel variables
block_struct bl;
cudaError_t(cudaMalloc((void**)&bl.po1, NB * sizeof(double)));
cudaError_t(cudaMalloc((void**)&bl.pw1, NB * sizeof(double)));
cudaError_t(cudaMalloc((void**)&bl.pg1, NB * sizeof(double)));
cudaError_t(cudaMalloc((void**)&bl.so1, NB * sizeof(double)));
cudaError_t(cudaMalloc((void**)&bl.sw1, NB * sizeof(double)));
cudaError_t(cudaMalloc((void**)&bl.sg1, NB * sizeof(double)));
cudaError_t(cudaMalloc((void**)&bl.VB, NB * sizeof(double)));
cudaError_t(cudaMalloc((void**)&bl.deno1, NB * sizeof(double)));
cudaError_t(cudaMalloc((void**)&bl.denw1, NB * sizeof(double)));
cudaError_t(cudaMalloc((void**)&bl.deng1, NB * sizeof(double)));
cudaError_t(cudaMalloc((void**)&bl.denmix, NB * sizeof(double)));
cudaError_t(cudaMalloc((void**)&bl.uo, NB * sizeof(double)));
cudaError_t(cudaMalloc((void**)&bl.uw, NB * sizeof(double)));
cudaError_t(cudaMalloc((void**)&bl.ug, NB * sizeof(double)));
cudaError_t(cudaMalloc((void**)&bl.rso1, NB * sizeof(double)));
cudaError_t(cudaMalloc((void**)&bl.rsg1, NB * sizeof(double)));
cudaError_t(cudaMalloc((void**)&bl.rsoix, NB * sizeof(double)));
cudaError_t(cudaMalloc((void**)&bl.rsoiy, NB * sizeof(double)));
cudaError_t(cudaMalloc((void**)&bl.rsoiz, NB * sizeof(double)));
cudaError_t(cudaMalloc((void**)&bl.rsgix, NB * sizeof(double)));
cudaError_t(cudaMalloc((void**)&bl.rsgiy, NB * sizeof(double)));
cudaError_t(cudaMalloc((void**)&bl.rsgiz, NB * sizeof(double)));
int jhg = sizeof(bl);
//cuda memory copy
cudaError_t(cudaMemcpy(bl.VB, VB, NB * sizeof(double), cudaMemcpyHostToDevice));
cudaError_t(cudaMalloc((void**)&gpu_x, DIM * sizeof(double)));
cudaMemcpy(gpu_x, xc, DIM * sizeof(double), cudaMemcpyHostToDevice);
// Load kr curves to device memory
curves_struct kr_gpu;
kr_gpu.NPTL = rt_perm[0].getNPTL();
kr_gpu.NPTG = rt_perm[0].getNPTG();
kr_gpu.KROWC = rt_perm[0].getKROWC();
cudaMalloc((void**)&kr_gpu.SW, kr_gpu.NPTL *sizeof(double));
cudaMalloc((void**)&kr_gpu.KRW, kr_gpu.NPTL * sizeof(double));
cudaMalloc((void**)&kr_gpu.KROW, kr_gpu.NPTL * sizeof(double));
cudaMalloc((void**)&kr_gpu.PCOW, kr_gpu.NPTL * sizeof(double));
cudaMalloc((void**)&kr_gpu.SG, kr_gpu.NPTG * sizeof(double));
cudaMalloc((void**)&kr_gpu.KRG, kr_gpu.NPTG * sizeof(double));
cudaMalloc((void**)&kr_gpu.KROG, kr_gpu.NPTG * sizeof(double));
cudaMalloc((void**)&kr_gpu.PCOG, kr_gpu.NPTG * sizeof(double));
cudaMemcpy(kr_gpu.SW, rt_perm[0].getSW(), kr_gpu.NPTL * sizeof(double), cudaMemcpyHostToDevice);
cudaMemcpy(kr_gpu.KRW, rt_perm[0].getKRW(), kr_gpu.NPTL * sizeof(double), cudaMemcpyHostToDevice);
cudaMemcpy(kr_gpu.KROW,rt_perm[0].getKROW(), kr_gpu.NPTL * sizeof(double), cudaMemcpyHostToDevice);
cudaMemcpy(kr_gpu.PCOW,rt_perm[0].getPCOW(), kr_gpu.NPTL * sizeof(double), cudaMemcpyHostToDevice);
cudaMemcpy(kr_gpu.SG, rt_perm[0].getSG(), kr_gpu.NPTG * sizeof(double), cudaMemcpyHostToDevice);
cudaMemcpy(kr_gpu.KRG, rt_perm[0].getKRG(), kr_gpu.NPTG * sizeof(double), cudaMemcpyHostToDevice);
cudaMemcpy(kr_gpu.KROG,rt_perm[0].getKROG(),kr_gpu.NPTG * sizeof(double), cudaMemcpyHostToDevice);
cudaMemcpy(kr_gpu.PCOG,rt_perm[0].getPCOG(),kr_gpu.NPTG * sizeof(double), cudaMemcpyHostToDevice);
pvt_struct pvt_gpu;
int dfs = sizeof(pvt_gpu);
pvt_gpu.NPT = pvt.getNPT();
cudaMalloc((void**)&pvt_gpu.PRES, pvt_gpu.NPT * sizeof(double));
cudaMalloc((void**)&pvt_gpu.RSO, pvt_gpu.NPT * sizeof(double));
cudaMalloc((void**)&pvt_gpu.DO, pvt_gpu.NPT * sizeof(double));
cudaMalloc((void**)&pvt_gpu.UO, pvt_gpu.NPT * sizeof(double));
cudaMalloc((void**)&pvt_gpu.DW, pvt_gpu.NPT * sizeof(double));
cudaMalloc((void**)&pvt_gpu.UW, pvt_gpu.NPT * sizeof(double));
cudaMalloc((void**)&pvt_gpu.DG, pvt_gpu.NPT * sizeof(double));
cudaMalloc((void**)&pvt_gpu.UG, pvt_gpu.NPT * sizeof(double));
cudaMalloc((void**)&pvt_gpu.RSG, pvt_gpu.NPT * sizeof(double));
cudaMemcpy(pvt_gpu.PRES, pvt.getPRES(), pvt_gpu.NPT * sizeof(double), cudaMemcpyHostToDevice);
cudaMemcpy(pvt_gpu.RSO, pvt.getRSO(), pvt_gpu.NPT * sizeof(double), cudaMemcpyHostToDevice);
cudaMemcpy(pvt_gpu.DO, pvt.getDO(), pvt_gpu.NPT * sizeof(double), cudaMemcpyHostToDevice);
cudaMemcpy(pvt_gpu.UO, pvt.getUO(), pvt_gpu.NPT * sizeof(double), cudaMemcpyHostToDevice);
cudaMemcpy(pvt_gpu.DW, pvt.getDW(), pvt_gpu.NPT * sizeof(double), cudaMemcpyHostToDevice);
cudaMemcpy(pvt_gpu.UW, pvt.getUW(), pvt_gpu.NPT * sizeof(double), cudaMemcpyHostToDevice);
cudaMemcpy(pvt_gpu.DG, pvt.getDG(), pvt_gpu.NPT * sizeof(double), cudaMemcpyHostToDevice);
cudaMemcpy(pvt_gpu.UG, pvt.getUG(), pvt_gpu.NPT * sizeof(double), cudaMemcpyHostToDevice);
cudaMemcpy(pvt_gpu.RSG, pvt.getRSG(), pvt_gpu.NPT * sizeof(double), cudaMemcpyHostToDevice);
kernel_residual(NBI, NBJ, NBK, gpu_x, bl, kr_gpu, pvt_gpu);
cudaMemcpy(po1, bl.po1, NB * sizeof(double), cudaMemcpyDeviceToHost);
cudaMemcpy(pw1, bl.pw1, NB * sizeof(double), cudaMemcpyDeviceToHost);
cudaMemcpy(pg1, bl.pg1, NB * sizeof(double), cudaMemcpyDeviceToHost);
cudaMemcpy(so1, bl.so1, NB * sizeof(double), cudaMemcpyDeviceToHost);
cudaMemcpy(sw1, bl.sw1, NB * sizeof(double), cudaMemcpyDeviceToHost);
cudaMemcpy(sg1, bl.sg1, NB * sizeof(double), cudaMemcpyDeviceToHost);
cudaMemcpy(deno1, bl.deno1, NB * sizeof(double), cudaMemcpyDeviceToHost);
cudaMemcpy(denw1, bl.denw1, NB * sizeof(double), cudaMemcpyDeviceToHost);
cudaMemcpy(deng1, bl.deng1, NB * sizeof(double), cudaMemcpyDeviceToHost);
cudaMemcpy(denmix, bl.denmix, NB * sizeof(double), cudaMemcpyDeviceToHost);
cudaMemcpy(uo, bl.uo, NB * sizeof(double), cudaMemcpyDeviceToHost);
cudaMemcpy(uw, bl.uw, NB * sizeof(double), cudaMemcpyDeviceToHost);
cudaMemcpy(ug, bl.ug, NB * sizeof(double), cudaMemcpyDeviceToHost);
cudaMemcpy(rso1, bl.rso1, NB * sizeof(double), cudaMemcpyDeviceToHost);
cudaMemcpy(rsg1, bl.rsg1, NB * sizeof(double), cudaMemcpyDeviceToHost);
cudaMemcpy(rsoix, bl.rsoix, NB * sizeof(double), cudaMemcpyDeviceToHost);
cudaMemcpy(rsoiy, bl.rsoiy, NB * sizeof(double), cudaMemcpyDeviceToHost);
cudaMemcpy(rsoiz, bl.rsoiz, NB * sizeof(double), cudaMemcpyDeviceToHost);
cudaMemcpy(rsgix, bl.rsgix, NB * sizeof(double), cudaMemcpyDeviceToHost);
cudaMemcpy(rsgiy, bl.rsgiy, NB * sizeof(double), cudaMemcpyDeviceToHost);
cudaMemcpy(rsgiz, bl.rsgiz, NB * sizeof(double), cudaMemcpyDeviceToHost);
void kernel_residual(int NBI,int NBJ,int NBK,double *x,block_struct bl, curves_struct kr_gpu,pvt_struct pvt_gpu)
{
dim3 threads(16,16,4);
dim3 blocks(16,16,16);
//int m;
// the total number of threads executed = threads_per_block * blocks_per_grid
// call the kernel using "triple chevron" notation.
loop1<<<blocks,threads>>>(NBI,NBJ,NBK, x, bl.po1,bl.pw1,bl.pg1,bl.so1,bl.sw1,bl.sg1);
cudaDeviceSynchronize();
//capillar pressure loop
loop2<<<blocks,threads>>>(NBI, NBJ, NBK, bl.sw1, bl.sg1, bl.pw1, bl.pg1, bl.po1, kr_gpu);
cudaDeviceSynchronize();
//boundary conditions
loop3<<<blocks,threads>>>(NBI, NBJ, NBK, bl.po1, bl.pw1, bl.pg1, bl.so1, bl.sw1, bl.sg1);
cudaDeviceSynchronize();
//pvt properties
//It isn't debugging within loop4
loop4<<<blocks,threads>>>(NBI, NBJ, NBK, bl, pvt_gpu);
// Check for any errors launching the kernel
cudaError_t e = cudaGetLastError();
cudaCheckError();
cudaDeviceSynchronize();
I don’t copy data for bl variables but VB, since everything is stored in gpu_x and eventually taken in the previous loops to loop4.
Well, this is now the loop4
__global__ void loop4(int NBI, int NBJ, int NBK, block_struct bl, pvt_struct pvt_gpu) {
int j = threadIdx.x + blockDim.x * blockIdx.x;
int k = threadIdx.y + blockDim.y * blockIdx.y;
//printf("Hello from thread");
//__syncthreads();
if (k < NBK) {
if (j < NBJ) {
int t;
for (int i = 0; i < NBI; i++)
{
t = i + j*NBI + NBI*NBJ*k;
bl.deno1[t] = kernel_ddn(bl.po1[t], pvt_gpu.PRES, pvt_gpu.DO, pvt_gpu.NPT);
bl.denw1[t] = kernel_ddn(bl.pw1[t], pvt_gpu.PRES, pvt_gpu.DW, pvt_gpu.NPT);
bl.deng1[t] = kernel_ddn(bl.pg1[t], pvt_gpu.PRES, pvt_gpu.DG, pvt_gpu.NPT);
bl.denmix[t] = bl.deno1[t] * bl.so1[t] + bl.denw1[t] * bl.sw1[t] + bl.deng1[t] * bl.sg1[t];
bl.rso1[t] = kernel_ddn(bl.po1[t], pvt_gpu.PRES, pvt_gpu.RSO, pvt_gpu.NPT);
bl.rsg1[t] = kernel_ddn(bl.pg1[t], pvt_gpu.PRES, pvt_gpu.RSG, pvt_gpu.NPT);
bl.uo[t] = kernel_ddn(bl.po1[t], pvt_gpu.PRES, pvt_gpu.UO, pvt_gpu.NPT);
bl.uw[t] = kernel_ddn(bl.pw1[t], pvt_gpu.PRES, pvt_gpu.UW, pvt_gpu.NPT);
bl.ug[t] = kernel_ddn(bl.pg1[t], pvt_gpu.PRES, pvt_gpu.UG, pvt_gpu.NPT);
}
}
}
__syncthreads();
if (k > 0 && k < NBK) {
if (j > 0 && j < NBJ) {
int t;
int south, top, west;
for (int i = 1; i < NBI; i++)
{
t = i + j*NBI + NBI*NBJ*k;
west = t - 1;
south = t - NBI;
top = t - NBI*NBJ;
bl.rsoix[t] = (bl.rso1[west] * bl.VB[west] + bl.rso1[t] * bl.VB[t]) / (bl.VB[t] + bl.VB[west]);
bl.rsoiy[t] = (bl.rso1[south] * bl.VB[south] + bl.rso1[t] * bl.VB[t]) / (bl.VB[t] + bl.VB[south]);
bl.rsoiz[t] = (bl.rso1[top] * bl.VB[top] + bl.rso1[t] * bl.VB[t]) / (bl.VB[t] + bl.VB[top]);
bl.rsgix[t] = (bl.rsg1[west] * bl.VB[west] + bl.rsg1[t] * bl.VB[t]) / (bl.VB[t] + bl.VB[west]);
bl.rsgiy[t] = (bl.rsg1[south] * bl.VB[south] + bl.rsg1[t] * bl.VB[t]) / (bl.VB[t] + bl.VB[south]);
bl.rsgiz[t] = (bl.rsg1[top] * bl.VB[top] + bl.rsg1[t] * bl.VB[t]) / (bl.VB[t] + bl.VB[top]);
}
}
}
}
Thank you very much for reading