Hello,
I’m new of CUDA programming, maybe my question could be foolish. My questinons is about usage of memory beteween kernel, so, let’s me explain. I have two different function in my Device code, the first function calc_UP and the second calc_DOWN.
The problem is, when calc_UP has finished the computation, I need to use the results from calc_UP to input of calc_DOWN, unfortunatly this computation must executed in order. I would avoid to copy the result from device to host for calc_UP, and then, copy from host to device in calc_DOWN, because I’ve noticed that is time-consuming.
So, it’s possible to use an array stored in the device memory of the first ( calc_UP ) to the second ( calc_DOWN) when the first has finished the execution, and how?
I hope I was clear, Thank you in advance!
__global__
void calc_UP(float* Power, float* time, float* T1_11, float Rth, float Tau, float Tini, int dim)
{
int index = blockIdx.x * blockDim.x + threadIdx.x;
int stride = blockDim.x * gridDim.x;
for (int m = index; m < dim; m += stride)
{
T1_11[m] = (((Power[m] * Rth) - Tini) * (1 - (exp(-time[m] / Tau)))) + Tini;
}
}
__global__
void calc_DOWN(float* time, float* T1_11, float Tau, float Tini, int dim)
{
int index = blockIdx.x * blockDim.x + threadIdx.x;
int stride = blockDim.x * gridDim.x;
for (int m = index; m < dim; m += stride)
{
T1_11[m] = (Tini * (exp(-time[m] / Tau)));
}
}
Now my host code run in this way:
// Allocate input vectors in host memory
............
// Initialize input vectors
............
// Allocate vectors in device memory
...........
// Copy vectors from host memory to device memory
cudaMemcpy(d_T, time_read, size, cudaMemcpyHostToDevice);
cudaMemcpy(d_P, power_read, size, cudaMemcpyHostToDevice);
// Run kernel on 1M elements on the GPU
int blockSize = 256;
int numBlocks = (N + blockSize - 1) / blockSize;
calc_UP << <numBlocks, blockSize >> > (d_P, d_T, UP, 0.12341, 0.0235, 1, N);
calc_DOWN << <numBlocks, blockSize >> > (d_T, DOWN, 0.235, 10, N);
// Copy result from device memory to host memory
// h_U contains the result in host memory
cudaMemcpy(h_U1, UP, size, cudaMemcpyDeviceToHost);
cudaMemcpy(h_D1, DOWN, size, cudaMemcpyDeviceToHost);
// Wait for GPU to finish before accessing on host
cudaDeviceSynchronize();
// Free memory Device
................
...........
// Free memory host
...........