Okay, I believe I have a good question for GPU programming.
I am currently trying to develop a fluid simulation program on the GPU. The simulation is very iterative for mass convergence and within that mass convergence an iterative solver  conjugate gradient. I built a conjugate gradient solver based on the Concurrent Number Cruncher (CNC) code I found online, but discovered that the costs of writing back and forth to the HOST/CPU from the GPU killed any gains in performance. So I thought about pushing the whole algorithm onto the GPU using CUDA. The following illustrates the code that interfaces with the CUDA file for the GPUBased Solver and works great:
extern "C" void cudaMatVecMult(float *matrix, unsigned int size_matrix, uint2 *rowptr, unsigned int size_rowptr, unsigned int *colind, unsigned int size_colind, float * x, float *b, unsigned int size_vec);
extern "C" void cudaVecVecMult(unsigned int size, float *x, float *y, float *r);
...
void GPUFertm::gpuMult(const void *x, void *y, unsigned int vec_size){
// Invoke the CCode function that inturn calls the
// CUDA/GPU Kernel for MatrixVector Multiplication
cudaMatVecMult((float*)gAhat, matrixSize_, (uint2*)gpu_rowptr, rowLength_,
(unsigned int*)gpu_colind, colLength_, (float*)x, (float*)y, vec_size);
}//gpuMult()
...
bool GPUFertm::pccgSolver(float *presoln){
try{
// Validity Check(s):
if(N_ == 0  rowLength_ == 0  colLength_ == 0  matrixSize_ == 0){
printf("Invalid values set for MASSConvergence Call!\n");
return false;
}
// Local Variable(s):
unsigned int its = 0;
float alpha = 0.0f, beta = 0.0f;
// r = A*x
gpuMult(gpu_x, gpu_r, N_);
// r = b  A*x
cublasSaxpy(N_, 1.0f, (float*)gpu_b, 1, (float*)gpu_r, 1);
cublasSscal(N_, 1.0f, (float*)gpu_r, 1);
// d = M1 * r
cudaVecVecMult(N_, (float*)gAdiagpre, (float*)gpu_r, (float*)gpu_d);
// cur_err = rT*d
float cur_err = cublasSdot(N_, (float*)gpu_r, 1, (float*)gpu_d, 1);
// err = cur_err
float err = (float)(cur_err * EPSILON * EPSILON);
while (cur_err > err && (int)its < MAX_ITERS) {
// Ad = A*d
gpuMult(gpu_d, gpu_Ad, N_);
// alpha = cur_err / (dT*Ad)
alpha = cur_err / cublasSdot(N_, (float*)gpu_d, 1, (float*)gpu_Ad, 1);
// x = x + alpha * d
cublasSaxpy(N_, alpha, (float*)gpu_d, 1, (float*)gpu_x, 1);
// r = r  alpha * Ad
cublasSaxpy(N_, alpha, (float*)gpu_Ad, 1, (float*)gpu_r, 1);
// h = M1r
cudaVecVecMult(N_, (float*)gAdiagpre, (float*)gpu_r, (float*)gpu_h);
float old_err = cur_err;
// cur_err = rT * h
cur_err = cublasSdot(N_, (float*)gpu_r, 1, (float*)gpu_h, 1);
beta = cur_err / old_err;
// d = h + beta * d
cublasSscal(N_, beta, (float*)gpu_d, 1);
cublasSaxpy(N_, 1.0f, (float*)gpu_h, 1, (float*)gpu_d, 1);
++its;
}//WHILELOOP (PCCG Convergence)
// Retrieve the solution from GPU:
// (GPU>CPU)
cublasGetVector(N_, 4, (float*)gpu_x, 1, presoln, 1);
// Stop GPU:
stopGPU();
// Operation was successful?
return (its < MAX_ITERS);
}catch(...){
return false;
}
}//pccgSolver()
The important aspects of the CUDA File follow:
extern "C" void cudaMatVecMult(float *matrix, unsigned int size_matrix, uint2 *rowptr, unsigned int size_rowptr, unsigned int *colind,unsigned int size_colind, float *x, float *b,unsigned int size_vec);
extern "C" void cudaVecVecMult(unsigned int size, float *x, float *y, float *r);
__device__ unsigned int compute_thread_index() {
return (blockIdx.x*THREAD_BLOCK_SIZE*THREAD_BLOCK_SIZE+
blockIdx.y*THREAD_BLOCK_SIZE*THREAD_BLOCK_SIZE*gridDim.x+
threadIdx.x+threadIdx.y*THREAD_BLOCK_SIZE);
}
...
void cudaMatVecMult(float *matrix, unsigned int size_matrix, uint2 *rowptr,
unsigned int size_rowptr,unsigned int *colind, unsigned int size_colind,
float *x, float *b, unsigned int size_vec) {
MatVecMultKernel<<<dimGrid, dimBlock>>>(matrix, size_matrix,rowptr, size_rowptr,
colind, size_colind, x, b, size_vec);
}
void cudaVecVecMult(unsigned int size, float * x, float * y, float * r) {
VecVecMultKernel<<<dimGrid, dimBlock>>>(size, x, y, r);
}
__global__ void MatVecMultKernel(float *matrix, unsigned int size_matrix, uint2 *rowptr,
unsigned int size_rowptr, unsigned int *colind,
unsigned int size_colind, float *x, float *b,
unsigned int size_vec) {
const unsigned int index = compute_thread_index();
if (index < size_vec) {
uint2 rowptr_bounds = rowptr[index];
float res = 0.0f;
// For each block of the block_row, mult
for (unsigned int i = rowptr_bounds.x; i < rowptr_bounds.y; i++) {
res += matrix[i]*x[colind[i]];
}
b[index] = res;
}//IFCONDITIONAL
}//MatVecMultKernel()
__global__ void VecVecMultKernel(unsigned int size, float *x, float *y, float *r) {
const unsigned int index = compute_thread_index();
if (index < size)
r[index] = x[index]*y[index];
}//VecVecMultKernel()
I decided to push the entire solver onto the GPU. The code (from CNC) uses different calls to GLOBAL Kernels (gpuMult, and cublas) does this mean that each call to the different kernels defines a new communication across the PCIe?
As I said, I want to keep as much of the simulation on the GPU as possible  minimizing read/writes to the host.
The general simulation algorithm is:

Calculate Pressures

Create Diagonal Precondtioner for solver

Save previous iteration solution

Solve system of equations Ax = b (already on GPU ?)

Update system

Does Convergence occur? Yes  EXIT, NO  Go to 1
Any Ideas?
Thank you.