Hi everyone,
recently i tried to optimize my working code, and instead calling
void func(pType * p){
...
pType *p1;
cudaMalloc((void**)&p1, MN*sizeof(pType));
for (int i = 0; i < JACOBI_ITERATIONS; i++)
{
jacobi_shared3<<<grid, block>>>(p1, p, rhs, dx, dy, alpha, beta, pitch); //stores results to first argument
cudaThreadSynchronize();
cudaMemcpy(p, p1, sizeof(pType)*dx*dy, cudaMemcpyDeviceToDevice);
}
cudaFree(p1);
}
i tried something like this:
void func(pType * p){
...
pType *p1;
cudaMalloc((void**)&p1, MN*sizeof(pType));
for (int i = 0; i < JACOBI_ITERATIONS/2; i++)
{
jacobi_shared3<<<grid, block>>>(p1, p, rhs, dx, dy, alpha, beta, pitch); //stores results to first argument
cudaThreadSynchronize();
jacobi_shared3<<<grid, block>>>(p, p1, rhs, dx, dy, alpha, beta, pitch);
cudaThreadSynchronize();
}
cudaFree(p1);
}
The problem is, I get a cudaError=30 (checked with cudeGetLastError()) when calling second cudaThreadSynchronize().
Why is that so? What am I doing wrong?
And will this change really affect calculations performance?
I have a GTX275 with CUDA 3.2 installed.
Any help will be appreciated.
(func paramter p is declared on a global scope and allocated virtually same as p1, with same size)