hi.
i am trying to use cuda stream in my code but it is giving me an error no 11 i.e. cudaErrorInvalidDevice .
can any one please help me.
i am posting my code below:-
error=(cudaMemcpy2DAsync(gpu_T, pitch_t, T, size, size, N1, cudaMemcpyHostToDevice,stream1));
if(error)
{
printf("\error is %d",error);
getch();
}
(cudaMallocPitch((void**)&gpu_T_, &pitch_t_, size, N1));
(cudaMemcpy2DAsync(gpu_T_, pitch_t_, T_, size,size, N1, cudaMemcpyHostToDevice,stream2));
(cudaMallocPitch((void**)&gpu_D ,&pitch_d, size, N1));
(cudaMemcpy2D(gpu_D, pitch_d, D, size, size, N1, cudaMemcpyHostToDevice));
transpose_kernel<<<dimGrid, dimBlock,0,stream3>>>(gpu_T,gpu_T_,pitch_t/sizeof(float));
and my kernel code is :-
global void transpose_kernel( float *T,float *T_,int pitch)
{
int xid = blockIdx.x * blockDim.x + threadIdx.x;
int yid = blockIdx.y * blockDim.y + threadIdx.y;
if(xid<N1 && yid<N1)
T_[xid*pitch+yid] = T[yid*pitch+xid];
}
please help me as why i am getting the error.
thanx in advance.