I made a example
u_int8_t *u8_PtImageHost;
u_int8_t *u8_PtImageDevice;
u_int8_t *u8_ptDataOutHost;
u_int8_t *u8_ptDataOutDevice;
u_int8_t u8_Used[NB_STREAM];
u8_ptDataOutHost = (u_int8_t *)malloc(WIDTH*HEIGHT*sizeof(u_int8_t));
checkCudaErrors(cudaMalloc((void**)&u8_ptDataOutDevice,WIDTH*HEIGHT*sizeof(u_int8_t)));
u8_PtImageHost = (u_int8_t *)malloc(WIDTH*HEIGHT*sizeof(u_int8_t));
checkCudaErrors(cudaMalloc((void**)&u8_PtImageDevice,WIDTH*HEIGHT*sizeof(u_int8_t)));
cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<unsigned char>();
checkCudaErrors(cudaMallocArray(&Array_PatchsMaxDevice, &channelDesc,WIDTH,HEIGHT ));
checkCudaErrors(cudaBindTextureToArray(Image,Array_PatchsMaxDevice));
int blockh = HEIGHT/NB_STREAM;
for(int i=0;i<NB_STREAM;i++)
{
cudaSetDevice(0);
cudaStreamCreate(&Stream[i]);
}
for(int i=0;i<NB_STREAM;i++)
{
printf("i: %d\n",i);
if(i == 0)
{
int yy =0;
int localHeight = blockh+1;
checkCudaErrors(cudaMemcpy2DToArrayAsync( Array_PatchsMaxDevice,
0,
0,
u8_PtImageDevice,
WIDTH,
WIDTH,
blockh,
cudaMemcpyHostToDevice ,
Stream[i]));
dim3 threads(BLOC_X,BLOC_Y);
dim3 blocks(ceil(WIDTH/BLOC_X),ceil(localHeight/BLOC_Y));
SobelKernel<<<blocks,threads,0,Stream[i]>>>(u8_ptDataOutDevice,0,WIDTH,localHeight-1);
u8_Used[i] = 1;
}else{
int ioffset = WIDTH*(HEIGHT/NB_STREAM );
int localHeight = min(HEIGHT - (blockh +1 +blockh*i),blockh);
printf("ioffset: %d\n",ioffset);
checkCudaErrors(cudaMemcpy2DToArrayAsync( Array_PatchsMaxDevice,
0,
ioffset*i,
&u8_PtImageDevice[ioffset*i],
WIDTH,
WIDTH,
localHeight,
cudaMemcpyHostToDevice ,
Stream[i])); ##### HERE THE ERROR
dim3 threads(BLOC_X,BLOC_Y);
dim3 blocks(ceil(WIDTH/BLOC_X),ceil(localHeight/BLOC_Y));
SobelKernel<<<blocks,threads,0,Stream[i]>>>(u8_ptDataOutDevice,ioffset,WIDTH,localHeight);
u8_Used[i] = 1;
if(localHeight<blockh-1)
{
break;
}
}
}
for(int i=0;i<NB_STREAM;i++)
{
cudaStreamSynchronize(Stream[i]);
}
checkCudaErrors(cudaMemcpy(u8_ptDataOutHost,u8_ptDataOutDevice,WIDTH*HEIGHT*sizeof(u_int8_t),cudaMemcpyDeviceToHost));
int iioffset = 0;
int jjoffset = 0;
for(int i=iioffset;i<iioffset+25;i++)
{
for(int j=jjoffset;j<jjoffset+25;j++)
{
printf(" %d",u8_ptDataOutHost[j*WIDTH+i]);
// std::cout << (int)u8_ptDataOutHost[j*WIDTH+i] <<" ";
}
printf(" \n");
}
On the line with the ####, I get that error:
…/Sobel_Stream/cuda_interface.cu(125) : CUDA Runtime API error 11: invalid argument.
It is due to the arogument: ioffset*i