So is it OK to make a loop and call a kernel in it like
for(int i = 0 ; i < n ; i++)
{
DoSomeWork<<<dim3,dim3>>>(someargs,i);
}
I ask this by 2 reasons :
-
performance
-
I try to do the same in my prog right now and it fails in a very strange way.
So the situation is: I have a kernel that accepts 7 arguments, all the arguments are type of int except one which is resulting array type of int*
I allocate a lot of memory for that array and then start a loop in which I launch my kernel (I’m using loop variable as an argument for the kernel)
And what is REALLY STRANGE - the first kernel lauch is successful, all the others result in “invalid argument”, though I don’t even reallocate or anyhow change pointer for resulting array. Besides I can launch empty kernel and the result will be the same. By the way I’ve made a similar test app and it works =|
Can anyone help?
This is what doesn’t work (all libs are included and so on)
//I've changed some defined values and variables for exact numbers
__global__ void Test(int* points)
{
//it can be either completely empty - anyhow doesn't work
points[blockIdx.x + threadIdx.x] = 1;
}
main( )
{
...
int* d_points;
cudaMalloc( (void**)&d_points , 1024*1024*10);
...
for (int i = 0 ; i < 1024; i++)
{
Test<<<1,1>>>(d_points);
CUDA_CHECK_ERROR(cudaThreadSynchronize());
//was made to check results after the first iteration
cudaMemcpy(points, d_points, 1024*1024*10, cudaMemcpyDeviceToHost);
//because memcpy seems to be async operation
CUDA_CHECK_ERROR(cudaThreadSynchronize());
cudaError_t err = cudaGetLastError();
if( cudaSuccess != err)
{
fprintf(stderr, "Cuda error: %s: %s.\n", msg, cudaGetErrorString( err) );
}
}
...
}
This is simple which works
__global__ void DoCuda(int* points,int offset)
{
points[threadIdx.x + offset*5] = threadIdx.x;
}
int main(int argc, char* argv[])
{
int* h_points = (int*)malloc(500);
int* d_points = NULL;
cudaMalloc( (void**)&d_points,500 );
for(int i = 0 ; i < 25 ; i++)
{
DoCuda<<<1,5>>>(d_points,i);
cudaError_t err = cudaGetLastError();
if (err != cudaSuccess)
{
printf("%s",cudaGetErrorString(err));
}
}
cudaMemcpy( h_points, d_points, 500, cudaMemcpyDeviceToHost );
cudaError_t err = cudaGetLastError();
for (int i = 0 ; i < 125 ; i++)
{
h_points[i]++;
}
return 0;
}