Hello all ;-)
I try to make my first cuda/c++ program with testing of performance, so I had problem with its…
Kernel code:
__global__ void krnFindClosestGpu(float3 *dev_points, int *dev_indices, int size)
{
if (size <= 1) return;
int currentIndex = threadIdx.x + blockDim.x * blockIdx.x;
if (currentIndex < size)
{
float distToClosest = FLT_MAX;
for (int i = 0; i < size; ++i)
{
if (currentIndex == i) continue;
float distance = sqrt(
(dev_points[currentIndex].x - dev_points[i].x) * (dev_points[currentIndex].x - dev_points[i].x) +
(dev_points[currentIndex].y - dev_points[i].y) * (dev_points[currentIndex].y - dev_points[i].y) +
(dev_points[currentIndex].z - dev_points[i].z) * (dev_points[currentIndex].z - dev_points[i].z));
if (distance < distToClosest)
{
distToClosest = distance;
// // dev_indices[currentIndex] = i;
}
}
}
}
short of code in main:
int main ()
{
const int size = 1500000;
srand((unsigned)time(NULL));
float3 *points = new float3;
int *indices = new int;
for (int i = 0; i < size; ++i)
{
points[i].x = (float)((rand() % 1001) + 100);
points[i].y = (float)((rand() % 1001) + 100);
points[i].z = (float)((rand() % 1001) + 100);
}
RUN_GPU_FUNCTION....
delete[] points;
delete[] indices;
cudaDeviceReset();
}
function of starting kernel:
cudaError_t runGpu(void(*fun)(float3*, int*, int), float3 *points, int *indices, int size)
{
float3 *dev_points;
int *dev_indices;
int i_size = threadsPerBlock * 1024; //blockdimx = 1024
int nBlock = (size + threadsPerBlock - 1) / threadsPerBlock;
cudaError_t status = cudaSuccess;
do
{
status = cudaSetDevice(DEVICE_INDEX);
if (status != cudaSuccess)
{
cout << "Unable to set device index. Check if your GPU is supporting CUDA." << endl;
break;
}
status = cudaMalloc((void**)&dev_points, size * sizeof(float3));
if (status != cudaSuccess)
{
cout << "cudaMalloc failed for dev_points" << endl;
break;
}
status = cudaMalloc((void**)&dev_indices, size * sizeof(int));
if (status != cudaSuccess)
{
cout << "cudaMalloc failed for dev_indices" << endl;
break;
}
status = cudaMemcpy(dev_points, points, size * sizeof(float3), cudaMemcpyHostToDevice);
if (status != cudaSuccess)
{
cout << "cudaMemcpy failed for dev_points" << endl;
break;
}
//status = cudaMemcpy(dev_indices, points, size * sizeof(int), cudaMemcpyHostToDevice);
//// status = cudaMemcpy(address, indices, size * sizeof(int), cudaMemcpyHostToDevice);
//if (status != cudaSuccess)
//{
// cout << "cudaMemcpy failed for dev_indices" << endl;
// break;
//}
(*fun) <<< nBlock, threadsPerBlock >>> (dev_points, dev_indices, size);
status = cudaGetLastError();
if (status != cudaSuccess)
{
cout << "Kernel startup failed. " << cudaGetErrorString(status) << endl;
break;
}
status = cudaDeviceSynchronize();
if (status != cudaSuccess)
{
cout << "cudaDeviceSynchronize failed with code: " << (int)status << endl;
break;
}
status = cudaMemcpy(indices, dev_indices, size * sizeof(int), cudaMemcpyDeviceToHost);
if (status != cudaSuccess)
{
cout << "cudaMemcpy failed for dev_indices" << endl;
break;
}
} while (false);
cudaFree(dev_points);
cudaFree(dev_indices);
return status;
}
where is the problem:
when the “size” is more than 100 000 - I had problem with write “dev_indices[currentIndex] = i;”
if I comment this line - problem disappear
when the “size” is more than 1 000 000 - I had problem with write “distToClosest = distance;”
if I comment this line - problem disappear
Result of problem:
cudaDeviceSynchronize failed with code: 4
cudaMalloc failed for dev_points
Info:
MSI GE62 6QC, with GTX960M (2gb)
Win 10, prof 64bit
MS Visual Studio 2015 prof.
In VisualStudio I change from CC 2.0 to CC 5.0
I try to run this program on x86 and x64, is not resolve the problem
I try to find similar problem in google - but no one have this problem ;-(
What is wrong?