I have a very simple kernel that takes works to find the closest of a number of cluster centroids in an input array and writes the number of the closest centroid for each entry into another array of integers that is passed in. This is part of a KMeans algorithm implemented on the GPU. I have a test platform written in unmanaged c++ the runs the kernels and when I run it there is no problem. I have created a interfac into a C# manag4d assembly using pInvoke to the top level functions and when I invoke this from C# I can follow the debugger into the c++ function that calls the kernel but when I get to the kernel the kernel gets bypassed.
Here is the function in unmanaged c++that calls the kernel and the kernel code.
__declspec(dllexport) double KMeans(float * data, int* clusters, int nPoints, double * clusterMeans, int nMeans, float ndv, float precision, int& errCode)
{
double res = 0.0;
if (devId == -1)
{
initCuda();
if (devId == -1)
{
printf(“Unable to initialize Cuda Device\n”);
res = -1.0;
}
}
if (devId != -1)
{
double clusterMotion = 1.0e12;
int iter = 0;
float eps = std::numeric_limits<float>::epsilon();
printf(" Max threads per block = %d\n", prop.maxThreadsPerBlock);
int SIZE = prop.maxThreadsPerBlock;
deviceVector<float> d_data(data, nPoints * sizeof(float));
deviceVector<int> d_assignments(clusters, nPoints * sizeof(int));
deviceVector<double> d_clusterMeans(clusterMeans, nMeans * sizeof(double));
double minMotion = 0.0;
for (int i = 0; i < nMeans; i++)
{
minMotion += clusterMeans[i] * precision;
}
while (clusterMotion > minMotion)
{
int smemSize = (nMeans * sizeof(double));
dim3 dimBlock(SIZE, 1, 1);
dim3 dimGrid(((nPoints) / dimBlock.x) + 1, 1, 1);
cluster_assignment_kernel << < dimGrid, dimBlock, smemSize >> >(
d_data.devPtr(), d_assignments.devPtr(),
d_clusterMeans.devPtr(),
nPoints, nMeans,
ndv, eps);
cudaDeviceSynchronize();
clusters = d_assignments.readDeviceData();
if (unassigned(clusters, nPoints))
{
res = -2.0; //assignment kernel failed to launch
break;
}
// calculate new average mean for each cluster
int maxThreads = nPoints / 4;
int numBlocks = 0;
int numThreads = 0;
int maxBlocks = MIN(nPoints / maxThreads, MAX_BLOCK_DIM_SIZE);
getNumBlocksAndThreads(6, 1024, maxBlocks, maxThreads, numBlocks, numThreads);
/*cpair<double, int>* init = new cpair<double, int>[nMeans];
for (int i = 0; i < nMeans; i++)
{
init[i] = cpair<double, int>(0.0, 0);
}
*/
//deviceVector<cpair<double, int>> d_opairs(h_output, nMeans * sizeof(cpair<double, int>));
cpair<double, int>* h_output = new cpair<double, int>[nMeans]();
deviceVector<cpair<double, int>> d_output(h_output, sizeof(cpair<double, int>));
for (int i = 0; i < nMeans; i++)
{
partitionedSumCount(nPoints, numThreads, numBlocks,
d_data.devPtr(), d_assignments.devPtr(), i,
d_output.devPtr(), nPoints);
cudaDeviceSynchronize();
}
h_output = d_output.readDeviceData();
if (noPartitionSums(h_output, nMeans))
{
res = -3.0; // partitionSumCount kernel failed to launch
break;
}
clusterMotion = 0.0;
minMotion = 0.0;
for (int i = 0; i < nMeans; i++)
{
clusterMotion += fabs(clusterMeans[i] - h_output[i].avg());
clusterMeans[i] = h_output[i].avg();
minMotion += clusterMeans[i] * precision;
printf("Cluster %d has average %f\n", (i + 1), clusterMeans[i]);
}
iter++;
printf("Iteration %d Total Cluster motion = %f, min motion = %f\n", iter, clusterMotion, minMotion);
d_clusterMeans.AssignData(clusterMeans);
}
if (!(res < 0.0))
{
//copy element assignments to clusters back to host memory
clusters = d_assignments.readDeviceData();
clusterMeans = d_clusterMeans.readDeviceData();;
cudaDeviceReset();
}
}
errCode = (int)res;
return res;
}
global void cluster_assignment_kernel(float * data, int * assignments, double * means, int nPoints, int nMeans, float ndv, float eps)
{
double *sdata = SharedMemory();
int n = threadIdx.x + blockIdx.x * blockDim.x;
// if thread zero initialize shared memory to initial means
if (threadIdx.x == 0)
{
for (int i = 0; i < nMeans; i++)
{
sdata[i] = means[i];
}
}
//all threads need to wait until the shared memory is initialized
__syncthreads();
float testValue = data[n];
float df = fabs(ndv - testValue);
if (df > eps)
{
if (n < nPoints)
{
double diff = sdata[nMeans - 1] * 1000.0f;
double tdiff = 0.0;
int res = -1;
for (int i = 0; i < nMeans; i++)
{
double x1 = sdata[i];
tdiff = fabs(x1 - testValue);
if (tdiff < diff || tdiff < 0.0)
{
diff = tdiff;
res = i;
}
}
if (n % 1000 == 0)
printf("point %i is assigned to cluster %i\n", n, res);
assignments[n] = res;
}
}
}
I use a helper class to manage device and host memory;
#include “common.cuh”
#include “helper_cuda.h”
//#include <cuda_texture_types.h>
#include “cuda_runtime.h”
#include <stdio.h>
#ifndef DEVICE_VECTOR_CUH
#define DEVICE_VECTOR_CUH
using namespace std;
template
class deviceVector
{
private:
T* hst_data = nullptr;
T* dev_data = nullptr;
size_t size;
public:
deviceVector(T* data, size_t size) : hst_data(data), size(size)
{
//printf(“Calling Device Vector constructor\n”);
checkCudaErrors(cudaMalloc((void **)&dev_data, size));
checkCudaErrors(cudaMemcpy(dev_data, data, size, cudaMemcpyHostToDevice));
}
~deviceVector()
{
//printf("deviceVector destructor called\n");
if (dev_data != nullptr)
checkCudaErrors(cudaFree(dev_data));
}
T* readDeviceData()
{
checkCudaErrors(cudaMemcpy(hst_data, dev_data, size, cudaMemcpyDeviceToHost));
return hst_data;
}
void AssignData(T* data)
{
checkCudaErrors(cudaMemcpy(dev_data, data, size, cudaMemcpyHostToDevice));
}
T* devPtr()
{
return dev_data;
}
};
Here is the definition in managed code for the function call
[DllImport(dllname)]
private static extern float KMeans(
[In] float raster,
[In, Out] int clusters,
int nsize,
[In, Out] double means,
int nMeans,
float ndv,
float precision,
[In, Out] ref int errCode);
dllname is defined to point to either the debug version or the releaase version and everything is compiled in x64 platform.
Here is the unmanaged c++ test platform code on which all of the above code runs perfectly.
int main(int argc, char **argv)
{
printf(“Enter TestGPUReduction\n”);
string textName = “C:\wedev\agverdict-doc-testdata\Testing\TestData\Rasters\RawYieldMap.text”;
if (readTestData(textName))
{
//auto midpt = std::partition(test_data, test_data + nItems, bind2nd(std::not_equal_to<float>(), ndv));
float min = testDataMin(); // *std::min_element(test_data, midpt - 1); //100; //(float[995643])
//double min = (double)rasterMin(test_data, nItems, ndv);
printf("min value = %f\n", min);
//float max = rasterMax(test_data, nItems, ndv); //5000; //
float max = testDataMax(); //*std::max_element(test_data, midpt - 1);
printf("max value = %f\n", max);
float range = max - min;
printf("range = %f\n", range);
double step_size = range / 5.0;
printf("step size value = %f\n", step_size);
double * means = new double[5];
for (int p = 0; p < 5; p++)
{
means[p] = (min + (step_size * ((double)p + 0.5)));
printf("MEAN %d = %f\n", p + 1, means[p]);
}
int * assignments = new int[nItems];
for (int i = 0; i < nItems; i++)
assignments[i] = -1;
printf("\n\nCall kmeans with precision of 0.001\n");
KMeans(test_data, assignments, nItems, means, 5, ndv, 0.001f);
for (int p = 1; p <= 5; p++)
{
printf("RESULT MEAN %d = %f\n", p, means[p - 1]);
}
printf("************************************************************\n");
for (int p = 0; p < 5; p++)
{
means[p] = (min + (step_size * ((double)p + 0.5)));
printf("MEAN %d = %f\n", p + 1, means[p]);
}
for (int i = 0; i < nItems; i++)
assignments[i] = -1;
printf("\n\nCall kmeans with precision of 0.0005\n");
KMeans(test_data, assignments, nItems, means, 5, ndv, 0.0005f);
for (int p = 1; p <= 5; p++)
{
printf("RESULT MEAN %d = %f\n", p, means[p - 1]);
}
printf("************************************************************\n");
for (int p = 0; p < 5; p++)
{
means[p] = (min + (step_size * ((double)p + 0.5)));
printf("MEAN %d = %f\n", p + 1, means[p]);
}
for (int i = 0; i < nItems; i++)
assignments[i] = -1;
printf("\n\nCall kmeans with precision of 0.00025\n");
KMeans(test_data, assignments, nItems, means, 5, ndv, 0.00025f);
for (int p = 1; p <= 5; p++)
{
printf("RESULT MEAN %d = %f\n", p, means[p - 1]);
}
printf("************************************************************\n");
for (int p = 0; p < 5; p++)
{
means[p] = (min + (step_size * ((double)p + 0.5)));
printf("MEAN %d = %f\n", p + 1, means[p]);
}
for (int i = 0; i < nItems; i++)
assignments[i] = -1;
printf("\n\nCall kmeans with precision of 0.0001\n");
KMeans(test_data, assignments, nItems, means, 5, ndv, 0.0001f);
for (int p = 1; p <= 5; p++)
{
printf("RESULT MEAN %d = %f\n", p, means[p - 1]);
}
delete[] means;
delete[] test_data;
delete[] assignments;
}
}
If anyone can help me with this please let me knw.
Thanks
David