Cuda runtime failing to launch kernel from CSharp Wrapper function.

I have a very simple kernel that takes works to find the closest of a number of cluster centroids in an input array and writes the number of the closest centroid for each entry into another array of integers that is passed in. This is part of a KMeans algorithm implemented on the GPU. I have a test platform written in unmanaged c++ the runs the kernels and when I run it there is no problem. I have created a interfac into a C# manag4d assembly using pInvoke to the top level functions and when I invoke this from C# I can follow the debugger into the c++ function that calls the kernel but when I get to the kernel the kernel gets bypassed.

Here is the function in unmanaged c++that calls the kernel and the kernel code.
__declspec(dllexport) double KMeans(float * data, int* clusters, int nPoints, double * clusterMeans, int nMeans, float ndv, float precision, int& errCode)
{
double res = 0.0;
if (devId == -1)
{
initCuda();
if (devId == -1)
{
printf(“Unable to initialize Cuda Device\n”);
res = -1.0;
}
}

	if (devId != -1)
	{
		double clusterMotion = 1.0e12;
		int iter = 0;
		float eps = std::numeric_limits<float>::epsilon();
		printf(" Max threads per block = %d\n", prop.maxThreadsPerBlock);
		int SIZE = prop.maxThreadsPerBlock;


		deviceVector<float> d_data(data, nPoints * sizeof(float));
		deviceVector<int> d_assignments(clusters, nPoints * sizeof(int));
		deviceVector<double> d_clusterMeans(clusterMeans, nMeans * sizeof(double));
		double minMotion = 0.0;
		for (int i = 0; i < nMeans; i++)
		{
			minMotion += clusterMeans[i] * precision;
		}
		while (clusterMotion > minMotion)
		{


			int smemSize = (nMeans * sizeof(double));
			dim3 dimBlock(SIZE, 1, 1);
			dim3 dimGrid(((nPoints) / dimBlock.x) + 1, 1, 1);
			cluster_assignment_kernel << < dimGrid, dimBlock, smemSize >> >(
				d_data.devPtr(), d_assignments.devPtr(),
				d_clusterMeans.devPtr(),
				nPoints, nMeans,
				ndv, eps);

			cudaDeviceSynchronize();
			clusters = d_assignments.readDeviceData();
			if (unassigned(clusters, nPoints))
			{
				res =  -2.0;  //assignment kernel failed to launch
				break;					
			}
			// calculate new average mean for each cluster
			int maxThreads = nPoints / 4;
			int numBlocks = 0;
			int numThreads = 0;
			int maxBlocks = MIN(nPoints / maxThreads, MAX_BLOCK_DIM_SIZE);
			getNumBlocksAndThreads(6, 1024, maxBlocks, maxThreads, numBlocks, numThreads);

			/*cpair<double, int>* init = new cpair<double, int>[nMeans];
			for (int i = 0; i < nMeans; i++)
			{
			init[i] = cpair<double, int>(0.0, 0);
			}
			*/
			//deviceVector<cpair<double, int>> d_opairs(h_output, nMeans * sizeof(cpair<double, int>));
			cpair<double, int>* h_output = new cpair<double, int>[nMeans]();
			deviceVector<cpair<double, int>> d_output(h_output, sizeof(cpair<double, int>));

			for (int i = 0; i < nMeans; i++)
			{
				partitionedSumCount(nPoints, numThreads, numBlocks,
					d_data.devPtr(), d_assignments.devPtr(), i,
					d_output.devPtr(), nPoints);
				cudaDeviceSynchronize();
			}

			h_output = d_output.readDeviceData();

			if (noPartitionSums(h_output, nMeans))
			{					
				res =  -3.0; // partitionSumCount kernel failed to launch
				break;
			}

			clusterMotion = 0.0;
			minMotion = 0.0;
			for (int i = 0; i < nMeans; i++)
			{
				clusterMotion += fabs(clusterMeans[i] - h_output[i].avg());
				clusterMeans[i] = h_output[i].avg();
				minMotion += clusterMeans[i] * precision;
				printf("Cluster %d has average %f\n", (i + 1), clusterMeans[i]);
			}
			iter++;
			printf("Iteration %d Total Cluster motion = %f, min motion = %f\n", iter, clusterMotion, minMotion);
			d_clusterMeans.AssignData(clusterMeans);

		}

		if (!(res < 0.0))
		{
			//copy element assignments to clusters back to host memory
			clusters = d_assignments.readDeviceData();
			clusterMeans = d_clusterMeans.readDeviceData();;

			cudaDeviceReset();
		}
	}
	errCode = (int)res;
	return res;
}

global void cluster_assignment_kernel(float * data, int * assignments, double * means, int nPoints, int nMeans, float ndv, float eps)
{
double *sdata = SharedMemory();

int n = threadIdx.x + blockIdx.x * blockDim.x;

// if thread zero initialize shared memory to initial means
if (threadIdx.x == 0)
{
	for (int i = 0; i < nMeans; i++)
	{
		sdata[i] = means[i];
	}		

}

//all threads need to wait until the shared memory is initialized
__syncthreads();

float testValue = data[n];
float df = fabs(ndv - testValue);
if (df > eps)
{
	if (n < nPoints)
	{
		double diff = sdata[nMeans - 1] * 1000.0f;
		double tdiff = 0.0;
		int res = -1;
		for (int i = 0; i < nMeans; i++)
		{
			double x1 = sdata[i];
			tdiff = fabs(x1 - testValue);
			
			if (tdiff < diff || tdiff < 0.0)
			{
				diff = tdiff;
				res = i;
			}
		}

		if (n % 1000 == 0)
			printf("point %i is assigned to cluster %i\n", n, res);

		assignments[n] = res;
	}
}

}

I use a helper class to manage device and host memory;
#include “common.cuh”
#include “helper_cuda.h”
//#include <cuda_texture_types.h>
#include “cuda_runtime.h”
#include <stdio.h>

#ifndef DEVICE_VECTOR_CUH
#define DEVICE_VECTOR_CUH
using namespace std;
template
class deviceVector
{
private:
T* hst_data = nullptr;
T* dev_data = nullptr;
size_t size;
public:
deviceVector(T* data, size_t size) : hst_data(data), size(size)
{
//printf(“Calling Device Vector constructor\n”);
checkCudaErrors(cudaMalloc((void **)&dev_data, size));
checkCudaErrors(cudaMemcpy(dev_data, data, size, cudaMemcpyHostToDevice));
}

~deviceVector()
{
	//printf("deviceVector destructor called\n");
	if (dev_data != nullptr)
		checkCudaErrors(cudaFree(dev_data));
}

T* readDeviceData()
{
	checkCudaErrors(cudaMemcpy(hst_data, dev_data, size, cudaMemcpyDeviceToHost));
	return hst_data;
}

void AssignData(T* data)
{
	checkCudaErrors(cudaMemcpy(dev_data, data, size, cudaMemcpyHostToDevice));
}

T* devPtr()


{
	return dev_data;
}

};

Here is the definition in managed code for the function call
[DllImport(dllname)]
private static extern float KMeans(
[In] float raster,
[In, Out] int clusters,
int nsize,
[In, Out] double means,
int nMeans,
float ndv,
float precision,
[In, Out] ref int errCode);

dllname is defined to point to either the debug version or the releaase version and everything is compiled in x64 platform.

Here is the unmanaged c++ test platform code on which all of the above code runs perfectly.
int main(int argc, char **argv)
{
printf(“Enter TestGPUReduction\n”);
string textName = “C:\wedev\agverdict-doc-testdata\Testing\TestData\Rasters\RawYieldMap.text”;

    if (readTestData(textName))
    {
        //auto midpt = std::partition(test_data, test_data + nItems, bind2nd(std::not_equal_to<float>(), ndv));
		float min = testDataMin(); // *std::min_element(test_data, midpt - 1);  //100; //(float[995643])
        //double min = (double)rasterMin(test_data, nItems, ndv);		
        printf("min value = %f\n", min);

        //float max = rasterMax(test_data, nItems, ndv);  //5000; //
		float max = testDataMax(); //*std::max_element(test_data, midpt - 1);
        printf("max value = %f\n", max);
        float range = max - min;
        printf("range = %f\n", range);
        double step_size = range / 5.0;
        printf("step size value = %f\n", step_size);
        double * means = new double[5];

        for (int p = 0; p < 5; p++)
        {
            means[p] = (min + (step_size * ((double)p + 0.5)));
            printf("MEAN %d = %f\n", p + 1, means[p]);
        }

        int * assignments = new int[nItems];

        for (int i = 0; i < nItems; i++)
            assignments[i] = -1;

        printf("\n\nCall kmeans with precision of 0.001\n");
        KMeans(test_data, assignments, nItems, means, 5, ndv, 0.001f);

        for (int p = 1; p <= 5; p++)
        {
            printf("RESULT MEAN %d = %f\n", p, means[p - 1]);
        }

        printf("************************************************************\n");

		for (int p = 0; p < 5; p++)
		{
			means[p] = (min + (step_size * ((double)p + 0.5)));
			printf("MEAN %d = %f\n", p + 1, means[p]);
		}


        for (int i = 0; i < nItems; i++)
            assignments[i] = -1;
        printf("\n\nCall kmeans with precision of 0.0005\n");
        KMeans(test_data, assignments, nItems, means, 5, ndv, 0.0005f);
        for (int p = 1; p <= 5; p++)
        {
            printf("RESULT MEAN %d = %f\n", p, means[p - 1]);
        }

        printf("************************************************************\n");

		for (int p = 0; p < 5; p++)
		{
			means[p] = (min + (step_size * ((double)p + 0.5)));
			printf("MEAN %d = %f\n", p + 1, means[p]);
		}



        for (int i = 0; i < nItems; i++)
            assignments[i] = -1;
        printf("\n\nCall kmeans with precision of 0.00025\n");
        KMeans(test_data, assignments, nItems, means, 5, ndv, 0.00025f);
        for (int p = 1; p <= 5; p++)
        {
            printf("RESULT MEAN %d = %f\n", p, means[p - 1]);
        }

        printf("************************************************************\n");

		for (int p = 0; p < 5; p++)
		{
			means[p] = (min + (step_size * ((double)p + 0.5)));
			printf("MEAN %d = %f\n", p + 1, means[p]);
		}



        for (int i = 0; i < nItems; i++)
            assignments[i] = -1;
        printf("\n\nCall kmeans with precision of 0.0001\n");
        KMeans(test_data, assignments, nItems, means, 5, ndv, 0.0001f);
        for (int p = 1; p <= 5; p++)
        {
            printf("RESULT MEAN %d = %f\n", p, means[p - 1]);
        }


        delete[] means;
        delete[] test_data;
        delete[] assignments;
    }
}

If anyone can help me with this please let me knw.

Thanks
David

I usually use C++/CLI to wrap native C++ for .NET
or, take a look ‘managedCuda’[url]https://github.com/kunzmi/managedCuda[/url]

small sample here (sorry, in Japanese) [url]http://blog.zaq.ne.jp/fareastprogramming/article/96/[/url]

I looked at Managed Cuda. I felt it was still too raw and unfinished. but I might go consider it again. The thing that puzzles me is that this code was working through the managed interface for several months before it suddenly just stopped working. Did something change in the latest Cuda 7.5 that could have caused this?

As far as C++/Cli interface I have no problem calling the unmanaged C++ functions and can even debug into them by turning on debug native code. It is just when I get to the place where the Cuda Kernel’s are called the kernel is bypassed. In my C++ Test exe the debugger goes on into the kernel cde and I can trace the execution of the kernel.

Called cudaGetLastError and getting error 8: invalid Device Function. I have researched this on the net and tried adding a suggested value to the Cuda / Device / Code Generation parameter but this did not help. I am beginning to think the is is caused by a new Device Driver.

… think the issue is caused by a new Device Driver.

invalid device function usually means your code is not compiled for the correct architecture.

identify the actual GPU you are trying to run on, and provide the nvcc compile command in its entirety, from the VS console window, that is compiling the CUDA code.