Internal Profiling error - insufficient kernel bounds data

I have a implementation of the Discrete Fourier Transform to calculate the maximum of each frequency bin of a 1024 PT DFT. (Yes, I know use the cuFFT, but this is for experimentation). It is called with 8 blocks and 1024 threads per block, each block processes 1/8th of the input time series (a simple tone) which in this case is 512 FFTs. It runs fine, albeit slow. But it is causing errors on two different GPUs when we try to run an individual kernel analysis (select this kernel and press Perform Kernel Analysis)

==10384== Error: Internal profiling error 3105:999.

and says “insufficient kernel bounds data. The data needed to calculate compute, memory, and latency bounds for the kernel could not be collected”

Display flashes, Windows pop up “Display driver stops responding and has recovered. Display Driver NVIDIA Windows Kernel Mode Driver Version 353.90 stopped responding and has successfully recovered”

Cuda 7.5
Linux with Quadro M4000
WIndows with Quadro K3100M

#include <stdio.h>
#include <cuda_runtime.h>
#include <cufft.h>
#include <helper_cuda.h>
#include <helper_functions.h>  
#include <cuda_profiler_api.h>

#define PI 3.14159265358979

__device__ float	D_PI = PI;

typedef struct {
	short	x;
	short	y;
} ComplexShort;

void  generateCmplxShortTone(float scaleFactor, ComplexShort *data, int count, float freq, float sampleFreq)
{
	float twoPI = 2 * PI;
	float step = (twoPI *freq) / sampleFreq;
	float phase;
	for (int n = 0; n < count; n++)  {
		phase = n*step;
		phase = fmod(phase, twoPI);
		data[n].x = cos(phase)*scaleFactor;
		data[n].y = sin(phase)*scaleFactor;
	}
	return;
}

// Each thread in this kernel will compute the maximum magnitude squared (power) of a 
// single frequency bin of a 1024 PT DFT.  It must be called with 1024 threads per block and 8 blocks.
// Each block will process 512 FFTs, so the d_data structures must be 512 * 8 * 1024.
// The results are stored in d_peakPickDFT which must be 8 * 1024 
__global__ void DFTPeakPick(ComplexShort *d_Data, float *d_peakPickDFT, int fftSize) {

	__shared__ ComplexShort timeSeries[1024];

	register float sumReals;
	register float sumImags;

	// Same as K
	int fftBin = threadIdx.x;
	int numFFTsToProcess = 512; // (Data Size / fftSize) / GridDim.x
	int numFFTsProcessed = 0;

	register float max = 0;
	register float newMagSquaredVal;

	int currentDataIndex = blockIdx.x * fftSize * numFFTsToProcess;

	timeSeries[fftBin].x = d_Data[currentDataIndex + fftBin].x;
	timeSeries[fftBin].y = d_Data[currentDataIndex + fftBin].y;

	// Wait for all threads to load a data sample
	__syncthreads();

	register float realCoef = cos(fftBin * 2 * D_PI / fftSize);
	register float imagCoef = -1 * sin(fftBin * 2 * D_PI / fftSize);

	while (numFFTsProcessed < numFFTsToProcess) {

		register float nextRealCoef = realCoef;
		register float tempNextRealCoef;
		register float nextImagCoef = imagCoef;

		sumReals = timeSeries[0].x;
		sumImags = timeSeries[0].y;

		for (int n = 1; n < fftSize; n++) {

			sumReals += nextRealCoef * timeSeries[n].x - nextImagCoef * timeSeries[n].y;
			sumImags += nextRealCoef * timeSeries[n].y + nextImagCoef * timeSeries[n].x;

			tempNextRealCoef = nextRealCoef * nextRealCoef - nextImagCoef * nextImagCoef;
			nextImagCoef = nextRealCoef * nextImagCoef * 2; // + imagCoef * realCoef;
			nextRealCoef = tempNextRealCoef;

		}

		newMagSquaredVal = sumReals * sumReals + sumImags * sumImags;

		if (newMagSquaredVal > max) {
			max = newMagSquaredVal;
		}


		//// Wait for everyone to finish before loading another FFT
		currentDataIndex += fftSize;
		__syncthreads();
		timeSeries[fftBin].x = d_Data[currentDataIndex + fftBin].x;
		timeSeries[fftBin].y = d_Data[currentDataIndex + fftBin].y;
		__syncthreads();

		numFFTsProcessed++;

	}

	// 8 Peak picked DFT that must be decimated further
	d_peakPickDFT[blockIdx.x*1024 + fftBin] = max;

}


int main(int argc, char **argv)
{

	int	fftSize = 1024;  // same as threads per block
	int numFFTsPerBatch = 4096;
	int numBlocks = 8;  // 
	int dataLength = fftSize * numFFTsPerBatch; 
	
	ComplexShort *hostComplexShortBuffer; // Tone
	ComplexShort *d_ComplexShortBuffer;   // Kernel input
	float *d_peakPickDataBuffer;          // Kernel output

	cudaHostAlloc(&hostComplexShortBuffer, dataLength * sizeof(ComplexShort), cudaHostAllocWriteCombined);
	cudaMalloc((void**)&d_ComplexShortBuffer, dataLength * sizeof(ComplexShort));
	cudaMalloc((void**)&d_peakPickDataBuffer, numBlocks * fftSize * sizeof(float));

    // Fill host complex short buffer with a tone
	float freq = 100.0;
	float sampleFreq = 1024.0;
	float scaleFactor = 10000;
	generateCmplxShortTone(scaleFactor, hostComplexShortBuffer, dataLength, freq, sampleFreq);

	// Do we need this if we're runnning NVIDIA profiler?
	cudaProfilerStart();
	
	// Copy data from host to device
	cudaMemcpyAsync(d_ComplexShortBuffer, hostComplexShortBuffer, dataLength*sizeof(ComplexShort), cudaMemcpyHostToDevice);
	
	// This will fully occupy NVIDIA Qaudro K3100M (4 MPs, 2048 thread per MP)
	DFTPeakPick << <  8, 1024, 0  >> >
		(d_ComplexShortBuffer,d_peakPickDataBuffer,fftSize);

	// Copy data from device to host (tbd)
	cudaDeviceSynchronize();
	cudaProfilerStop();
	cudaFreeHost(hostComplexShortBuffer);
	cudaFree((void*)d_ComplexShortBuffer);
	cudaFree((void*)d_peakPickDataBuffer);

    cudaDeviceReset();
	exit(true ? EXIT_SUCCESS : EXIT_FAILURE);
	
}