I have a implementation of the Discrete Fourier Transform to calculate the maximum of each frequency bin of a 1024 PT DFT. (Yes, I know use the cuFFT, but this is for experimentation). It is called with 8 blocks and 1024 threads per block, each block processes 1/8th of the input time series (a simple tone) which in this case is 512 FFTs. It runs fine, albeit slow. But it is causing errors on two different GPUs when we try to run an individual kernel analysis

== 21652 == Error Internal Profiling Error OR

= 7880 == Error Internal Profiling Error

and says “insufficient kernel bounds data. The data needed to calculate compute, memory, and latency bounds for the kernel could not be collected”

I hope this is way to post code …

```
#include <stdio.h>
#include <cuda_runtime.h>
#include <cufft.h>
#include <helper_cuda.h>
#include <helper_functions.h>
#include <cuda_profiler_api.h>
#define PI 3.14159265358979
__device__ float D_PI = PI;
typedef struct {
short x;
short y;
} ComplexShort;
void generateCmplxShortTone(float scaleFactor, ComplexShort *data, int count, float freq, float sampleFreq)
{
float twoPI = 2 * PI;
float step = (twoPI *freq) / sampleFreq;
float phase;
for (int n = 0; n < count; n++) {
phase = n*step;
phase = fmod(phase, twoPI);
data[n].x = cos(phase)*scaleFactor;
data[n].y = sin(phase)*scaleFactor;
}
return;
}
// Each thread in this kernel will compute the maximum magnitude squared (power) of a
// single frequency bin of a 1024 PT DFT. It must be called with 1024 threads per block and 8 blocks.
// Each block will process 512 FFTs, so the d_data structures must be 512 * 8 * 1024.
// The results are stored in d_peakPickDFT which must be 8 * 1024
__global__ void DFTPeakPick(ComplexShort *d_Data, float *d_peakPickDFT, int fftSize) {
__shared__ ComplexShort timeSeries[1024];
register float sumReals;
register float sumImags;
// Same as K
int fftBin = threadIdx.x;
int numFFTsToProcess = 512; // (Data Size / fftSize) / GridDim.x
int numFFTsProcessed = 0;
register float max = 0;
register float newMagSquaredVal;
int currentDataIndex = blockIdx.x * fftSize * numFFTsToProcess;
timeSeries[fftBin].x = d_Data[currentDataIndex + fftBin].x;
timeSeries[fftBin].y = d_Data[currentDataIndex + fftBin].y;
// Wait for all threads to load a data sample
__syncthreads();
register float realCoef = cos(fftBin * 2 * D_PI / fftSize);
register float imagCoef = -1 * sin(fftBin * 2 * D_PI / fftSize);
while (numFFTsProcessed < numFFTsToProcess) {
register float nextRealCoef = realCoef;
register float tempNextRealCoef;
register float nextImagCoef = imagCoef;
sumReals = timeSeries[0].x;
sumImags = timeSeries[0].y;
for (int n = 1; n < fftSize; n++) {
sumReals += nextRealCoef * timeSeries[n].x - nextImagCoef * timeSeries[n].y;
sumImags += nextRealCoef * timeSeries[n].y + nextImagCoef * timeSeries[n].x;
tempNextRealCoef = nextRealCoef * nextRealCoef - nextImagCoef * nextImagCoef;
nextImagCoef = nextRealCoef * nextImagCoef * 2; // + imagCoef * realCoef;
nextRealCoef = tempNextRealCoef;
}
newMagSquaredVal = sumReals * sumReals + sumImags * sumImags;
if (newMagSquaredVal > max) {
max = newMagSquaredVal;
}
//// Wait for everyone to finish before loading another FFT
currentDataIndex += fftSize;
__syncthreads();
timeSeries[fftBin].x = d_Data[currentDataIndex + fftBin].x;
timeSeries[fftBin].y = d_Data[currentDataIndex + fftBin].y;
__syncthreads();
numFFTsProcessed++;
}
// 8 Peak picked DFT that must be decimated further
d_peakPickDFT[blockIdx.x*1024 + fftBin] = max;
}
int main(int argc, char **argv)
{
int fftSize = 1024; // same as threads per block
int numFFTsPerBatch = 4096;
int numBlocks = 8; //
int dataLength = fftSize * numFFTsPerBatch;
ComplexShort *hostComplexShortBuffer; // Tone
ComplexShort *d_ComplexShortBuffer; // Kernel input
float *d_peakPickDataBuffer; // Kernel output
cudaHostAlloc(&hostComplexShortBuffer, dataLength * sizeof(ComplexShort), cudaHostAllocWriteCombined);
cudaMalloc((void**)&d_ComplexShortBuffer, dataLength * sizeof(ComplexShort));
cudaMalloc((void**)&d_peakPickDataBuffer, numBlocks * fftSize * sizeof(float));
// Fill host complex short buffer with a tone
float freq = 100.0;
float sampleFreq = 1024.0;
float scaleFactor = 10000;
generateCmplxShortTone(scaleFactor, hostComplexShortBuffer, dataLength, freq, sampleFreq);
// Do we need this if we're runnning NVIDIA profiler?
cudaProfilerStart();
// Copy data from host to device
cudaMemcpyAsync(d_ComplexShortBuffer, hostComplexShortBuffer, dataLength*sizeof(ComplexShort), cudaMemcpyHostToDevice);
// This will fully occupy NVIDIA Qaudro K3100M (4 MPs, 2048 thread per MP)
DFTPeakPick << < 8, 1024, 0 >> >
(d_ComplexShortBuffer,d_peakPickDataBuffer,fftSize);
// Copy data from device to host (tbd)
cudaFreeHost(hostComplexShortBuffer);
cudaFree((void*)d_ComplexShortBuffer);
cudaFree((void*)d_peakPickDataBuffer);
cudaDeviceReset();
cudaDeviceSynchronize();
cudaProfilerStop();
}
```