This was my old code:
global void window(float **waveform, float* coeff, int n_waveform, int number_of_coefficients) → Here was the error
My new code is :
global void window(float waveform, float coeff, int n_waveform, int number_of_coefficients)
{
nt tidx = threadIdx.x + blockIdx.x*blockDim.x;
int tidy = threadIdx.y + blockIdx.y*blockDim.y;
if( tidx >= number_of_coefficients )
{
return;
}
if( tidy >= n_waveform )
{
return;
}
waveform[tidy * number_of_coefficients + tidx] = waveform[tidy * number_of_coefficients + tidx] * coeff[tidx];
}
Now I can compile the projekt without errros. But the result is right only for the first waveform.
I create a coeff array. The coefficient[0] = 0, the coefficient[2] = 1 … the coefficient[1024] = 1023
I create a waveform array. The waveform[0] = 0, waveform[1] = 1 … waveform[1023999] = 1023999
But my Result is: → Result[0] = 0, Result[1] = 1, Result[2] = 4, Result[3] = 9 … Result[1023] = 1046529 → This results are right
But Result[1024] = 1024 → This Result is frong. The right result must be 0
But Result[1024] = 1025 → This Result is frong. The right result must be 1025
But Result[1024] = 1026 → This Result is frong. The right result must be 2052
But Result[1024] = 1027 → This Result is frong. The right result must be 4108
…
Result [1023999]
Here is my new complete code:
// includes, system
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <math.h>
// includes, project
#include <cufft.h>
#include <cutil_inline.h>
// Complex data type
typedef float2 Complex;
static global void window(float , float , int, int);
unsigned int timer = 0;
float elapsedTimeInMs = 0.0f;
////////////////////////////////////////////////////////////////////////////////
// declaration, forward
void runTest(int argc, char** argv);
// The filter size is assumed to be a number smaller than the signal size
#define SIGNAL_SIZE 1024*1000
#define FILTER_SIZE 1024
////////////////////////////////////////////////////////////////////////////////
// Program main
////////////////////////////////////////////////////////////////////////////////
int main(int argc, char** argv)
{
runTest(argc, argv);
cutilExit(argc, argv);
}
////////////////////////////////////////////////////////////////////////////////
//! Run a simple test for CUDA
////////////////////////////////////////////////////////////////////////////////
void runTest(int argc, char** argv)
{
if( cutCheckCmdLineFlag(argc, (const char**)argv, “device”) )
cutilDeviceInit(argc, argv);
else
cudaSetDevice( cutGetMaxGflopsDeviceId() );
// Allocate host memory for the waveform
float* h_waveform = (float*)malloc(sizeof(float) * SIGNAL_SIZE);
float* h_coeff = (float*)malloc(sizeof(float) * FILTER_SIZE);
// Initalize the memory for the signal
for(unsigned int i = 0; i < SIGNAL_SIZE; ++i)
{
//h_waveform[i] = 2.4374328223;
h_waveform[i] = i;
}
// Initalize the memory for the signal
for(unsigned int i = 0; i < FILTER_SIZE; ++i)
{
//h_coeff[i] = 4.312312312;
h_coeff[i] = i;
}
for(unsigned int i = 1019; i < 1019 + 10; ++i)
{
printf("Waveform: %f \n", h_waveform[i]);
}
for(unsigned int i = 0; i < 10; ++i)
{
printf("Coefficiens: %f \n", h_coeff[i]);
}
cutilCheckError( cutCreateTimer( &timer ) );
int mem_size_waveform = sizeof(float) * SIGNAL_SIZE;
int mem_size_coeff = sizeof(float) * FILTER_SIZE;
// Allocate device memory for signal
float* d_waveform;
cutilSafeCall(cudaMalloc((void**)&d_waveform, mem_size_waveform));
float* d_coeff;
cutilSafeCall(cudaMalloc((void**)&d_coeff, mem_size_coeff));
// Copy waveform from CPU memory to GPU memory
cutilSafeCall(cudaMemcpy(d_waveform, h_waveform, mem_size_waveform, cudaMemcpyHostToDevice));
// Copy coeff from CPU memory to GPU memory
cutilSafeCall(cudaMemcpy(d_coeff, h_coeff, mem_size_coeff, cudaMemcpyHostToDevice));
//den Timer starten
cutilCheckError( cutStartTimer( timer));
window<<<SIGNAL_SIZE/512, 512>>>(d_waveform, d_coeff, 1000, 1024);
//den Timer anhalten
cutilCheckError( cutStopTimer( timer));
elapsedTimeInMs = cutGetTimerValue( timer);
// Check if kernel execution generated and error
cutilCheckMsg("Kernel execution failed [ ComplexPointwiseMulAndScale ]");
// Allocate host memory for the result
float* h_result = (float*)malloc(sizeof(float) * SIGNAL_SIZE);
// Copy device memory to host
cutilSafeCall(cudaMemcpy(h_result, d_waveform, mem_size_waveform, cudaMemcpyDeviceToHost));
// Das Ergebnis anzeigen
for (unsigned int i = 1019; i < 1019+10; ++i)
{
printf(" Ergebnis: %2.20f \n", h_result[i]);
}
printf(“\nZeit: %f \n”, elapsedTimeInMs);
// cleanup memory
free(h_waveform);
free(h_coeff);
cutilSafeCall(cudaFree(d_waveform));
cutilSafeCall(cudaFree(d_coeff));
cudaThreadExit();
}
// Betrag berechnen
global void window(float waveform, float coeff, int n_waveform, int number_of_coefficients)
{
int tidx = threadIdx.x + blockIdx.x*blockDim.x;
int tidy = threadIdx.y + blockIdx.y*blockDim.y;
if( tidx >= number_of_coefficients )
{
return;
}
if( tidy >= n_waveform )
{
return;
}
waveform[tidy * number_of_coefficients + tidx] = waveform[tidy * number_of_coefficients + tidx] * coeff[tidx];
}