How do I know I am in release mode?
Here is the code I am running. You will see it doesnt give the correct results. What is wrong with it?
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <math.h>
// includes, project
#include <cufft.h>
#include <cutil.h>
// includes, special
#include <ctype.h>
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
// Complex data type
typedef float2 Complex;
int createArray(Complex *arrayPtr, int arrayLength);
int main(int argc, char** argv)
{
int fftSize = 1*1024;
int memSizeHost = 512*(1024*1024);
Complex* hostMem = 0;
Complex* deviceMem = 0;
int batchSize = 1;
CUT_DEVICE_INIT(1, "");
// Allocate memory on GPU and RAM
memSizeHost = (memSizeHost / (fftSize*sizeof(Complex))) * (fftSize*sizeof(Complex));
int memSizeDevice = sizeof(Complex) * fftSize * batchSize;
// Allocate memory in RAM
CUDA_SAFE_CALL(cudaMallocHost((void**)&hostMem, memSizeHost));
CUDA_SAFE_CALL(cudaMalloc((void **) &deviceMem, memSizeDevice)); // Allocate array on device
// Initialize GPU memory to nan and host memory to zero
CUDA_SAFE_CALL(cudaMemset(deviceMem, 255, memSizeDevice));
memset(hostMem, 0, memSizeHost);
// Ensure memory allocation functions worked correctly.
if(hostMem == NULL)
{
printf("Error -- Failed to allocate host memory!\n");
exit(EXIT_FAILURE);
}
if(deviceMem == NULL)
{
printf("Error -- Failed to allocate device memory!\n");
exit(EXIT_FAILURE);
}
// Create input data array.
createArray(hostMem, memSizeHost/sizeof(Complex));
// Create FFT plan
cufftHandle plan;
CUFFT_SAFE_CALL(cufftPlan1d(&plan, fftSize, CUFFT_C2C, batchSize));
int ioi = 13;
int idx = 0;
if (0)
{
// Synchronous Processing
CUDA_SAFE_CALL(cudaMemcpy(deviceMem, &(hostMem[idx]), memSizeDevice, cudaMemcpyHostToDevice));
CUFFT_SAFE_CALL(cufftExecC2C(plan, (cufftComplex *)deviceMem, (cufftComplex *)deviceMem, CUFFT_FORWARD));
// This modifies some of the host memory. If we see it after copying back the data
// from the GPU to the host machine, the copy did not work.
hostMem[ioi].x = -131313;
CUDA_SAFE_CALL(cudaMemcpy(&(hostMem[idx]), deviceMem, memSizeDevice, cudaMemcpyDeviceToHost));
}
else
{
// Asychronous Processing
cudaEvent_t event;
cudaEventCreate(&event);
cudaEventRecord(event, 0);
cudaMemcpyAsync(deviceMem, &(hostMem[idx]), memSizeDevice, cudaMemcpyHostToDevice, 0);
cufftExecC2C(plan, (cufftComplex *)deviceMem, (cufftComplex *)deviceMem, CUFFT_FORWARD);
// This modifies some of the host memory. If we see it after copying back the data
// from the GPU to the host machine, the copy did not work.
hostMem[ioi].x = -131313;
cudaMemcpyAsync(&(hostMem[idx]), deviceMem, memSizeDevice, cudaMemcpyHostToDevice, 0);
cudaEventRecord(event, 0);
//cudaThreadSynchronize();
//cudaEventSynchronize(event);
while (cudaEventQuery(event) == cudaErrorNotReady)
{
printf("In while loop.\n");
usleep(1000);
}
CUDA_SAFE_CALL(cudaEventDestroy(event));
}
// Display result
printf("Result: %d = %f +%fi\n", ioi, hostMem[ioi].x, hostMem[ioi].y);
printf("-----------------------------\n");
printf("If test passed, result should be: 13 = -13342.587891 +12318.576172i\n");
printf("If FFT and GPU memcpy did not occur, result should be: Result: 13 = -131313.000000 +13.000000i\n");
// Free memory
cudaFreeHost(hostMem);
cudaFree(deviceMem);
return EXIT_SUCCESS;
}
int createArray(Complex *arrayPtr, int arrayLength)
{
unsigned int a;
int status;
float timeVal;
timeVal = time(NULL);
srand(timeVal);
//printf("timeVal = %f\n", timeVal);
for(a = 0;a < arrayLength;++a)
{
arrayPtr[a].x = a;
arrayPtr[a].y = a;
//printf("%d = %f +%fi\n", a, arrayPtr[a].x, arrayPtr[a].y);
}
status = 1;
//printf("%d = %f +%fi\n", 3, arrayPtr[3].x, arrayPtr[3].y);
return status;
}
When I run it, I get the following output:
[root@localhost CudaFFTBenchmark]# asyncFftTest
Using device 0: GeForce GTX 280
In while loop.
Result: 13 = -131313.000000 +13.000000i
-----------------------------
If test passed, result should be: 13 = -13342.587891 +12318.576172i
If FFT and GPU memcpy did not occur, result should be: Result: 13 = -131313.000000 +13.000000i