Hi everybody,
I am a new user of cuda API and I have a very annoying problem. I am working on a filter engine to perform separable convolution in my application on Win 7 64bits.
I have been studying the example given in Nvidia SDK and there is something that I don’t understand when dealing with pointers that are pointing to data on the device.
Here is the code of main.cpp from the example :
/*
* Copyright 1993-2010 NVIDIA Corporation. All rights reserved.
*
* Please refer to the NVIDIA end user license agreement (EULA) associated
* with this source code for terms and conditions that govern your use of
* this software. Any use, reproduction, disclosure, or distribution of
* this software and related documentation outside the terms of the EULA
* is strictly prohibited.
*
*/
/*
* This sample implements a separable convolution filter
* of a 2D image with an arbitrary kernel.
*/
// Utilities and system includes
#include <shrUtils.h>
#include <cutil_inline.h>
#include "convolutionSeparable_common.h"
////////////////////////////////////////////////////////////////////////////////
// Main program
////////////////////////////////////////////////////////////////////////////////
int main(int argc, char **argv)
{
// start logs
shrSetLogFileName ("convolutionSeparable.txt");
shrLog("%s Starting...\n\n", argv[0]);
float
*h_Kernel,
*h_Input,
*h_Buffer,
*h_OutputCPU,
*h_OutputGPU;
float
*d_Input,
*d_Output,
*d_Buffer;
const int imageW =768;
const int imageH = 576;
const int iterations = 16;
unsigned int hTimer;
//Use command-line specified CUDA device, otherwise use device with highest Gflops/s
if ( shrCheckCmdLineFlag(argc, (const char **)argv, "device") )
cutilDeviceInit(argc, argv);
else
cudaSetDevice( cutGetMaxGflopsDeviceId() );
cutilCheckError(cutCreateTimer(&hTimer));
shrLog("Image Width x Height = %i x %i\n\n", imageW, imageH);
shrLog("Allocating and intializing host arrays...\n");
h_Kernel = (float *)malloc(KERNEL_LENGTH * sizeof(float));
h_Input = (float *)malloc(imageW * imageH * sizeof(float));
h_Buffer = (float *)malloc(imageW * imageH * sizeof(float));
h_OutputCPU = (float *)malloc(imageW * imageH * sizeof(float));
h_OutputGPU = (float *)malloc(imageW * imageH * sizeof(float));
srand(200);
for(unsigned int i = 0; i < KERNEL_LENGTH; i++)
h_Kernel[i] = (float)(rand() % 16);
for(unsigned i = 0; i < imageW * imageH; i++)
h_Input[i] = (float)(rand() % 16);
shrLog("Allocating and initializing CUDA arrays...\n");
cutilSafeCall( cudaMalloc((void **)&d_Input, imageW * imageH * sizeof(float)) );
cutilSafeCall( cudaMalloc((void **)&d_Output, imageW * imageH * sizeof(float)) );
cutilSafeCall( cudaMalloc((void **)&d_Buffer , imageW * imageH * sizeof(float)) );
setConvolutionKernel(h_Kernel);
cutilSafeCall( cudaMemcpy(d_Input, h_Input, imageW * imageH * sizeof(float), cudaMemcpyHostToDevice) );
shrLog("Running GPU convolution (%u identical iterations)...\n\n", iterations);
for(int i = -1; i < iterations; i++){
//i == -1 -- warmup iteration
if(i == 0){
cutilSafeCall( cudaThreadSynchronize() );
cutilCheckError( cutResetTimer(hTimer) );
cutilCheckError( cutStartTimer(hTimer) );
}
convolutionRowsGPU(
d_Buffer,
d_Input,
imageW,
imageH
);
convolutionColumnsGPU(
d_Output,
d_Buffer,
imageW,
imageH
);
}
cutilSafeCall( cudaThreadSynchronize() );
cutilCheckError(cutStopTimer(hTimer));
double gpuTime = 0.001 * cutGetTimerValue(hTimer) / (double)iterations;
shrLogEx(LOGBOTH | MASTER, 0, "convolutionSeparable, Throughput = %.4f MPixels/sec, Time = %.5f s, Size = %u Pixels, NumDevsUsed = %i, Workgroup = %u\n",
(1.0e-6 * (double)(imageW * imageH)/ gpuTime), gpuTime, (imageW * imageH), 1, 0);
shrLog("\nReading back GPU results...\n\n");
cutilSafeCall( cudaMemcpy(h_OutputGPU, d_Output, imageW * imageH * sizeof(float), cudaMemcpyDeviceToHost) );
shrLog("Checking the results...\n");
shrLog(" ...running convolutionRowCPU()\n");
convolutionRowCPU(
h_Buffer,
h_Input,
h_Kernel,
imageW,
imageH,
KERNEL_RADIUS
);
shrLog(" ...running convolutionColumnCPU()\n");
convolutionColumnCPU(
h_OutputCPU,
h_Buffer,
h_Kernel,
imageW,
imageH,
KERNEL_RADIUS
);
shrLog(" ...comparing the results\n");
double sum = 0, delta = 0;
for(unsigned i = 0; i < imageW * imageH; i++){
delta += (h_OutputGPU[i] - h_OutputCPU[i]) * (h_OutputGPU[i] - h_OutputCPU[i]);
sum += h_OutputCPU[i] * h_OutputCPU[i];
}
double L2norm = sqrt(delta / sum);
shrLog(" ...Relative L2 norm: %E\n\n", L2norm);
shrLog((L2norm < 1e-6) ? "PASSED\n\n" : "FAILED\n\n");
shrLog("Shutting down...\n");
cutilSafeCall( cudaFree(d_Buffer ) );
cutilSafeCall( cudaFree(d_Output) );
cutilSafeCall( cudaFree(d_Input) );
free(h_OutputGPU);
free(h_OutputCPU);
free(h_Buffer);
free(h_Input);
free(h_Kernel);
cutilCheckError(cutDeleteTimer(hTimer));
cudaThreadExit();
shrEXIT(argc, (const char**)argv);
}
With this code, everything is working great I just put it for comparison.
Then, I modified main.cpp in order to create an InitGPU() function that is design to allocate memory on the device and the pointers are now global variable in main.cpp :
/*
* Copyright 1993-2010 NVIDIA Corporation. All rights reserved.
*
* Please refer to the NVIDIA end user license agreement (EULA) associated
* with this source code for terms and conditions that govern your use of
* this software. Any use, reproduction, disclosure, or distribution of
* this software and related documentation outside the terms of the EULA
* is strictly prohibited.
*
*/
/*
* This sample implements a separable convolution filter
* of a 2D image with an arbitrary kernel.
*/
// Utilities and system includes
#include <shrUtils.h>
#include <cutil_inline.h>
#include "convolutionSeparable_common.h"
void InitGPU();
float *h_Kernel;
float *h_Input;
float *h_Buffer;
float *h_OutputCPU;
float *h_OutputGPU;
float *d_Input;
float *d_Output;
float *d_Buffer;
const int imageW =768;
const int imageH = 576;
const int iterations = 16;
////////////////////////////////////////////////////////////////////////////////
// Main program
////////////////////////////////////////////////////////////////////////////////
int main(int argc, char **argv)
{
// start logs
shrSetLogFileName ("convolutionSeparable.txt");
shrLog("%s Starting...\n\n", argv[0]);
unsigned int hTimer;
//Use command-line specified CUDA device, otherwise use device with highest Gflops/s
if ( shrCheckCmdLineFlag(argc, (const char **)argv, "device") )
cutilDeviceInit(argc, argv);
else
cudaSetDevice( cutGetMaxGflopsDeviceId() );
cutilCheckError(cutCreateTimer(&hTimer));
InitGPU();
setConvolutionKernel(h_Kernel);
cutilSafeCall( cudaMemcpy(d_Input, h_Input, imageW * imageH * sizeof(float), cudaMemcpyHostToDevice) );
shrLog("Running GPU convolution (%u identical iterations)...\n\n", iterations);
for(int i = -1; i < iterations; i++){
//i == -1 -- warmup iteration
if(i == 0){
cutilSafeCall( cudaThreadSynchronize() );
cutilCheckError( cutResetTimer(hTimer) );
cutilCheckError( cutStartTimer(hTimer) );
}
convolutionRowsGPU(
d_Buffer,
d_Input,
imageW,
imageH
);
convolutionColumnsGPU(
d_Output,
d_Buffer,
imageW,
imageH
);
}
cutilSafeCall( cudaThreadSynchronize() );
cutilCheckError(cutStopTimer(hTimer));
double gpuTime = 0.001 * cutGetTimerValue(hTimer) / (double)iterations;
shrLogEx(LOGBOTH | MASTER, 0, "convolutionSeparable, Throughput = %.4f MPixels/sec, Time = %.5f s, Size = %u Pixels, NumDevsUsed = %i, Workgroup = %u\n",
(1.0e-6 * (double)(imageW * imageH)/ gpuTime), gpuTime, (imageW * imageH), 1, 0);
shrLog("\nReading back GPU results...\n\n");
cutilSafeCall( cudaMemcpy(h_OutputGPU, d_Output, imageW * imageH * sizeof(float), cudaMemcpyDeviceToHost) );
shrLog("Checking the results...\n");
shrLog(" ...running convolutionRowCPU()\n");
convolutionRowCPU(
h_Buffer,
h_Input,
h_Kernel,
imageW,
imageH,
KERNEL_RADIUS
);
shrLog(" ...running convolutionColumnCPU()\n");
convolutionColumnCPU(
h_OutputCPU,
h_Buffer,
h_Kernel,
imageW,
imageH,
KERNEL_RADIUS
);
shrLog(" ...comparing the results\n");
double sum = 0, delta = 0;
for(unsigned i = 0; i < imageW * imageH; i++){
delta += (h_OutputGPU[i] - h_OutputCPU[i]) * (h_OutputGPU[i] - h_OutputCPU[i]);
sum += h_OutputCPU[i] * h_OutputCPU[i];
}
double L2norm = sqrt(delta / sum);
shrLog(" ...Relative L2 norm: %E\n\n", L2norm);
shrLog((L2norm < 1e-6) ? "PASSED\n\n" : "FAILED\n\n");
shrLog("Shutting down...\n");
cutilSafeCall( cudaFree(d_Buffer ) );
cutilSafeCall( cudaFree(d_Output) );
cutilSafeCall( cudaFree(d_Input) );
free(h_OutputGPU);
free(h_OutputCPU);
free(h_Buffer);
free(h_Input);
free(h_Kernel);
cutilCheckError(cutDeleteTimer(hTimer));
cudaThreadExit();
shrEXIT(argc, (const char**)argv);
}
void InitGPU()
{
shrLog("Image Width x Height = %i x %i\n\n", imageW, imageH);
shrLog("Allocating and intializing host arrays...\n");
h_Kernel = (float *)malloc(KERNEL_LENGTH * sizeof(float));
h_Input = (float *)malloc(imageW * imageH * sizeof(float));
h_Buffer = (float *)malloc(imageW * imageH * sizeof(float));
h_OutputCPU = (float *)malloc(imageW * imageH * sizeof(float));
h_OutputGPU = (float *)malloc(imageW * imageH * sizeof(float));
srand(200);
for(unsigned int i = 0; i < KERNEL_LENGTH; i++)
h_Kernel[i] = (float)(rand() % 16);
for(unsigned i = 0; i < imageW * imageH; i++)
h_Input[i] = (float)(rand() % 16);
shrLog("Allocating and initializing CUDA arrays...\n");
cutilSafeCall( cudaMalloc((void **)&d_Input, imageW * imageH * sizeof(float)) );
cutilSafeCall( cudaMalloc((void **)&d_Output, imageW * imageH * sizeof(float)) );
cutilSafeCall( cudaMalloc((void **)&d_Buffer , imageW * imageH * sizeof(float)) );
}
When I launch the program I get an error message :
.\main.cpp(66) : cudaSafeCall() Runtime API error : invalid argument.
it correspond to the following call :
cutilSafeCall( cudaMemcpy(d_Input, h_Input, imageW * imageH * sizeof(float), cudaMemcpyHostToDevice) );
The problem happens when I want to send the data to the device at the address pointed “d_Input”.
As I said I am a newcomer when it comes to cuda and there must be something I don’t understand.
Can someone help me on that matter ?