Hi everybody,
I am a new user of cuda API and I have a very annoying problem. I am working on a filter engine to perform separable convolution in my application on Win 7 64bits.
I have been studying the example given in Nvidia SDK and there is something that I don’t understand when dealing with pointers that are pointing to data on the device.
Here is the code of main.cpp from the example :
/*
 * Copyright 1993-2010 NVIDIA Corporation.  All rights reserved.
 *
 * Please refer to the NVIDIA end user license agreement (EULA) associated
 * with this source code for terms and conditions that govern your use of
 * this software. Any use, reproduction, disclosure, or distribution of
 * this software and related documentation outside the terms of the EULA
 * is strictly prohibited.
 *
 */
/*
 * This sample implements a separable convolution filter 
 * of a 2D image with an arbitrary kernel.
 */
// Utilities and system includes
#include <shrUtils.h>
#include <cutil_inline.h>
#include "convolutionSeparable_common.h"
////////////////////////////////////////////////////////////////////////////////
// Main program
////////////////////////////////////////////////////////////////////////////////
int main(int argc, char **argv)
{
    // start logs
    shrSetLogFileName ("convolutionSeparable.txt");
    shrLog("%s Starting...\n\n", argv[0]); 
float
        *h_Kernel,
        *h_Input,
        *h_Buffer,
        *h_OutputCPU,
        *h_OutputGPU;
float
        *d_Input,
        *d_Output,
        *d_Buffer;
const int imageW =768;
    const int imageH = 576;
    const int iterations = 16;
unsigned int hTimer;
//Use command-line specified CUDA device, otherwise use device with highest Gflops/s
    if ( shrCheckCmdLineFlag(argc, (const char **)argv, "device") )
        cutilDeviceInit(argc, argv);
    else
        cudaSetDevice( cutGetMaxGflopsDeviceId() );
cutilCheckError(cutCreateTimer(&hTimer));
shrLog("Image Width x Height = %i x %i\n\n", imageW, imageH);
    shrLog("Allocating and intializing host arrays...\n");
        h_Kernel    = (float *)malloc(KERNEL_LENGTH * sizeof(float));
        h_Input     = (float *)malloc(imageW * imageH * sizeof(float));
        h_Buffer    = (float *)malloc(imageW * imageH * sizeof(float));
        h_OutputCPU = (float *)malloc(imageW * imageH * sizeof(float));
        h_OutputGPU = (float *)malloc(imageW * imageH * sizeof(float));
        srand(200);
        for(unsigned int i = 0; i < KERNEL_LENGTH; i++)
            h_Kernel[i] = (float)(rand() % 16);
        for(unsigned i = 0; i < imageW * imageH; i++)
            h_Input[i] = (float)(rand() % 16);
shrLog("Allocating and initializing CUDA arrays...\n");
        cutilSafeCall( cudaMalloc((void **)&d_Input,   imageW * imageH * sizeof(float)) );
        cutilSafeCall( cudaMalloc((void **)&d_Output,  imageW * imageH * sizeof(float)) );
        cutilSafeCall( cudaMalloc((void **)&d_Buffer , imageW * imageH * sizeof(float)) );
setConvolutionKernel(h_Kernel);
        cutilSafeCall( cudaMemcpy(d_Input, h_Input, imageW * imageH * sizeof(float), cudaMemcpyHostToDevice) );
shrLog("Running GPU convolution (%u identical iterations)...\n\n", iterations);
        for(int i = -1; i < iterations; i++){
            //i == -1 -- warmup iteration
            if(i == 0){
                cutilSafeCall( cudaThreadSynchronize() );
                cutilCheckError( cutResetTimer(hTimer) );
                cutilCheckError( cutStartTimer(hTimer) );
            }
convolutionRowsGPU(
                d_Buffer,
                d_Input,
                imageW,
                imageH
            );
convolutionColumnsGPU(
                d_Output,
                d_Buffer,
                imageW,
                imageH
            );
        }
        cutilSafeCall( cudaThreadSynchronize() );
        cutilCheckError(cutStopTimer(hTimer));
        double gpuTime = 0.001 * cutGetTimerValue(hTimer) / (double)iterations;
    shrLogEx(LOGBOTH | MASTER, 0, "convolutionSeparable, Throughput = %.4f MPixels/sec, Time = %.5f s, Size = %u Pixels, NumDevsUsed = %i, Workgroup = %u\n", 
        (1.0e-6 * (double)(imageW * imageH)/ gpuTime), gpuTime, (imageW * imageH), 1, 0);
shrLog("\nReading back GPU results...\n\n");
        cutilSafeCall( cudaMemcpy(h_OutputGPU, d_Output, imageW * imageH * sizeof(float), cudaMemcpyDeviceToHost) );
shrLog("Checking the results...\n");
        shrLog(" ...running convolutionRowCPU()\n");
        convolutionRowCPU(
            h_Buffer,
            h_Input,
            h_Kernel,
            imageW,
            imageH,
            KERNEL_RADIUS
        );
shrLog(" ...running convolutionColumnCPU()\n");
        convolutionColumnCPU(
            h_OutputCPU,
            h_Buffer,
            h_Kernel,
            imageW,
            imageH,
            KERNEL_RADIUS
        );
shrLog(" ...comparing the results\n");
        double sum = 0, delta = 0;
        for(unsigned i = 0; i < imageW * imageH; i++){
            delta += (h_OutputGPU[i] - h_OutputCPU[i]) * (h_OutputGPU[i] - h_OutputCPU[i]);
            sum   += h_OutputCPU[i] * h_OutputCPU[i];
        }
        double L2norm = sqrt(delta / sum);
        shrLog(" ...Relative L2 norm: %E\n\n", L2norm);
    shrLog((L2norm < 1e-6) ? "PASSED\n\n" : "FAILED\n\n");
shrLog("Shutting down...\n");
        cutilSafeCall( cudaFree(d_Buffer ) );
        cutilSafeCall( cudaFree(d_Output) );
        cutilSafeCall( cudaFree(d_Input) );
        free(h_OutputGPU);
        free(h_OutputCPU);
        free(h_Buffer);
        free(h_Input);
        free(h_Kernel);
cutilCheckError(cutDeleteTimer(hTimer));
cudaThreadExit();
shrEXIT(argc, (const char**)argv);
}
With this code, everything is working great I just put it for comparison.
Then, I modified main.cpp in order to create an InitGPU() function that is design to allocate memory on the device and the pointers are now global variable in main.cpp :
/*
 * Copyright 1993-2010 NVIDIA Corporation.  All rights reserved.
 *
 * Please refer to the NVIDIA end user license agreement (EULA) associated
 * with this source code for terms and conditions that govern your use of
 * this software. Any use, reproduction, disclosure, or distribution of
 * this software and related documentation outside the terms of the EULA
 * is strictly prohibited.
 *
 */
/*
 * This sample implements a separable convolution filter 
 * of a 2D image with an arbitrary kernel.
 */
// Utilities and system includes
#include <shrUtils.h>
#include <cutil_inline.h>
#include "convolutionSeparable_common.h"
void InitGPU();
float *h_Kernel;
float *h_Input;
float *h_Buffer;
float *h_OutputCPU;
float *h_OutputGPU;
float *d_Input;
float *d_Output;
float *d_Buffer;
const int imageW =768;
const int imageH = 576;
const int iterations = 16;
////////////////////////////////////////////////////////////////////////////////
// Main program
////////////////////////////////////////////////////////////////////////////////
int main(int argc, char **argv)
{
    // start logs
    shrSetLogFileName ("convolutionSeparable.txt");
    shrLog("%s Starting...\n\n", argv[0]); 
unsigned int hTimer;
//Use command-line specified CUDA device, otherwise use device with highest Gflops/s
    if ( shrCheckCmdLineFlag(argc, (const char **)argv, "device") )
        cutilDeviceInit(argc, argv);
    else
        cudaSetDevice( cutGetMaxGflopsDeviceId() );
cutilCheckError(cutCreateTimer(&hTimer));
InitGPU();
        setConvolutionKernel(h_Kernel);
        cutilSafeCall( cudaMemcpy(d_Input, h_Input, imageW * imageH * sizeof(float), cudaMemcpyHostToDevice) );
shrLog("Running GPU convolution (%u identical iterations)...\n\n", iterations);
        for(int i = -1; i < iterations; i++){
            //i == -1 -- warmup iteration
            if(i == 0){
                cutilSafeCall( cudaThreadSynchronize() );
                cutilCheckError( cutResetTimer(hTimer) );
                cutilCheckError( cutStartTimer(hTimer) );
            }
convolutionRowsGPU(
                d_Buffer,
                d_Input,
                imageW,
                imageH
            );
convolutionColumnsGPU(
                d_Output,
                d_Buffer,
                imageW,
                imageH
            );
        }
        cutilSafeCall( cudaThreadSynchronize() );
        cutilCheckError(cutStopTimer(hTimer));
        double gpuTime = 0.001 * cutGetTimerValue(hTimer) / (double)iterations;
    shrLogEx(LOGBOTH | MASTER, 0, "convolutionSeparable, Throughput = %.4f MPixels/sec, Time = %.5f s, Size = %u Pixels, NumDevsUsed = %i, Workgroup = %u\n", 
        (1.0e-6 * (double)(imageW * imageH)/ gpuTime), gpuTime, (imageW * imageH), 1, 0);
shrLog("\nReading back GPU results...\n\n");
        cutilSafeCall( cudaMemcpy(h_OutputGPU, d_Output, imageW * imageH * sizeof(float), cudaMemcpyDeviceToHost) );
shrLog("Checking the results...\n");
        shrLog(" ...running convolutionRowCPU()\n");
        convolutionRowCPU(
            h_Buffer,
            h_Input,
            h_Kernel,
            imageW,
            imageH,
            KERNEL_RADIUS
        );
shrLog(" ...running convolutionColumnCPU()\n");
        convolutionColumnCPU(
            h_OutputCPU,
            h_Buffer,
            h_Kernel,
            imageW,
            imageH,
            KERNEL_RADIUS
        );
shrLog(" ...comparing the results\n");
        double sum = 0, delta = 0;
        for(unsigned i = 0; i < imageW * imageH; i++){
            delta += (h_OutputGPU[i] - h_OutputCPU[i]) * (h_OutputGPU[i] - h_OutputCPU[i]);
            sum   += h_OutputCPU[i] * h_OutputCPU[i];
        }
        double L2norm = sqrt(delta / sum);
        shrLog(" ...Relative L2 norm: %E\n\n", L2norm);
    shrLog((L2norm < 1e-6) ? "PASSED\n\n" : "FAILED\n\n");
shrLog("Shutting down...\n");
        cutilSafeCall( cudaFree(d_Buffer ) );
        cutilSafeCall( cudaFree(d_Output) );
        cutilSafeCall( cudaFree(d_Input) );
        free(h_OutputGPU);
        free(h_OutputCPU);
        free(h_Buffer);
        free(h_Input);
        free(h_Kernel);
cutilCheckError(cutDeleteTimer(hTimer));
cudaThreadExit();
shrEXIT(argc, (const char**)argv);
}
void InitGPU()
{
	shrLog("Image Width x Height = %i x %i\n\n", imageW, imageH);
    shrLog("Allocating and intializing host arrays...\n");
        h_Kernel    = (float *)malloc(KERNEL_LENGTH * sizeof(float));
        h_Input     = (float *)malloc(imageW * imageH * sizeof(float));
        h_Buffer    = (float *)malloc(imageW * imageH * sizeof(float));
        h_OutputCPU = (float *)malloc(imageW * imageH * sizeof(float));
        h_OutputGPU = (float *)malloc(imageW * imageH * sizeof(float));
        srand(200);
        for(unsigned int i = 0; i < KERNEL_LENGTH; i++)
            h_Kernel[i] = (float)(rand() % 16);
        for(unsigned i = 0; i < imageW * imageH; i++)
            h_Input[i] = (float)(rand() % 16);
shrLog("Allocating and initializing CUDA arrays...\n");
        cutilSafeCall( cudaMalloc((void **)&d_Input,   imageW * imageH * sizeof(float)) );
        cutilSafeCall( cudaMalloc((void **)&d_Output,  imageW * imageH * sizeof(float)) );
        cutilSafeCall( cudaMalloc((void **)&d_Buffer , imageW * imageH * sizeof(float)) );
}
When I launch the program I get an error message :
.\main.cpp(66) : cudaSafeCall() Runtime API error : invalid argument.
it correspond to the following call :
cutilSafeCall( cudaMemcpy(d_Input, h_Input, imageW * imageH * sizeof(float), cudaMemcpyHostToDevice) );
The problem happens when I want to send the data to the device at the address pointed “d_Input”.
As I said I am a newcomer when it comes to cuda and there must be something I don’t understand.
Can someone help me on that matter ?