Visual Studio : Not able to include a kernal method in .cu file to main method in .cpp file Not able

I am not able to build solution in MSVS 2008 for a Cuda project.

Contents are:

[indent]1. scalarProd_gold.cpp that defines an extern method scalarProdCPU.

  1. scalarProd_kernal.cu that defines an extern global method scalarProdGPU.

[indent][codebox]

[indent]scalarProd_kernal.cu[/indent]

global extern “C” void scalarProdGPU(

float *d_C,

float *d_A,

float *d_B,

int vectorN,

int elementN

){

//Accumulators cache

__shared__ float accumResult[ACCUM_N];

////////////////////////////////////////////////////////////////////////////

// Cycle through every pair of vectors,

// taking into account that vector counts can be different

// from total number of thread blocks

////////////////////////////////////////////////////////////////////////////

for(int vec = blockIdx.x; vec < vectorN; vec += gridDim.x){

    int vectorBase = IMUL(elementN, vec);

    int vectorEnd  = vectorBase + elementN;

////////////////////////////////////////////////////////////////////////

    // Each accumulator cycles through vectors with

    // stride equal to number of total number of accumulators ACCUM_N

    // At this stage ACCUM_N is only preferred be a multiple of warp size

    // to meet memory coalescing alignment constraints.

    ////////////////////////////////////////////////////////////////////////

    for(int iAccum = threadIdx.x; iAccum < ACCUM_N; iAccum += blockDim.x){

        float sum = 0;

for(int pos = vectorBase + iAccum; pos < vectorEnd; pos += ACCUM_N)

            sum += d_A[pos] * d_B[pos];

accumResult[iAccum] = sum;

    }

////////////////////////////////////////////////////////////////////////

    // Perform tree-like reduction of accumulators' results.

    // ACCUM_N has to be power of two at this stage

    ////////////////////////////////////////////////////////////////////////

    for(int stride = ACCUM_N / 2; stride > 0; stride >>= 1){

        __syncthreads();

        for(int iAccum = threadIdx.x; iAccum < stride; iAccum += blockDim.x)

            accumResult[iAccum] += accumResult[stride + iAccum];

    }

if(threadIdx.x == 0) d_C[vec] = accumResult[0];

}

}[/codebox][/indent]

  1. testprg.cpp that contains the main method where I access both scalarProdCPU and scalarProdGPU methods.

[indent][codebox]

[indent]testprg.cpp[/indent]

// Calculate scalar products of VectorN vectors of ElementN elements on CPU

extern “C”

void scalarProdCPU(

float *h_C,

float *h_A,

float *h_B,

int vectorN,

int elementN

);

// Calculate scalar products of VectorN vectors of ElementN elements on GPU

extern “C”

void scalarProdGPU(

float *d_C,

float *d_A,

float *d_B,

int vectorN,

int elementN

);

// Main program

int main(int argc, char **argv){

float *h_A, *h_B, *h_C_CPU, *h_C_GPU;

float *d_A, *d_B, *d_C;

double delta, ref, sum_delta, sum_ref, L1norm;

unsigned int hTimer;

int i;

// use command-line specified CUDA device, otherwise use device with highest Gflops/s

if( cutCheckCmdLineFlag(argc, (const char**)argv, "device") )

    cutilDeviceInit(argc, argv);

else

    cudaSetDevice( cutGetMaxGflopsDeviceId() );

cutilCheckError( cutCreateTimer(&hTimer) );

printf(“Initializing data…\n”);

    printf("...allocating CPU memory.\n");

    h_A     = (float *)malloc(DATA_SZ);

    h_B     = (float *)malloc(DATA_SZ);

    h_C_CPU = (float *)malloc(RESULT_SZ);

    h_C_GPU = (float *)malloc(RESULT_SZ);

printf(“…allocating GPU memory.\n”);

    cutilSafeCall( cudaMalloc((void **)&d_A, DATA_SZ)   );

    cutilSafeCall( cudaMalloc((void **)&d_B, DATA_SZ)   );

    cutilSafeCall( cudaMalloc((void **)&d_C, RESULT_SZ) );

printf(“…generating input data in CPU mem.\n”);

    srand(123);

    //Generating input data on CPU

    for(i = 0; i < DATA_N; i++){

        h_A[i] = RandFloat(0.0f, 1.0f);

        h_B[i] = RandFloat(0.0f, 1.0f);

    }

printf(“…copying input data to GPU mem.\n”);

    //Copy options data to GPU memory for further processing 

    cutilSafeCall( cudaMemcpy(d_A, h_A, DATA_SZ, cudaMemcpyHostToDevice) );

    cutilSafeCall( cudaMemcpy(d_B, h_B, DATA_SZ, cudaMemcpyHostToDevice) );

printf("Data init done.\n");

printf(“Executing GPU kernel…\n”);

    cutilSafeCall( cudaThreadSynchronize() );

    cutilCheckError( cutResetTimer(hTimer) );

    cutilCheckError( cutStartTimer(hTimer) );

    scalarProdGPU(d_C, d_A, d_B, VECTOR_N, ELEMENT_N);

    cutilCheckMsg("scalarProdGPU() execution failed\n");

    cutilSafeCall( cudaThreadSynchronize() );

    cutilCheckError( cutStopTimer(hTimer) );

printf("GPU time: %f msecs.\n", cutGetTimerValue(hTimer));

printf(“Reading back GPU result…\n”);

    //Read back GPU results to compare them to CPU results

    cutilSafeCall( cudaMemcpy(h_C_GPU, d_C, RESULT_SZ, cudaMemcpyDeviceToHost) );

printf(“Checking GPU results…\n”);

    printf("..running CPU scalar product calculation\n");

    scalarProdCPU(h_C_CPU, h_A, h_B, VECTOR_N, ELEMENT_N);

printf(“…comparing the results\n”);

    //Calculate max absolute difference and L1 distance

    //between CPU and GPU results

    sum_delta = 0;

    sum_ref   = 0;

    for(i = 0; i < VECTOR_N; i++){

        delta = fabs(h_C_GPU[i] - h_C_CPU[i]);

        ref   = h_C_CPU[i];

        sum_delta += delta;

        sum_ref   += ref;

    }

    L1norm = sum_delta / sum_ref;

printf("L1 error: %E\n", L1norm);

printf((L1norm < 1e-6) ? "TEST PASSED\n" : "TEST FAILED\n");

printf(“Shutting down…\n”);

    cutilSafeCall( cudaFree(d_C) );

    cutilSafeCall( cudaFree(d_B)   );

    cutilSafeCall( cudaFree(d_A)   );

    free(h_C_GPU);

    free(h_C_CPU);

    free(h_B);

    free(h_A);

    cutilCheckError( cutDeleteTimer(hTimer) );

cudaThreadExit();

cutilExit(argc, argv);

}

[/codebox][/indent]

[/indent]

Error: error LNK2019: unresolved external symbol _scalarProdGPU referenced in function _main File:testprg.obj

The build log

[codebox]

Build Log Build started: Project: scalarProd, Configuration: Debug|Win32

Command Lines

Creating temporary file “c:\Program Files\NVIDIA Corporation\C\src\scalarProd\Debug\RSP00000955162196.rsp” with contents

[

/OUT:“…/…/bin/win32/Debug/scalarProd.exe” /INCREMENTAL:NO /LIBPATH:“C:\CUDA\lib” /LIBPATH:“C:\Program Files\NVIDIA Corporation\C\common\lib” /LIBPATH:“…/…/common/lib” /LIBPATH:“./Debug” /MANIFEST /MANIFESTFILE:“Debug\scalarProd.exe.intermediate.manifest” /MANIFESTUAC:“level=‘asInvoker’ uiAccess=‘false’” /DEBUG /PDB:“Debug/scalarProd.pdb” /SUBSYSTEM:CONSOLE /OPT:NOICF /DYNAMICBASE:NO /MACHINE:X86 cudart.lib cutil32D.lib kernel32.lib user32.lib gdi32.lib winspool.lib comdlg32.lib advapi32.lib shell32.lib ole32.lib oleaut32.lib uuid.lib odbc32.lib odbccp32.lib

“.\Debug\scalarProd_gold.obj”

“.\Debug\testprg.obj”

“.\Debug\scalarProd_kernel.obj”

]

Creating command line “link.exe @“c:\Program Files\NVIDIA Corporation\C\src\scalarProd\Debug\RSP00000955162196.rsp” /NOLOGO /ERRORREPORT:PROMPT”

Output Window Linking…

LINK : warning LNK4098: defaultlib ‘LIBCMT’ conflicts with use of other libs; use /NODEFAULTLIB:library

testprg.obj : error LNK2019: unresolved external symbol _scalarProdGPU referenced in function _main

…/…/bin/win32/Debug/scalarProd.exe : fatal error LNK1120: 1 unresolved externals

Results Build log was saved at “file://c:\Program Files\NVIDIA Corporation\C\src\scalarProd\Debug\BuildLog.htm”

scalarProd - 2 error(s), 1 warning(s)

[/codebox]

Using custom build rule from file: C:\Program Files\NVIDIA Corporation\C\common\Cuda.Rules

Please help. Having been trying to get this working for more than a day.

Thanks