I am not able to build solution in MSVS 2008 for a Cuda project.
Contents are:
[indent]1. scalarProd_gold.cpp that defines an extern method scalarProdCPU.
- scalarProd_kernal.cu that defines an extern global method scalarProdGPU.
[indent][codebox]
[indent]scalarProd_kernal.cu[/indent]
global extern “C” void scalarProdGPU(
float *d_C,
float *d_A,
float *d_B,
int vectorN,
int elementN
){
//Accumulators cache
__shared__ float accumResult[ACCUM_N];
////////////////////////////////////////////////////////////////////////////
// Cycle through every pair of vectors,
// taking into account that vector counts can be different
// from total number of thread blocks
////////////////////////////////////////////////////////////////////////////
for(int vec = blockIdx.x; vec < vectorN; vec += gridDim.x){
int vectorBase = IMUL(elementN, vec);
int vectorEnd = vectorBase + elementN;
////////////////////////////////////////////////////////////////////////
// Each accumulator cycles through vectors with
// stride equal to number of total number of accumulators ACCUM_N
// At this stage ACCUM_N is only preferred be a multiple of warp size
// to meet memory coalescing alignment constraints.
////////////////////////////////////////////////////////////////////////
for(int iAccum = threadIdx.x; iAccum < ACCUM_N; iAccum += blockDim.x){
float sum = 0;
for(int pos = vectorBase + iAccum; pos < vectorEnd; pos += ACCUM_N)
sum += d_A[pos] * d_B[pos];
accumResult[iAccum] = sum;
}
////////////////////////////////////////////////////////////////////////
// Perform tree-like reduction of accumulators' results.
// ACCUM_N has to be power of two at this stage
////////////////////////////////////////////////////////////////////////
for(int stride = ACCUM_N / 2; stride > 0; stride >>= 1){
__syncthreads();
for(int iAccum = threadIdx.x; iAccum < stride; iAccum += blockDim.x)
accumResult[iAccum] += accumResult[stride + iAccum];
}
if(threadIdx.x == 0) d_C[vec] = accumResult[0];
}
}[/codebox][/indent]
- testprg.cpp that contains the main method where I access both scalarProdCPU and scalarProdGPU methods.
[indent][codebox]
[indent]testprg.cpp[/indent]
// Calculate scalar products of VectorN vectors of ElementN elements on CPU
extern “C”
void scalarProdCPU(
float *h_C,
float *h_A,
float *h_B,
int vectorN,
int elementN
);
// Calculate scalar products of VectorN vectors of ElementN elements on GPU
extern “C”
void scalarProdGPU(
float *d_C,
float *d_A,
float *d_B,
int vectorN,
int elementN
);
// Main program
int main(int argc, char **argv){
float *h_A, *h_B, *h_C_CPU, *h_C_GPU;
float *d_A, *d_B, *d_C;
double delta, ref, sum_delta, sum_ref, L1norm;
unsigned int hTimer;
int i;
// use command-line specified CUDA device, otherwise use device with highest Gflops/s
if( cutCheckCmdLineFlag(argc, (const char**)argv, "device") )
cutilDeviceInit(argc, argv);
else
cudaSetDevice( cutGetMaxGflopsDeviceId() );
cutilCheckError( cutCreateTimer(&hTimer) );
printf(“Initializing data…\n”);
printf("...allocating CPU memory.\n");
h_A = (float *)malloc(DATA_SZ);
h_B = (float *)malloc(DATA_SZ);
h_C_CPU = (float *)malloc(RESULT_SZ);
h_C_GPU = (float *)malloc(RESULT_SZ);
printf(“…allocating GPU memory.\n”);
cutilSafeCall( cudaMalloc((void **)&d_A, DATA_SZ) );
cutilSafeCall( cudaMalloc((void **)&d_B, DATA_SZ) );
cutilSafeCall( cudaMalloc((void **)&d_C, RESULT_SZ) );
printf(“…generating input data in CPU mem.\n”);
srand(123);
//Generating input data on CPU
for(i = 0; i < DATA_N; i++){
h_A[i] = RandFloat(0.0f, 1.0f);
h_B[i] = RandFloat(0.0f, 1.0f);
}
printf(“…copying input data to GPU mem.\n”);
//Copy options data to GPU memory for further processing
cutilSafeCall( cudaMemcpy(d_A, h_A, DATA_SZ, cudaMemcpyHostToDevice) );
cutilSafeCall( cudaMemcpy(d_B, h_B, DATA_SZ, cudaMemcpyHostToDevice) );
printf("Data init done.\n");
printf(“Executing GPU kernel…\n”);
cutilSafeCall( cudaThreadSynchronize() );
cutilCheckError( cutResetTimer(hTimer) );
cutilCheckError( cutStartTimer(hTimer) );
scalarProdGPU(d_C, d_A, d_B, VECTOR_N, ELEMENT_N);
cutilCheckMsg("scalarProdGPU() execution failed\n");
cutilSafeCall( cudaThreadSynchronize() );
cutilCheckError( cutStopTimer(hTimer) );
printf("GPU time: %f msecs.\n", cutGetTimerValue(hTimer));
printf(“Reading back GPU result…\n”);
//Read back GPU results to compare them to CPU results
cutilSafeCall( cudaMemcpy(h_C_GPU, d_C, RESULT_SZ, cudaMemcpyDeviceToHost) );
printf(“Checking GPU results…\n”);
printf("..running CPU scalar product calculation\n");
scalarProdCPU(h_C_CPU, h_A, h_B, VECTOR_N, ELEMENT_N);
printf(“…comparing the results\n”);
//Calculate max absolute difference and L1 distance
//between CPU and GPU results
sum_delta = 0;
sum_ref = 0;
for(i = 0; i < VECTOR_N; i++){
delta = fabs(h_C_GPU[i] - h_C_CPU[i]);
ref = h_C_CPU[i];
sum_delta += delta;
sum_ref += ref;
}
L1norm = sum_delta / sum_ref;
printf("L1 error: %E\n", L1norm);
printf((L1norm < 1e-6) ? "TEST PASSED\n" : "TEST FAILED\n");
printf(“Shutting down…\n”);
cutilSafeCall( cudaFree(d_C) );
cutilSafeCall( cudaFree(d_B) );
cutilSafeCall( cudaFree(d_A) );
free(h_C_GPU);
free(h_C_CPU);
free(h_B);
free(h_A);
cutilCheckError( cutDeleteTimer(hTimer) );
cudaThreadExit();
cutilExit(argc, argv);
}
[/codebox][/indent]
[/indent]
Error: error LNK2019: unresolved external symbol _scalarProdGPU referenced in function _main File:testprg.obj
The build log
[codebox]
Build Log Build started: Project: scalarProd, Configuration: Debug|Win32
Command Lines
Creating temporary file “c:\Program Files\NVIDIA Corporation\C\src\scalarProd\Debug\RSP00000955162196.rsp” with contents
[
/OUT:“…/…/bin/win32/Debug/scalarProd.exe” /INCREMENTAL:NO /LIBPATH:“C:\CUDA\lib” /LIBPATH:“C:\Program Files\NVIDIA Corporation\C\common\lib” /LIBPATH:“…/…/common/lib” /LIBPATH:“./Debug” /MANIFEST /MANIFESTFILE:“Debug\scalarProd.exe.intermediate.manifest” /MANIFESTUAC:“level=‘asInvoker’ uiAccess=‘false’” /DEBUG /PDB:“Debug/scalarProd.pdb” /SUBSYSTEM:CONSOLE /OPT:NOICF /DYNAMICBASE:NO /MACHINE:X86 cudart.lib cutil32D.lib kernel32.lib user32.lib gdi32.lib winspool.lib comdlg32.lib advapi32.lib shell32.lib ole32.lib oleaut32.lib uuid.lib odbc32.lib odbccp32.lib
“.\Debug\scalarProd_gold.obj”
“.\Debug\testprg.obj”
“.\Debug\scalarProd_kernel.obj”
]
Creating command line “link.exe @“c:\Program Files\NVIDIA Corporation\C\src\scalarProd\Debug\RSP00000955162196.rsp” /NOLOGO /ERRORREPORT:PROMPT”
Output Window Linking…
LINK : warning LNK4098: defaultlib ‘LIBCMT’ conflicts with use of other libs; use /NODEFAULTLIB:library
testprg.obj : error LNK2019: unresolved external symbol _scalarProdGPU referenced in function _main
…/…/bin/win32/Debug/scalarProd.exe : fatal error LNK1120: 1 unresolved externals
Results Build log was saved at “file://c:\Program Files\NVIDIA Corporation\C\src\scalarProd\Debug\BuildLog.htm”
scalarProd - 2 error(s), 1 warning(s)
[/codebox]
Using custom build rule from file: C:\Program Files\NVIDIA Corporation\C\common\Cuda.Rules
Please help. Having been trying to get this working for more than a day.
Thanks