compilation

Hi,

I have a problem with the compilation of my code. I’m a newbie in graphic card program and i don’t understand why there is an error.

Could anyone have a look to explain me where are my mistakes?

Makefile

[codebox]

############################################################

####################

Build script for project

############################################################

####################

Add source files here

EXECUTABLE := myVectorAdd

Cuda source files (compiled with cudacc)

CUFILES := myVectorAdd.cu\

	    cppIntegration.cu\

C/C++ source files (compiled with gcc / c++)

CCFILES := main.cpp \

CFLAGS := -W -Wall -ansi -pedantic

LDFLAGS :=\

NVCCFLAGS += --host-compilation ‘C’

############################################################

####################

Rules and targets

include …/…/common/common.mk[/codebox]

myAddVector.cu

[codebox]#include “vectorCst.h”

global void vectorAdd(const float* A,const float* B,float* R){

int index = blockDim.x * blockIdx.x + threadIdx.x;



if(index < vectorSize) R[index] = A[index] + B[index];

}[/codebox]

cppIntegration.cpp

[codebox]#include “myVectorAdd.cu”

#include <cutil_inline.h>

#include <cuda.h>

//premier version

extern “C”

void computeAddOnDevice(const float* V1,const float* V2,float* VR){

vectorAdd<<<blocksPerGrid, threadsPerBlock>>>(V1, V2, VR);

cudaThreadSynchronize();//wait the result before doing anything else

cudaThreadExit();

}

[/codebox]

main.cpp

[codebox]#include

#include <stdio.h>

#include <string.h>

#include <cuda.h>

#include “vectorCst.h”

#include “cppIntegration.cu”

#include “cutil_inline.h”

using namespace std;

extern “C” void computeAddOnDevice(const float* V1,const float* V2,float* VR);

int main(int argc,char** argv){

cout << "Commencement du programme" << endl;

// shrLog("Starting up CUDA context...\n");

    if( cutCheckCmdLineFlag(argc, (const char**)argv, "device") )

        cutilDeviceInit(argc, argv);

    else

        cudaSetDevice( cutGetMaxGflopsDeviceId() );



float* h_A;

float* h_B;

float* h_Result;



float* d_A;

float* d_B;

float* d_Result;



h_A = new float[vectorSize];

h_B = new float[vectorSize];

h_Result = new float[vectorSize];



d_A = new float[vectorSize];

d_B = new float[vectorSize];

d_Result = new float[vectorSize];



memset(h_A,0,vectorSize);

memset(h_B,0,vectorSize);

memset(h_Result,0,vectorSize);



memset(d_A,0,vectorSize);

memset(d_B,0,vectorSize);

memset(d_Result,0,vectorSize);



cudaMemcpy(d_A, h_A, vectorSize, cudaMemcpyHostToDevice);

cudaMemcpy(d_B, h_B, vectorSize, cudaMemcpyHostToDevice);

cudaMemcpy(d_Result, h_Result, vectorSize, cudaMemcpyHostToDevice);



computeAddOnDevice(d_A,d_B,d_Result);

	

cudaMemcpy(h_Result, d_Result, vectorSize, cudaMemcpyDeviceToHost);

for(int i = 0;i< vectorSize;++i){

	cout << d_Result[i] << endl;

}

//free variables

delete h_A;

delete h_B;

delete h_Result;



cudaFree(d_A);

cudaFree(d_B);

cudaFree(d_Result);

cout << "fin du programme" << endl;



return 0;

}[/codebox]

vectorCst.h

[codebox]#ifndef VECTOR_CST_H

#define VECTOR_CST_H

const int vectorSize = 3;

/(vectorSize + threadsPerBlock - 1) / threadsPerBlock/

const unsigned int blocksPerGrid = 4;

/256/

const unsigned int threadsPerBlock = vectorSize / blocksPerGrid + (vectorSize % blocksPerGrid == 0?0:1);

#endif[/codebox]

Here is what console says

[codebox]

In file included from cppIntegration.cu:1,

             from main.cpp:6:

myVectorAdd.cu:3: error: expected constructor, destructor, or type conversion before ‘void’

In file included from main.cpp:6:

cppIntegration.cu: In function ‘void computeAddOnDevice(const float*, const float*, float*)’:

cppIntegration.cu:8: error: ‘vectorAdd’ was not declared in this scope

cppIntegration.cu:8: error: expected primary-expression before ‘<’ token

cppIntegration.cu:8: error: expected primary-expression before ‘>’ token

cppIntegration.cu:8: warning: left-hand operand of comma has no effect

cppIntegration.cu:8: warning: right-hand operand of comma has no effect

make: *** [obj/x86_64/release/main.cpp.o] Erreur 1

[/codebox]

I think that is because you use gcc to compile main.cpp, however
you also include cppIntegration.cu in main.cpp, so gcc does not know what is global

you can use nvcc to compile main.cpp

It’s possible to mix this files because when I have a look on NVIDIA examples that what they do.

I find the solution it was due to an include which was not necessary in the main.cpp file.

However now everything compile fine but the addition doesn’t work.

Here is the new files.

main.cpp

[codebox]

#include

#include <cuda.h>

#include “vectorCst.h”

#include “cutil_inline.h”

using namespace std;

extern “C” void computeAddOnDevice(const float* V1,const float* V2,float* VR);

int main(int argc,char** argv){

cout << "Commencement du programme" << endl;

// shrLog("Starting up CUDA context...\n");

    if( cutCheckCmdLineFlag(argc, (const char**)argv, "device") )

        cutilDeviceInit(argc, argv);

    else

        cudaSetDevice( cutGetMaxGflopsDeviceId() );



float* h_A;

float* h_B;

float* h_Result;



float* d_A;

float* d_B;

float* d_Result;



h_A = new float[vectorSize];

h_B = new float[vectorSize];

h_Result = new float[vectorSize];



for(int i = 0; i < vectorSize;++i)	h_A[i] = 5;

for(int i = 0; i < vectorSize;++i)	h_B[i] = 4;

for(int i = 0; i < vectorSize;++i)	h_Result[i] = 0;





cudaMalloc((void**)&d_A, vectorSize);

cudaMalloc((void**)&d_B, vectorSize);

cudaMalloc((void**)&d_Result, vectorSize);



cudaError err_cp1 = cudaMemcpy(d_A, h_A, vectorSize, cudaMemcpyHostToDevice);

cudaError err_cp2 = cudaMemcpy(d_B, h_B, vectorSize, cudaMemcpyHostToDevice);

cudaError err_cp3 = cudaMemcpy(d_Result, h_Result, vectorSize, cudaMemcpyHostToDevice);

if((err_cp1 != cudaSuccess) || (err_cp2 != cudaSuccess) || (err_cp3 != cudaSuccess) ){

	cout << cudaGetErrorString(err_cp1) << endl;

	cout << cudaGetErrorString(err_cp2) << endl;

	cout << cudaGetErrorString(err_cp3) << endl;

 	exit(1);

}



computeAddOnDevice(d_A,d_B,d_Result);



cudaMemcpy(h_Result, d_Result, vectorSize, cudaMemcpyDeviceToHost);



for(int i = 0;i < vectorSize;++i){

	cout << h_Result[i] << endl;

}

//free variables

delete h_A;

delete h_B;

delete h_Result;



cudaFree(d_A);

cudaFree(d_B);

cudaFree(d_Result);

cout << "fin du programme" << endl;



return 0;

}

[/codebox]

cppIntegration.cu

[codebox]#include “myVectorAdd.cu”

#include <cutil_inline.h>

#include <cuda.h>

//premier version

extern “C”

void computeAddOnDevice(const float* V1,const float* V2,float* VR){

vectorAdd <<< blocksPerGrid, threadsPerBlock >>> (V1, V2, VR);

cudaThreadSynchronize();//wait the result before doing anything else

//cudaThreadExit();

}

[/codebox]

myAddVector.cpp

[codebox]#include “vectorCst.h”

global void vectorAdd(const float* A,const float* B,float* R){

int index = blockDim.x * blockIdx.x + threadIdx.x;



if(index < vectorSize){

 	R[index] = A[index] + B[index];

}

}

[/codebox]

vectorCst.h

[codebox]#ifndef VECTOR_CST_H

#define VECTOR_CST_H

const int vectorSize = 3;

/(vectorSize + threadsPerBlock - 1) / threadsPerBlock/

const unsigned int blocksPerGrid = 4;

/vectorSize / blocksPerGrid + (vectorSize % blocksPerGrid == 0?0:1)/

const unsigned int threadsPerBlock = 10;

#endif[/codebox]

I find out the solution.

In cudaMalloc and cudaMemcpy it’s essential to specify sizeof(float) * vectorSize

Thus nothing change but main.cpp

[codebox]#include

#include <cuda.h>

#include “vectorCst.h”

#include “cutil_inline.h”

using namespace std;

extern “C” void computeAddOnDevice(const float* V1,const float* V2,float* VR);

int main(int argc,char** argv){

cout << "Commencement du programme" << endl;

// shrLog("Starting up CUDA context...\n");

    if( cutCheckCmdLineFlag(argc, (const char**)argv, "device") )

        cutilDeviceInit(argc, argv);

    else

        cudaSetDevice( cutGetMaxGflopsDeviceId() );



float* h_A;

float* h_B;

float* h_Result;



float* d_A;

float* d_B;

float* d_Result;



h_A = new float[vectorSize];

h_B = new float[vectorSize];

h_Result = new float[vectorSize];



for(int i = 0; i < vectorSize;++i)	h_A[i] = 36;

for(int i = 0; i < vectorSize;++i)	h_B[i] = 4;

for(int i = 0; i < vectorSize;++i)	h_Result[i] = 0;





cudaMalloc((void**)&d_A, vectorSize * sizeof(float));

cudaMalloc((void**)&d_B, vectorSize * sizeof(float));

cudaMalloc((void**)&d_Result, vectorSize * sizeof(float));



cudaError err_cp1 = cudaMemcpy(d_A, h_A, vectorSize * sizeof(float), cudaMemcpyHostToDevice);

cudaError err_cp2 = cudaMemcpy(d_B, h_B, vectorSize * sizeof(float), cudaMemcpyHostToDevice);

cudaError err_cp3 = cudaMemcpy(d_Result, h_Result, vectorSize * sizeof(float), cudaMemcpyHostToDevice);

if((err_cp1 != cudaSuccess) || (err_cp2 != cudaSuccess) || (err_cp3 != cudaSuccess) ){

	cout << cudaGetErrorString(err_cp1) << endl;

	cout << cudaGetErrorString(err_cp2) << endl;

	cout << cudaGetErrorString(err_cp3) << endl;

 	exit(1);

}



computeAddOnDevice(d_A,d_B,d_Result);



cudaMemcpy(h_Result, d_Result, vectorSize * sizeof(float), cudaMemcpyDeviceToHost);



for(int i = 0;i < vectorSize;++i){

	cout << h_Result[i] << endl;

}

//free variables

delete h_A;

delete h_B;

delete h_Result;



cudaFree(d_A);

cudaFree(d_B);

cudaFree(d_Result);

cout << "fin du programme" << endl;



return 0;

}[/codebox]