error LNK2005 using CUDA

Hello everyone,

I am relatively new to CUDA. I am trying to compile

a kernel and I get the following error:


1> : error LNK2005: _device_stub__Z6Mult_dPdPiS0_S0_S_S already defined in

1>LINK : warning LNK4098: defaultlib ‘LIBCMT’ conflicts with use of other libs; use /NODEFAULTLIB:library

1>C:\Documents and Settings\Administrator\My Documents\Visual Studio 2008\Projects\tests_solver_cuda\x64\Debug\tests_solver_cuda.exe : fatal error LNK1169: one or more multiply defined symbols found

Someone knows what can be treated?

Thank You!!

These are the host and device code:

// DEVICE CODE BY 17/03/2008

// Sparse Matrix-Vector multiplication.



#include <stdio.h>

#include “matvecmult.h”

// Sparse matrix multiplication A * x = res !!

device global void Mult_d(double* gval_, int* gcolind_, int* growptr_, int* ndim_, double* gx_, double* res_)


// Thread index

int tx = threadIdx.x ;

int k_b, k_e, ij_aux ;

double daux = 0;

k_b = growptr_[tx] ;

k_e = growptr_[tx+1];

while (k_b<k_e) {

ij_aux = gcolind_[k_b]        ;

daux += gval_[k_b]*gx_[ij_aux];



res_[tx] = daux;



// HOST CODE BY 17/03/2008

// Sparse Matrix-Vector multiplication.

#include <stdlib.h>

#include <stdio.h>

#include <string.h>

#include <math.h>

#include <cutil_inline.h>

#include “”

// Multiplicacion matriz por vector A * x = res

void Mult(const double* gval_, const int* gcolind_, const int* growptr_, const int nrow, const int ncol,

    const int nnzero, const double* gx_, double* res_)            


int size_a, size_a2, size_b, size_b2, size_c;

// Load SparseMatrix “A” and vectors “x” and “res” to the device

// Note: Subscript _d identifies a device variable !!

double* gval_d ;

int* growptr_d ;

int* gcolind_d ;

int* ndim_d;

double* gx_d ;

double* res_d;

int ndim_[3] ;

ndim_[0] = nrow ; ndim_[1] = ncol ; ndim_[2] = nnzero ;

size_a = nnzero * sizeof(double); // non-zero elements size

size_a2 = nnzero * sizeof(int); // non-zero elements size

size_b = (nrow+1) * sizeof(double); // row size

size_b2 = nrow * sizeof(int); // row size

size_c = 3*sizeof(int); // ndim size

// Memory allocation into the device

cudaMalloc((void**)&gval_d, size_a);

cudaMalloc((void**)&gcolind_d, size_a2);

cudaMalloc((void**)&growptr_d, size_b2);

cudaMalloc((void**)&ndim_d, size_c);

cudaMalloc((void**)&gx_d, size_b);

cudaMalloc((void**)&res_d, size_b);

// Copy host to device variables

cudaMemcpy(gval_d, gval_, size_a, cudaMemcpyHostToDevice);

cudaMemcpy(gcolind_d, gcolind_, size_a2, cudaMemcpyHostToDevice);

cudaMemcpy(growptr_d, growptr_, size_b2, cudaMemcpyHostToDevice);

cudaMemcpy(ndim_d, ndim_, size_c, cudaMemcpyHostToDevice);

cudaMemcpy(gx_d, gx_, size_b, cudaMemcpyHostToDevice);

cudaMemcpy(res_d, res_, size_b, cudaMemcpyHostToDevice);

// La organizacion del grid en terminos de los bloques y threads.

// Primera version muy simple para el caso de una matriz 10x10.

//if (BLOCK_SIZE/ndim_(1) >= 512){

// cout << “Quietor!!! Demasiado Thread para tan poco bloque …” << endl;

// system(“pause”);


dim3 dimBlock(BLOCK_SIZE);

dim3 dimGrid(ndim_[1]/BLOCK_SIZE);

// Launch the device computation

Mult_d<<< dimGrid, dimBlock >>>(gval_d, gcolind_d, growptr_d, ndim_d, gx_d, res_d);

// Read result from the device

cudaMemcpy(res_, res_d, size_b, cudaMemcpyDeviceToHost);

// Free device memory






Thank you very much

There is an include for another kernel, probably with the same name Mul_d:
#include “”