undeclared identifier using CUDA

Hello everybody,

I am relatively new to CUDA. I wrote a kernel that I am including in one of my C++ projects.
I think the kernel is well implemented, however when I try to compile the project, I get the following error message:

error C2065: ‘threadIdx’ : undeclared identifier
error C2228: left of ‘.x’ must have class/struct/union

Can anyone help me solve this problem, please ???

Thank you very much in advance.
Greetings.

  1. did you include the cuda.h file?

  2. did you write your cuda code in .cu file?

  3. did you use nvcc to compile the .cu file?

  1. did you include the cuda.h file?

yes, I include this header.

  1. did you write your cuda code in .cu file?

Yes, I wrote my kernel in a .cu file (matvecmult_kernel.cu)

  1. did you use nvcc to compile the .cu file?

I am not sure about that … I include a new Build Rule which identifies the
.cu file, and I think that in this rule, the command argument nvcc … is
already specified.

Please, can someone help me?

You might need to post your code.

If you’re using visual studio, you should check out the CUDA VS Wizard. This should set the building environment up for you.

since it’s cl that’s complaining I’m pretty sure you’re using threadIdx in host code.

These are the host and device code that I am using:

HOST CODE (matvecmult.cu):

======
// HOST CODE 17/03/2008
// Sparse Matrix-Vector multiplication.

#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <math.h>

#include <cutil_inline.h>
#include “matvecmult_kernel.cu”

// Multiplicacion matriz por vector A * x = res
void Mult(const double* gval_, const int* gcolind_, const int* growptr_, const int nrow, const int ncol,
const int nnzero, const double* gx_, double* res_)
{

int size_a, size_a2, size_b, size_b2, size_c;

// Load SparseMatrix “A” and vectors “x” and “res” to the device
// Note: Subscript _d identifies a device variable !!

double* gval_d ;
int* growptr_d ;
int* gcolind_d ;
int* ndim_d;
double* gx_d ;
double* res_d;

int ndim_[3] ;
ndim_[0] = nrow ; ndim_[1] = ncol ; ndim_[2] = nnzero ;

size_a = nnzero * sizeof(double); // non-zero elements size
size_a2 = nnzero * sizeof(int); // non-zero elements size
size_b = (nrow+1) * sizeof(double); // row size
size_b2 = nrow * sizeof(int); // row size
size_c = 3*sizeof(int); // ndim size

// Memory allocation into the device
cudaMalloc((void**)&gval_d, size_a);
cudaMalloc((void**)&gcolind_d, size_a2);
cudaMalloc((void**)&growptr_d, size_b2);
cudaMalloc((void**)&ndim_d, size_c);
cudaMalloc((void**)&gx_d, size_b);
cudaMalloc((void**)&res_d, size_b);

// Copy host to device variables
cudaMemcpy(gval_d, gval_, size_a, cudaMemcpyHostToDevice);
cudaMemcpy(gcolind_d, gcolind_, size_a2, cudaMemcpyHostToDevice);
cudaMemcpy(growptr_d, growptr_, size_b2, cudaMemcpyHostToDevice);
cudaMemcpy(ndim_d, ndim_, size_c, cudaMemcpyHostToDevice);
cudaMemcpy(gx_d, gx_, size_b, cudaMemcpyHostToDevice);
cudaMemcpy(res_d, res_, size_b, cudaMemcpyHostToDevice);

// La organizacion del grid en terminos de los bloques y threads.
// Primera version muy simple para el caso de una matriz 10x10.

//if (BLOCK_SIZE/ndim_(1) >= 512){
// cout << “Quietor!!! Demasiado Thread para tan poco bloque …” << endl;
// system(“pause”);
//}

dim3 dimBlock(BLOCK_SIZE);
dim3 dimGrid(ndim_[1]/BLOCK_SIZE);

// Launch the device computation
Mult_d<<< dimGrid, dimBlock >>>(gval_d, gcolind_d, growptr_d, ndim_d, gx_d, res_d);

// Read result from the device
cudaMemcpy(res_, res_d, size_b, cudaMemcpyDeviceToHost);

// Free device memory
cudaFree(gval_d);
cudaFree(gcolind_d);
cudaFree(growptr_d);
cudaFree(ndim_d);

}

==

DEVICE CODE

==

// DEVICE CODE
// Sparse Matrix-Vector multiplication.

#ifndef MATVECMULT_KERNEL_H
#define MATVECMULT_KERNEL_H

#include <stdio.h>

#include “matvecmult.h”

#endif

// Sparse matrix multiplication A * x = res !!
global void Mult_d(double* gval_, int* gcolind_, int* growptr_, int* ndim_, double* gx_, double* res_){

// Thread index
int tx = threadIdx.x ;

int k_b, k_e, ij_aux ;
double daux = 0;

k_b = growptr_[tx] ;
k_e = growptr_[tx+1];

while (k_b<k_e) {
ij_aux = gcolind_[k_b] ;
daux += gval_[k_b]*gx_[ij_aux];
k_b++;
}

res_[tx] = daux;
}

These are the host and device code that I am using:

HOST CODE (matvecmult.cu):

======
// HOST CODE 17/03/2008
// Sparse Matrix-Vector multiplication.

#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <math.h>

#include <cutil_inline.h>
#include “matvecmult_kernel.cu”

// Multiplicacion matriz por vector A * x = res
void Mult(const double* gval_, const int* gcolind_, const int* growptr_, const int nrow, const int ncol,
const int nnzero, const double* gx_, double* res_)
{

int size_a, size_a2, size_b, size_b2, size_c;

// Load SparseMatrix “A” and vectors “x” and “res” to the device
// Note: Subscript _d identifies a device variable !!

double* gval_d ;
int* growptr_d ;
int* gcolind_d ;
int* ndim_d;
double* gx_d ;
double* res_d;

int ndim_[3] ;
ndim_[0] = nrow ; ndim_[1] = ncol ; ndim_[2] = nnzero ;

size_a = nnzero * sizeof(double); // non-zero elements size
size_a2 = nnzero * sizeof(int); // non-zero elements size
size_b = (nrow+1) * sizeof(double); // row size
size_b2 = nrow * sizeof(int); // row size
size_c = 3*sizeof(int); // ndim size

// Memory allocation into the device
cudaMalloc((void**)&gval_d, size_a);
cudaMalloc((void**)&gcolind_d, size_a2);
cudaMalloc((void**)&growptr_d, size_b2);
cudaMalloc((void**)&ndim_d, size_c);
cudaMalloc((void**)&gx_d, size_b);
cudaMalloc((void**)&res_d, size_b);

// Copy host to device variables
cudaMemcpy(gval_d, gval_, size_a, cudaMemcpyHostToDevice);
cudaMemcpy(gcolind_d, gcolind_, size_a2, cudaMemcpyHostToDevice);
cudaMemcpy(growptr_d, growptr_, size_b2, cudaMemcpyHostToDevice);
cudaMemcpy(ndim_d, ndim_, size_c, cudaMemcpyHostToDevice);
cudaMemcpy(gx_d, gx_, size_b, cudaMemcpyHostToDevice);
cudaMemcpy(res_d, res_, size_b, cudaMemcpyHostToDevice);

// La organizacion del grid en terminos de los bloques y threads.
// Primera version muy simple para el caso de una matriz 10x10.

//if (BLOCK_SIZE/ndim_(1) >= 512){
// cout << “Quietor!!! Demasiado Thread para tan poco bloque …” << endl;
// system(“pause”);
//}

dim3 dimBlock(BLOCK_SIZE);
dim3 dimGrid(ndim_[1]/BLOCK_SIZE);

// Launch the device computation
Mult_d<<< dimGrid, dimBlock >>>(gval_d, gcolind_d, growptr_d, ndim_d, gx_d, res_d);

// Read result from the device
cudaMemcpy(res_, res_d, size_b, cudaMemcpyDeviceToHost);

// Free device memory
cudaFree(gval_d);
cudaFree(gcolind_d);
cudaFree(growptr_d);
cudaFree(ndim_d);

}

==

DEVICE CODE

==

// DEVICE CODE
// Sparse Matrix-Vector multiplication.

#ifndef MATVECMULT_KERNEL_H
#define MATVECMULT_KERNEL_H

#include <stdio.h>

#include “matvecmult.h”

#endif

// Sparse matrix multiplication A * x = res !!
global void Mult_d(double* gval_, int* gcolind_, int* growptr_, int* ndim_, double* gx_, double* res_){

// Thread index
int tx = threadIdx.x ;

int k_b, k_e, ij_aux ;
double daux = 0;

k_b = growptr_[tx] ;
k_e = growptr_[tx+1];

while (k_b<k_e) {
ij_aux = gcolind_[k_b] ;
daux += gval_[k_b]*gx_[ij_aux];
k_b++;
}

res_[tx] = daux;
}

Your code compiles fine for me. (I had to define BLOCK_SIZE since I don’t have matvecmult.h).

You are very probably using the wrong compiler. Perhaps try using one of the build rules that other people have built.