[Solved ]Error whileSpecify macro definitions for use during preprocessing or compilation.

Anyone know how to figure out this problems?
If i remove “-DN=1024” , it compile correctly.
Compiling error:

usr/local/cuda/bin/../targets/x86_64-linux/include/cuda.h(6121): error: expected a ")"

/usr/local/cuda/bin/../targets/x86_64-linux/include/cuda.h(6155): error: expected a ")"

/usr/local/cuda/bin/../targets/x86_64-linux/include/cuda.h(6189): error: expected a ")"

/usr/local/cuda/bin/../targets/x86_64-linux/include/cuda.h(6344): error: expected a ")"

/usr/local/cuda/bin/../targets/x86_64-linux/include/cuda.h(6380): error: expected a ")"

/usr/local/cuda/bin/../targets/x86_64-linux/include/cuda.h(6415): error: expected a ")"

6 errors detected in the compilation of "/tmp/tmpxft_00001279_00000000-9_vd_tex.cpp1.ii".

Makefile:

vd_tex:vd_tex.cu
        nvcc -DN=1024 -o vd_tex vd_tex.cu

vd_tex.cu

#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <cuda.h>
#ifndef N
    #define N (2048)
#endif

texture<float> tex_a;
texture<float> tex_b;
texture<float> tex_c;

// CUDA kernel. Each thread takes care of one element of c
__global__ void vecAdd(float *c)
{
    // Get our global thread ID
    int id = blockIdx.x*blockDim.x+threadIdx.x;
    // Make sure we do not go out of bounds
   // if (id < N) {
		
		c[id] = tex1Dfetch(tex_a,id) + tex1Dfetch(tex_b,id);
//	}
        
}

int main( int argc, char* argv[] )
{
    // Size of vectors
    //int n = 10000;
	
    // Host input vectors
    float *h_a;
    float *h_b;
    //Host output vector
    float *h_c;
	
    // Device input vectors
    float *d_a;
    float *d_b;
    //Device output vector
    float *d_c;
	
    // Size, in bytes, of each vector
    size_t bytes = N*sizeof(float);
	
    // Allocate memory for each vector on host
    h_a = (float*)malloc(bytes);
    h_b = (float*)malloc(bytes);
    h_c = (float*)malloc(bytes);
	// Allocate memory for each vector on GPU
    cudaMalloc(&d_a, bytes);
    cudaMalloc(&d_b, bytes);
    cudaMalloc(&d_c, bytes);
	
    int i;
    // Initialize vectors on host
    for( i = 0; i < N; i++ ) {
        h_a[i] = sin(i)*sin(i);
        h_b[i] = cos(i)*cos(i);
		//h_c[i] = 0.0f;
    }
	// bind to texture memory
	cudaBindTexture( NULL, tex_a,
					 d_a,
					 bytes );
	cudaBindTexture( NULL, tex_b,
					 d_b,
					 bytes );
	cudaBindTexture( NULL, tex_c,
					 d_c,
					 bytes );
    // Copy host vectors to device
    cudaMemcpy( d_a, h_a, bytes, cudaMemcpyHostToDevice);
    cudaMemcpy( d_b, h_b, bytes, cudaMemcpyHostToDevice);
	
	
    int blockSize, gridSize;
	
    // Number of threads in each thread block
    blockSize = 1024;
	
    // Number of thread blocks in grid
    gridSize = (int)ceil((float)N/blockSize);
	
    // Execute the kernel
    vecAdd<<<gridSize, blockSize>>>(d_c);
	
    // Copy array back to host
    cudaMemcpy( h_c, d_c, bytes, cudaMemcpyDeviceToHost );
	
    // Sum up vector c and print result divided by n, this should equal 1 within error
    float sum = 0;
    for(i=0; i<N; i++)
        sum += h_c[i];
    printf("final result: %f\n", sum/N);
	
    // Release device memory
    cudaFree(d_a);
    cudaFree(d_b);
    cudaFree(d_c);
	
    // Release host memory
    free(h_a);
    free(h_b);
    free(h_c);
	return 0;
}

For codes that use the runtime API, it’s not necessary to include cuda.h like this:

#include <cuda.h>

simply delete that line and the problem should go away. If you don’t wish to make that change:

The -DN=1024 switch is clashing with various lines in cuda.h which have a function prototype that uses a variable named N, such as this one:

CUresult CUDAAPI cuMemsetD8(CUdeviceptr dstDevice, unsigned char uc, size_t N);
                                                                            ^

Either don’t use this particular compile switch, or change your N variable to something else, like NN, which doesn’t appear in any CUDA headers, and use -DNN=1024 instead.

$ cat t691.cu
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <cuda.h>
#ifndef NN
    #define NN (2048)
#endif

texture<float> tex_a;
texture<float> tex_b;
texture<float> tex_c;

// CUDA kernel. Each thread takes care of one element of c
__global__ void vecAdd(float *c)
{
    // Get our global thread ID
    int id = blockIdx.x*blockDim.x+threadIdx.x;
    // Make sure we do not go out of bounds
   // if (id < NN) {

                c[id] = tex1Dfetch(tex_a,id) + tex1Dfetch(tex_b,id);
//      }

}

int main( int argc, char* argv[] )
{
    // Size of vectors
    //int n = 10000;

    // Host input vectors
    float *h_a;
    float *h_b;
    //Host output vector
    float *h_c;

    // Device input vectors
    float *d_a;
    float *d_b;
    //Device output vector
    float *d_c;

    // Size, in bytes, of each vector
    size_t bytes = NN*sizeof(float);

    // Allocate memory for each vector on host
    h_a = (float*)malloc(bytes);
    h_b = (float*)malloc(bytes);
    h_c = (float*)malloc(bytes);
        // Allocate memory for each vector on GPU
    cudaMalloc(&d_a, bytes);
    cudaMalloc(&d_b, bytes);
    cudaMalloc(&d_c, bytes);

    int i;
    // Initialize vectors on host
    for( i = 0; i < NN; i++ ) {
        h_a[i] = sin(i)*sin(i);
        h_b[i] = cos(i)*cos(i);
                //h_c[i] = 0.0f;
    }
        // bind to texture memory
        cudaBindTexture( NULL, tex_a,
                                         d_a,
                                         bytes );
        cudaBindTexture( NULL, tex_b,
                                         d_b,
                                         bytes );
        cudaBindTexture( NULL, tex_c,
                                         d_c,
                                         bytes );
    // Copy host vectors to device
    cudaMemcpy( d_a, h_a, bytes, cudaMemcpyHostToDevice);
    cudaMemcpy( d_b, h_b, bytes, cudaMemcpyHostToDevice);

int blockSize, gridSize;

    // Number of threads in each thread block
    blockSize = 1024;

    // Number of thread blocks in grid
    gridSize = (int)ceil((float)NN/blockSize);

    // Execute the kernel
    vecAdd<<<gridSize, blockSize>>>(d_c);

    // Copy array back to host
    cudaMemcpy( h_c, d_c, bytes, cudaMemcpyDeviceToHost );

    // Sum up vector c and print result divided by n, this should equal 1 within error
    float sum = 0;
    for(i=0; i<NN; i++)
        sum += h_c[i];
    printf("final result: %f\n", sum/NN);

    // Release device memory
    cudaFree(d_a);
    cudaFree(d_b);
    cudaFree(d_c);

    // Release host memory
    free(h_a);
    free(h_b);
    free(h_c);
        return 0;
}
$ nvcc -DNN=1024 -o t691 t691.cu
$

Thank you.
Your answer solved my problem.