nvcc doesn't find default includes

I want to compile a simple CUDA programm;

#include "stdafx.h"

#include <stdio.h>

#include <cuda.h>

#include <cutil.h>

// Kernel that executes on the CUDA device

__global__ void square_array(float *a, int N)


#define STRIDE       32

#define OFFSET        0

#define GROUP_SIZE  512

  int n_elem_per_thread = N / (gridDim.x * blockDim.x);

  int block_start_idx = n_elem_per_thread * blockIdx.x * blockDim.x;

  int thread_start_idx = block_start_idx

			+ (threadIdx.x / STRIDE) * n_elem_per_thread * STRIDE

			+ ((threadIdx.x + OFFSET) % STRIDE);

  int thread_end_idx = thread_start_idx + n_elem_per_thread * STRIDE;

  if(thread_end_idx > N) thread_end_idx = N;

  int group = (threadIdx.x / GROUP_SIZE) & 1;

  for(int idx=thread_start_idx; idx < thread_end_idx; idx+=STRIDE)


    if(!group) a[idx] = a[idx] * a[idx];

    else       a[idx] = a[idx] + a[idx];



// main routine that executes on the host

int main(void)


  float *a_h, *a_d;  // Pointer to host & device arrays

  const int N = 1<<25;  // Make a big array with 2**N elements

  size_t size = N * sizeof(float);

  a_h = (float *)malloc(size);        // Allocate array on host

  cudaMalloc((void **) &a_d, size);   // Allocate array on device

  // Initialize host array and copy it to CUDA device

  for (int i=0; i<N; i++) a_h[i] = (float)i;

  cudaMemcpy(a_d, a_h, size, cudaMemcpyHostToDevice);

  // Create timer for timing CUDA calculation

  unsigned int timer = 0;

  cutCreateTimer( &timer );

  // Set number of threads and blocks

  int n_threads_per_block = 1<<9;  // 512 threads per block

  int n_blocks = 1<<10;  // 1024 blocks

  // Do calculation on device

  cutStartTimer( timer );  // Start timer

  square_array <<< n_blocks, n_threads_per_block >>> (a_d, N);

  cudaThreadSynchronize();  // Wait for square_array to finish on CUDA

  cutStopTimer( timer );  // Stop timer

  // Retrieve result from device and store it in host array

  cudaMemcpy(a_h, a_d, sizeof(float)*N, cudaMemcpyDeviceToHost);

  // Print some of the results and the CUDA execution time

  for (int i=0; i<N; i+=N/50) printf("%d %f\n", i, a_h[i]);

  printf("CUDA execution time = %f ms\n",cutGetTimerValue( timer ));

  // Cleanup

  free(a_h); cudaFree(a_d);


but I canno’t compile it with nvcc:

$ nvcc example.cu

cc1plus: error: cuda_runtime.h: No such file or directory

example.cu:4:20: error: stdafx.h: No such file or directory

example.cu:7:18: error: cuda.h: No such file or directory

example.cu:8:19: error: cutil.h: No such file or directory

CUDA Driver and SDK (in /usr/cuda/) should be properly installed.

$ nvcc -V

nvcc: NVIDIA (R) Cuda compiler driver

Copyright (c) 2005-2010 NVIDIA Corporation

Built on Wed_Nov__3_16:16:57_PDT_2010

Cuda compilation tools, release 3.2, V0.2.1221

$PATH has /usr/cuda/bin

$LD_LIBRARY_PATH has /usr/cuda/lib

System is Redhat with 2.6.32 Kernel, NVidia Driver 260.19.44, Card; GeForce 8600 GT

try running the compilation command with the --dry-run option. nvcc will dump out the commands in full, including the include and library search paths. You might be able to spot the problem from there. Also check that you have read permissions to the include directories, if they do exist and are properly populated.