invalid device function

in the following program i am getting the error invalid device function i am compiling it with nvcc -arch sm_13 and getting the error invalid device function i think i am doing everything right

#include<stdio.h>
#define NOE 4
#define BLOCKSIZE 512
#include<cuda.h>
void checkCUDAError(const char *msg)
{
cudaError_t err = cudaGetLastError();
if( cudaSuccess != err) {
fprintf(stderr, “Cuda error: %s: %s.\n”, msg, cudaGetErrorString( err) );
exit(EXIT_FAILURE);
}
}

global void bsearching(int arr,int num,int elements,int count)
{
shared int selements[NOE];
shared int scount[NOE];
int i,j;
i=blockIdx.xblockDim.x+threadIdx.x;
if(threadIdx.x<NOE)
{
selements[threadIdx.x]=elements[threadIdx.x];
scount[threadIdx.x]=0;
}
__syncthreads();
for(j=0;j<NOE;j++)
{
if(arr[i]==selements[j])
{
atomicAdd(&scount[j],1);
break;
}
}
__syncthreads();
if(threadIdx.x<NOE)
count[blockIdx.x
NOE+threadIdx.x]=scount[threadIdx.x];
__syncthreads();
}

int main()
{
int *a_h,a_d; // host and device pointers for the source array
int i,j;
int num;
int size;
int fcount[NOE]={0,0,0,0};
printf(“\nEnter the number of elements”);
scanf(“%d”,&num);
size=num
sizeof(int);
printf(“\n Random function gives input to the array\n”);
cudaSetDeviceFlags(cudaDeviceMapHost);
checkCUDAError(“cudaSetDeviceFlags”);

cudaHostAlloc((void **)&a_h, size, cudaHostAllocMapped);
checkCUDAError(“cudaHostAllocMapped”);
cudaHostGetDevicePointer((void **)&a_d, (void *)a_h, 0);
checkCUDAError(“cudaHostAllocMapped”);

for(j=0;j<num;j++) a_h[j]=j;
a_h[j]=rand();

int count,dcount;
int noofblocks= num/512+((num%512==0)?0:1);
int size2=noofblocks
NOE
sizeof(int);
cudaHostAlloc((void **)&count,size2, cudaHostAllocMapped);
checkCUDAError(“cudaHostAllocMapped”);
cudaHostGetDevicePointer((void **)&dcount, (void *)a_h, 0);
checkCUDAError(“cudaHostAllocMapped”);

int *elements,delements;
cudaHostAlloc((void **)&elements, NOE
sizeof(int), cudaHostAllocMapped);
checkCUDAError(“cudaHostAllocMapped”);
cudaHostGetDevicePointer((void **)&delements, (void *)a_h, 0);
checkCUDAError(“cudaHostAllocMapped”);
for(i=0;i<4;i++)
elements[i]=rand();

bsearching<<<noofblocks,BLOCKSIZE>>>(a_d,num,delements,dcount);
cudaThreadSynchronize();
checkCUDAError(“kernel failure”);
printf(“\n The elements are present following no of times \n”);
for(j=0;j<NOE;j++)
{
for(i=0;i<noofblocks;i++)
fcount[j]+=count[i*NOE +j];
printf(" count= %d -->number = %d",j,fcount[j]);
}
return 0;
}

I have searched nvidia forums but posts related to this are for windows l only i am using it on ubuntu 8.10.
THANKS in Advance

Try -arch=sm_13

N.

Did you make sure your device supports sm13?