The kernel can not be configured CUDA 1.1 on VC 2005

Hello :) I have tried CUDA about a month,and

I’m working hard on a pi-computing program.

After debuging a few times,still I can’t fix an error…

1>------ Build started: Project: win32-console-app-cuda-msvc2005, Configuration: Debug Win32 ------

1>Compiling...

1>PI.cu

1>"C:\CUDA\include\common_functions.h", line 55: warning: dllexport/dllimport

1>          conflict with "clock" (declared at line 176 of "C:\Program

1>          Files\Microsoft Visual Studio 8\VC\INCLUDE\time.h");

1>          dllimport/dllexport dropped

1>  extern __declspec(__host__) __declspec(__device__) clock_t clock(void);

1>                                                             ^

1>"PI.cu", line 39: error: call can not be configured

1>    _incircle<<<grid,threads>>>(dev_sum,log_r,i,j);

1>    ^

1>"PI.cu", line 102: error: identifier "atomicAdd" is undefined

1>    atomicAdd(&dev_sum[bidx_x+gdim_x*i],5);

1>    ^

1>2 errors detected in the compilation of "C:\DOCUME~1\bbs\LOCALS~1\Temp/tmpxft_00000c70_00000000-5.ii".

1>Build log was saved at "file://c:\Documents and Settings\circ\My Documents\Visual Studio 2005\Projects\PI\Debug\BuildLog.htm"

1>win32-console-app-cuda-msvc2005 - 2 error(s), 1 warning(s)

========== Build: 0 succeeded, 1 failed, 0 up-to-date, 0 skipped ==========

#include <stdio.h>

#include <stdlib.h>

#include <math.h>

#include "cuda_runtime.h"

#include "cutil.h"

#include "device_functions.h"

#include "sm_11_atomic_functions.h"

#define radium 100000000

void cudaInit(void);

void _incircle(int *dev_sum,float log_r,int i,int j);

int _CONFIG(int *sum,long long r);

int main() {

cudaInit();//Initialize the CUDA device and display device properties

printf("Start Calculating PI~\n");

//Allocate the array for PI on host

size_t data_size=sizeof(int)*250000;

int *sum=(int*)malloc(data_size);

//Allocate the array on device and write data from the host array

int *dev_sum;

CUDA_SAFE_CALL(cudaMalloc((void**)&dev_sum,data_size));

CUDA_SAFE_CALL(cudaMemset((void*)dev_sum,-1,data_size));//Memset the device array to -1

//Indicate the grid size and block size

dim3 grid(62500, 31250, 1);

dim3 threads(32, 16, 1); 

float log_r=log((float)radium);

//Run the kernel

for(int i=0;i<50;i++){

	for(int j=0;j<200;j++){

  incircle<<<grid,threads>>>(dev_sum,log_r,i,j);

  }

	}

CUDA_SAFE_CALL(cudaMemcpy(sum,dev_sum,data_size,cudaMemcpyDeviceToHost));

int size=_CONFIG(sum,radium);//Configure the value of array

//write the result into "pi_data.txt"

FILE *PI;

fopen_s(&PI,"pi_data.txt","w,ccs=<UNICODE>");

for(int i=249999+size;i<250000;i++) {

	fprintf_s(PI,"%d",sum[i]);

	}

fclose(PI);

printf("Calculating succeeded --> pi_data.txt\n");

cudaFree(dev_sum);

free(sum);

system("pause");

return 0;

}

It seems the block and grid size should make sense,

and so should the arrays in shared memory.

But I just can’t figure it out…

The other error could result from the nvcc command line

nvcc.exe -ccbin “C:\Program Files\Microsoft Visual Studio 8\VC\bin” -c -DWIN32 -D_DEBUG -D_CONSOLE -Xcompiler “/EHsc /W3 /nologo /Wp64 /Od /Zi /MDd /GR” -I"C:\CUDA\include" -o Debug\win32-console-app-cuda-msvc2005.obj win32-console-app-cuda-msvc2005.vcproj

where is the define of _incircle ?
where is include file about atomicAdd?

check it more :)

Thank you~I found that if I separate the _incircle.cu from the .cpp code

and compile the _incircle.cu with nvcc command line before building with VC++ 2005,It works fine!

But I am just able to compile it…

My program stops at the kernel function

I have to complete it in two days~ <img src=‘http://hqnveipbwb20/public/style_emoticons/<#EMO_DIR#>/crying.gif’ class=‘bbc_emoticon’ alt=’:’(’ />

 

#include <stdio.h>

#include "cuda_runtime.h"

#include "sm_11_atomic_functions.h"

__global__ void _incircle(int *dev_sum,float log_r,int i,int j){

	

	__shared__ float X[32];

	__shared__ float Y[16];

	X[threadIdx.x]=__logf(gridDim.x*i+blockIdx.x*blockDim.x+threadIdx.x);

	Y[threadIdx.y]=__logf(gridDim.y*j+blockIdx.y*blockDim.y+threadIdx.y);

	__syncthreads();

	

	if(X[threadIdx.x]+Y[threadIdx.y]<=log_r)

  atomicAdd(&dev_sum[blockIdx.x+gridDim.x*i],5);

  	

	}	

extern "C" void kernel(int *dev_sum,float log_r,dim3 grid,dim3 threads);

void kernel(int *dev_sum,float log_r,dim3 grid,dim3 threads) {

	for(int i=0;i<50;i++) {

  for(int j=0;j<200;j++)

  	_incircle<<<grid,threads>>>(dev_sum,log_r,i,j);

  

  }

	}