about nvcc in command line

i have written a cuda program, if i compile it in vs IDE, then there is no problem and the result is right,

but if i compile it with nvcc command under commandline prompt the result is wrong,

what is the reason

here is my source code

[codebox]#include <stdlib.h>

#include <stdio.h>

#include <string.h>

#include <math.h>

#include <cutil.h>

#include <cutil_math.h>

#include <cutil_inline.h>

#define BLOCK 256

#define SOD 1000

#define ODD 230

#define DETECTOR_CELLSIZE 0.254

#define DETECTOR_XRESOLUTION 960

#define DETECTOR_YRESOLUTION 768

#define PI 3.14159265

#define SAMPLES_PER_TURN 720

#define DLAMBDA (2*PI/SAMPLES_PER_TURN)

#define PHANTOM_RADIUS (DETECTOR_XRESOLUTIONDETECTOR_CELLSIZE/2SOD/sqrt((SOD+ODD)(SOD+ODD)+(DETECTOR_XRESOLUTIONDETECTOR_CELL

SIZE/2)(DETECTOR_XRESOLUTIONDETECTOR_CELLSIZE/2)))

#define CONEBEAM_PITCH (PHANTOM_RADIUS*6)

// 事实上我是转了两圈

#define DRNUMBER 1440

const int split_number = 10; //每个探测器单元细分的数量,用于正演

global void forwardprojection_CUDA(float *d_dr_data, int dr);

host device float mysquare(float x);

host device float get_length(float3 source_position, float3 direction, float3 half_axes);

noinline host device float3 trans_to_ellipsolid(float3 src, float3 angle);

device void one_address_fp(float *tmp, \

						   float3 ellipsoid_position, float3 half_axis, float3 rotate_angles, float density,\

						   float lambda, \

						   int w, int h,\

						   int thidy, 

						   int dr);

int main()

{

// 一幅DR的空间

size_t dr_mem_size = sizeof(float) * DETECTOR_XRESOLUTION * DETECTOR_YRESOLUTION;

// 为投影数据分配内存空间

float *dr_data = (float *)malloc(dr_mem_size);

if (dr_data == NULL)

{

	printf("分配内存出错!\n");

	exit(0);

}

// 为投影数据分配显存空间

float *d_dr_data;

CUDA_SAFE_CALL(cudaMalloc((void **)&d_dr_data, dr_mem_size));



// block size

dim3 bDim(1, BLOCK);

dim3 gDim(1, (DETECTOR_XRESOLUTION * DETECTOR_YRESOLUTION + BLOCK - 1) / BLOCK);

char dr_name[100];



FILE *fp; 

for(int dr = 150; dr < 159/*DRNUMBER*/; dr ++)

{

	forwardprojection_CUDA<<<gDim, bDim>>>(d_dr_data, dr);

	CUDA_SAFE_CALL(cudaMemcpy(dr_data, d_dr_data, dr_mem_size, cudaMemcpyDeviceToHost));

	

	sprintf(dr_name, "../proj_data/CT%4.4i.proj", dr);

	printf("%s\n", dr_name);

	fp = fopen(dr_name, "wb");

	if (!fp)

	{

		printf("ERROR Create FILE!\n");

		exit(0);

	}

	fwrite(dr_data, sizeof(float), DETECTOR_XRESOLUTION * DETECTOR_YRESOLUTION, fp);

	fclose(fp);

}



free(dr_data);

cudaFree(d_dr_data);

printf("Done!\n");

return 0;

}

host device float mysquare(float x)

{

return x*x;

}

host device float get_length(float3 source_position, float3 direction, float3 half_axes)

{

float coa = mysquare(direction.x)/mysquare(half_axes.x) +\

	mysquare(direction.y)/mysquare(half_axes.y) +\

	mysquare(direction.z)/mysquare(half_axes.z);

float cob = 2 * (direction.x * source_position.x) / mysquare(half_axes.x) +\

	2 * (direction.y * source_position.y) / mysquare(half_axes.y) +\

	2 * (direction.z * source_position.z) / mysquare(half_axes.z);

float coc =  mysquare(source_position.x)/mysquare(half_axes.x) +\

	mysquare(source_position.y)/mysquare(half_axes.y) +\

	mysquare(source_position.z)/mysquare(half_axes.z) - 1;

float delta = mysquare(cob) - 4 * coa * coc;

if (delta <= 0 )

	return 0;

else

{

	float t1 = (-cob + sqrt(delta)) / (2 * coa);

	float t2 = (-cob - sqrt(delta)) / (2 * coa);

	return sqrt(mysquare(direction.x) + mysquare(direction.y) + mysquare(direction.z)) * abs(t1 - t2);

}

}

noinline host device float3 trans_to_ellipsolid(float3 src, float3 angle)

{

float alpha = angle.x / 180.0 * PI;

float beta = angle.y / 180.0 * PI;

float gama = angle.z / 180.0 * PI;

float3 des;

float a11 = cos(beta)*cos(gama);

float a12 = cos(gama)*sin(alpha)*sin(beta)+cos(alpha)*sin(gama);

float a13 = -cos(alpha)*cos(gama)*sin(beta)+sin(alpha)*sin(gama);

float a21 = -cos(beta)*sin(gama);

float a22 = cos(alpha)*cos(gama)-sin(alpha)*sin(beta)*sin(gama);

float a23 = cos(gama)*sin(alpha)+cos(alpha)*sin(beta)*sin(gama);

float a31 = sin(beta);

float a32 = -cos(beta)*sin(alpha);

float a33 = cos(alpha)*cos(beta);

des.x = a11 * src.x + a12 * src.y + a13 * src.z;

des.y = a21 * src.x + a22 * src.y + a23 * src.z;

des.z = a31 * src.x + a32 * src.y + a33 * src.z;

return des;

}

global void forwardprojection_CUDA(float *d_dr_data, int dr)

{

__shared__ float tmp_shared_array[BLOCK];

tmp_shared_array[threadIdx.y] = 0;

__syncthreads();

int index = blockIdx.y * BLOCK + threadIdx.y;

int w = index % DETECTOR_XRESOLUTION;

int h = index / DETECTOR_XRESOLUTION;

float lambda = dr * DLAMBDA;

// phantom 参数

float density;

float3 ellipsoid_position;

float3 half_axis;

float3 rotate_angles;	

//////////////////////////////////////////////////////////////////////////

// 1

half_axis.x = 0.69 * PHANTOM_RADIUS;

half_axis.y = 0.92 * PHANTOM_RADIUS;

half_axis.z = 0.81 * PHANTOM_RADIUS;

ellipsoid_position.x = 0 * PHANTOM_RADIUS;

ellipsoid_position.y = 0 * PHANTOM_RADIUS;

ellipsoid_position.z = 0 * PHANTOM_RADIUS;

rotate_angles.x = 0;

rotate_angles.y = 0;

rotate_angles.z = 0;

density = 1.0;

one_address_fp(tmp_shared_array, ellipsoid_position, half_axis, rotate_angles, density, lambda, w, h, threadIdx.y, dr);

__syncthreads();

//////////////////////////////////////////////////////////////////////////

// 2

half_axis.x = 0.6624 * PHANTOM_RADIUS;

half_axis.y = 0.874 * PHANTOM_RADIUS;

half_axis.z = 0.78 * PHANTOM_RADIUS ;

//ellipsoid_position.x = -0.0184 * PHANTOM_RADIUS;

ellipsoid_position.x = 0 * PHANTOM_RADIUS;

ellipsoid_position.y = 0 * PHANTOM_RADIUS;

ellipsoid_position.z = 0  * PHANTOM_RADIUS;

rotate_angles.x = 0;

rotate_angles.y = 0;

rotate_angles.z = 0;

density = -0.8;

one_address_fp(tmp_shared_array, ellipsoid_position, half_axis, rotate_angles, density, lambda, w, h, threadIdx.y, dr);

__syncthreads();



//////////////////////////////////////////////////////////////////////////

// 3

half_axis.x = 0.11 * PHANTOM_RADIUS;

half_axis.y = 0.31 * PHANTOM_RADIUS;

half_axis.z = 0.22  * PHANTOM_RADIUS;

ellipsoid_position.x = 0.22 * PHANTOM_RADIUS;

ellipsoid_position.y = 0 * PHANTOM_RADIUS;

ellipsoid_position.z = 0  * PHANTOM_RADIUS;

rotate_angles.x = -18;

rotate_angles.y = 0;

rotate_angles.z = -10;

density = -0.2;

one_address_fp(tmp_shared_array, ellipsoid_position, half_axis, rotate_angles, density, lambda, w, h, threadIdx.y, dr);

__syncthreads();



//////////////////////////////////////////////////////////////////////////

// 4

half_axis.x = 0.16 * PHANTOM_RADIUS;

half_axis.y = 0.41 * PHANTOM_RADIUS;

half_axis.z = 0.28  * PHANTOM_RADIUS;

ellipsoid_position.x = -0.22 * PHANTOM_RADIUS;

ellipsoid_position.y = 0 * PHANTOM_RADIUS;

ellipsoid_position.z = 0  * PHANTOM_RADIUS;

rotate_angles.x = 18;

rotate_angles.y = 0;

rotate_angles.z = 10;

density = -0.2;

one_address_fp(tmp_shared_array, ellipsoid_position, half_axis, rotate_angles, density, lambda, w, h, threadIdx.y, dr);

__syncthreads();



//////////////////////////////////////////////////////////////////////////

// 5

half_axis.x = 0.21 * PHANTOM_RADIUS;

half_axis.y = 0.25 * PHANTOM_RADIUS;

half_axis.z = 0.41  * PHANTOM_RADIUS;

ellipsoid_position.x = 0 * PHANTOM_RADIUS;

ellipsoid_position.y = 0.35 * PHANTOM_RADIUS;

ellipsoid_position.z = -0.15  * PHANTOM_RADIUS;

rotate_angles.x = 0;

rotate_angles.y = 0;

rotate_angles.z = 0;

density = 0.1;

one_address_fp(tmp_shared_array, ellipsoid_position, half_axis, rotate_angles, density, lambda, w, h, threadIdx.y, dr);

__syncthreads();

/* //////////////////////////////////////////////////////////////////////////

// 6

half_axis.x = 0.046 * PHANTOM_RADIUS;

half_axis.y = 0.046 * PHANTOM_RADIUS;

half_axis.z = 0.05  * PHANTOM_RADIUS;

ellipsoid_position.x = 0 * PHANTOM_RADIUS;

ellipsoid_position.y = 0.1 * PHANTOM_RADIUS;

ellipsoid_position.z = 0.25  * PHANTOM_RADIUS;

rotate_angles.x = 0;

rotate_angles.y = 0;

rotate_angles.z = 0;

density = 0.1;

one_address_fp(tmp_shared_array, ellipsoid_position, half_axis, rotate_angles, density, lambda, w, h, threadIdx.y, dr);



//////////////////////////////////////////////////////////////////////////

// 7

half_axis.x = 0.046 * PHANTOM_RADIUS;

half_axis.y = 0.046 * PHANTOM_RADIUS;

half_axis.z = 0.05  * PHANTOM_RADIUS;

ellipsoid_position.x = 0 * PHANTOM_RADIUS;

ellipsoid_position.y = -0.1 * PHANTOM_RADIUS;

ellipsoid_position.z = 0.25  * PHANTOM_RADIUS;

rotate_angles.x = 0;

rotate_angles.y = 0;

rotate_angles.z = 0;

density = 0.1;

one_address_fp(tmp_shared_array, ellipsoid_position, half_axis, rotate_angles, density, lambda, w, h, threadIdx.y, dr);

//////////////////////////////////////////////////////////////////////////

// 8

half_axis.x = 0.046 * PHANTOM_RADIUS;

half_axis.y = 0.023 * PHANTOM_RADIUS;

half_axis.z = 0.05  * PHANTOM_RADIUS;

ellipsoid_position.x = -0.08 * PHANTOM_RADIUS;

ellipsoid_position.y = -0.605 * PHANTOM_RADIUS;

ellipsoid_position.z = 0  * PHANTOM_RADIUS;

rotate_angles.x = 0;

rotate_angles.y = 0;

rotate_angles.z = 0;

density = 0.1;

one_address_fp(tmp_shared_array, ellipsoid_position, half_axis, rotate_angles, density, lambda, w, h, threadIdx.y, dr);

//////////////////////////////////////////////////////////////////////////

// 9

half_axis.x = 0.023 * PHANTOM_RADIUS;

half_axis.y = 0.023 * PHANTOM_RADIUS;

half_axis.z = 0.02  * PHANTOM_RADIUS;

ellipsoid_position.x = 0 * PHANTOM_RADIUS;

ellipsoid_position.y = -0.606 * PHANTOM_RADIUS;

ellipsoid_position.z = 0  * PHANTOM_RADIUS;

rotate_angles.x = 0;

rotate_angles.y = 0;

rotate_angles.z = 0;

density = 0.1;

one_address_fp(tmp_shared_array, ellipsoid_position, half_axis, rotate_angles, density, lambda, w, h, threadIdx.y, dr);

//////////////////////////////////////////////////////////////////////////

// 10

half_axis.x = 0.023 * PHANTOM_RADIUS;

half_axis.y = 0.046 * PHANTOM_RADIUS;

half_axis.z = 0.02  * PHANTOM_RADIUS;

ellipsoid_position.x = 0.06 * PHANTOM_RADIUS;

ellipsoid_position.y = -0.605 * PHANTOM_RADIUS;

ellipsoid_position.z = 0  * PHANTOM_RADIUS;

rotate_angles.x = 0;

rotate_angles.y = 0;

rotate_angles.z = 0;

density = 0.1;

one_address_fp(tmp_shared_array, ellipsoid_position, half_axis, rotate_angles, density, lambda, w, h, threadIdx.y, dr);

*/

__syncthreads();

if (h < DETECTOR_YRESOLUTION && w < DETECTOR_XRESOLUTION)

	d_dr_data[h * DETECTOR_XRESOLUTION + w] = tmp_shared_array[threadIdx.y] / float(split_number*split_number);

}

device void one_address_fp(float *tmp, \

						   float3 ellipsoid_position, float3 half_axis, float3 rotate_angles, float density,\

						   float lambda, \

						   int w, int h,\

						   int thidy, 

						   int dr)

{

float3 source_position_fix;

float3 source_position_ellipsolid;

float3 tmp_pt;

float3 dector_prosition_fix;

float3 dector_prosition_ellipsolid;

float3 direction;

source_position_fix.x = SOD * cos(lambda);

source_position_fix.y = SOD * sin(lambda);

source_position_fix.z = CONEBEAM_PITCH * lambda / (2 * PI);

if (dr >= DRNUMBER * 2 / 3)

	source_position_fix.z -= CONEBEAM_PITCH * 5 / 3;

else if(dr >= DRNUMBER / 3)

	source_position_fix.z -= CONEBEAM_PITCH;

else

	source_position_fix.z -= CONEBEAM_PITCH / 3;

// 平移

source_position_fix.x -= ellipsoid_position.x;

source_position_fix.y -= ellipsoid_position.y;

source_position_fix.z -= ellipsoid_position.z;

// 旋转

source_position_ellipsolid = trans_to_ellipsolid(source_position_fix, rotate_angles);

for(int t = 0; t < split_number; t ++)

{

	for(int s = 0; s < split_number; s ++)

	{	

		tmp_pt.x = -ODD;

		tmp_pt.y = - DETECTOR_XRESOLUTION / 2.0 * DETECTOR_CELLSIZE + w * DETECTOR_CELLSIZE;

		tmp_pt.z = DETECTOR_YRESOLUTION / 2.0 * DETECTOR_CELLSIZE - h * DETECTOR_CELLSIZE;

		tmp_pt.y = tmp_pt.y + DETECTOR_CELLSIZE / float(split_number) / 2 + DETECTOR_CELLSIZE / float(split_number) * s;

		tmp_pt.z = tmp_pt.z - DETECTOR_CELLSIZE / float(split_number) / 2 - DETECTOR_CELLSIZE / float(split_number) * t;

		dector_prosition_fix.x = tmp_pt.x * cos(lambda) - tmp_pt.y * sin(lambda);

		dector_prosition_fix.y = tmp_pt.x * sin(lambda) + tmp_pt.y * cos(lambda);

		dector_prosition_fix.z = tmp_pt.z + CONEBEAM_PITCH * lambda / (2 * PI);	

		if (dr >= DRNUMBER * 2 / 3)

			dector_prosition_fix.z -= CONEBEAM_PITCH * 5 / 3;

		else if(dr >= DRNUMBER / 3)

			dector_prosition_fix.z -= CONEBEAM_PITCH;

		else

			dector_prosition_fix.z -= CONEBEAM_PITCH / 3;

		dector_prosition_fix.x -= ellipsoid_position.x;

		dector_prosition_fix.y -= ellipsoid_position.y;

		dector_prosition_fix.z -= ellipsoid_position.z;

		dector_prosition_ellipsolid = trans_to_ellipsolid(dector_prosition_fix, rotate_angles);

		direction.x = source_position_ellipsolid.x - dector_prosition_ellipsolid.x;

		direction.y = source_position_ellipsolid.y - dector_prosition_ellipsolid.y;

		direction.z = source_position_ellipsolid.z - dector_prosition_ellipsolid.z;

		float p = density * get_length(source_position_ellipsolid, direction, half_axis);

		tmp[thidy] += p;

	}

}

}[/codebox]

the command i used is " nvcc -arch=sm_13 forward_projection.cu"