My cuda program gets different results when I run it many times

#include <stdlib.h>

#include <stdio.h>

#include <string.h>

#include <math.h>

#include <malloc.h>

#include <string>

#include <cutil.h>

#include <cutil_math.h>

#include <cutil_inline.h>

#include <cufft.h>

#include <Windows.h>

using namespace std;

#define BLOCK 16

#define PI (3.14159265f)

#define detector_xresolution 1320

#define detector_yresolution 1536

#define number_of_projections 50

//////////////////////////////////////////////////////////////////////////

// 函数声明

void bpf();

__global__ void get_derivative_CUDA(float *derivative_data, float *proj_data_0);

void load_projection_and_calc_derivative_TD_SGN(float **derivative_data);

int main()

{

	bpf();

	return 0;

}

void bpf()

{

	long start = GetTickCount();

	size_t proj_mem_size = sizeof(float) * detector_xresolution * detector_yresolution;

	//////////////////////////////////////////////////////////////////////////

	// 分配内存空间,存放投影数据的导函数

	float **derivative_data = (float **)malloc(sizeof(float *) * number_of_projections);

	if (!derivative_data)

	{

		printf("Error malloc data derivative_data!\n");

		exit(-1);

	}

	for(int i = 0; i < number_of_projections; i ++)

	{

		derivative_data[i] = (float *)malloc(proj_mem_size);

		if (!derivative_data[i])

		{

			printf("Error malloc data derivative_data[%d]!\n", i);

			exit(-1);

		}

	}

	//////////////////////////////////////////////////////////////////////////

	// è¯»å–æŠ•å½±æ•°æ®å¹¶æ±‚å¯¼åŠ æƒ

	load_projection_and_calc_derivative_TD_SGN(derivative_data);

	// 释放内存空间,投影数据的导数

	for(int i = 0; i < number_of_projections; i ++)

	{

		free(derivative_data[i]);

	}

	free(derivative_data);

	long stop = GetTickCount();

	printf("time used: %f s\n", (stop - start) / 1000.0f);

}

void load_projection_and_calc_derivative_TD_SGN(float **derivative_data)

{

	size_t proj_mem_size = sizeof(float) * detector_xresolution * detector_yresolution;

	dim3 bDim_proj(BLOCK, BLOCK);

	dim3 gDim_proj((detector_xresolution + BLOCK - 1) / BLOCK, (detector_yresolution + BLOCK - 1) / BLOCK);

	FILE *fp;

	char proj_name[100];

	float *proj_data = (float *)malloc(proj_mem_size);

	float *d_proj_data_0;

	float *d_derivative_data;

	CUDA_SAFE_CALL(cudaMalloc((void **)&d_proj_data_0, proj_mem_size));

	CUDA_SAFE_CALL(cudaMalloc((void **)&d_derivative_data, proj_mem_size));

	cudaThreadSynchronize();

	for(int dr = 0; dr < number_of_projections; dr ++)

	{

		sprintf(proj_name, "../proj_data/CT%4.4i.proj", dr);

		printf("%s\n", proj_name);

		fp = fopen(proj_name, "rb");

		if (!fp)

		{

			printf("error open file!\n");

			exit(-1);

		}

		fread(proj_data, 1, proj_mem_size, fp);

		fclose(fp);

		CUDA_SAFE_CALL(cudaMemcpy(d_proj_data_0, proj_data, proj_mem_size, cudaMemcpyHostToDevice));

		

		

		get_derivative_CUDA<<<gDim_proj, bDim_proj>>>(d_derivative_data, d_proj_data_0);

		cudaThreadSynchronize();

		CUDA_SAFE_CALL(cudaMemcpy(derivative_data[dr], d_derivative_data, proj_mem_size, cudaMemcpyDeviceToHost));

		cudaThreadSynchronize();

	}

	for(int i = 0; i < 50; i ++)

	{

		sprintf(proj_name, "%4.4i", i);

		fp = fopen(proj_name, "wb");

		fwrite(derivative_data[i], 1, proj_mem_size, fp);

		fclose(fp);

	}

	free(proj_data);

	cudaFree(d_proj_data_0);

	cudaFree(d_derivative_data);

}

__global__ void get_derivative_CUDA(float *derivative_data, float *proj_data_0)

{

	int w = blockIdx.x * blockDim.x + threadIdx.x;

	int h = blockIdx.y * blockDim.y + threadIdx.y;

	

	if (h < detector_yresolution || w < detector_xresolution)

	{

		if (h == 0 || h == detector_yresolution - 1 || w == 0 || w == detector_xresolution - 1)

			derivative_data[h * detector_xresolution + w] = 0.0f;

		else

		{

			derivative_data[h * detector_xresolution + w] = proj_data_0[h * detector_xresolution + w + 1] - proj_data_0[h * detector_xresolution + w - 1];

		}

	}

}

this is my program, just to calculate the derivative of the data.