Global memory access problem

I allocated global memory variable in host function, and I changed this varible value in device function.

But, This value wasn’t changed.

I don’t know this reason.


My System

OS: Fedora 8

Cuda version: 2.1

Device: Tesla C870


#include <stdio.h>

#include <stdlib.h>

#define BLOCK_SIZE  4

#define THREAD_SIZE 10

#define TOTAL_THREAD BLOCK_SIZE * THREAD_SIZE

#define DEBUG_SIZE 1000

__global__ void test(int *debug)

{

	int i;

	int mi_start, mi_end;

	int mi_local_start, mi_local_end;

	int tid = blockIdx.x * blockDim.x + threadIdx.x;

	mi_start = (blockIdx.x*DEBUG_SIZE)/BLOCK_SIZE;

	mi_end   = ((blockIdx.x+1)*DEBUG_SIZE)/BLOCK_SIZE;

	if (blockIdx.x == BLOCK_SIZE -1)

		mi_end = DEBUG_SIZE;

	

	mi_local_start = mi_start + (threadIdx.x*(mi_end-mi_start) / THREAD_SIZE);

	mi_local_end = mi_start + ((threadIdx.x + 1)*(mi_end-mi_start)/ THREAD_SIZE);

	for ( i = mi_local_start; i < mi_local_end; i++ )

	{

		debug[i] = tid;

	}

}

int main()

{

	int *debug, *g_debug;

	int i;

	debug = (int *)malloc(sizeof(int) * DEBUG_SIZE);

	for ( i = 0; i < DEBUG_SIZE; i++ )

		debug[i] = 1;

	cudaMalloc((void**)&g_debug, sizeof(int) * DEBUG_SIZE);

	cudaMemcpy(g_debug, debug, sizeof(int) * DEBUG_SIZE, cudaMemcpyHostToDevice);

	test<<< BLOCK_SIZE, THREAD_SIZE >>>(g_debug);

	cudaThreadSynchronize();

	cudaMemcpy(debug, g_debug, sizeof(int) * DEBUG_SIZE, cudaMemcpyDeviceToHost);

	for ( i = 0; i < DEBUG_SIZE; i++ )

		printf("debug[%d] = %d\n", i, debug[i]);

	free(debug);

	cudaFree(g_debug);

	return 0;

}