device memory data losed

I copy host memory to device memory, in kernel function each thread write some data in the device memory.
after kernel function is finshed, I copy device memory back to host memory.
and print out the host memory, then I find some data is losing.

the code is below:
can you help me solve this problem. it perplex me days.

// pay attention on “these characters are losing” in function Test_kernel(char* logInfo)

#include <stdio.h>
#include <stdlib.h>
#include <cuda_runtime.h>
#include <cutil.h>
#define Log 1
#define THREAD_SIZE 1024
#define THREAD_NUM 4

/************************************************************
/
/
Init CUDA /
/
************************************************
************/
#if DEVICE_EMULATION

bool InitCUDA(void){return true;}

#else
bool InitCUDA(void)
{
int count = 0;
int i = 0;

cudaGetDeviceCount(&count);
if(count == 0) {
	fprintf(stderr, "There is no device.\n");
	return false;
}

for(i = 0; i < count; i++) {
	cudaDeviceProp prop;
	if(cudaGetDeviceProperties(&prop, i) == cudaSuccess) {
		if(prop.major >= 1) {
			break;
		}
	}
}
if(i == count) {
	fprintf(stderr, "There is no device supporting CUDA.\n");
	return false;
}
cudaSetDevice(i);

printf("CUDA initialized.\n");
return true;

}

#endif

device int sprintf_int(char *s, int v) {
int len=0;
if(v==0) {
s[0]=‘0’;
return (len=1);
}
if(v<0) {
s[len++]=’-’;
v=-v;
}
unsigned int base;
for(base=1000000000; base>0; base/=10) if((unsigned int)v>=base) break;
while(base>0) {
s[len++]=‘0’+(v/base)%10;
base/=10;
}
return len;
}
device int sprintf_str(char *s, char *s2) {
int i;
for(i=0; s2[i]; i++) s[i]=s2[i];
return i;
}

global void Test_kernel(char* logInfo)
{
int threadId = threadIdx.x;
char* pLog = logInfo + threadId*THREAD_SIZE;
int oI = 0;

if(Log && oI<THREAD_SIZE-20) oI+=sprintf_str(pLog+oI,"=============threadid= ");
if(Log && oI<THREAD_SIZE-20) oI+=sprintf_int(pLog+oI,threadId);
if(Log && oI<THREAD_SIZE-20) oI+=sprintf_str(pLog+oI,"= these characters are losing\n");

if(Log && oI<THREAD_SIZE-20) oI+=sprintf_str(pLog+oI,"123456\n");
if(Log && oI<THREAD_SIZE-20) oI+=sprintf_str(pLog+oI,"123456\n");


if(Log && oI<THREAD_SIZE-20) oI+=sprintf_str(pLog+oI,"************** end thread logsize= ");
if(Log && oI<THREAD_SIZE-20) oI+=sprintf_int(pLog+oI,oI);
if(Log && oI<THREAD_SIZE-20) oI+=sprintf_str(pLog+oI,"************\n");

}

extern “C”
void TestCuda(char* h_logInfo,
long logSize)
{
char *d_logInfo;

if(Log) CUDA_SAFE_CALL(cudaMalloc((void**)&d_logInfo,logSize));
if(Log) CUDA_SAFE_CALL(cudaMemcpy(d_logInfo,h_logInfo,logSize,cudaMe

mcpyHostToDevice));

Test_kernel<<<1,THREAD_NUM>>>(d_logInfo);

if(Log) CUDA_SAFE_CALL(cudaMemcpy(h_logInfo,d_logInfo,logSize,cudaMe

mcpyDeviceToHost));
if(Log) CUDA_SAFE_CALL(cudaFree(d_logInfo));
}

char* compress(char* str,char delChar)
{
int i = 0;
int j = 0;
while(str[j] && str[j++] != delChar);
j–;
for(i=j;str[i];i++)
{
if(str[i] != delChar)
str[j++]=str[i];
}
str[j]=0;
return str;
}

/************************************************************
/
/
HelloCUDA /
/
************************************************
***********/
int main(int argc, char
argv)
{

if(!InitCUDA()) {
	return 0;
}

long logSize = sizeof(char)* THREAD_SIZE * THREAD_NUM;
char * logInfo = (char*)malloc(logSize);
memset(logInfo,'@',logSize);
TestCuda(logInfo,logSize);

printf(compress(logInfo,'@'));
CUT_EXIT(argc, argv);

return 0;

}