Hi all,
I am doing a lot of malloc/memcpy in my program. I’ve checked my code and there shouldn’t be anything wrong. Most of the Memcpy2D’s are fine, however, there are two which reported errors. The only thing different about the host arrays which have the reported errors are that I had to convert them from 24 bits word to 32 bits words. But then I am memcpy’ing 4 such arrays and only 2 reported memory errors.
I am only seeing this in valgrind running deviceemu. On release mode, all my memcpy2D gives invalid reads error. The program seems to be running fine, but I haven’t been able to verify my outputs yet.
Maybe this isn’t something that would come back and haunt me later on??
EDIT: Reproducible case in reply.
Valgrind
copy size onto device
size of pitch: 4
size of cpitch: 0
copy refid
copy refloc
copy ilist
copy crefid
copy crefloc
copy cilist
malloc binayseqs
copy binayseqs
==29800==
==29800== Invalid read of size 1
==29800== at 0x4C27200: memcpy (mc_replace_strmem.c:402)
==29800== Address 0x6c98cd7 is 1 bytes before a block of size 40 alloc'd
==29800== at 0x4C265AE: malloc (vg_replace_malloc.c:207)
==29800==
==29800== Invalid read of size 1
==29800== at 0x4C27208: memcpy (mc_replace_strmem.c:402)
==29800== Address 0x6c98cd6 is 2 bytes before a block of size 40 alloc'd
==29800== at 0x4C265AE: malloc (vg_replace_malloc.c:207)
==29800==
==29800== Invalid read of size 1
==29800== at 0x4C27212: memcpy (mc_replace_strmem.c:402)
==29800== Address 0x6c98cd5 is 3 bytes before a block of size 40 alloc'd
==29800== at 0x4C265AE: malloc (vg_replace_malloc.c:207)
==29800==
==29800== Invalid read of size 1
==29800== at 0x4C2721C: memcpy (mc_replace_strmem.c:402)
==29800== Address 0x6c98cd4 is 4 bytes before a block of size 40 alloc'd
==29800== at 0x4C265AE: malloc (vg_replace_malloc.c:207)
copy bseq
copy reg
==29800==
==29800== Invalid read of size 1
==29800== at 0x4C272B8: memcpy (mc_replace_strmem.c:402)
==29800== Address 0x6c98ca8 is 0 bytes after a block of size 40 alloc'd
==29800== at 0x4C265AE: malloc (vg_replace_malloc.c:207)
==29800==
==29800== Invalid read of size 1
==29800== at 0x4C272BF: memcpy (mc_replace_strmem.c:402)
==29800== Address 0x6c98ca9 is 1 bytes after a block of size 40 alloc'd
==29800== at 0x4C265AE: malloc (vg_replace_malloc.c:207)
==29800==
==29800== Invalid read of size 1
==29800== at 0x4C272C8: memcpy (mc_replace_strmem.c:402)
==29800== Address 0x6c98caa is 2 bytes after a block of size 40 alloc'd
==29800== at 0x4C265AE: malloc (vg_replace_malloc.c:207)
==29800==
==29800== Invalid read of size 1
==29800== at 0x4C272D1: memcpy (mc_replace_strmem.c:402)
==29800== Address 0x6c98cab is 3 bytes after a block of size 40 alloc'd
==29800== at 0x4C265AE: malloc (vg_replace_malloc.c:207)
copy cbseq
copy creg
before 1kernel invocation: Total GPU Memory: 536150016, free memory: 480644864
before 1kernel invocation: Total GPU Memory: 536608768, free memory: 489701120
malloc/memcpy code
cout<<"copy size onto device"<<endl;
cutilSafeCall(cudaMalloc((void**) &d_size, sizeof(int)*num));
cutilSafeCall(cudaMemcpy(d_size, h_size, sizeof(int)*num, cudaMemcpyHostToDevice));
//direct chain
cout<<"size of pitch: "<<sizeof(ref_id_t)*maxM<<endl;
cout<<"size of cpitch: "<<sizeof(ref_id_t)*cmaxM<<endl;
cutilSafeCall(cudaMallocPitch((void**) &d_refid, &p_refid, maxM*sizeof(ref_id_t), num));
cout<<"copy refid"<<endl;
cutilSafeCall(cudaMemcpy2D(d_refid, p_refid, h_refid, maxM*sizeof(ref_id_t), maxM*sizeof(ref_id_t), num, cudaMemcpyHostToDevice));
cutilSafeCall(cudaMallocPitch((void**) &d_refloc, &p_refloc, maxM*sizeof(ref_loc_t), num));
cout<<"copy refloc"<<endl;
cutilSafeCall(cudaMemcpy2D(d_refloc, p_refloc, h_refloc, maxM*sizeof(ref_loc_t), maxM*sizeof(ref_loc_t), num, cudaMemcpyHostToDevice));
cutilSafeCall(cudaMallocPitch((void**) &d_ilist, &p_ilist, maxM*sizeof(int), num));
cout<<"copy ilist"<<endl;
cutilSafeCall(cudaMemcpy2D(d_ilist, p_ilist, h_ilist, maxM*sizeof(int), maxM*sizeof(int), num, cudaMemcpyHostToDevice));
//complementary chain
cutilSafeCall(cudaMallocPitch((void**) &d_crefid, &p_crefid, cmaxM*sizeof(ref_id_t), num));
cout<<"copy crefid"<<endl;
cutilSafeCall(cudaMemcpy2D(d_crefid, p_crefid, h_crefid, cmaxM*sizeof(ref_id_t), cmaxM*sizeof(ref_id_t), num, cudaMemcpyHostToDevice));
cutilSafeCall(cudaMallocPitch((void**) &d_crefloc, &p_crefloc, cmaxM*sizeof(ref_loc_t), num));
cout<<"copy crefloc"<<endl;
cutilSafeCall(cudaMemcpy2D(d_crefloc, p_crefloc, h_crefloc, cmaxM*sizeof(ref_loc_t), cmaxM*sizeof(ref_loc_t), num, cudaMemcpyHostToDevice));
cutilSafeCall(cudaMallocPitch((void**) &d_cilist, &p_cilist, cmaxM*sizeof(int), num));
cout<<"copy cilist"<<endl;
cutilSafeCall(cudaMemcpy2D(d_cilist, p_cilist, h_cilist, cmaxM*sizeof(int), cmaxM*sizeof(int), num, cudaMemcpyHostToDevice));
cout<<"malloc binayseqs"<<endl;
// alloc binay seqs
cutilSafeCall(cudaMallocPitch((void**) &d_bseq, &p_bseq, 12*FIXELEMENT*sizeof(bit32_t), num));
cutilSafeCall(cudaMallocPitch((void**) &d_reg, &p_reg, 12*FIXELEMENT*sizeof(bit32_t), num));
cutilSafeCall(cudaMallocPitch((void**) &d_cbseq, &p_cbseq, 12*FIXELEMENT*sizeof(bit32_t), num));
cutilSafeCall(cudaMallocPitch((void**) &d_creg, &p_creg, 12*FIXELEMENT*sizeof(bit32_t), num));
cout<<"copy binayseqs"<<endl;
//copy binary seqs
cutilSafeCall(cudaMemcpy2D(d_bseq, p_bseq, h_bseq, 12*FIXELEMENT*sizeof(bit32_t), 12*FIXELEMENT*sizeof(bit32_t), num, cudaMemcpyHostToDevice));
cout<<"copy bseq"<<endl;
cutilSafeCall(cudaMemcpy2D(d_reg, p_reg, h_reg, 12*FIXELEMENT*sizeof(bit32_t), 12*FIXELEMENT*sizeof(bit32_t), num, cudaMemcpyHostToDevice));
cout<<"copy reg"<<endl;
cutilSafeCall(cudaMemcpy2D(d_cbseq, p_cbseq, h_cbseq, 12*FIXELEMENT*sizeof(bit32_t), 12*FIXELEMENT*sizeof(bit32_t), num, cudaMemcpyHostToDevice));
cout<<"copy cbseq"<<endl;
cutilSafeCall(cudaMemcpy2D(d_creg, p_creg, h_creg, 12*FIXELEMENT*sizeof(bit32_t), 12*FIXELEMENT*sizeof(bit32_t), num, cudaMemcpyHostToDevice));
cout<<"copy creg"<<endl;
code that convert 24->32 bits
extern "C" void
cpyBinaySeq(bit24_t bseq[][FIXELEMENT], bit24_t reg [][FIXELEMENT], bit24_t cbseq[][FIXELEMENT], bit24_t creg[][FIXELEMENT], size_t size, int tt)
{
bit32_t *temp1 = (bit32_t*)malloc(sizeof(bit32_t)*12*FIXELEMENT);
bit32_t *temp2 = (bit32_t*)malloc(sizeof(bit32_t)*12*FIXELEMENT);
bit32_t *temp3 = (bit32_t*)malloc(sizeof(bit32_t)*12*FIXELEMENT);
bit32_t *temp4 = (bit32_t*)malloc(sizeof(bit32_t)*12*FIXELEMENT);
for(int j=0; j<12; j++) {
for(int i=0; i<FIXELEMENT; i++) {
//memcpy(&temp1[j*FIXELEMENT + i], &bseq[i][j], sizeof(bit24_t));
temp1[j*FIXELEMENT + i] = (bit32_t)bseq[i][j].a;
temp2[j*FIXELEMENT + i] = (bit32_t)reg[i][j].a;
temp3[j*FIXELEMENT + i] = (bit32_t)cbseq[i][j].a;
temp4[j*FIXELEMENT + i] = (bit32_t)creg[i][j].a;
}
}
h_bseq[tt] = temp1;
h_reg[tt] = temp2;
h_cbseq[tt] = temp3;
h_creg[tt] = temp4;
h_size[tt] = (int)size;
}