erroneous data transfer using cudaMemcpy3D

Hi,

I’m trying to transfer data in a DeviceToDevice copy from an array allocated using cudaMalloc to a cudaArray using cudaMemcpy3D. However the data transfer is not working.
I need to make this data transfer in order to complete a cycle and make a proces iterative. I dont know why it is not working, when i use cudaGetLastError i get: invalid argument.
i don’t know why, maybe i am using a wrong way to declare the cudaPitchedPointer. I will post my clean code in order for anyone to see it. Can anybody see an error in the way i am using cudaMemcpy3D the seccond time???

best regards and lots of thank yous in advence
C.


// arrays on host:
float *f1,*f2;

// arrays on device:
float *f1_data, *f2_data;

// cudaArrays on device:
cudaArray *f1_array;

//textures:
texture <float, 3, cudaReadModeElementType> f1_tex;

// memory allocation:
f1 = (float )malloc(sizeof(float) * ni * nj * nk);
f2 = (float )malloc(sizeof(float) * ni * nj * nk);
cudaMalloc((void **)&f1_data, ni
nj
nk);

 // texture binding:
cudaChannelFormatDesc desc;
cudaExtent extent;
desc = cudaCreateChannelDesc<float>();
extent.width  = ni;
extent.height = nj;
extent.depth  = nk;
cudaMalloc3DArray( &f1_array, &desc, extent);
cudaBindTextureToArray(f1_tex, f1_array, desc);
f1_tex.filterMode = cudaFilterModePoint;

  // array nitiallization:
for (i=0; i<totpoints; i++) {
f1[i] = i;
f2[i] = -1.f;
}

 // data transfer:  f1 ---> f1_array:
cudaMemcpy3DParms p = { 0 };
p.extent = extent;
p.kind = cudaMemcpyHostToDevice;
p.dstArray = f1_array;
p.srcPtr = make_cudaPitchedPtr( (void*)f1, ni*sizeof(float), ni, nj ); 
cudaMemcpy3D(&p);

// kernel call, uses texturefetch to modify f1_data by adding 1.f:
operation_kernel<<<grid, block>>>(ni, nj, f1_data);

cudaUnbindTexture(f1_tex);

// data transfer: f1_data —> f1_array: IT DOESNT WORK!!!
cudaMemcpy3DParms p1 = { 0 };
p1.extent = extent;
p1.kind = cudaMemcpyDeviceToDevice;
p1.srcPtr = make_cudaPitchedPtr( (void*)f1_data, ni*sizeof(float), ni, nj );
p1.dstArray = f1_array;
cudaMemcpy3D(&p1);

cudaBindTextureToArray(f1_tex, f1_array, desc);
f1_tex.filterMode = cudaFilterModePoint;

// Kernel call to add 1.f again:
operacion_kernel<<<grid, block>>>(ni, nj, f1_data);

// return data to CPU:
cudaMemcpy(f2, f1_data, ninjnk*sizeof(float),cudaMemcpyDeviceToHost);

if(f1_data)
cudaFree(f1_data);
if(f1)
free(f1);
if(f2)
free(f2);

cudaUnbindTexture(f1_tex);

if(f1_array)
cudaFreeArray(f1_array);

return 0;
}

so after the program ends and i want to see the results i should see f1’s value being increased by 2 however thisis not the case it only does it once.
i’m baffled…