Why pitched memory performanced worse than linear memory?

In my algorithm I used both linear memory (allocated with cudaMalloc) and Pitched memory (allocated with cudaMalloc3D) but to my surprise the linear memory showed the best performance than aligned memory. Someone have an idea of why this is happening?

Note: I tested this algorithm in 3 differents architectures (Fermi, Kepler and Maxwell).

Could you show a simple buildable, runnable program that would allow others to reproduce your findings? With no notion of what your code is doing and what is being timed, it is impossible to diagnose what may be happening.

Unfortunately I can’t show the buildable code because it’s a part of a great software framework. But I used the same kernel for linear and pitched memory. The only difference between the codes is the iteration of the data.

Linear Memory code:

unsigned int xOffset, yOffset, zOffset, fOffset;

xOffset = uint_t(1);
yOffset = uint32_c( f.xAllocSize() ); // xAllocSize() = size of the x dimension.
zOffset = uint32_c( yOffset * f.yAllocSize() );
fOffset = uint32_c( zOffset * f.zAllocSize() );

Pitched Memory code:

xOffset = sizeof(T); // T = double
yOffset = uint32_c( f.pitchedPtr().pitch );
zOffset = yOffset * uint32_c( f.yAllocSize() );
fOffset = zOffset * uint32_c( f.zAllocSize() );

Since I can not put all the code, which test I can do to find it?

Build a new working test case out of your code, that just shows the kernel and the minimum necessary to launch it and compare timing. Make sure it is a complete code, that someone else can compile and run.

This is a similar code that I did in the software framework which I work.

#include <stdlib.h>
#include <stdio.h>
#include <iostream>

// Device code
__global__ void MyKernel(cudaPitchedPtr devPitchedPtr, cudaExtent extent)
{
        char* devPtr = (char*) devPitchedPtr.ptr;
        size_t pitch = devPitchedPtr.pitch;
        size_t cubePitch = pitch * extent.height * (extent.depth/19);

        int x = threadIdx.x;
        int y = blockIdx.x;
        int z = blockIdx.y;
        int f = blockIdx.z;

        char* cube = devPtr + f * cubePitch;
        char* slice = cube + z * pitch * extent.height;
        float* row = (float*) (slice + y * pitch);
        row[x] = f + (z*1) + (y*1*1) + (x*1*1*19); // Sets any value.
}

int main(void)
{

	cudaSetDevice(0);
        size_t x = 128;
        size_t y = 128;
        size_t z = 128;
        size_t f = 19;
        double *array, *result;
        int i, j, k, q;
        cudaError_t status = cudaSuccess;

   	array = (double *) malloc(x*y*z*f*sizeof(double));
   	result = (double *) malloc(x*y*z*f*sizeof(double));

        //initialise array
        for (q = 0; q < f; q++) {
                for (k = 0; k < z; k++) {
                        for (j = 0; j < y; j++) {
                                for (i = 0; i < x; i++) {
                                   array[i + (x*j) + (x*y*k) + (x*y*z*q)] = 0.0;
               result[i + (x*j) + (x*y*k) + (x*y*z*q)] = 0.0;
                                }
                        }
                }
        }

	// Allocate memory on device for a 3D matrix
        cudaExtent extent; // Define a extensao (x, y, z) da matriz ou array 3D.
        extent.width    = x*sizeof(double);
        extent.height   = y;
        extent.depth    = z*f;
        cudaPitchedPtr mem_device; // Ponteiro para a memoria alocada.
        status = cudaMalloc3D( &mem_device, extent );

        if (status != cudaSuccess)
                fprintf(stderr, "Malloc: %s\n", cudaGetErrorString(status));

        // Copy memory to device
        cudaMemcpy3DParms s = { 0 };
        s.srcPtr = make_cudaPitchedPtr( (void*) array, x * sizeof(double), x, y ); // Origem
        s.dstPtr = mem_device; // Destino
        s.extent = extent; // Extensao
        s.kind = cudaMemcpyHostToDevice; // Tipo de copia
        status = cudaMemcpy3D(&s);

        if (status != cudaSuccess)
                fprintf(stderr, "MemcpyHtD: %s\n", cudaGetErrorString(status));

        // Run 3d kernel
        dim3 blocks_per_grid(y, z, f);
        MyKernel <<<blocks_per_grid, x>>> (mem_device, extent);

        // Copy result array back to host
        cudaMemcpy3DParms d = {0};
        d.srcPtr = mem_device;
        d.dstPtr = make_cudaPitchedPtr( (void*) result, x * sizeof(double), x, y );
        d.extent = extent;
        d.kind = cudaMemcpyDeviceToHost;
        status = cudaMemcpy3D(&d);

        if (status != cudaSuccess)
                fprintf(stderr, "MemcpyDtoH: %s\n", cudaGetErrorString(status));

	cudaFree(mem_device.ptr);

	return 0;
}