cudaMemcpy2D with pitch gives random results

Would any of you please mind running or having a look at this code and seeing if it works for you? I’m not even calling a kernel.

I’m struggling with this one and am beginning to think that my implementation must be buggy or unstable. Weird things are happening here on x86_64 in linux newest 3.1, and also with a stable 3.0. Or, maybe I’m just coding something wrong.

The operations seem so basic and yet the results are so weird…I must be missing something silly!?

Or, is this a bug?

[codebox]/*

  • File: testHMAX.cu

  • Author: martin

  • Created on August 14, 2010, 3:29 AM

*/

#include <stdio.h>

#include <stdlib.h>

#include <time.h>

#include <math.h>

#define TYPE float

void checkError(char* file, int line);

/*

*/

int main(int argc, char** argv) {

int image_width = 5;

int image_height = 5;

int patch_width = 3;

int patch_height = 3;

int filter_width = 3;

int filter_height = 3;

TYPE image[image_height][image_width];

TYPE filters[1][3][3];

TYPE patches[1][3][3];

printf(“Here is the image: \n”);

 for(int i=0;i<image_height;i++) {                            //new image and filter

     for(int  j=0;j<image_width;j++)  {

        image[i][j]=(TYPE)(rand()%256);

            printf("%d ", (int) (image[i][j]));

     }

     printf("\n");

}

printf(“Here is the patch: \n”);

for(int i=0;i<patch_height;i++) {                            //new image and filter

     for(int  j=0;j<patch_width;j++) {

        patches[1][i][j]=(TYPE)(rand()%256);

        printf("%d ", (int) (patches[1][i][j]));

     }

     printf("\n");

}

printf(“Here is the filter: \n”);

for(int i=0;i<patch_height;i++) {                            //new image and filter

     for(int  j=0;j<patch_width;j++) {

        filters[1][i][j]=(TYPE)(rand()%256);

        printf("%d ", (int) (filters[1][i][j]));

     }

     printf("\n");

}

int num_filters = 1;

int num_patches = 1;

int filter_rows[num_filters];

int filter_columns[num_filters];

for(int i=0;i<num_filters;i++) {

    filter_rows[i] = 3;

    filter_columns[i] =3;

}

/images/

TYPE *imageGPU;

/*filters*/

TYPE *filtersGPU[num_filters];

/*patches*/

TYPE *patchesGPU[num_patches];

size_t filterpitches[num_filters];

size_t patchpitches[num_patches];

/* Allocate and copy memory to GPU */

for(int i=0;i<num_filters;i++) {

            printf("Trying to allocate space for a filter on the GPU...\n");

            cudaMallocPitch((void**) &(filtersGPU[i]), &filterpitches[i],filter_columns[i] * sizeof(TYPE),filter_rows[i]);

            checkError(__FILE__,__LINE__);

/*

            printf("Trying to delete space to zero for a filter on the GPU...\n");

            cudaMemset2D(filtersGPU[i], filterpitches[i],0, filterpitches[i] * filter_columns[i],filter_rows[i]);

            checkError(__FILE__,__LINE__);

*/

printf(“Copying filter to GPU!\n”);

            cudaMemcpy2D((void**) filtersGPU[i],filterpitches[i],filters[i],filter_columns[i] * sizeof(TYPE ),filter_columns[i] * sizeof(TYPE),filter_rows[i],cudaMemcpyHostToDevice);

            checkError(__FILE__,__LINE__);

            cudaThreadSynchronize();

}

    cudaThreadSynchronize();

    printf("Finished allocating filters on GPU with pitches!\n");

for(int i=0;i<num_filters;i++) {

            printf("Here is the filter from the CPU before copying!!\n");

            for(int k=0;k<filter_height;k++) {                            //new image and filter

                for(int  j=0;j<filter_width;j++)  {

                    printf("%f ", (TYPE) filters[1][k][j]);

                }

                printf("\n");

            }

printf(“Here is the filter from the GPU after copying!!\n”);

TYPE tmp[filter_rows[i]][filter_columns[i]];

            cudaMemcpy2D((void**) tmp,filter_columns[i] * sizeof(TYPE),filtersGPU[i],filterpitches[i],filter_columns[i

] * sizeof(TYPE),filter_rows[i],cudaMemcpyDeviceToHost);

            cudaThreadSynchronize();

            checkError(__FILE__,__LINE__);

for(int k=0;k<filter_height;k++) { //new image and filter

                for(int  j=0;j<filter_width;j++)  {

                    printf("%d ", (int) tmp[k][j]);

                }

                printf("\n");

            }

for(int j=0;j<filter_rows[i];j++) {

                printf("Freeing row %d\n",j);

    		cudaFree(tmp[j]);

            }

            cudaFree(tmp);

/*

            TYPE **tmp = (TYPE**)malloc(filter_rows[i]*sizeof(TYPE*));

            for(int j=0;j<filter_rows[i];j++)

                tmp[j] = (TYPE*)malloc(filter_columns[i]*sizeof(TYPE));

cudaMemcpy2D(tmp,filter_columns[i] * sizeof(TYPE),filtersGPU[i],filterpitches[i],filter_columns[i

] * sizeof(TYPE),filter_rows[i],cudaMemcpyDeviceToHost);

            checkError(__FILE__,__LINE__);

            print(tmp,filter_columns[i],filter_rows[i]);

*/

/* for(j=0;j<filter_rows[i];j++)

    		free(tmp[j]);

            free(tmp);

*/

}

    fflush(stdout);

    fflush(stderr);

return (EXIT_SUCCESS);

}

void checkError(char* file,int line) {

cudaError_t err = cudaGetLastError();

if (cudaSuccess != err) {

    printf("There was an error!\n");

    fprintf(stderr,"Cuda error: %s in file '%s' in line %i : %s.\n",

            "CUDA operation failed",file,line, cudaGetErrorString(err));

    printf("error code = %d\n", err);

fflush(stdout);

    fflush(stderr);

      fflush ((FILE*)0);

    exit(EXIT_FAILURE);

}

}[/codebox]

And here is the output I get:

[codebox]Here is the image:

103 198 105 115 81

255 74 236 41 205

186 171 242 251 227

70 124 194 84 248

27 232 231 141 118

Here is the patch:

90 46 99

51 159 201

154 102 50

Here is the filter:

13 183 49

88 163 90

37 93 5

Trying to allocate space for a filter on the GPU…

Copying filter to GPU!

Finished allocating filters on GPU with pitches!

Here is the filter from the CPU before copying!!

13.000000 183.000000 49.000000

88.000000 163.000000 -1719463078730986946560.000000

0.000000 -1719458575131359576064.000000 0.000000

Here is the filter from the GPU after copying!!

51 159 201

154 102 50

0 0 -2147483648

Freeing row 0

Freeing row 1

Freeing row 2

[/codebox]

Then, the next time I run the same compiled code, it’ll be something really weird like this:

[codebox]Here is the image:

103 198 105 115 81

255 74 236 41 205

186 171 242 251 227

70 124 194 84 248

27 232 231 141 118

Here is the patch:

90 46 99

51 159 201

154 102 50

Here is the filter:

13.000000 183.000000 49.000000

88.000000 163.000000 90.000000

37.000000 93.000000 5.000000

Trying to allocate space for a filter on the GPU…

Copying filter to GPU!

Finished allocating filters on GPU with pitches!

Here is the filter from the CPU before copying!!

13.000000 183.000000 49.000000

88.000000 163.000000 -3492112793945374751089657643008.000000

0.000000 -3492103122538817834056259993600.000000 0.000000

Here is the filter from the GPU after copying!!

51 159 201

154 102 50

0 0 -2147483648

Freeing row 0

Freeing row 1

Freeing row 2

[/codebox]