cudaMemcpy() returns success and copy incorrect data

I have been having an issue with cudaMemcpy() where cudaSuccess is returned and incorrect data is copied to the host. I have double check the host/device pointers and am pretty sure they are correct. I have run this code on a GTX970 and K20m with the same results. The directory structure of this project is,

lab2
|_src
|__main.cu
|_build
|__csv.o
|__gpu.o
|__kernels.o
|__main.o
|__pgm.o
|_include
|__csv.h
|__gpu.h
|__kernels.h
|__pgm.h
|_lib
|__csv.cu
|__gpu.cu
|__kernels.cu
|__pgm.cu

And some useful code,

=====================================================================================
makefile

lab2: main.o pgm.o gpu.o kernels.o csv.o
nvcc -G -g -o lab2 build/main.o build/pgm.o build/gpu.o
build/kernels.o build/csv.o

main.o: src/main.cu
nvcc -G -g -c src/main.cu -o build/main.o -I ./include

pgm.o: lib/pgm.cu
nvcc -G -g -c lib/pgm.cu -o build/pgm.o -I ./include

gpu.o: lib/gpu.cu
nvcc -G -g -c lib/gpu.cu -o build/gpu.o -I ./include

kernels.o: lib/kernels.cu
nvcc -gencode arch=compute_30,code=sm_30 -G -g -c lib/kernels.cu -o build/kernels.o -I ./include

csv.o: lib/csv.cu
nvcc -G -g -c lib/csv.cu -o build/csv.o -I ./include

clean:
rm -r ./build/*.o

Header Files

pgm.h

//Prevent multiple inclusions
#ifndef PGMLIB_H
#define PGMLIB_H

//Global Variables
typedef struct
{
char magic_number[2];
unsigned int width;
unsigned int height;
unsigned int max_val;
char *image;
} pgm;

extern pgm h_i_image;
extern pgm h_o_image;
extern char *fp_input;
//extern unsigned int fp_length = 0;

//Functions
void read_pgm(char *filename);
void write_pgm(char *filename);
void parse_pgm();
#endif

=====================================================================================
csv.h

//Preven multiple inclusions
#ifndef CSVLIB_H
#define CSVLIB_H

//Global Vars
extern unsigned char memory_type;
extern unsigned char memory_check;
extern unsigned char dim_check;
extern int *h_kernel;
extern int h_kern_rows;
extern int h_kern_cols;

//Functions
void csv_read(char *filename);

#endif

=====================================================================================
gpu.h

#ifndef LIBGPU_H
#define LIBGPU_H

#include <cuda.h>
#include <cuda_runtime.h>
#include <pgm.h>
//Global Variables
extern dim3 blockSize, gridSize;
extern int *d_kernel;
extern pgm d_image_i;
extern pgm d_image_o;

//Functions
void gpu_setup();
void gpu_run(dim3 gSize, dim3 bSize);
void gpu_results();

#endif

=====================================================================================
kernels.h

#ifndef LIBKERN_H
#define LIBKERN_H

//Kernel Functions
global void d1convolution(int *kern, char *i_input, char *i_output,
unsigned int kern_rows, unsigned int kern_cols,
unsigned int num_rows, unsigned int num_cols);
#endif

Source Files

main.cu

#include <stdio.h>
#include <stdlib.h>
#include <pgm.h>
#include <gpu.h>
#include <csv.h>

int main(int argv, char *argc)
{

//Main Program

//Debug Program Parts
//0. Check for correct usage.
if (argv != 4)
{
printf(“Usage: ./lab2 kernel.csv input.pgm output.pgm\n”);
exit(1);
}
//1. Read in pgm file on command line.
read_pgm(argc[2]);

//2. Read in the kernel from file.
csv_read(argc[1]);

//3. Run the user selected operation on the image.
gpu_setup();
gpu_run(gridSize, blockSize);
gpu_results();

//3. Print out the image.
//a. Assign input image parameters to output.
h_o_image.width = h_i_image.width;
h_o_image.height = h_i_image.height;
int i;
for (i = 0; i != 2; i++)
{
h_o_image.magic_number[i] = h_i_image.magic_number[i];
}
write_pgm(argc[3]);

return 0;
}

=====================================================================================
pgm.cu

#include <stdio.h>
#include <stdlib.h>
#include <cuda.h>
#include <cuda_runtime.h>
#include <pgm.h>

//Global Variables
char *fp_input;
unsigned int fp_length = 0;
pgm h_i_image;
pgm h_o_image;

//Functions
void read_pgm(char *filename)
{
//0. Opem pgm file for reading.
FILE *fp;
const char *mode = “r”;
fp = fopen(filename, mode);
if (fp == NULL)
{
printf(“Can’t open file, %s\n”, filename);
exit(1);
}

//1. Allocate memory to read in the pgm file.
while (fgetc(fp) != EOF)
{
fp_length++;
}
rewind(fp);

fp_input = (char )malloc(fp_lengthsizeof(char));
fread(fp_input, sizeof(char), fp_length, fp);

//2. Parse the image data.
parse_pgm();
}

void parse_pgm()
{
unsigned int i,j;
unsigned int line_count = 0;
char *buffer;
buffer = (char )malloc(100sizeof(char));
unsigned int buffer_index = 0;
unsigned int mgk_num_i = 0;
unsigned int image_index = 0;
char memory_allocated = 0;
for (i = 0; i != fp_length; i++)
{
//a. Get the header information.
if (line_count <= 3)
{
//i. Get the image magick number.
if (line_count == 0)
{
if (fp_input[i] != ‘\n’)
{
//Magic Number to i_mage struct.
h_i_image.magic_number[mgk_num_i] = fp_input[i];
mgk_num_i++;
}
if (fp_input[i] == ‘\n’)
{
//Increae the line_count;
line_count++;
i++;
}
}

  //iii. Get the width and height.
  if (line_count == 1)
    {
      if (fp_input[i] != ' ' & fp_input[i] != '\n')
	{
	  buffer[buffer_index] = fp_input[i];
	  buffer_index++;
	}
      if (fp_input[i] == ' ')
	{
	  //Assign buffer to width integer.
	  h_i_image.width = atoi(buffer);
	  //printf("width buffer = %s\n", buffer);
	  //Reset the buffer and buffer_index.
	  for (j = 0; j != 100; j++)
	    {
	      buffer[j] = 0;
	    }
	  buffer_index = 0;
	}
      if (fp_input[i] == '\n')
	{
	  //Assign buffer to height integer.
	  h_i_image.height = atoi(buffer);
	  //printf("height buffer = %s\n", buffer);
	  //Reset the buffer_index.
	  buffer_index = 0;
	  for (j = 0; j != 100; j++)
	    {
	      buffer[j] = 0;
	    }
	  //Move to next line.
	  line_count++;
	  i++;
	}
    }
  
  //iv. Get the image max_val.
  if (line_count == 2)
    {
      if (fp_input[i] != '\n')
	{
	  buffer[buffer_index] = fp_input[i];
	  buffer_index++;
	}
      if (fp_input[i] == '\n')
	{
	  //Assign max_val to integer.
	  h_i_image.max_val = atoi(buffer);
	  //Move to the image.
	  line_count++;
	  //Reset the buffer index.
	  buffer_index = 0;
	  for (j = 0; j != 100; j++)
	    {
	      buffer[j] = 0;
	    }
	  i++;
	}
    }
}
  
  //b. Read in the image.
  if (line_count == 3)
{
  //printf("index = %d\n", i);
  if (memory_allocated == 0)
    {
      h_i_image.image = (char *)malloc(h_i_image.width*
				       h_i_image.height*
				       sizeof(char));
      memory_allocated = 1;
    }
  h_i_image.image[image_index] = fp_input[i];
  image_index++;
}
}

///*
//Debug
printf(“\n”);
printf(“i_image.magic_number = %s\n”, h_i_image.magic_number);
printf(“i_image.width = %d\n”, h_i_image.width);
printf(“i_image.height = %d\n”, h_i_image.height);
printf(“i_image.max_val = %d\n”, h_i_image.max_val);
//int w_index = 0;
/*
for (i = 0; i != h_i_image.width*h_i_image.height; i++)
{
printf(“%d “, h_i_image.image[i]);
w_index++;
if (w_index == h_i_image.width)
{
printf(”\n”);
w_index = 0;
}
}
printf(“\n”);
printf(“%d #\n”, ‘#’);
//End Debug
*/
}

void write_pgm(char *filename)
{
FILE *fp;
const char *mode = “w”;
fp = fopen(filename, mode);
if (fp == NULL)
{
printf(“Can’t open file %s”, filename);
exit(1);
}

fprintf(fp, “%s\n”, h_o_image.magic_number);
fprintf(fp, “%d %d\n”, h_o_image.width, h_o_image.height);
fprintf(fp, “%d\n”, h_o_image.max_val);
fwrite(h_o_image.image, 1, h_o_image.width*h_o_image.height, fp);
fclose(fp);
}

=====================================================================================
csv.cu

#include <stdio.h>
#include <stdlib.h>
#include <gpu.h>

//Global Host Variables
int *h_kernel;
unsigned int h_kern_rows;
unsigned int h_kern_cols;
void csv_read(char *filename)
{
//Args
//filename → filename of the csv file to readin
//matrix → character of the input matrix (a or b).
char *fp_input;
unsigned int fp_length=0;
//0. Open the file.
FILE *fp;
fp = fopen(filename, “r”);
if (fp == NULL)
{
printf(“Can’t open file %s\n”, filename);
exit(1);
}

//1. Find the length of the file.
while(fgetc(fp) != EOF)
{
fp_length++;
}
rewind(fp);

//2. Allocate memory to store the file.
fp_input = (char )malloc(fp_lengthsizeof(char));

//3. Read in the file.
fread(fp_input, sizeof(char), fp_length, fp);
fclose(fp);
//4. Parse the file for array size

unsigned int i;

for (i = 0; i != fp_length; i++)
{
if (fp_input[i] == ‘\n’)
h_kern_rows++;
if (fp_input[i] == ‘,’)
h_kern_cols++;
}
//
if (h_kern_rows != 0)
{
h_kern_cols = h_kern_cols/h_kern_rows+1;
//printf(“h_kern_cols = %d\n”, h_h_kern_cols);
}

if (h_kern_rows == 0)
{
h_kern_cols++;
h_kern_rows++;
printf(“h_kern_rows = %d\n”, h_kern_rows);
printf(“h_kern_cols = %d\n”, h_kern_cols);
}

//5. Allocate memory to store the array.

h_kernel = (int )malloc(h_kern_rowsh_kern_cols*sizeof(int));

//6. Parse the file for data and store.

char *buffer;
unsigned int buffer_index=0;
unsigned int k_i=0; //Row index.
unsigned int k_j=0; //Col index.

buffer = (char *)calloc(100,sizeof(char));

unsigned int j;

for (i = 0; i != fp_length; i++)
{
if (fp_input[i] != ‘,’ && fp_input[i] != ‘\n’)
{
//Add characters to the buffer.
buffer[buffer_index] = fp_input[i];
//Increase the buffer index
buffer_index++;
}

 if (fp_input[i] == ',')
   {
		   //Reset the buffer index.
	buffer_index = 0;
	//Convert the value on the buffer.
	//kernel[k_i*h_kern_cols+k_j] = atoi(buffer);
	//Increase the col index.
	k_j++;
	//Clean the buffer.
	for (j = 0; j != 100; j++)
	  {
	    buffer[j] = 0;
	  }
   }
 
 if (fp_input[i] == '\n')
   {
	//Reset the buffer index.
	buffer_index = 0;
	//Convert the value on the buffer.
	//kernel[k_i*h_kern_cols+k_j] = atoi(buffer);
	//Reset the col. index.
	k_j=0;
	//Increase the row index.
	k_i++;
	//Clean the buffer.
	for (j = 0; j != 100; j++)
	  {
	    buffer[j] = 0;
	  }
   }
}

}

=====================================================================================
kernels.cu

#include <stdio.h>
#include <stdlib.h>
#include <cuda.h>
#include <cuda_runtime.h>

global void d1convolution(int *kern, char *i_input, char *i_output,
unsigned int kern_rows, unsigned int kern_cols,
unsigned int num_rows, unsigned int num_cols)
{
int idx = blockIdx.x * blockDim.x + threadIdx.x;
int idy = blockIdx.y * blockDim.y + threadIdx.y;

unsigned int k;
//printf(“Kernel Running\n”);
if (idx < num_cols && idy < num_rows)
{
if (idx == 0 & idy == 0)
{
printf(“Kernel Running…\n”);
printf(“num_rows = %d and num_cols = %d\n”, num_rows, num_cols);
printf(“Address of i_output = %p\n”, (void *)i_output);
}

  //i_output[idy*num_cols+idx] = 0;
 for (k = 0; k != kern_cols; k++)
   {
     //i_output[idy*num_cols+idx] += kern[k]*i_input[idy*num_cols+idx];
   }
 i_output[idy*num_cols+idx] = 2;
 /*
 printf("i_output[%d] = %d\n",
	idy*num_cols+idx,
	i_output[idy*num_cols+idx]);
 */
}

if (idx < 10 && idy == 0)
{
printf(“i_input[%d] = %d\n”,
idynum_cols+idx,
i_input[idy
num_cols+idx]);

  printf("i_output[%d] = %d\n",
     idy*num_cols+idx,
     i_output[idy*num_cols+idx]);
}

}

=====================================================================================
gpu.cu

#include <stdio.h>
#include <stdlib.h>
#include <cuda.h>
#include <cuda_runtime.h>
#include <kernels.h>
#include <csv.h>
#include <gpu.h>
//Global Variables
dim3 blockSize, gridSize;
int *d_kernel;
pgm d_image_i;
pgm d_image_o;
char *o_debug;

void gpu_setup()
{
//0. Allocate memory on the device.
cudaError_t err;

err = cudaMalloc((void**)&d_kernel,
h_kern_rowsh_kern_colssizeof(int));

if (err != cudaSuccess)
{
printf(“Can’t allocate cuda memory\n”);
exit(1);
}

err = cudaMalloc((void**)&d_image_i.image,
h_i_image.widthh_i_image.heightsizeof(char));

if (err != cudaSuccess)
{
printf(“Can’t allocate cuda memory\n”);
exit(1);
}

err = cudaMalloc((void**)&d_image_o.image,
h_i_image.widthh_i_image.heightsizeof(char));

if (err != cudaSuccess)
{
printf(“Can’t allocate cuda memory.\n”);
exit(1);
}

//1. Copy arrays to the device.
//a. Copy the kernel to the device.
err = cudaMemcpy(d_kernel,
h_kernel,
h_kern_rowsh_kern_colssizeof(int),
cudaMemcpyHostToDevice);

//printf(“err = %d\n”, err);
//printf(“cudaSuccess = %d\n”, cudaSuccess);

if (err != cudaSuccess)
{
printf(“Error: Copying memory from host to device…\n”);
exit(1);
}

//b. Copy the host input image to the device.
err = cudaMemcpy(d_image_i.image,
h_i_image.image,
h_i_image.widthh_i_image.heightsizeof(char),
cudaMemcpyHostToDevice);

if (err != cudaSuccess)
{
printf(“Error: Copying memory from host to device.\n”);
exit(1);
}

//2. Calculate blockSize and gridSize.
//unsigned int threads_block_dim[2], block_grid_dim[2];
//a. Query the devcie properties.
struct cudaDeviceProp gpu_property;
int gpu_count;

//i. Get the GPU Count
cudaGetDeviceCount(&gpu_count);
cudaGetDeviceProperties(&gpu_property, 0);

//ii. Get the threads_block_dim and block_grid_dim.
/*
unsigned char i;
for (i = 0; i != 2; i++)
{
threads_block_dim[i] = gpu_property.maxThreadsDim[i];
block_grid_dim[i] = gpu_property.maxGridSize[i];
}
*/

//ii. Get the GPU Arch.
printf(“Arch = %d%d\n”,
gpu_property.major,
gpu_property.minor);

//iii. GPU Block and Grid Allocation Method.
if (h_i_image.widthh_i_image.height >
gpu_property.maxThreadsPerBlock)
{
blockSize.x = sqrt(gpu_property.maxThreadsPerBlock);
blockSize.y = sqrt(gpu_property.maxThreadsPerBlock);
gridSize.x =
ceil(sqrt((float)ceil((float)h_i_image.width
h_i_image.height/
gpu_property.maxThreadsPerBlock)));
gridSize.y =
ceil(sqrt((float)ceil((float)h_i_image.width*h_i_image.height/
gpu_property.maxThreadsPerBlock)));
}

if (h_i_image.widthh_i_image.height <
gpu_property.maxThreadsPerBlock)
{
blockSize.x = ceil((float)sqrt((float)h_i_image.width
h_i_image.height));
blockSize.y = ceil((float)sqrt((float)h_i_image.width*h_i_image.height));
gridSize.x = 1;
gridSize.y = 1;
}

printf(“blockSize = %d,%d\n”, blockSize.x, blockSize.y);
printf(“gridSize = %d,%d\n”, gridSize.x, gridSize.y);

}

void gpu_run(dim3 gSize, dim3 bSize)
{

//Variables
cudaError_t err;

//1. Run the 1D Kernel.
if (h_kern_rows == 1)
{
printf(“Run the 1D kernel…\n”);
printf(“blockSize = %d,%d\n”, bSize.x, bSize.y);
printf(“gridSize = %d,%d\n”, gSize.x, gSize.y);
d1convolution<<<gSize, bSize>>>(d_kernel, d_image_i.image, d_image_o.image,
h_kern_rows, h_kern_cols,
h_i_image.height, h_i_image.width);

 err = cudaGetLastError();
 if (err != cudaSuccess)
   {
	printf("Failed to launch kernel.\n %s \n", cudaGetErrorString(err));
	exit(1);
   }
 cudaDeviceSynchronize();
}

/*
//2. Run the 2D Kernel.
if (h_kern_rows > 1)
{
d2convolution<<<gSize, bSize>>>(d_kernel,
d_image,
d_image,
num_rows,
num_cols);
err = cudaGetLastError();
if (err != cudaSuccess)
{
printf(“Failed to launch kernel.\n %s \n”, cudaGetErrorString(err));
exit(1);
}
}
*/
}

void gpu_results()
{
//0. Allocate memory on host to store the image.
h_o_image.image = (char*)malloc(h_i_image.widthh_i_image.heightsizeof(char));
//o_debug = (char*)malloc(h_i_image.widthh_i_image.heightsizeof(char));
/*
if (h_o_image.image == NULL)
{
printf(“Error: Failed to allocate memory on host.\n”);
exit(1);
}
*/

//1. Copy result array from the device.
cudaError_t err;
//->->THIS IS WHERE THE ISSUE IS!<-<-
err = cudaMemcpy(h_o_image.image,
d_image_o.image,
h_o_image.widthh_o_image.heightsizeof(char),
cudaMemcpyDeviceToHost);

if (err != cudaSuccess)
{
printf(“Error: Can’t copy memory from gpu to host.\n”);
exit(1);
}

err = cudaGetLastError();

if (err != cudaSuccess)
{
printf(“Failed to copy memory from device. \n %s \n”, cudaGetErrorString(err));
exit(1);
}

//Debug
printf(“Device Address on Host = %p\n”, d_image_o.image);
//End Debug

//Debug
int i;
printf(“Debug GPU Results…\n”);
for (i = 0; i != 10; i++)
{
printf(“%d = %d\n”, h_i_image.image[i], h_o_image.image[i]);
}
//End Debug
}

=====================================================================================
Example Program Output

i_image.magic_number = P5
i_image.width = 256
i_image.height = 256
i_image.max_val = 255
h_kern_rows = 1
h_kern_cols = 3
Arch = 30
blockSize = 32,32
gridSize = 8,8
Run the 1D kernel…
blockSize = 32,32
gridSize = 8,8
Kernel Running…
num_rows = 256 and num_cols = 256
Address of i_output = 0x1f01d90000
i_input[0] = -95
i_input[1] = -95
i_input[2] = -94
i_input[3] = -94
i_input[4] = -93
i_input[5] = -94
i_input[6] = -96
i_input[7] = -98
i_input[8] = -98
i_input[9] = -97
i_output[0] = 2
i_output[1] = 2
i_output[2] = 2
i_output[3] = 2
i_output[4] = 2
i_output[5] = 2
i_output[6] = 2
i_output[7] = 2
i_output[8] = 2
i_output[9] = 2
Device Address on Host = 0x1f01d90000
Debug GPU Results…
-95 = 0
-95 = 0
-94 = 0
-94 = 0
-93 = 0
-94 = 0
-96 = 0
-98 = 0
-98 = 0
-97 = 0

When you run the program under control of cuda-memcheck, are any errors reported?

I have run the program using cuda-memcheck and no errors are reported.

cuda-memcheck ./lab2 kernel1.csv test-images/Lenna_Images/Lenna_org_256.pgm out.pgm

========= CUDA-MEMCHECK

i_image.magic_number = P5
i_image.width = 256
i_image.height = 256
i_image.max_val = 255
h_kern_rows = 1
h_kern_cols = 3
Arch = 30
blockSize = 32,32
gridSize = 8,8
Run the 1D kernel…
blockSize = 32,32
gridSize = 8,8
Kernel Running…
num_rows = 256 and num_cols = 256
Address of i_output = 0x1f01d90000
i_input[0] = -95
i_input[1] = -95
i_input[2] = -94
i_input[3] = -94
i_input[4] = -93
i_input[5] = -94
i_input[6] = -96
i_input[7] = -98
i_input[8] = -98
i_input[9] = -97
i_output[0] = 2
i_output[1] = 2
i_output[2] = 2
i_output[3] = 2
i_output[4] = 2
i_output[5] = 2
i_output[6] = 2
i_output[7] = 2
i_output[8] = 2
i_output[9] = 2
Device Address on Host = 0x1f01d90000
Debug GPU Results…
-95 = -40
-95 = 36
-94 = -55
-94 = 32
-93 = 122
-94 = 127
-96 = 0
-98 = 0
-98 = -40
-97 = 36
========= ERROR SUMMARY: 0 errors

Whats interesting is when I run the program a second time it obtain different results in the h_o_image.image array. Below are results from running the program a second time.

cuda-memcheck ./lab2 kernel1.csv test-images/Lenna_Images/Lenna_org_256.pgm out.pgm
========= CUDA-MEMCHECK

i_image.magic_number = P5
i_image.width = 256
i_image.height = 256
i_image.max_val = 255
h_kern_rows = 1
h_kern_cols = 3
Arch = 30
blockSize = 32,32
gridSize = 8,8
Run the 1D kernel…
blockSize = 32,32
gridSize = 8,8
Kernel Running…
num_rows = 256 and num_cols = 256
Address of i_output = 0x1f01d90000
i_input[0] = -95
i_input[1] = -95
i_input[2] = -94
i_input[3] = -94
i_input[4] = -93
i_input[5] = -94
i_input[6] = -96
i_input[7] = -98
i_input[8] = -98
i_input[9] = -97
i_output[0] = 2
i_output[1] = 2
i_output[2] = 2
i_output[3] = 2
i_output[4] = 2
i_output[5] = 2
i_output[6] = 2
i_output[7] = 2
i_output[8] = 2
i_output[9] = 2
Device Address on Host = 0x1f01d90000
Debug GPU Results…
-95 = 0
-95 = 0
-94 = 0
-94 = 0
-93 = 0
-94 = 0
-96 = 0
-98 = 0
-98 = 0
-97 = 0
========= ERROR SUMMARY: 0 errors

The issue was pretty simple. I didn’t assign h_o_image.width and h_o_image.height to the correct values before calling cudaMemcpy(). I moved these assignments in main() we the program is working correctly.

=====================================================
main.cu

#include <stdio.h>
#include <stdlib.h>
#include <pgm.h>
#include <gpu.h>
#include <csv.h>

int main(int argv, char *argc)
{

//Main Program

//Debug Program Parts
//0. Check for correct usage.
if (argv != 4)
{
printf(“Usage: ./lab2 kernel.csv input.pgm output.pgm\n”);
exit(1);
}
//1. Read in pgm file on command line.
read_pgm(argc[2]);

//2. Read in the kernel from file.
csv_read(argc[1]);

//3. Run the user selected operation on the image.
h_o_image.width = h_i_image.width;
h_o_image.height = h_i_image.height;
gpu_setup();
gpu_run(gridSize, blockSize);
gpu_results();

//3. Print out the image.
//a. Assign input image parameters to output.
int i;
for (i = 0; i != 2; i++)
{
h_o_image.magic_number[i] = h_i_image.magic_number[i];
}
write_pgm(argc[3]);

return 0;
}