cudaMemcpy() returns success and copy incorrect data

allanoverst · March 4, 2017, 4:40pm

I have been having an issue with cudaMemcpy() where cudaSuccess is returned and incorrect data is copied to the host. I have double check the host/device pointers and am pretty sure they are correct. I have run this code on a GTX970 and K20m with the same results. The directory structure of this project is,

And some useful code,

=====================================================================================
makefile

lab2: main.o pgm.o gpu.o kernels.o csv.o
nvcc -G -g -o lab2 build/main.o build/pgm.o build/gpu.o
build/kernels.o build/csv.o

main.o: src/main.cu
nvcc -G -g -c src/main.cu -o build/main.o -I ./include

pgm.o: lib/pgm.cu
nvcc -G -g -c lib/pgm.cu -o build/pgm.o -I ./include

gpu.o: lib/gpu.cu
nvcc -G -g -c lib/gpu.cu -o build/gpu.o -I ./include

kernels.o: lib/kernels.cu
nvcc -gencode arch=compute_30,code=sm_30 -G -g -c lib/kernels.cu -o build/kernels.o -I ./include

csv.o: lib/csv.cu
nvcc -G -g -c lib/csv.cu -o build/csv.o -I ./include

clean:
rm -r ./build/*.o

Header Files

pgm.h

//Prevent multiple inclusions
#ifndef PGMLIB_H
#define PGMLIB_H

//Global Variables
typedef struct
{
char magic_number[2];
unsigned int width;
unsigned int height;
unsigned int max_val;
char *image;
} pgm;

extern pgm h_i_image;
extern pgm h_o_image;
extern char *fp_input;
//extern unsigned int fp_length = 0;

//Functions
void read_pgm(char *filename);
void write_pgm(char *filename);
void parse_pgm();
#endif

=====================================================================================
csv.h

//Preven multiple inclusions
#ifndef CSVLIB_H
#define CSVLIB_H

//Global Vars
extern unsigned char memory_type;
extern unsigned char memory_check;
extern unsigned char dim_check;
extern int *h_kernel;
extern int h_kern_rows;
extern int h_kern_cols;

//Functions
void csv_read(char *filename);

#endif

=====================================================================================
gpu.h

#ifndef LIBGPU_H
#define LIBGPU_H

#include <cuda.h>
#include <cuda_runtime.h>
#include <pgm.h>
//Global Variables
extern dim3 blockSize, gridSize;
extern int *d_kernel;
extern pgm d_image_i;
extern pgm d_image_o;

//Functions
void gpu_setup();
void gpu_run(dim3 gSize, dim3 bSize);
void gpu_results();

#endif

=====================================================================================
kernels.h

#ifndef LIBKERN_H
#define LIBKERN_H

//Kernel Functions
global void d1convolution(int *kern, char *i_input, char *i_output,
unsigned int kern_rows, unsigned int kern_cols,
unsigned int num_rows, unsigned int num_cols);
#endif

Source Files

main.cu

#include <stdio.h>
#include <stdlib.h>
#include <pgm.h>
#include <gpu.h>
#include <csv.h>

int main(int argv, char *argc)
{

//Main Program

//Debug Program Parts
//0. Check for correct usage.
if (argv != 4)
{
printf(“Usage: ./lab2 kernel.csv input.pgm output.pgm\n”);
exit(1);
}
//1. Read in pgm file on command line.
read_pgm(argc[2]);

//2. Read in the kernel from file.
csv_read(argc[1]);

//3. Run the user selected operation on the image.
gpu_setup();
gpu_run(gridSize, blockSize);
gpu_results();

//3. Print out the image.
//a. Assign input image parameters to output.
h_o_image.width = h_i_image.width;
h_o_image.height = h_i_image.height;
int i;
for (i = 0; i != 2; i++)
{
h_o_image.magic_number[i] = h_i_image.magic_number[i];
}
write_pgm(argc[3]);

return 0;
}

=====================================================================================
pgm.cu

#include <stdio.h>
#include <stdlib.h>
#include <cuda.h>
#include <cuda_runtime.h>
#include <pgm.h>

//Global Variables
char *fp_input;
unsigned int fp_length = 0;
pgm h_i_image;
pgm h_o_image;

//Functions
void read_pgm(char *filename)
{
//0. Opem pgm file for reading.
FILE *fp;
const char *mode = “r”;
fp = fopen(filename, mode);
if (fp == NULL)
{
printf(“Can’t open file, %s\n”, filename);
exit(1);
}

//1. Allocate memory to read in the pgm file.
while (fgetc(fp) != EOF)
{
fp_length++;
}
rewind(fp);

fp_input = (char )malloc(fp_lengthsizeof(char));
fread(fp_input, sizeof(char), fp_length, fp);

//2. Parse the image data.
parse_pgm();
}

void parse_pgm()
{
unsigned int i,j;
unsigned int line_count = 0;
char *buffer;
buffer = (char )malloc(100sizeof(char));
unsigned int buffer_index = 0;
unsigned int mgk_num_i = 0;
unsigned int image_index = 0;
char memory_allocated = 0;
for (i = 0; i != fp_length; i++)
{
//a. Get the header information.
if (line_count <= 3)
{
//i. Get the image magick number.
if (line_count == 0)
{
if (fp_input[i] != ‘\n’)
{
//Magic Number to i_mage struct.
h_i_image.magic_number[mgk_num_i] = fp_input[i];
mgk_num_i++;
}
if (fp_input[i] == ‘\n’)
{
//Increae the line_count;
line_count++;
i++;
}
}

  //iii. Get the width and height.
  if (line_count == 1)
    {
      if (fp_input[i] != ' ' & fp_input[i] != '\n')
	{
	  buffer[buffer_index] = fp_input[i];
	  buffer_index++;
	}
      if (fp_input[i] == ' ')
	{
	  //Assign buffer to width integer.
	  h_i_image.width = atoi(buffer);
	  //printf("width buffer = %s\n", buffer);
	  //Reset the buffer and buffer_index.
	  for (j = 0; j != 100; j++)
	    {
	      buffer[j] = 0;
	    }
	  buffer_index = 0;
	}
      if (fp_input[i] == '\n')
	{
	  //Assign buffer to height integer.
	  h_i_image.height = atoi(buffer);
	  //printf("height buffer = %s\n", buffer);
	  //Reset the buffer_index.
	  buffer_index = 0;
	  for (j = 0; j != 100; j++)
	    {
	      buffer[j] = 0;
	    }
	  //Move to next line.
	  line_count++;
	  i++;
	}
    }
  
  //iv. Get the image max_val.
  if (line_count == 2)
    {
      if (fp_input[i] != '\n')
	{
	  buffer[buffer_index] = fp_input[i];
	  buffer_index++;
	}
      if (fp_input[i] == '\n')
	{
	  //Assign max_val to integer.
	  h_i_image.max_val = atoi(buffer);
	  //Move to the image.
	  line_count++;
	  //Reset the buffer index.
	  buffer_index = 0;
	  for (j = 0; j != 100; j++)
	    {
	      buffer[j] = 0;
	    }
	  i++;
	}
    }
}
  
  //b. Read in the image.
  if (line_count == 3)
{
  //printf("index = %d\n", i);
  if (memory_allocated == 0)
    {
      h_i_image.image = (char *)malloc(h_i_image.width*
				       h_i_image.height*
				       sizeof(char));
      memory_allocated = 1;
    }
  h_i_image.image[image_index] = fp_input[i];
  image_index++;
}
}

///*
//Debug
printf(“\n”);
printf(“i_image.magic_number = %s\n”, h_i_image.magic_number);
printf(“i_image.width = %d\n”, h_i_image.width);
printf(“i_image.height = %d\n”, h_i_image.height);
printf(“i_image.max_val = %d\n”, h_i_image.max_val);
//int w_index = 0;
/*
for (i = 0; i != h_i_image.width*h_i_image.height; i++)
{
printf(“%d “, h_i_image.image[i]);
w_index++;
if (w_index == h_i_image.width)
{
printf(”\n”);
w_index = 0;
}
}
printf(“\n”);
printf(“%d #\n”, ‘#’);
//End Debug
*/
}

void write_pgm(char *filename)
{
FILE *fp;
const char *mode = “w”;
fp = fopen(filename, mode);
if (fp == NULL)
{
printf(“Can’t open file %s”, filename);
exit(1);
}

fprintf(fp, “%s\n”, h_o_image.magic_number);
fprintf(fp, “%d %d\n”, h_o_image.width, h_o_image.height);
fprintf(fp, “%d\n”, h_o_image.max_val);
fwrite(h_o_image.image, 1, h_o_image.width*h_o_image.height, fp);
fclose(fp);
}

=====================================================================================
csv.cu

#include <stdio.h>
#include <stdlib.h>
#include <gpu.h>

//Global Host Variables
int *h_kernel;
unsigned int h_kern_rows;
unsigned int h_kern_cols;
void csv_read(char *filename)
{
//Args
//filename → filename of the csv file to readin
//matrix → character of the input matrix (a or b).
char *fp_input;
unsigned int fp_length=0;
//0. Open the file.
FILE *fp;
fp = fopen(filename, “r”);
if (fp == NULL)
{
printf(“Can’t open file %s\n”, filename);
exit(1);
}

//1. Find the length of the file.
while(fgetc(fp) != EOF)
{
fp_length++;
}
rewind(fp);

//2. Allocate memory to store the file.
fp_input = (char )malloc(fp_lengthsizeof(char));

//3. Read in the file.
fread(fp_input, sizeof(char), fp_length, fp);
fclose(fp);
//4. Parse the file for array size

unsigned int i;

for (i = 0; i != fp_length; i++)
{
if (fp_input[i] == ‘\n’)
h_kern_rows++;
if (fp_input[i] == ‘,’)
h_kern_cols++;
}
//
if (h_kern_rows != 0)
{
h_kern_cols = h_kern_cols/h_kern_rows+1;
//printf(“h_kern_cols = %d\n”, h_h_kern_cols);
}

if (h_kern_rows == 0)
{
h_kern_cols++;
h_kern_rows++;
printf(“h_kern_rows = %d\n”, h_kern_rows);
printf(“h_kern_cols = %d\n”, h_kern_cols);
}

//5. Allocate memory to store the array.

h_kernel = (int )malloc(h_kern_rowsh_kern_cols*sizeof(int));

//6. Parse the file for data and store.

char *buffer;
unsigned int buffer_index=0;
unsigned int k_i=0; //Row index.
unsigned int k_j=0; //Col index.

buffer = (char *)calloc(100,sizeof(char));

unsigned int j;

for (i = 0; i != fp_length; i++)
{
if (fp_input[i] != ‘,’ && fp_input[i] != ‘\n’)
{
//Add characters to the buffer.
buffer[buffer_index] = fp_input[i];
//Increase the buffer index
buffer_index++;
}

 if (fp_input[i] == ',')
   {
		   //Reset the buffer index.
	buffer_index = 0;
	//Convert the value on the buffer.
	//kernel[k_i*h_kern_cols+k_j] = atoi(buffer);
	//Increase the col index.
	k_j++;
	//Clean the buffer.
	for (j = 0; j != 100; j++)
	  {
	    buffer[j] = 0;
	  }
   }
 
 if (fp_input[i] == '\n')
   {
	//Reset the buffer index.
	buffer_index = 0;
	//Convert the value on the buffer.
	//kernel[k_i*h_kern_cols+k_j] = atoi(buffer);
	//Reset the col. index.
	k_j=0;
	//Increase the row index.
	k_i++;
	//Clean the buffer.
	for (j = 0; j != 100; j++)
	  {
	    buffer[j] = 0;
	  }
   }
}

}

=====================================================================================
kernels.cu

#include <stdio.h>
#include <stdlib.h>
#include <cuda.h>
#include <cuda_runtime.h>

global void d1convolution(int *kern, char *i_input, char *i_output,
unsigned int kern_rows, unsigned int kern_cols,
unsigned int num_rows, unsigned int num_cols)
{
int idx = blockIdx.x * blockDim.x + threadIdx.x;
int idy = blockIdx.y * blockDim.y + threadIdx.y;

unsigned int k;
//printf(“Kernel Running\n”);
if (idx < num_cols && idy < num_rows)
{
if (idx == 0 & idy == 0)
{
printf(“Kernel Running…\n”);
printf(“num_rows = %d and num_cols = %d\n”, num_rows, num_cols);
printf(“Address of i_output = %p\n”, (void *)i_output);
}

  //i_output[idy*num_cols+idx] = 0;
 for (k = 0; k != kern_cols; k++)
   {
     //i_output[idy*num_cols+idx] += kern[k]*i_input[idy*num_cols+idx];
   }
 i_output[idy*num_cols+idx] = 2;
 /*
 printf("i_output[%d] = %d\n",
	idy*num_cols+idx,
	i_output[idy*num_cols+idx]);
 */
}

if (idx < 10 && idy == 0)
{
printf(“i_input[%d] = %d\n”,
idynum_cols+idx,
i_input[idynum_cols+idx]);

  printf("i_output[%d] = %d\n",
     idy*num_cols+idx,
     i_output[idy*num_cols+idx]);
}

}

=====================================================================================
gpu.cu

#include <stdio.h>
#include <stdlib.h>
#include <cuda.h>
#include <cuda_runtime.h>
#include <kernels.h>
#include <csv.h>
#include <gpu.h>
//Global Variables
dim3 blockSize, gridSize;
int *d_kernel;
pgm d_image_i;
pgm d_image_o;
char *o_debug;

void gpu_setup()
{
//0. Allocate memory on the device.
cudaError_t err;

err = cudaMalloc((void**)&d_kernel,
h_kern_rowsh_kern_colssizeof(int));

if (err != cudaSuccess)
{
printf(“Can’t allocate cuda memory\n”);
exit(1);
}

err = cudaMalloc((void**)&d_image_i.image,
h_i_image.widthh_i_image.heightsizeof(char));

if (err != cudaSuccess)
{
printf(“Can’t allocate cuda memory\n”);
exit(1);
}

err = cudaMalloc((void**)&d_image_o.image,
h_i_image.widthh_i_image.heightsizeof(char));

if (err != cudaSuccess)
{
printf(“Can’t allocate cuda memory.\n”);
exit(1);
}

//1. Copy arrays to the device.
//a. Copy the kernel to the device.
err = cudaMemcpy(d_kernel,
h_kernel,
h_kern_rowsh_kern_colssizeof(int),
cudaMemcpyHostToDevice);

//printf(“err = %d\n”, err);
//printf(“cudaSuccess = %d\n”, cudaSuccess);

if (err != cudaSuccess)
{
printf(“Error: Copying memory from host to device…\n”);
exit(1);
}

//b. Copy the host input image to the device.
err = cudaMemcpy(d_image_i.image,
h_i_image.image,
h_i_image.widthh_i_image.heightsizeof(char),
cudaMemcpyHostToDevice);

if (err != cudaSuccess)
{
printf(“Error: Copying memory from host to device.\n”);
exit(1);
}

//2. Calculate blockSize and gridSize.
//unsigned int threads_block_dim[2], block_grid_dim[2];
//a. Query the devcie properties.
struct cudaDeviceProp gpu_property;
int gpu_count;

//i. Get the GPU Count
cudaGetDeviceCount(&gpu_count);
cudaGetDeviceProperties(&gpu_property, 0);

//ii. Get the threads_block_dim and block_grid_dim.
/*
unsigned char i;
for (i = 0; i != 2; i++)
{
threads_block_dim[i] = gpu_property.maxThreadsDim[i];
block_grid_dim[i] = gpu_property.maxGridSize[i];
}
*/

//ii. Get the GPU Arch.
printf(“Arch = %d%d\n”,
gpu_property.major,
gpu_property.minor);

//iii. GPU Block and Grid Allocation Method.
if (h_i_image.widthh_i_image.height >
gpu_property.maxThreadsPerBlock)
{
blockSize.x = sqrt(gpu_property.maxThreadsPerBlock);
blockSize.y = sqrt(gpu_property.maxThreadsPerBlock);
gridSize.x =
ceil(sqrt((float)ceil((float)h_i_image.widthh_i_image.height/
gpu_property.maxThreadsPerBlock)));
gridSize.y =
ceil(sqrt((float)ceil((float)h_i_image.width*h_i_image.height/
gpu_property.maxThreadsPerBlock)));
}

if (h_i_image.widthh_i_image.height <
gpu_property.maxThreadsPerBlock)
{
blockSize.x = ceil((float)sqrt((float)h_i_image.widthh_i_image.height));
blockSize.y = ceil((float)sqrt((float)h_i_image.width*h_i_image.height));
gridSize.x = 1;
gridSize.y = 1;
}

printf(“blockSize = %d,%d\n”, blockSize.x, blockSize.y);
printf(“gridSize = %d,%d\n”, gridSize.x, gridSize.y);

}

void gpu_run(dim3 gSize, dim3 bSize)
{

//Variables
cudaError_t err;

//1. Run the 1D Kernel.
if (h_kern_rows == 1)
{
printf(“Run the 1D kernel…\n”);
printf(“blockSize = %d,%d\n”, bSize.x, bSize.y);
printf(“gridSize = %d,%d\n”, gSize.x, gSize.y);
d1convolution<<<gSize, bSize>>>(d_kernel, d_image_i.image, d_image_o.image,
h_kern_rows, h_kern_cols,
h_i_image.height, h_i_image.width);

 err = cudaGetLastError();
 if (err != cudaSuccess)
   {
	printf("Failed to launch kernel.\n %s \n", cudaGetErrorString(err));
	exit(1);
   }
 cudaDeviceSynchronize();
}

/*
//2. Run the 2D Kernel.
if (h_kern_rows > 1)
{
d2convolution<<<gSize, bSize>>>(d_kernel,
d_image,
d_image,
num_rows,
num_cols);
err = cudaGetLastError();
if (err != cudaSuccess)
{
printf(“Failed to launch kernel.\n %s \n”, cudaGetErrorString(err));
exit(1);
}
}
*/
}

void gpu_results()
{
//0. Allocate memory on host to store the image.
h_o_image.image = (char*)malloc(h_i_image.widthh_i_image.heightsizeof(char));
//o_debug = (char*)malloc(h_i_image.widthh_i_image.heightsizeof(char));
/*
if (h_o_image.image == NULL)
{
printf(“Error: Failed to allocate memory on host.\n”);
exit(1);
}
*/

//1. Copy result array from the device.
cudaError_t err;
//->->THIS IS WHERE THE ISSUE IS!<-<-
err = cudaMemcpy(h_o_image.image,
d_image_o.image,
h_o_image.widthh_o_image.heightsizeof(char),
cudaMemcpyDeviceToHost);

if (err != cudaSuccess)
{
printf(“Error: Can’t copy memory from gpu to host.\n”);
exit(1);
}

err = cudaGetLastError();

if (err != cudaSuccess)
{
printf(“Failed to copy memory from device. \n %s \n”, cudaGetErrorString(err));
exit(1);
}

//Debug
printf(“Device Address on Host = %p\n”, d_image_o.image);
//End Debug

//Debug
int i;
printf(“Debug GPU Results…\n”);
for (i = 0; i != 10; i++)
{
printf(“%d = %d\n”, h_i_image.image[i], h_o_image.image[i]);
}
//End Debug
}

=====================================================================================
Example Program Output

i_image.magic_number = P5
i_image.width = 256
i_image.height = 256
i_image.max_val = 255
h_kern_rows = 1
h_kern_cols = 3
Arch = 30
blockSize = 32,32
gridSize = 8,8
Run the 1D kernel…
blockSize = 32,32
gridSize = 8,8
Kernel Running…
num_rows = 256 and num_cols = 256
Address of i_output = 0x1f01d90000
i_input[0] = -95
i_input[1] = -95
i_input[2] = -94
i_input[3] = -94
i_input[4] = -93
i_input[5] = -94
i_input[6] = -96
i_input[7] = -98
i_input[8] = -98
i_input[9] = -97
i_output[0] = 2
i_output[1] = 2
i_output[2] = 2
i_output[3] = 2
i_output[4] = 2
i_output[5] = 2
i_output[6] = 2
i_output[7] = 2
i_output[8] = 2
i_output[9] = 2
Device Address on Host = 0x1f01d90000
Debug GPU Results…
-95 = 0
-95 = 0
-94 = 0
-94 = 0
-93 = 0
-94 = 0
-96 = 0
-98 = 0
-98 = 0
-97 = 0

njuffa · March 4, 2017, 9:05pm

When you run the program under control of cuda-memcheck, are any errors reported?

allanoverst · March 4, 2017, 9:10pm

I have run the program using cuda-memcheck and no errors are reported.

cuda-memcheck ./lab2 kernel1.csv test-images/Lenna_Images/Lenna_org_256.pgm out.pgm

========= CUDA-MEMCHECK

i_image.magic_number = P5
i_image.width = 256
i_image.height = 256
i_image.max_val = 255
h_kern_rows = 1
h_kern_cols = 3
Arch = 30
blockSize = 32,32
gridSize = 8,8
Run the 1D kernel…
blockSize = 32,32
gridSize = 8,8
Kernel Running…
num_rows = 256 and num_cols = 256
Address of i_output = 0x1f01d90000
i_input[0] = -95
i_input[1] = -95
i_input[2] = -94
i_input[3] = -94
i_input[4] = -93
i_input[5] = -94
i_input[6] = -96
i_input[7] = -98
i_input[8] = -98
i_input[9] = -97
i_output[0] = 2
i_output[1] = 2
i_output[2] = 2
i_output[3] = 2
i_output[4] = 2
i_output[5] = 2
i_output[6] = 2
i_output[7] = 2
i_output[8] = 2
i_output[9] = 2
Device Address on Host = 0x1f01d90000
Debug GPU Results…
-95 = -40
-95 = 36
-94 = -55
-94 = 32
-93 = 122
-94 = 127
-96 = 0
-98 = 0
-98 = -40
-97 = 36
========= ERROR SUMMARY: 0 errors

Whats interesting is when I run the program a second time it obtain different results in the h_o_image.image array. Below are results from running the program a second time.

cuda-memcheck ./lab2 kernel1.csv test-images/Lenna_Images/Lenna_org_256.pgm out.pgm
========= CUDA-MEMCHECK

i_image.magic_number = P5
i_image.width = 256
i_image.height = 256
i_image.max_val = 255
h_kern_rows = 1
h_kern_cols = 3
Arch = 30
blockSize = 32,32
gridSize = 8,8
Run the 1D kernel…
blockSize = 32,32
gridSize = 8,8
Kernel Running…
num_rows = 256 and num_cols = 256
Address of i_output = 0x1f01d90000
i_input[0] = -95
i_input[1] = -95
i_input[2] = -94
i_input[3] = -94
i_input[4] = -93
i_input[5] = -94
i_input[6] = -96
i_input[7] = -98
i_input[8] = -98
i_input[9] = -97
i_output[0] = 2
i_output[1] = 2
i_output[2] = 2
i_output[3] = 2
i_output[4] = 2
i_output[5] = 2
i_output[6] = 2
i_output[7] = 2
i_output[8] = 2
i_output[9] = 2
Device Address on Host = 0x1f01d90000
Debug GPU Results…
-95 = 0
-95 = 0
-94 = 0
-94 = 0
-93 = 0
-94 = 0
-96 = 0
-98 = 0
-98 = 0
-97 = 0
========= ERROR SUMMARY: 0 errors

allanoverst · March 4, 2017, 11:49pm

The issue was pretty simple. I didn’t assign h_o_image.width and h_o_image.height to the correct values before calling cudaMemcpy(). I moved these assignments in main() we the program is working correctly.

=====================================================
main.cu

#include <stdio.h>
#include <stdlib.h>
#include <pgm.h>
#include <gpu.h>
#include <csv.h>

int main(int argv, char *argc)
{

//Main Program

//Debug Program Parts
//0. Check for correct usage.
if (argv != 4)
{
printf(“Usage: ./lab2 kernel.csv input.pgm output.pgm\n”);
exit(1);
}
//1. Read in pgm file on command line.
read_pgm(argc[2]);

//2. Read in the kernel from file.
csv_read(argc[1]);

//3. Run the user selected operation on the image.
h_o_image.width = h_i_image.width;
h_o_image.height = h_i_image.height;
gpu_setup();
gpu_run(gridSize, blockSize);
gpu_results();

//3. Print out the image.
//a. Assign input image parameters to output.
int i;
for (i = 0; i != 2; i++)
{
h_o_image.magic_number[i] = h_i_image.magic_number[i];
}
write_pgm(argc[3]);

return 0;
}

Topic		Replies	Views
program crash when copying from device to host <br /> CUDA Programming and Performance	11	1903	March 31, 2009
cudaMemcpy does not copy data from the host to device CUDA Programming and Performance	6	7053	June 20, 2012
cudaMemcpy problem problem with using cudaMemcpy CUDA Programming and Performance	12	5947	January 5, 2010
Device Memeroy allocation and data transfer Data transfer between host and device CUDA Programming and Performance	5	2595	June 16, 2011
Newbie problem with copying between host and device My image is just gr{a\|e}y CUDA Programming and Performance	8	5305	May 30, 2011
cudaMemCpy wrong results CUDA Programming and Performance	3	4801	May 26, 2011
cuda malloc and mecpy error or is it? CUDA Programming and Performance	1	2601	May 23, 2009
Should I worry about these invalid reads? Valgrind invalid reads reported for Memcpy2D CUDA Programming and Performance	2	7404	April 21, 2009
cudaMemcpy copying wrong data set back from GPU CUDA Programming and Performance	2	1402	January 3, 2016
cudaMemcpy not working? CUDA Programming and Performance	3	4333	May 27, 2009

cudaMemcpy() returns success and copy incorrect data

===================================================================================== makefile

Header Files

pgm.h

===================================================================================== csv.h

===================================================================================== gpu.h

===================================================================================== kernels.h

Source Files

main.cu

===================================================================================== pgm.cu

===================================================================================== csv.cu

===================================================================================== kernels.cu

===================================================================================== gpu.cu

===================================================================================== Example Program Output

===================================================== main.cu

Related topics