cudaMemcpy does not copy data from the host to device

Hello all,

So after a few hours of different tests, I found that my cudaMemcpy, although returning successful, is not actually copying data from the host to the device.

Here’s the code:

#include "cuda_runtime.h"

#include "device_launch_parameters.h"

#include <stdio.h>

#include <cstdio>

#include <fstream>

#include <windows.h>

using namespace std;

/***********************************************

* Variables that may be changed before running *

************************************************/

//TESTING TYPE

//Timing test:    true

//Centroid test:  false

bool timing                                     = false;

bool cudaTest									= false;

//Accuracy changing here

typedef float decimalAccuracy;

//Size of the image in pixels

const int height								= 500;

const int width	   								= 500;

//Search box specifications

const int numberOfSearches				        = 1;

const decimalAccuracy initialSearchBoxRadius  	= 10;

//Itterative Specifications

decimalAccuracy searchBoxMultiplier			    = 1.0;

int numberOfItterations                         = 1;

//Timing testing

int  timing_repetitions                         = 25;

//Input file names:

//Data file

const char *fileName       = "blah.dat";

//Reference coordinates

const char *fileNameRef    = "blah2.dat";

//Output file names:

//Result output file name

const char *outputFileName = "out.dat";

//Timing data output file name

const char *timingfileName = "time_out.dat";

__global__ void centroidKernel   (decimalAccuracy* device_reference_coordinates_x, 

								  decimalAccuracy* device_reference_coordinates_y,

								  decimalAccuracy* device_working_reference_coordinates_x, 

							      decimalAccuracy* device_working_reference_coordinates_y,

								  decimalAccuracy* device_image_to_centroid,

					              decimalAccuracy  initialSearchBoxRadius,

								  int              width,

								  int              height,

								  decimalAccuracy  searchBoxMultiplier,

								  int              desiredItterations)

{

	decimalAccuracy  workingSearchBoxMultiplier = searchBoxMultiplier;

	//Create "perminate" references for each side as to not leave that area

	decimalAccuracy left   = device_reference_coordinates_x[blockIdx.x] - initialSearchBoxRadius - 0.5;

	decimalAccuracy right  = device_reference_coordinates_x[blockIdx.x] + initialSearchBoxRadius + 0.5;

	decimalAccuracy top    = device_reference_coordinates_y[blockIdx.x] - initialSearchBoxRadius - 0.5;

	decimalAccuracy bottom = device_reference_coordinates_y[blockIdx.x] + initialSearchBoxRadius + 0.5;

	//Loop Start here

	//Create "perminate" references for each side as to not leave that area

	decimalAccuracy working_left   = device_working_reference_coordinates_x[blockIdx.x] - workingSearchBoxMultiplier - 0.5;

	decimalAccuracy working_right  = device_working_reference_coordinates_x[blockIdx.x] + workingSearchBoxMultiplier + 0.5;

	decimalAccuracy working_top    = device_working_reference_coordinates_y[blockIdx.x] - workingSearchBoxMultiplier - 0.5;

	decimalAccuracy working_bottom = device_working_reference_coordinates_y[blockIdx.x] + workingSearchBoxMultiplier + 0.5;

	//Create Variables for centroiding

	//Column = x

	//Row    = y

	__shared__ decimalAccuracy row_sum,

							   column_sum,

							   intensity_sum;

	row_sum       = 0;

	column_sum    = 0;

	intensity_sum = 0;

	__syncthreads();

	decimalAccuracy threadIntensity = device_image_to_centroid[blockIdx.x * width * height + ((int)floor(top) + threadIdx.y * width) + (int)floor(left) + threadIdx.x];

	//Using boolean math, determine the value of the "pixel" or thread

	//Thread index is of the search box, not the entire image, left is entire image. Convert

	decimalAccuracy x_fraction = (abs(left + threadIdx.x - working_left)  < 0.5) * ((ceil(working_left)   + working_left)  / 2)        //If left fraction

		                        +(abs(left + threadIdx.x - working_right) < 0.5) * ((floor(working_right) + working_right) / 2)        //If right fraction

								+(working_left  <= threadIdx.x + left) * (left + threadIdx.x + 0.5 <= working_right);                  // Whole Pixels  

	decimalAccuracy y_fraction = (abs(top + threadIdx.y - working_top) <    0.5) * ((ceil(working_top)     + working_top)    / 2)      //If left fraction

		                        +(abs(top + threadIdx.y - working_bottom) < 0.5) * ((floor(working_bottom) + working_bottom) / 2)      //If right fraction

								+(working_top <=threadIdx.y + top)     * (top + threadIdx.y + 0.5 <= working_bottom);                  // Whole Pixels  

	

	__syncthreads();

	//Centroid Sums Based on thread index

	atomicAdd(&row_sum      , threadIdx.y * y_fraction * threadIntensity);

	atomicAdd(&column_sum   , threadIdx.x * x_fraction * threadIntensity);

	atomicAdd(&intensity_sum, threadIntensity);

	__syncthreads();

	//Store the new reference coordinates

	if(threadIdx.x == 0 && threadIdx.y == 0){

		printf("%f, %f\t\n", row_sum, intensity_sum);

		device_working_reference_coordinates_x[blockIdx.x] = row_sum    / intensity_sum;

		device_working_reference_coordinates_y[blockIdx.x] = column_sum / intensity_sum;

	}

}

int main()

{

  	

	/************************************************************

	* Pulls in the large image and stores it to a 1-D, pagelocked

	* array.

	*************************************************************/

	decimalAccuracy* image_to_centroid;

	cudaHostAlloc((void**)&image_to_centroid, width * height * sizeof(decimalAccuracy), cudaHostAllocDefault);

	//Loads the image data from a file and stores in the "testImage"

	ifstream imageData(fileName);

	if(imageData.is_open()){

		for(int y = 0; y < height; y++){

			for(int x = 0; x < width; x++){	

				imageData >> image_to_centroid[y*width+x];

			}

		}

	}else{

		//Displays a message if the file does not exist

		printf("File is not open!\n\n");

	}

	//Closes the file

	imageData.close();

	/************************************************************

	* Pulls in the reference Coordinates and stores them to two

	* 1-D, pagelocked arrays.

	*************************************************************/

	//loads the reference coordinates from a file

	decimalAccuracy* reference_coordinates_x;

	decimalAccuracy* reference_coordinates_y;

	cudaHostAlloc((void**)&reference_coordinates_x, numberOfSearches * sizeof(decimalAccuracy), cudaHostAllocDefault);

	cudaHostAlloc((void**)&reference_coordinates_y, numberOfSearches * sizeof(decimalAccuracy), cudaHostAllocDefault);

	ifstream refs(fileNameRef);

	

	//Fills the vector with SingleLenslets

	if(refs.is_open()){

		for(int currentReferenceCoordinate = 0; currentReferenceCoordinate < numberOfSearches; currentReferenceCoordinate++){

			refs >> reference_coordinates_x[currentReferenceCoordinate];

			refs >> reference_coordinates_y[currentReferenceCoordinate];

		}

	//Displays a message if the file does not exist

	}else{

		printf("File is not open!\n\n");

	}

	//Closes the file

	refs.close();

	/******************************************************************

	* Copy data to the device. This data includes:

	* - Array for the large image

	* - Arrays for the reference coordinates

	*******************************************************************/

	//Create each array

	decimalAccuracy* device_reference_coordinates_x;

	decimalAccuracy* device_reference_coordinates_y;

	decimalAccuracy* device_working_reference_coordinates_x;

	decimalAccuracy* device_working_reference_coordinates_y;

	decimalAccuracy* device_image_to_centroid;

	//Allocate memory on the device for each array

	//And prints the error messages

	cudaMalloc((void**) &device_reference_coordinates_x        , numberOfSearches * sizeof(decimalAccuracy));

	cudaMalloc((void**) &device_reference_coordinates_y        , numberOfSearches * sizeof(decimalAccuracy));

	cudaMalloc((void**) &device_working_reference_coordinates_x, numberOfSearches * sizeof(decimalAccuracy));

	cudaMalloc((void**) &device_working_reference_coordinates_y, numberOfSearches * sizeof(decimalAccuracy));

	cudaMalloc((void**) &device_image_to_centroid              , width * height   * sizeof(decimalAccuracy));

	printf("Device Memory Allocation:\t%s\n", cudaGetErrorString(cudaGetLastError()));

	//Copy all of the data

	cudaMemcpy(device_reference_coordinates_x, reference_coordinates_x, numberOfSearches * sizeof(decimalAccuracy), cudaMemcpyHostToDevice);

	cudaMemcpy(device_reference_coordinates_y, reference_coordinates_y, numberOfSearches * sizeof(decimalAccuracy), cudaMemcpyHostToDevice);

	

	cudaMemcpy(device_working_reference_coordinates_x, reference_coordinates_x, numberOfSearches * sizeof(decimalAccuracy), cudaMemcpyHostToDevice);

	cudaMemcpy(device_working_reference_coordinates_y, reference_coordinates_y, numberOfSearches * sizeof(decimalAccuracy), cudaMemcpyHostToDevice);

	

	cudaMemcpy(device_image_to_centroid,       image_to_centroid,       width * height   * sizeof(decimalAccuracy), cudaMemcpyHostToDevice);

	printf("Device Variable Copying:\t%s\n", cudaGetErrorString(cudaGetLastError()));

	/******************************************************************

	* Launches the kernel with the following parameters:

	* - Array for the large image

	* - Arrays for the reference coordinates

	* - Arrays for working reference coordinates

	* - Search box radius

	* - Image width

	* - Image height

	* - Search Box Multiplier

	* - Number of itterations

	********************************************************************/

	dim3 threadDimensions(2 * initialSearchBoxRadius + 1, 2 * initialSearchBoxRadius + 1, 1);

	centroidKernel<<<numberOfSearches, threadDimensions>>>(device_reference_coordinates_x, 

														   device_reference_coordinates_y,

														   device_working_reference_coordinates_x, 

														   device_working_reference_coordinates_y,

														   device_image_to_centroid,

														   initialSearchBoxRadius,

														   width,

														   height,

														   searchBoxMultiplier,

														   numberOfItterations);

	printf("Kernel Launch:\t\t\t%s\n", cudaGetErrorString(cudaGetLastError()));

	//Sync device

	cudaDeviceSynchronize();

	printf("Kernel Return:\t\t\t%s\n", cudaGetErrorString(cudaGetLastError()));

	/*******************************************************************

	* Coppies back the following data:

	* - Arrays for the new reference coordinates

	********************************************************************/

	decimalAccuracy* working_reference_coordinates_x;

	decimalAccuracy* working_reference_coordinates_y;

	//Malloced due to not being accesed on the GPU

	working_reference_coordinates_x = (decimalAccuracy*) malloc( numberOfSearches * sizeof(decimalAccuracy));

	working_reference_coordinates_y = (decimalAccuracy*) malloc( numberOfSearches * sizeof(decimalAccuracy));

	cudaMemcpy(working_reference_coordinates_x, device_working_reference_coordinates_x, numberOfSearches * sizeof(decimalAccuracy), cudaMemcpyDeviceToHost);

	cudaMemcpy(working_reference_coordinates_y, device_working_reference_coordinates_x, numberOfSearches * sizeof(decimalAccuracy), cudaMemcpyDeviceToHost);

	printf("Device Variable Return Copying:\t%s\n", cudaGetErrorString(cudaGetLastError()));

	

	/*******************************************************************

	* Outputs the spot shift to a file

	********************************************************************/

	ofstream out(outputFileName);

	//Output configurations

	out.width(16); 

	out << scientific;

	out.precision(7);

	for(int currentSpotShift = 0; currentSpotShift < numberOfSearches; currentSpotShift++){

		out << right << reference_coordinates_x[currentSpotShift] - working_reference_coordinates_x[currentSpotShift] << " ";

		out << right << reference_coordinates_y[currentSpotShift] - working_reference_coordinates_y[currentSpotShift] << endl;

		

	}

	out.close();

			

	//Waits for button press to enure you are aware it is finished

	system("PAUSE");

	return 0;

}

Any Ideas?

How do you know the cudaMemcpy() calls are successful? Nowhere in your code the return code is checked…

It’s right under it. I use printf so I can check it each time it runs.

cudaMemcpy(device_image_to_centroid,       image_to_centroid,       width * height   * sizeof(decimalAccuracy), cudaMemcpyHostToDevice);

        printf("Device Variable Copying:\t%s\n", cudaGetErrorString(cudaGetLastError()));

Just to add a little more detail, it simply fills the new array with 0’s, no memory leaks or anything either.

Since I don’t have the input files, I cannot run the code, which makes it hard to determine what happens without any additional information, as I do not know the output of the code. I did notice that you are actually copying the same vector twice in your code

cudaMemcpy(working_reference_coordinates_x, device_working_reference_coordinates_x, numberOfSearches * sizeof(decimalAccuracy), cudaMemcpyDeviceToHost);

cudaMemcpy(working_reference_coordinates_y, device_working_reference_coordinates_x, numberOfSearches * sizeof(decimalAccuracy), cudaMemcpyDeviceToHost);

could this be your problem?

That part has no real issue. Its the image file that it takes in. I would give you input files but it would take far too long to explain them all.

It copied the data if i make them all something simple like 7.0 but if it’s the data pulled from the file (which i printf’ed as well and it was correct) it just copies an entire array of 0’s.

I’m not really sure where to look for errors at this point.

You’re not error checking after every call! Why??