Hello all,
So after a few hours of different tests, I found that my cudaMemcpy, although returning successful, is not actually copying data from the host to the device.
Here’s the code:
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
#include <cstdio>
#include <fstream>
#include <windows.h>
using namespace std;
* Variables that may be changed before running *
//Timing test: true
//Centroid test: false
bool timing = false;
bool cudaTest = false;
//Accuracy changing here
typedef float decimalAccuracy;
//Size of the image in pixels
const int height = 500;
const int width = 500;
//Search box specifications
const int numberOfSearches = 1;
const decimalAccuracy initialSearchBoxRadius = 10;
//Itterative Specifications
decimalAccuracy searchBoxMultiplier = 1.0;
int numberOfItterations = 1;
//Timing testing
int timing_repetitions = 25;
//Input file names:
//Data file
const char *fileName = "blah.dat";
//Reference coordinates
const char *fileNameRef = "blah2.dat";
//Output file names:
//Result output file name
const char *outputFileName = "out.dat";
//Timing data output file name
const char *timingfileName = "time_out.dat";
__global__ void centroidKernel (decimalAccuracy* device_reference_coordinates_x,
decimalAccuracy* device_reference_coordinates_y,
decimalAccuracy* device_working_reference_coordinates_x,
decimalAccuracy* device_working_reference_coordinates_y,
decimalAccuracy* device_image_to_centroid,
decimalAccuracy initialSearchBoxRadius,
int width,
int height,
decimalAccuracy searchBoxMultiplier,
int desiredItterations)
decimalAccuracy workingSearchBoxMultiplier = searchBoxMultiplier;
//Create "perminate" references for each side as to not leave that area
decimalAccuracy left = device_reference_coordinates_x[blockIdx.x] - initialSearchBoxRadius - 0.5;
decimalAccuracy right = device_reference_coordinates_x[blockIdx.x] + initialSearchBoxRadius + 0.5;
decimalAccuracy top = device_reference_coordinates_y[blockIdx.x] - initialSearchBoxRadius - 0.5;
decimalAccuracy bottom = device_reference_coordinates_y[blockIdx.x] + initialSearchBoxRadius + 0.5;
//Loop Start here
//Create "perminate" references for each side as to not leave that area
decimalAccuracy working_left = device_working_reference_coordinates_x[blockIdx.x] - workingSearchBoxMultiplier - 0.5;
decimalAccuracy working_right = device_working_reference_coordinates_x[blockIdx.x] + workingSearchBoxMultiplier + 0.5;
decimalAccuracy working_top = device_working_reference_coordinates_y[blockIdx.x] - workingSearchBoxMultiplier - 0.5;
decimalAccuracy working_bottom = device_working_reference_coordinates_y[blockIdx.x] + workingSearchBoxMultiplier + 0.5;
//Create Variables for centroiding
//Column = x
//Row = y
__shared__ decimalAccuracy row_sum,
row_sum = 0;
column_sum = 0;
intensity_sum = 0;
decimalAccuracy threadIntensity = device_image_to_centroid[blockIdx.x * width * height + ((int)floor(top) + threadIdx.y * width) + (int)floor(left) + threadIdx.x];
//Using boolean math, determine the value of the "pixel" or thread
//Thread index is of the search box, not the entire image, left is entire image. Convert
decimalAccuracy x_fraction = (abs(left + threadIdx.x - working_left) < 0.5) * ((ceil(working_left) + working_left) / 2) //If left fraction
+(abs(left + threadIdx.x - working_right) < 0.5) * ((floor(working_right) + working_right) / 2) //If right fraction
+(working_left <= threadIdx.x + left) * (left + threadIdx.x + 0.5 <= working_right); // Whole Pixels
decimalAccuracy y_fraction = (abs(top + threadIdx.y - working_top) < 0.5) * ((ceil(working_top) + working_top) / 2) //If left fraction
+(abs(top + threadIdx.y - working_bottom) < 0.5) * ((floor(working_bottom) + working_bottom) / 2) //If right fraction
+(working_top <=threadIdx.y + top) * (top + threadIdx.y + 0.5 <= working_bottom); // Whole Pixels
//Centroid Sums Based on thread index
atomicAdd(&row_sum , threadIdx.y * y_fraction * threadIntensity);
atomicAdd(&column_sum , threadIdx.x * x_fraction * threadIntensity);
atomicAdd(&intensity_sum, threadIntensity);
//Store the new reference coordinates
if(threadIdx.x == 0 && threadIdx.y == 0){
printf("%f, %f\t\n", row_sum, intensity_sum);
device_working_reference_coordinates_x[blockIdx.x] = row_sum / intensity_sum;
device_working_reference_coordinates_y[blockIdx.x] = column_sum / intensity_sum;
int main()
* Pulls in the large image and stores it to a 1-D, pagelocked
* array.
decimalAccuracy* image_to_centroid;
cudaHostAlloc((void**)&image_to_centroid, width * height * sizeof(decimalAccuracy), cudaHostAllocDefault);
//Loads the image data from a file and stores in the "testImage"
ifstream imageData(fileName);
for(int y = 0; y < height; y++){
for(int x = 0; x < width; x++){
imageData >> image_to_centroid[y*width+x];
//Displays a message if the file does not exist
printf("File is not open!\n\n");
//Closes the file
* Pulls in the reference Coordinates and stores them to two
* 1-D, pagelocked arrays.
//loads the reference coordinates from a file
decimalAccuracy* reference_coordinates_x;
decimalAccuracy* reference_coordinates_y;
cudaHostAlloc((void**)&reference_coordinates_x, numberOfSearches * sizeof(decimalAccuracy), cudaHostAllocDefault);
cudaHostAlloc((void**)&reference_coordinates_y, numberOfSearches * sizeof(decimalAccuracy), cudaHostAllocDefault);
ifstream refs(fileNameRef);
//Fills the vector with SingleLenslets
for(int currentReferenceCoordinate = 0; currentReferenceCoordinate < numberOfSearches; currentReferenceCoordinate++){
refs >> reference_coordinates_x[currentReferenceCoordinate];
refs >> reference_coordinates_y[currentReferenceCoordinate];
//Displays a message if the file does not exist
printf("File is not open!\n\n");
//Closes the file
* Copy data to the device. This data includes:
* - Array for the large image
* - Arrays for the reference coordinates
//Create each array
decimalAccuracy* device_reference_coordinates_x;
decimalAccuracy* device_reference_coordinates_y;
decimalAccuracy* device_working_reference_coordinates_x;
decimalAccuracy* device_working_reference_coordinates_y;
decimalAccuracy* device_image_to_centroid;
//Allocate memory on the device for each array
//And prints the error messages
cudaMalloc((void**) &device_reference_coordinates_x , numberOfSearches * sizeof(decimalAccuracy));
cudaMalloc((void**) &device_reference_coordinates_y , numberOfSearches * sizeof(decimalAccuracy));
cudaMalloc((void**) &device_working_reference_coordinates_x, numberOfSearches * sizeof(decimalAccuracy));
cudaMalloc((void**) &device_working_reference_coordinates_y, numberOfSearches * sizeof(decimalAccuracy));
cudaMalloc((void**) &device_image_to_centroid , width * height * sizeof(decimalAccuracy));
printf("Device Memory Allocation:\t%s\n", cudaGetErrorString(cudaGetLastError()));
//Copy all of the data
cudaMemcpy(device_reference_coordinates_x, reference_coordinates_x, numberOfSearches * sizeof(decimalAccuracy), cudaMemcpyHostToDevice);
cudaMemcpy(device_reference_coordinates_y, reference_coordinates_y, numberOfSearches * sizeof(decimalAccuracy), cudaMemcpyHostToDevice);
cudaMemcpy(device_working_reference_coordinates_x, reference_coordinates_x, numberOfSearches * sizeof(decimalAccuracy), cudaMemcpyHostToDevice);
cudaMemcpy(device_working_reference_coordinates_y, reference_coordinates_y, numberOfSearches * sizeof(decimalAccuracy), cudaMemcpyHostToDevice);
cudaMemcpy(device_image_to_centroid, image_to_centroid, width * height * sizeof(decimalAccuracy), cudaMemcpyHostToDevice);
printf("Device Variable Copying:\t%s\n", cudaGetErrorString(cudaGetLastError()));
* Launches the kernel with the following parameters:
* - Array for the large image
* - Arrays for the reference coordinates
* - Arrays for working reference coordinates
* - Search box radius
* - Image width
* - Image height
* - Search Box Multiplier
* - Number of itterations
dim3 threadDimensions(2 * initialSearchBoxRadius + 1, 2 * initialSearchBoxRadius + 1, 1);
centroidKernel<<<numberOfSearches, threadDimensions>>>(device_reference_coordinates_x,
printf("Kernel Launch:\t\t\t%s\n", cudaGetErrorString(cudaGetLastError()));
//Sync device
printf("Kernel Return:\t\t\t%s\n", cudaGetErrorString(cudaGetLastError()));
* Coppies back the following data:
* - Arrays for the new reference coordinates
decimalAccuracy* working_reference_coordinates_x;
decimalAccuracy* working_reference_coordinates_y;
//Malloced due to not being accesed on the GPU
working_reference_coordinates_x = (decimalAccuracy*) malloc( numberOfSearches * sizeof(decimalAccuracy));
working_reference_coordinates_y = (decimalAccuracy*) malloc( numberOfSearches * sizeof(decimalAccuracy));
cudaMemcpy(working_reference_coordinates_x, device_working_reference_coordinates_x, numberOfSearches * sizeof(decimalAccuracy), cudaMemcpyDeviceToHost);
cudaMemcpy(working_reference_coordinates_y, device_working_reference_coordinates_x, numberOfSearches * sizeof(decimalAccuracy), cudaMemcpyDeviceToHost);
printf("Device Variable Return Copying:\t%s\n", cudaGetErrorString(cudaGetLastError()));
* Outputs the spot shift to a file
ofstream out(outputFileName);
//Output configurations
out << scientific;
for(int currentSpotShift = 0; currentSpotShift < numberOfSearches; currentSpotShift++){
out << right << reference_coordinates_x[currentSpotShift] - working_reference_coordinates_x[currentSpotShift] << " ";
out << right << reference_coordinates_y[currentSpotShift] - working_reference_coordinates_y[currentSpotShift] << endl;
//Waits for button press to enure you are aware it is finished
return 0;
Any Ideas?