Hi, I’ve been interested in CUDA for a couple of years but only just got myself a decent CUDA G/card, so only now starting to code! woo!
i’m having and issue with the very simple code below, the atomicAdd() function should increment an int in global memory, for each thread that has a threadIdx.x == 1. however, when i run this code and enter the same arbitrary range input everytime, the accumulated value of d_perfect_elements[0] changes between runs.
(i.e i put the number range from 1 to 6, and sometimes d_perfect_elements[0] will == 15, and sometimes it will == 6).
I’ve tried stripping the code right down and it seems to be something to do with the while loop that i’ve commented against. my code is below:
#include <iostream>
#include <vector>
#include <ctime>
#include "cuda.h"
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
const int ELEMENTS = 64;
using namespace std;
// GPU kernal
__global__ void perfectNumbersGPU(int *min, int *max, int *d_current_number, int *d_perfect_elements, int *d_perfect_index_line) {
if ((threadIdx.x + blockIdx.x * blockDim.x) < *max ) {
// Kernal prep work //////////////////////////////
__shared__ int s_divisors[32];
__shared__ int s_current_number;
d_perfect_elements[0] = 0;
if ((threadIdx.x + blockIdx.x * blockDim.x) == 1) {
*d_current_number = *min;
}
// launch into the loop thats having issues ////////////////////////
while (*d_current_number < ( *max + 1 )) {
int tid = threadIdx.x + blockIdx.x * blockDim.x;
if (threadIdx.x == 1) {
s_current_number = *d_current_number;
atomicAdd(&d_perfect_elements[0], 1); // atomic operation of interest!
}
if ((threadIdx.x + blockIdx.x * blockDim.x) == 1) { // thread no.1 increment the current no.
*d_current_number += 1;
}
}
__syncthreads();
}
};
int main () {
int min, max;
cout << "\n\nPerfect Number V0.1\n";
cout << "=======================" << endl;
cout << "\nEnter a starting number: ";
cin >> min;
cout << "Enter a finishing number: ";
cin >> max;
cout << endl;
if (min == 0) min++;
if (max < min) {
cout << "\nERROR: starting number is larger than finishing number. Exiting...!\n\n" << endl;
return 1;
}
clock_t start = clock();
// GPU CODE /////////////////////////
cout << "\nGPU: Starting...\n";
int *d_min, *d_max;
int *h_min, *h_max;
int *d_perfect_elements = new int [ELEMENTS];
int *h_perfect_elements = new int [ELEMENTS];
int *d_perfect_index_line;
int *d_current_number;
int *d_current_array;
start = clock();
cudaMalloc( (void**) &d_min, sizeof(int) );
cudaMalloc( (void**) &d_max, sizeof(int) );
cudaMalloc( (void**) &d_perfect_elements, ELEMENTS * sizeof(int) );
cudaMalloc( (void**) &d_perfect_index_line, sizeof(int));
cudaMalloc( (void**) &d_current_number, sizeof(int) );
stop = clock();
cout << "GPU: Memory preperation duration: " << (stop - start) << "ms" << endl;
h_perfect_elements[0] = 0;
cudaMemcpy( d_perfect_elements, h_perfect_elements , ELEMENTS * sizeof(int), cudaMemcpyHostToDevice );
cudaMemcpy( d_max, &max , sizeof(int), cudaMemcpyHostToDevice );
cudaMemcpy( d_min, &min , sizeof(int), cudaMemcpyHostToDevice );
// Run the kernal ///////////////////////////////////////////////////////////////////////////////////
start = clock();
perfectNumbersGPU<<< 4, 4 >>> ( d_min, d_max, d_current_number, d_perfect_elements, d_perfect_index_line ); // kernal launch here
stop = clock();
cout << "GPU: Time Taken: " << (stop - start) << "ms" << endl;
cudaMemcpy( h_perfect_elements, d_perfect_elements , ELEMENTS * sizeof(int), cudaMemcpyDeviceToHost );
cout << "RESULT: " << h_perfect_elements[0] << endl;
cudaFree( d_min );
cudaFree( d_max );
cudaFree( d_perfect_elements );
cudaFree( d_perfect_index_line );
cudaFree( d_current_number );
delete [] h_perfect_elements;
cout << "GPU: Memory deallocation duration: " << (stop - start) << "ms\n" << endl;
return 0;
}
If anyone can point out the problem i would greatly appreciate it. i’m sure its very simple but i just can’t see it!
Thanks for helping this noob!