Hi:
I was doing some testing on CUDA, and I found an interesting thing regarding the thread scheduling… So, I have this “dummy” program that finds first prime factor for a given number
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include "conio.h"
#include "stdio.h"
#include "string.h"
#include "stdlib.h"
#include "time.h"
__global__ void addKernel(int* x, const int *n, const int o, int* r);
__global__ void addKernel(int* x, const int *n, const int o, int* r){
while (*r == 0) {
//THIS PRINTF ON PARTICULAR!
printf("Trying with: %i result: %i\n", x[blockIdx.x * blockDim.x + threadIdx.x], *n % x[blockIdx.x * blockDim.x + threadIdx.x]);
if (*n % x[blockIdx.x * blockDim.x + threadIdx.x] == 0 && x[blockIdx.x * blockDim.x + threadIdx.x] > 1) {
memcpy(r, &x[blockIdx.x * blockDim.x + threadIdx.x], sizeof(int));
}
x[blockIdx.x * blockDim.x + threadIdx.x] += o;
}
}
int main(int argc, char* argv[]) {
//kernel control
int cn;
int cm;
//aux data
int i;
if (argc > 1) {
cn = atoi(argv[1]);
cm = atoi(argv[2]);
} else {
cn = 4;
cm = 4;
}
//heap memory control
size_t maxMem;
//loop control
int ex = 0;
cudaError_t cuerr;
//measurement
float str;
float end;
float freq;
//HOST data
int* hx = (int*)malloc(sizeof(int) * cn * cm);
int* hr = (int*)malloc(sizeof(int));
int* hn = (int*)malloc(sizeof(int));
//DEVICE data
int* x;
int* r;
int* n;
cudaDeviceGetLimit(&maxMem, cudaLimitMallocHeapSize);
cudaDeviceSetLimit(cudaLimitMallocHeapSize, maxMem);
printf("Heap memory set result: %s\n", cudaGetErrorString(cudaGetLastError()));
printf("Kernel <<<%i, %i>>> will be launched\n", cn, cm);
//init DEVICE data
cudaMalloc(&x, sizeof(int) * cn * cm);
cudaMalloc(&r, sizeof(int));
cudaMalloc(&n, sizeof(int));
if (hx == NULL || hr == NULL || hn == NULL)
exit(-1);
//init HOST data
for (i = 0; i < cn * cm; i++)
hx[i] = i;
*hn = 131 * 313; //test data
*hr = 0;
//copy from HOST to DEVICE
cudaMemcpy(x, hx, sizeof(int) * cn * cm, cudaMemcpyHostToDevice);
cudaMemcpy(n, hn, sizeof(int), cudaMemcpyHostToDevice);
cudaMemcpy(r, hr, sizeof(int), cudaMemcpyHostToDevice);
printf("Calculation - Loop Start @%i\n", clock());
addKernel << <cn, cm >> > (x, n, (cn * cm), r);
cuerr = cudaDeviceSynchronize();
printf("Calculation - Loop Ends @%i\n", clock());
//copy r to HOST
cudaMemcpy(hr, r, sizeof(int), cudaMemcpyDeviceToHost);
printf("Result: %i\n", *hr);
return 0;
}
So, if I run it wtih first printf
uncommented (the one that says “THIS PRINTF ON PARTICULAR”, the program ends quickly (as expected), and returns the lowest prime factor of the number
[...]
Trying with: 158 result: 81
Trying with: 159 result: 140
Calculation - Loop Ends @1245
Result: 131
On the other hand, if I comment that particular printf
, program lasts considerably more and retrieves the highest primer factor of the number
Heap memory set result: no error
Kernel <<<4, 4>>> will be launched
Calculation - Loop Start @1107
Calculation - Loop Ends @48101
Result: 313
I don’t know why the behaviour changes so much… I even tried to use __shared__
data on the kernel, but similar result
__global__ void addKernel(int* x, const int *n, const int o, int* r){
__shared__ int sr;
sr = 0;
__syncthreads(); //ensure all threads have initial value of 0
while (sr == 0) {
//printf("Trying with: %i result: %i\n", x[blockIdx.x * blockDim.x + threadIdx.x], *n % x[blockIdx.x * blockDim.x + threadIdx.x]);
if (*n % x[blockIdx.x * blockDim.x + threadIdx.x] == 0 && x[blockIdx.x * blockDim.x + threadIdx.x] > 1) {
//memcpy(r, &x[blockIdx.x * blockDim.x + threadIdx.x], sizeof(int));
sr = x[blockIdx.x * blockDim.x + threadIdx.x];
}
x[blockIdx.x * blockDim.x + threadIdx.x] += o;
__syncthreads(); //sync so every thread gets updated sr value
}
__syncthreads();
memcpy(r, &x[blockIdx.x * blockDim.x + threadIdx.x], sizeof(int));
}
EDIT: after a few rounds of testing, I found that the way to make it work is by asking for block data specifically
__global__ void addKernel(int* x, const int n, const int o, int* r, const int cn){
if (blockIdx.x > 0) {
printf("Block %i activated...\n", blockIdx.x); //kind of "hack" to make it work"
}
while (*r == 0) {
if (n % x[blockIdx.x * blockDim.x + threadIdx.x] == 0 && x[blockIdx.x * blockDim.x + threadIdx.x] > 1) {
memcpy(r, &x[blockIdx.x * blockDim.x + threadIdx.x], sizeof(int));
}
x[blockIdx.x * blockDim.x + threadIdx.x] += o;
}
}
Is there maybe some compiler optimization that is changing some values when I remove the printf
?
Thanks.