Sorry, it dint get attached in the last post. I am not able to attach it . I shall post it as code
#include "stdio.h"
#include "sys/time.h"
#include "cutil.h"
double elapsed_seconds;
void time_int(int print) {
static struct timeval prevTime;
static struct timeval t2; /* var of current time stamp */
struct timezone tzp;
if (gettimeofday(&t2, &tzp) == -1)
exit(0);
if (print == 1) {
elapsed_seconds=(double)(t2.tv_sec - prevTime.tv_sec) + ((double)(t2.tv_usec
- prevTime.tv_usec))/1000000;
printf("Time spent [%.2fs] \n", elapsed_seconds);
}
prevTime = t2;
}
__device__ bool cuda_go_2_next_element_c(int3& iterator, int3 array_size) {
iterator.x++;
if(iterator.x >= array_size.x)
iterator.x = 0;
else
return true;
iterator.y++;
if(iterator.y >= array_size.y)
iterator.y = 0;
else
return true;
iterator.z++;
if(iterator.z >= array_size.z){
iterator.x = array_size.x - 1;
iterator.y = array_size.y - 1;
iterator.z = array_size.z - 1;
return false;
}
else
return true;
}
__device__ float cuda_getSingleton(int value, int label, float* mean, float* variance, float* var_) {
float val = powf((float) value - mean[blockIdx.x*3 + label], 2)/(2.0 * variance[blockIdx.x*3 + label]);
val = var_[blockIdx.x*3 + label] + val;
return val;
}
__device__ void cuda_getMinMaxCoord(int& minRet, int& maxRet, int sitePos, int radius, int start, int end){
minRet = sitePos - radius;
maxRet = sitePos + radius;
if (minRet < start) {
maxRet = maxRet - minRet;
minRet = start;
if (maxRet >= end)
maxRet = end - 1;
} else {
if (maxRet >= end) {
minRet = minRet - (maxRet - end) - 1;
maxRet = end - 1;
if (minRet < start)
minRet = start;
}
}
}
__device__ int cuda_getRand(int3 curr_indx, int no_regions) {
return (curr_indx.x + curr_indx.y + curr_indx.z) % no_regions;
}
__global__ void cuda_while_loop(int* class_label_array,int* data_array, int* size, int* checkValue,
bool mmd, float T, float E, float E_old, int no_regions, float kszi, int K,
float* mean, float* variance, float* var_) {
bool cuda_metropolis = true;
float summa_deltaE = 0.0;
int multiples;
//int finalLabel = 0;
int currLabel = 0;
int currValue = 0;
int r = 0;
float currEnergy = 0.0;
float newEnergy = 0.0;
bool my_c_data_class_label_changed;
//uint classCount[3];
int3 array_start; //start position for each block of the class label volume
int3 array_iterator_size; // end position for each block of the class label volume
int3 array_size; // holds the size of the volume
int3 iterator; // used to iterate through the volume
int3 curr_indx; // current position in the volume
/* Initializing class count and array_size */
for(int i=0;i<3;i++){
//classCount[i]=0;
checkValue[blockIdx.x*3 + i] = 0;
checkValue[24 + blockIdx.x*3 + i] = 0;
}
array_size.x = 512;
array_size.y = 512;
array_size.z = 76;
multiples = array_size.x * array_size.y; /* 512 * 512 - used for the accesing elemnts in a class label array */
/* z coordinatie is divided between the blocks and each iterator is assigned itz initial
value */
iterator.x = 0;
iterator.y = 0;
iterator.z = (array_size.z /8) * blockIdx.x;
array_start.x = iterator.x;
array_start.y = iterator.y;
array_start.z = iterator.z;
/* array_iterator - this array is used in host_go_2_next_element_c . It holds the voulme through which each block si going to iterate
iterator is start position for each block and array_iterator_size is the end position for each block */
array_iterator_size.x = array_size.x;
array_iterator_size.y = array_size.y;
/* When the number of slices is not exactly divisible by 8 , making the last block iterate until the end*/
if(blockIdx.x==7)
array_iterator_size.z = array_size.z;
else
array_iterator_size.z = (((int)(array_size.z/8)*(blockIdx.x+1)));
array_iterator_size.z = array_start.z + 1;
do {
curr_indx = iterator;
currLabel = class_label_array[curr_indx.x + (curr_indx.y * array_iterator_size.x) + (curr_indx.z * multiples)];
currValue = data_array[curr_indx.x + (curr_indx.y * array_iterator_size.x) + (curr_indx.z * multiples)];
if (no_regions == 2)
r = 1 - currLabel;
else {
r = cuda_getRand(curr_indx, no_regions);
if (r == currLabel)
if (currLabel + 1 == no_regions )
r = currLabel - 1;
else
r = currLabel + 1;
}
if (!mmd)
kszi = logf(0.1);
if (true){
currEnergy = cuda_getSingleton(currValue, currLabel, mean, variance, var_);
int label = currLabel;
float betaInClass = -1;
float betaOutClass = -4;
int3 min;
int3 max;
int radius = 1;
int3 curr;
float energy = 0.0;
cuda_getMinMaxCoord(min.x, max.x, curr_indx.x, radius, array_start.x, array_iterator_size.x);
cuda_getMinMaxCoord(min.y, max.y, curr_indx.y, radius, array_start.y, array_iterator_size.y);
cuda_getMinMaxCoord(min.z, max.z, curr_indx.z, radius, array_start.z, array_iterator_size.z);
int currPixel = 1;
for (curr.x = min.x; curr.x <= max.x; curr.x++)
for (curr.y = min.y; curr.y <= max.y; curr.y++)
for (curr.z = min.z; curr.z <= max.z; curr.z++) {
currPixel = class_label_array[curr.x + (curr.y * array_iterator_size.x) + (curr.z * multiples)];
if (true){
if (label == currPixel)
energy = energy - betaInClass;
else
energy = energy + betaOutClass;
}
}
currEnergy = currEnergy + energy;
}
if (true){
newEnergy = cuda_getSingleton(currValue, r, mean, variance, var_);
int label = r;
float betaInClass = -1;
float betaOutClass = -4;
int3 min;
int3 max;
int radius = 1;
int3 curr;
float energy = 0.0;
cuda_getMinMaxCoord(min.x, max.x, curr_indx.x, radius, array_start.x, array_iterator_size.x);
cuda_getMinMaxCoord(min.y, max.y, curr_indx.y, radius, array_start.y, array_iterator_size.y);
cuda_getMinMaxCoord(min.z, max.z, curr_indx.z, radius, array_start.z, array_iterator_size.z);
int currPixel = 1;
for (curr.x = min.x; curr.x <= max.x; curr.x++)
for (curr.y = min.y; curr.y <= max.y; curr.y++)
for (curr.z = min.z; curr.z <= max.z; curr.z++) {
currPixel = class_label_array[curr.x + (curr.y * array_iterator_size.x) + (curr.z * multiples)];
if (true){
if (label == currPixel)
energy = energy - betaInClass;
else
energy = energy + betaOutClass;
}
}
newEnergy = newEnergy + energy;
}
float diffE = currEnergy - newEnergy;
int finalLabel = currLabel;
if (kszi <= (diffE) / T) {
summa_deltaE += fabsf(diffE);
E_old = E = E_old - diffE;
if (r != currLabel) {
my_c_data_class_label_changed = true;
int indx = curr_indx.x + (curr_indx.y * array_size.x) + (curr_indx.z * 262144);
class_label_array[indx] = r;
finalLabel = r;
my_c_data_class_label_changed = true;
}
} else {
if ((newEnergy != newEnergy) && (K == 1))
if (newEnergy != newEnergy) {}
}
//classCount[finalLabel]++;
cuda_metropolis = cuda_go_2_next_element_c(iterator, array_iterator_size);
}while(cuda_metropolis);
checkValue[blockIdx.x*3 + 0] = (int)currEnergy;
checkValue[blockIdx.x*3 + 1] = (int)newEnergy;
checkValue[blockIdx.x*3 + 2] = curr_indx.z;
checkValue[24 + blockIdx.x*3 + 0] = array_iterator_size.x;
checkValue[24 + blockIdx.x*3 + 1] = array_iterator_size.y;
checkValue[24 + blockIdx.x*3 + 2] = array_iterator_size.z;
}
int main(int argc, char* argv[]){
double mean[3], variance[3], var_[3];
double T = 3.92, E = 0.0, E_old = -382031186.172914, summa_deltaE = 0.0, kszi = -2.302585;
int K = 0;
int no_regions = 3;
int classCount[3];
time_int(0);
int size = 512*512*76;
CUT_DEVICE_INIT();
int blocksize = 8;
/* ASSIGNING THE MEAN VARIANCE VAR_ VALUES*/
mean[0] = 0.000000, mean[1] = 100.054820, mean[2] = 87.107318;
variance[0] = 0.000000, variance[1] = 8744.017037, variance[2] = 149.997697;
var_[0] = -10.593987, var_[1] = 5.457001, var_[2] = 3.424249;
int* host_class_label_array;
host_class_label_array = (int*) malloc(sizeof(int) * size);
printf("start copying class labels\n");
/* Copying the class label values from file */
FILE *fp;
fp = fopen("/home/vbalu2/temp/class_label_host.txt","r");
for(int i = 0; i<size; i++){
fscanf(fp,"%d\n", &host_class_label_array[i]);
//host_class_label_array[i] = 1;
}
fclose(fp);
printf("start copying data array\n");
int* host_data_array;
host_data_array = (int*) malloc(sizeof(int) * size);
/* copying the data array values from a file*/
fp = fopen("/home/vbalu2/temp/data_array.txt","r");
for(int i = 0; i<size; i++){
fscanf(fp,"%d\n", &host_data_array[i]);
//host_data_array[i] = 2;
}
fclose(fp);
/* allocating a copy of mean variance and var_ for each block */
float mean_h[ 8* 3], variance_h[8 * 3], var_h[8 * 3];
for(int j=0; j<blocksize; j++) {
for(int i=0; i<3; i++) {
mean_h[j*3 + i] = (float)mean[i];
variance_h[j*3 + i] = (float)variance[i];
var_h[j*3 + i] = (float)var_[i];
}
}
/* pointers to device memory */
float *mean_d, *variance_d, *var_d;
/* allocating mean , variance, var_ on the deivce*/
CUDA_SAFE_CALL(cudaMalloc((void **)&mean_d, sizeof(float)*blocksize*3));
CUDA_SAFE_CALL(cudaMalloc((void **)&variance_d, sizeof(float)*blocksize*3));
CUDA_SAFE_CALL(cudaMalloc((void **)&var_d, sizeof(float)*blocksize*3));
/* copying from host to device */
CUDA_SAFE_CALL(cudaMemcpy(mean_d,mean_h, sizeof(float)*blocksize*3, cudaMemcpyHostToDevice));
CUDA_SAFE_CALL(cudaMemcpy(variance_d, variance_h, sizeof(float)*blocksize*3, cudaMemcpyHostToDevice));
CUDA_SAFE_CALL(cudaMemcpy(var_d,var_h, sizeof(float)*blocksize*3 , cudaMemcpyHostToDevice));
/* Allocating class label array on cuda and copying data from host to device */
int* my_device_class_label_array;
CUDA_SAFE_CALL(cudaMalloc ((void **) &(my_device_class_label_array), sizeof(int)*size));
CUDA_SAFE_CALL(cudaMemcpy(my_device_class_label_array, host_class_label_array ,
sizeof(int)*size, cudaMemcpyHostToDevice));
/* Allocating data array on cuda and copying data from host to device */
int* my_device_data_array;
CUT_SAFE_MALLOC(cudaMalloc ((void **) &(my_device_data_array), sizeof(int)*size));
CUDA_SAFE_CALL(cudaMemcpy(my_device_data_array, host_data_array ,
sizeof(int)*size, cudaMemcpyHostToDevice));
/* Allocation and initialzing the array size values for host and device */
int host_array_size[3];
host_array_size[0] = 512;
host_array_size[1] = 512;
host_array_size[2] = 76;
int* my_device_array_size;
CUDA_SAFE_CALL(cudaMalloc ((void **) &(my_device_array_size), sizeof(int)*3));
CUDA_SAFE_CALL(cudaMemcpy(my_device_array_size, host_array_size,
sizeof(int)*3, cudaMemcpyHostToDevice));
/* To test intermediate values on the device for debugging */
int testhost[48];
int *testdevice;
CUDA_SAFE_CALL(cudaMalloc ((void **) &testdevice, sizeof(int)* 48));
/* execution configuraton */
dim3 dimBlock(1);
dim3 dimGrid (blocksize);
printf("Calling Kernel\n");
/* while loop of metropolis*/
cuda_while_loop<<<dimGrid, dimBlock>>>(my_device_class_label_array, my_device_data_array,
my_device_array_size, testdevice,true,(float)T,(float)E,(float)E_old,no_regions, (float)kszi,K,
mean_d,variance_d,var_d);
printf("Returning from kernel\n");
/* Copying data from device to host */
int data[3];
CUDA_SAFE_CALL(cudaMemcpy(testhost,testdevice, sizeof(int) * 48, cudaMemcpyDeviceToHost));
CUDA_SAFE_CALL(cudaMemcpy(host_class_label_array, my_device_class_label_array, sizeof(int)*size,
cudaMemcpyDeviceToHost));
/* checking device array size if it has been copied correctly*/
CUDA_SAFE_CALL(cudaMemcpy(data, my_device_array_size, sizeof(int)*3,
cudaMemcpyDeviceToHost));
printf("Writing device values back into a file\n");
/* Writing the class label values to file to check it values*/
fp = fopen("/home/vbalu2/temp/class_label_device.txt","w");
for(int i = 0; i<size; i++)
fprintf(fp,"%d\n", host_class_label_array[i]);
fclose(fp);
for(int i=0; i<blocksize*3;i++)
printf("test device value[%d] = %d, %d \n", i, testhost[i], testhost[i + 24]);
for(int i=0;i<3;i++){
printf("i=%d, cuda_array_size=%d, orig_array_size=%d \n", i, data[i], host_array_size[i]);
}
CUDA_SAFE_CALL(cudaFree(my_device_class_label_array));
CUDA_SAFE_CALL(cudaFree(my_device_data_array));
CUDA_SAFE_CALL(cudaFree(my_device_array_size));
CUDA_SAFE_CALL(cudaFree(testdevice));
CUDA_SAFE_CALL(cudaFree(mean_d));
CUDA_SAFE_CALL(cudaFree(variance_d));
CUDA_SAFE_CALL(cudaFree(var_d));
free(host_class_label_array);
free(host_data_array);
time_int(1);
}