I’m trying to create a program to compute the potential energy of a system of points. The constraint is that the program must place the points into a cell in a 100x100x100 virtual grid and then compute the energy of all points in the same cell, ajoining cells, and then against a condensed point in the center of every other cell. When I try to run this program with one block per data point and only one thread per block, I get the wrong answer but it at least does something. However when I try to use a multidimensional thread block to get a better answer and to speed up the program, the program doesn’t do anything inside my kernel. What’s the cause of this?
#define REALLY_PRINT_TIME
//#define PRINTTIMECLOCK
#include "mycuda.h"
#include <iostream>
#include <string>
#include <stdlib.h>
#include <stdio.h>
#include <math.h>
using namespace std;
const int n=35947;
const float xmin=-0.1;
const float xmax=.08;
const float xinc=(xmax-xmin)/100.0;
const float ymin=0.02;
const float ymax=0.2;
const float yinc=(ymax-ymin)/100.0;
const float zmin=-.08;
const float zmax=.06;
const float zinc=(zmax-zmin)/100.0;
__device__ __managed__ float data[n][3];
__device__ __managed__ int location[n][3];
__device__ __managed__ float weight[100][100][100];
__device__ float distance( float x1, float y1, float z1, float x2, float y2, float z2) {
float squaresum=(x2-x1)*(x2-x1)+(y2-y1)*(y2-y1)+(z2-z1)*(z2-z1);
return sqrt(squaresum);
}
__global__ void potential(double *U) {
const int i=blockIdx.x;
printf("i==%d\n", i);
if(i<n-1) {
double isum=0.0;
//unsigned int j=blockIdx.x+threadIdx.x+threadIdx.y*blockDim.y;
//if(j<n) {
for(unsigned int j=i+1; j<n; j++) {
if( ( location[j][0] >= location[i][0] - 1 && location[j][0] <= location[i][0] + 1 )
&& ( location[j][1] >= location[i][1] - 1 && location[j][1] <= location[i][1] + 1 )
&& ( location[j][2] >= location[i][2] - 1 && location[j][2] <= location[i][2] + 1 ) ) {
isum+=distance(data[i][0],data[j][0],data[i][1],data[j][1],data[i][2],data[j][2]);
}
}
for(unsigned int x=0; x<100; x++) {
for(unsigned int y=0; y<100; y++) {
for(unsigned int z=0; z<100; z++) {
if( ( x < location[i][0] - 1 || x > location[i][0] + 1 )
&& ( y < location[i][1] - 1 || y > location[i][1] + 1 )
&& ( z < location[i][2] - 1 || z > location[i][2] + 1 ) ) {
isum+=weight[x][y][z]*distance(data[i][0],(xmin+(x+.5)*xinc),
data[i][1],(ymin+(y+.5)*yinc),
data[i][2],(zmin+(z+.5)*zinc));
} else continue;
}
}
}
//printf("isum=%.2f\n", isum);
__syncthreads();
atomicAdd(U,1.0/isum);
}
}
void cudamul2() {
double *U;
dim3 block_size(1024,1024,10);
cudaMallocManaged(&U,sizeof(double));
*U=0.0;
potential<<<n,1024>>>(U);
cudaError_t cudaerr=cudaDeviceSynchronize();
if(cudaerr != cudaSuccess) cout << "CUDA Error! "<< cudaGetErrorString(cudaerr) << endl;
cout << "Potential Energy is " << -*U << endl;
}
void cudamul() {
CT(cudamul2());
}
void point_sorter() {
for(unsigned int i=0; i<100; i++) {
for(unsigned int j=0; j<n; j++) {
if( data[j][0] > (xmin+xinc*i) && data[j][0] < ( xmin+xinc*(i+1) ) ) {
location[j][0]=i;
// cout << i << " " ;
}
}
}
for(unsigned int i=0; i<100; i++) {
for(unsigned int j=0; j<n; j++) {
if( data[j][1] > (ymin+yinc*i) && data[j][1] < ( ymin+yinc*(i+1) ) ) {
location[j][1]=i;
// cout << i << " ";
}
}
}
for(unsigned int i=0; i<100; i++) {
for(unsigned int j=0; j<n; j++) {
if(data[j][2] > (zmin+zinc*i) && data[j][2] < ( zmin+zinc*(i+1) ) ) {
location[j][2]=i;
// cout << i << " " << endl;
}
}
}
}
void weight_calculator() {
for(unsigned int i=0; i<n; i++) {
weight[ (location[i][0]) ][ (location[i][1]) ][ (location[i][2]) ] += 1.0;
}
}
int main(void) {
cout << PRINTN(n);
string word;
int i=0;
ifstream datafile("/parallel-class/data/bunny");
while(datafile) {
datafile >> word;
//if(i==0) cout << word << endl;
data[i][0]=strtof(word.c_str(),0);
//if(i==0) cout << data[i][0] << endl;
datafile >> word;
data[i][1]=strtof(word.c_str(),0);
datafile >> word;
data[i][2]=strtof(word.c_str(),0);
++i;
}
//cout << i << endl;
//cout<<"data[0][0]="<<data[0][0]<<endl;
TIME(point_sorter());
TIME(weight_calculator());
TIME(cudamul());
}