Cuda runs no threads

I’m trying to create a program to compute the potential energy of a system of points. The constraint is that the program must place the points into a cell in a 100x100x100 virtual grid and then compute the energy of all points in the same cell, ajoining cells, and then against a condensed point in the center of every other cell. When I try to run this program with one block per data point and only one thread per block, I get the wrong answer but it at least does something. However when I try to use a multidimensional thread block to get a better answer and to speed up the program, the program doesn’t do anything inside my kernel. What’s the cause of this?

#define REALLY_PRINT_TIME
//#define PRINTTIMECLOCK
#include "mycuda.h"
#include <iostream>
#include <string>
#include <stdlib.h>
#include <stdio.h>
#include <math.h>

using namespace std;

const int n=35947;
const float xmin=-0.1;
const float xmax=.08;
const float xinc=(xmax-xmin)/100.0;
const float ymin=0.02;
const float ymax=0.2;
const float yinc=(ymax-ymin)/100.0;
const float zmin=-.08;
const float zmax=.06;
const float zinc=(zmax-zmin)/100.0;
__device__ __managed__  float data[n][3];
__device__ __managed__  int location[n][3];
__device__ __managed__  float weight[100][100][100];

__device__ float distance( float x1, float y1, float z1, float x2, float y2, float z2) {
	float squaresum=(x2-x1)*(x2-x1)+(y2-y1)*(y2-y1)+(z2-z1)*(z2-z1);
	return sqrt(squaresum);
}

__global__ void potential(double *U) {
	const int i=blockIdx.x;
	printf("i==%d\n", i);
	if(i<n-1) {
		double isum=0.0;
		//unsigned int j=blockIdx.x+threadIdx.x+threadIdx.y*blockDim.y;
		//if(j<n) {
		for(unsigned int j=i+1; j<n; j++) {
			if( ( location[j][0] >= location[i][0] - 1 && location[j][0] <= location[i][0] + 1 ) 
			 && ( location[j][1] >= location[i][1] - 1 && location[j][1] <= location[i][1] + 1 )
			 && ( location[j][2] >= location[i][2] - 1 && location[j][2] <= location[i][2] + 1 ) ) {
				isum+=distance(data[i][0],data[j][0],data[i][1],data[j][1],data[i][2],data[j][2]);
			}
		}
		for(unsigned int x=0; x<100; x++) {
			for(unsigned int y=0; y<100; y++) {
				for(unsigned int z=0; z<100; z++) {
					if( ( x < location[i][0] - 1 || x > location[i][0] + 1 ) 
					 && ( y < location[i][1] - 1 || y > location[i][1] + 1 )
					 && ( z < location[i][2] - 1 || z > location[i][2] + 1 ) ) {
						isum+=weight[x][y][z]*distance(data[i][0],(xmin+(x+.5)*xinc),
							       data[i][1],(ymin+(y+.5)*yinc),
							       data[i][2],(zmin+(z+.5)*zinc));
						
					} else continue;
				}
			}
		}
		//printf("isum=%.2f\n", isum);
		__syncthreads();
		atomicAdd(U,1.0/isum);
	}
}

void cudamul2() {
	double *U;
	dim3 block_size(1024,1024,10);
	cudaMallocManaged(&U,sizeof(double));
	*U=0.0;
	potential<<<n,1024>>>(U);
	cudaError_t cudaerr=cudaDeviceSynchronize();
	if(cudaerr != cudaSuccess) cout << "CUDA Error! "<< cudaGetErrorString(cudaerr) << endl;
	cout << "Potential Energy is " << -*U << endl;
}

void cudamul() {
  CT(cudamul2());
}

void point_sorter() {	
	for(unsigned int i=0; i<100; i++) {
		for(unsigned int j=0; j<n; j++) {
			if( data[j][0] > (xmin+xinc*i) && data[j][0] < ( xmin+xinc*(i+1) ) ) {
				location[j][0]=i;
//				cout << i << " " ;
			}
		}
	}
	for(unsigned int i=0; i<100; i++) {
		for(unsigned int j=0; j<n; j++) {
			if( data[j][1] > (ymin+yinc*i) && data[j][1] < ( ymin+yinc*(i+1) ) ) {
				location[j][1]=i;
//				cout << i << " ";
			}
		}
	}
	for(unsigned int i=0; i<100; i++) {
		for(unsigned int j=0; j<n; j++) {
			if(data[j][2] > (zmin+zinc*i) && data[j][2] < ( zmin+zinc*(i+1) ) ) {
				location[j][2]=i;
//				cout << i  << " " << endl;
			}
		}
	}
}

void weight_calculator() {
	for(unsigned int i=0; i<n; i++) {
		weight[ (location[i][0]) ][ (location[i][1]) ][ (location[i][2]) ] += 1.0;
	}
} 

int main(void) {
	cout << PRINTN(n);
	string word;
	int i=0;
	ifstream datafile("/parallel-class/data/bunny");
	while(datafile) {
		datafile >> word;
		//if(i==0) cout << word << endl;
		data[i][0]=strtof(word.c_str(),0);
		//if(i==0) cout << data[i][0] << endl;
		datafile >> word;
		data[i][1]=strtof(word.c_str(),0);
		datafile >> word;
		data[i][2]=strtof(word.c_str(),0);
		++i;
	}
	//cout << i << endl;
	//cout<<"data[0][0]="<<data[0][0]<<endl;
	TIME(point_sorter());
	TIME(weight_calculator());
	TIME(cudamul());
}