cuda memory allocation problem segmentation fault before kernel launch

Could anybody spot why the following wraper function creates a segmentation fault…?

float compute_e_bonds(MMFF94S_bond_calcs_t* bondsIn, int gradients,
Bond_LookUp_t lookUp_bonds, int numAtoms, float* gradient_pointers) {

float e_bonds = 0.0f;
cudaGetDeviceProperties(&prop, 0);
blocksPerGrid = 2 * prop.multiProcessorCount;
threadsPerBlock = (numAtoms + blocksPerGrid - 1) / blocksPerGrid;

float* e, *dev_e, *dev_gradPointers;
int *dev_lookUp_bonds, *dev_lengths, *dev_offsets;
MMFF94S_bond_calcs_t *dev_bonds;


//allocate MMFFBONDS
cudaMalloc(
		(void**) &dev_bonds,
		lookUp_bonds.bonds_struct_length
				* sizeof(struct MMFF94S_bond_calcs));
cudaMemcpy(
		dev_bonds,
		bondsIn,
		lookUp_bonds.bonds_struct_length
				* sizeof(struct MMFF94S_bond_calcs), cudaMemcpyHostToDevice);

//allocate int* lookup_bonds
cudaMalloc((void**) &dev_lookUp_bonds,
		lookUp_bonds.bond_pointer_length * sizeof(int));
cudaMemcpy(dev_lookUp_bonds, lookUp_bonds.bonds,
		lookUp_bonds.bond_pointer_length * sizeof(int),
		cudaMemcpyHostToDevice);

//allocate and copy int* lookup_bonds lengths
cudaMalloc((void**) &dev_lengths, numAtoms * sizeof(int));
cudaMemcpy(dev_lengths, lookUp_bonds.length, numAtoms * sizeof(int),
		cudaMemcpyHostToDevice);
//allocate and copy int* lookup_bonds offsets
cudaMalloc((void**) &dev_offsets, numAtoms * sizeof(int));
cudaMemcpy(dev_offsets, lookUp_bonds.offset, numAtoms * sizeof(int),
		cudaMemcpyHostToDevice);
//allocate and copy grad pointers
cudaMalloc((void**) &dev_gradPointers, 3 * numAtoms * sizeof(float));
cudaMemcpy(dev_gradPointers, gradient_pointers,
		3 * numAtoms * sizeof(float), cudaMemcpyHostToDevice);
//alocate e
cudaMalloc((void**) &dev_e, numAtoms * sizeof(float));

printf("Good up to kernel  : %d\n", sizeof(struct MMFF94S_bond_calcs));
compute_bonds_energy<<<blocksPerGrid, threadsPerBlock>>>(dev_bonds,dev_e, dev_lookUp_bonds,dev_offsets, dev_lengths, numAtoms, dev_gradPointers);

cudaFree(dev_bonds);
cudaFree(dev_gradPointers);
cudaFree(dev_lookUp_bonds);
cudaFree(dev_offsets);
cudaFree(dev_lengths);

cudaMemcpy(e, dev_e, numAtoms*sizeof(float),
		cudaMemcpyDeviceToHost);
cudaFree(dev_e);
cudaThreadExit();
    for ( int i = 0; i < numAtoms; i++) {

	e_bonds += e[i];
	printf("Atom : %d | e = %lf \n", i+1, e[i]);
}
free(e);
return e_bonds;

}

All the structs and pointers i pass into the class have been checked and they are fine. Why do i get that?
Also the kernel was working before i make a small change : return e* from the function and then reduce it to float e_bonds;

Thanks in advance,
thanasio

You never malloc for float* e on the host?