JNI in eclipse(ubuntu)

Hi, I am working on JNI based CUDA program for which i hava a java class that has main func i.e. jclass.java(containing native func jniEntry() declaration)
and jclass.h which is generated from javah . I have the JNI bridge cEntry.c which contains the native function implementation declared as
JNIEXPORT void JNICALL Java_jclass_jniEntry(JNIEnv* env, jobject thisObj)
the above function calls the CUDA host function i.e. jniEntry() in cudaprogram.h. The jniEntry() function then calls the device func contained in cudaprogram.cu

I can’t seem generate the .so file from the generated .o files i.e. cudaprogram.o from cudaprogram.cu and cEntry.o from cEntry.c(which is the bridge for JNI i.e. jclass.java → jclass.class(from javac) & jclass.h(from javah -jni)
My makefile is :

INCLUDES	:= -I$(CUDASDK_PATH)/inc -I$(CUDA_PATH)/include  -I$(JDK_PATH)/include -I$(JDK_PATH)/include/linux -I.
LIBRARIES	:= -lrt -lm  -lcudart -lcufft -lcublas -L$(CUDA_PATH)/lib64  -L.
JAVASRC_PATH := ../ 
NATIVESRC_PATH := ./ 
NVCC := /opt/cuda-6.5//bin/nvcc -ccbin g++ 


cujni1: cEntry.o cudaprog.o makefile jclass.h cudaprog.h
	g++  $(INCLUDES) $(LIBRARIES) -v -m64 -shared -fPIC -o libcujni1.so cEntry.o cudaprog.o 
 
cEntry.o: cEntry.c jclass.h cudaprog.o 
	gcc $(INCLUDES) -v  -m64 -fPIC -o $@  cEntry.c -c
	
cudaprog.o: cudaprog.cu jclass.h  cudaprog.h
	$(NVCC) $(INCLUDES) -v -m64  -o $@ -c cudaprog.cu   

run: build
	$(EXEC) ./cujni1

jclass.h:jclass.class
	javah -jni -classpath $(JAVASRC_PATH) jclass
	
jclass.class: 
	javac $(JAVASRC_PATH)/jclass.java

the files that are being generated without errors are jclass.class, jclass.h, cudaprogram.o, cEntry.o but the libcujni1.so is not getting generated as i get the errors something like

/usr/bin/ld: cudaprog.o: relocation R_X86_64_32 against `.rodata' can not be used when making a shared object; recompile with -fPIC
cudaprog.o: error adding symbols: Bad value
collect2: error: ld returned 1 exit status
make: *** [cujni1] Error 1

as you can see i am using nvcc to compile the .cu files so can’t used -fPIC option because it returns error of |unknown option -fPIC"

For reference in needed i am attaching the other source files as well
jclass.java:

public class jclass {
	
	static {
		System.loadLibrary("cujni1");
		}
	
	private native void jniEntry();

	public static void main(String[] args){
		
		System.out.print("1:Hello" + "JNI CUder\n");
		new jclass().jniEntry();
	}
}

cEntry.c:

#include <jni.h>
#include "jclass.h"
#include "cudaprog.h"

JNIEXPORT void JNICALL Java_jclass_jniEntry(JNIEnv* env, jobject thisObj)
{
	   printf("2:cEntry.c-->Java_jclass_jniEntry!\n");
	jniEntry();
	return;
}

the generated jclass.h:

#ifndef CUDAPROG_H_
#define CUDAPROG_H_

#ifdef __cplusplus
	extern "C" {
#endif
	void jniEntry();
#ifdef __cplusplus
	}
#endif

#endif /* CUDAPROG_H_ */

cudaprogram.cu:

// includes, system
#include <string.h>
#include <math.h>

#include "jclass.h"
#include "cudaprog.h"

#include <stdio.h>
#include <iostream>
#include <stdlib.h>     /* srand, rand */
#include <time.h>       /* time */
#include <ctime>
// CUDA runtime
#include <cuda_runtime.h>

// Helper functions and utilities to work with CUDA
#include <helper_functions.h>
#ifdef __cplusplus
extern "C"
{
#endif

#define LO -100.0f
#define HI 100.0f
#define BlockSize 16
#define VECTORLENGTH 100
#define MATRIXLENGTH 4000


__global__ void
calDistanceMatrixCUDA(float *Out, float *In)
{
	// Block index
//	int bx = blockIdx.x;
//	int by = blockIdx.y;

	// Thread index
//	int tx = threadIdx.x;
//	int ty = threadIdx.y;

	int i = blockIdx.x * blockDim.x + threadIdx.x;
	int j = blockIdx.y * blockDim.y + threadIdx.y;


	if (i < MATRIXLENGTH && j < MATRIXLENGTH)
	{

		float fDim = 0.0f;
		float fDist = 0.0f;
		float(&InM)[4000][100] = *reinterpret_cast<float(*)[4000][100]>(In);
		float(&OutM)[4000][4000] = *reinterpret_cast<float(*)[4000][4000]>(Out);
		for (int k = 0; k < VECTORLENGTH; k++){//not blockSize because numElements = 100 < 128
			fDim = InM[i][k] - InM[j][k];
			fDim *= fDim;
			fDist += fDim;
		}
		fDist = sqrt(fDist);
		OutM[i][j] = fDist;

	}

}

#ifdef __cplusplus
}
#endif

#ifdef __cplusplus
extern "C"
{
#endif

void jniEntry()
{

	clock_t time1, time2, time3, time4;
	double tDiff1, tDiff2, tDiff3, tDiff4;

	unsigned int numElements = VECTORLENGTH;//dims
	unsigned int numVectors = MATRIXLENGTH;

	dim3 dimsVector(VECTORLENGTH, 1, 1);
	dim3 dimsVectorArray(MATRIXLENGTH, VECTORLENGTH, 1);
	dim3 dimsDistMatrix(MATRIXLENGTH, MATRIXLENGTH, 1);

	size_t sizeVector = VECTORLENGTH * sizeof(float);
	size_t sizeVectorArray = sizeVector * MATRIXLENGTH;
	size_t sizeMatrix = MATRIXLENGTH * MATRIXLENGTH * sizeof(float);

	unsigned int nSizeVector = dimsVector.x * dimsVector.y;
	unsigned int mem_SizeVector = sizeof(float) * nSizeVector;
	unsigned int nSizeVectorArray = dimsVectorArray.x * dimsVectorArray.y;
	unsigned int mem_SizeVectorArray = sizeof(float) * nSizeVectorArray;
	unsigned int nSizeDistMatrix = dimsDistMatrix.x * dimsDistMatrix.y;
	unsigned int mem_SizeDistMatrix = sizeof(float) * nSizeDistMatrix;

	float *distMatrix = (float *)malloc(mem_SizeDistMatrix);///Destination
	/////////////////////////////////////////
	///initialize Vector
	time1 = clock();

	float *featureV100 = (float *)malloc(mem_SizeVectorArray);
	for (int i = 0; i < nSizeVectorArray; ++i)
	{
		featureV100[i] = LO + static_cast <float> (rand()) / (static_cast <float> (RAND_MAX / (HI - LO)));;
//		printf("i:%d, == %5.2f\n", i, featureV100[i]);
	}
	time2 = clock();

	///////////////////////////
	float *d_featureV100, *d_DistMatrix;

	cudaError_t error;
	error = cudaMalloc((void **)&d_featureV100, mem_SizeVectorArray);
	if (error != cudaSuccess)
	{
		printf("cudaMalloc d_featureV100 returned error code %d, line(%d)\n", error, __LINE__);
		exit(EXIT_FAILURE);
	}

	error = cudaMalloc((void **)&d_DistMatrix, mem_SizeDistMatrix);
	if (error != cudaSuccess)
	{
		printf("cudaMalloc d_DistMatrix returned error code %d, line(%d)\n", error, __LINE__);
		exit(EXIT_FAILURE);
	}


	error = cudaMemcpy(d_featureV100, featureV100, mem_SizeVectorArray, cudaMemcpyHostToDevice);
	if (error != cudaSuccess)
	{
		printf("cudaMemcpy (d_featureV100,featureV100) returned error code %d, line(%d)\n", error, __LINE__);
		exit(EXIT_FAILURE);
	}

	//////////////////////
	// Allocate CUDA events that we'll use for timing
	cudaEvent_t start;
	error = cudaEventCreate(&start);

	if (error != cudaSuccess)
	{
		fprintf(stderr, "Failed to create start event (error code %s)!\n", cudaGetErrorString(error));
		exit(EXIT_FAILURE);
	}

	cudaEvent_t stop;
	error = cudaEventCreate(&stop);

	if (error != cudaSuccess)
	{
		fprintf(stderr, "Failed to create stop event (error code %s)!\n", cudaGetErrorString(error));
		exit(EXIT_FAILURE);
	}
	// Record the start event
	error = cudaEventRecord(start, NULL);

	if (error != cudaSuccess)
	{
		fprintf(stderr, "Failed to record start event (error code %s)!\n", cudaGetErrorString(error));
		exit(EXIT_FAILURE);
	}

	// Setup execution parameters
//	int threads = /*128*/512; //sufficient for vector of 100 elements
	dim3 threads(512); //sufficient for vector of 100 elements
//	dim3 grid(MATRIXLENGTH / threads, MATRIXLENGTH / threads);
	dim3 grid(512);

	calDistanceMatrixCUDA<<<grid, threads>>>(d_DistMatrix, d_featureV100);

	// Record the stop event
	error = cudaEventRecord(stop, NULL);

	if (error != cudaSuccess)
	{
		fprintf(stderr, "Failed to record stop event (error code %s)!\n", cudaGetErrorString(error));
		exit(EXIT_FAILURE);
	}

	// Wait for the stop event to complete
	error = cudaEventSynchronize(stop);

	if (error != cudaSuccess)
	{
		fprintf(stderr, "Failed to synchronize on the stop event (error code %s)!\n", cudaGetErrorString(error));
		exit(EXIT_FAILURE);
	}

	float msecTotal = 0.0f;
	error = cudaEventElapsedTime(&msecTotal, start, stop);

	if (error != cudaSuccess)
	{
		fprintf(stderr, "Failed to get time elapsed between events (error code %s)!\n", cudaGetErrorString(error));
		exit(EXIT_FAILURE);
	}

	// Compute and print the performance
	float msec = msecTotal ;
	printf(
		"Performance= Time= %.3f msec, WorkgroupSize= %d,%d,%d threads/block & %d,%d,%d blocks/grid\n",
		msec,
		threads.x,threads.y,threads.z,
		grid.x,grid.y,grid.z);

	error = cudaGetLastError();

	if (error != cudaSuccess)
	{
		fprintf(stderr, "Failed to launch calDistanceMatrixCUDA (error code %s)!\n", cudaGetErrorString(error));
		exit(EXIT_FAILURE);
	}


	error = cudaMemcpy(distMatrix, d_DistMatrix, mem_SizeDistMatrix, cudaMemcpyDeviceToHost);

	if (error != cudaSuccess)
	{
		fprintf(stderr, "Failed to copy d_DistMatrix from device to host distMatrix (error code %s)!\n", cudaGetErrorString(error));
		exit(EXIT_FAILURE);
	}


	cudaFree(d_featureV100);
	cudaFree(d_DistMatrix);
	free(featureV100);
	free(distMatrix);

	error = cudaDeviceReset();

	if (error != cudaSuccess)
	{
		fprintf(stderr, "Failed to deinitialize the device! error=%s\n", cudaGetErrorString(error));
		exit(EXIT_FAILURE);
	}

	printf("Done\n");

}

#ifdef __cplusplus
}
#endif

Needless to say the above cudaprogam.cu runs coorectly without errors when run as a CUDA application i.e. without JNI
Please guide me in regards to using the correct options in the makefile as i am a newbie in creating makefiles. Thanks.