Why I use ILP can't improve the performance ?How should I use ILP appropriately?

Hello everyone,I just want to use “ILP” to improve the performance of a function named “image to column” ,but the result shows that adding “ILP” is more slower than no ILP. I can’t find the the reason resulting above.
Here is the code:

i)im2col

#define ILP 2
#define BLOCK 32
__global__ void im2col_gpu_kernel(const int n, const float* data_im,
        const int height, const int width, const int ksize,
        const int pad,
        const int stride,
        const int height_col, const int width_col,
        float *data_col) {
		int index = blockIdx.x*blockDim.x+threadIdx.x;
		for(; index < n; index += blockDim.x*gridDim.x){

			int w_out = index % width_col;
			int h_index = index / width_col;
			int h_out = h_index % height_col;
			int channel_in = h_index / height_col;
			int channel_out = channel_in * ksize * ksize;
			int h_in = h_out * stride - pad;
			int w_in = w_out * stride - pad;
			float* data_col_ptr = data_col;

			data_col_ptr += (channel_out * height_col + h_out) * width_col + w_out;
			const float* data_im_ptr = data_im;
			data_im_ptr += (channel_in * height + h_in) * width + w_in;
			//----------------------------------------------------//

#pragma unroll
			for (int i = 0; i < ksize; ++i) {
#pragma unroll
				for (int j = 0; j <ksize; ++j) {
					int h = h_in + i;
					int w = w_in + j;
					*data_col_ptr = (h >= 0 && w >= 0 && h < height && w < width) ?	data_im_ptr[i * width + j] : 0;
					data_col_ptr += height_col * width_col;

				}
			}

		}
}

ii)im2col_ILP

__global__ void im2col_gpu_kernel_ILP(const int  n, const float* __restrict__ data_im,
        const int height, const int width, const int ksize,
        const int pad,
        const int stride,
        const int height_col, const int width_col,
        float * __restrict__ data_col) {

	int index[ILP];
    index[0] = blockIdx.x*blockDim.x*ILP+threadIdx.x;
//#if option
    index[1]= blockIdx.x*blockDim.x*ILP+threadIdx.x + BLOCK/2;
//#endif
    for(; index[0] < n; index[0] += blockDim.x*gridDim.x, index[1] += blockDim.x*gridDim.x){
    	//printf("index is %d\n",index);
    	//printf("index1 is %d\n",index1);
    	int w_out[ILP];
#pragma unroll
    	for(int l(0); l<ILP; l++)
    		w_out[l] = index[l] % width_col;

    	int h_index[ILP];
#pragma unroll
    	for(int l(0); l<ILP; l++)
            h_index[l] = index[l] / width_col;
    	int h_out[ILP];
#pragma unroll
    	for(int l(0); l<ILP; l++)
    		h_out[l] = h_index[l] % height_col;
    	int channel_in[ILP];
#pragma unroll
    	for(int l(0); l<ILP; l++)
        	channel_in[l] = h_index[l] / height_col;

    	int channel_out[ILP];
#pragma unroll
    	for(int l(0); l<ILP; l++)
        	channel_out[l] = channel_in[l] * ksize * ksize;

    	int h_in[ILP];
#pragma unroll
    	for(int l(0); l<ILP; l++)
        	h_in[l] = h_out[l] * stride - pad;

    	int w_in[ILP];
#pragma unroll
    	for(int l(0); l<ILP; l++)
        	w_in[l] = w_out[l] * stride - pad;

    	float *data_col_ptr[ILP];
#pragma unroll
    	for(int l(0); l<ILP; l++)
        	data_col_ptr[l] = data_col;

#pragma unroll
    	for(int l(0); l<ILP; l++)
    		data_col_ptr[l] += (channel_out[l] * height_col + h_out[l]) * width_col + w_out[l];//w_out+h_out*width_col+channel_out*height_col*width_col 
    	const float* data_im_ptr[ILP];
#pragma unroll
    	for(int l(0); l<ILP; l++)
    		data_im_ptr[l] = data_im;

#pragma unroll
    	for(int l(0); l<ILP; l++)
    		data_im_ptr[l] += (channel_in[l] * height + h_in[l]) * width + w_in[l];//w_in+h_in*width+channel_in*height*width 

#pragma unroll
        	for (int i = 0; i < ksize; ++i) {
#pragma unroll
        		for (int j = 0; j < ksize; ++j) {
        			int h[ILP];
#pragma unroll
        			for(int l(0); l<ILP; l++)
        				h[l] = h_in[l] + i;

        			int w[ILP];
#pragma unroll
        			for(int l(0); l<ILP; l++)
        				w[l] = w_in[l] + j;

        			bool tmp[ILP];
#pragma unroll
        			for(int l(0); l<ILP; l++)
        				tmp[l] = (h[l] >= 0 && w[l] >= 0 && h[l] < height && w[l] < width);
        			float dataTmp[ILP];
#pragma unroll
        			for(int l(0); l<ILP; l++)
        				dataTmp[l] = (data_im_ptr[l])[i * width + j];

#pragma unroll
        			for(int l(0); l<ILP; l++)
        				*(data_col_ptr[l]) = tmp[l] ? dataTmp[l] : 0;


#pragma unroll
        			for(int l(0); l<ILP; l++)
        				data_col_ptr[l] += height_col * width_col;

        		}
        	}



    }
}

The result of Compute in NVVP is that :
im2col_gpu_kernel_ILP cost 152.646ms ,accounting for 79.2%, and 41 Registers/Thread
im2col_kernel cost 18.032ms, accounting for 20.8% ,and 34 Registers/Thread

Unlike many modern CPUs, GPUs are, by and large, not designed to exploit instruction-level parallelism. Instead they exploit other forms of parallelism.

Therefore, attempts to increase ILP are likely to have minimal to zero positive effect of performance, while the code changes needed to increase ILP may well have a noticeable negative effect on other forms of parallelism, such as when more complicated code increases dynamic instruction count, or increases register pressure, leading to lower occupancy.