Kernel runtime, need explanation

This is my first post so i’d like to say hello to everyone and I hope you forgive me for my bad english :).

I’m trying to figure out why Kernel1 is twice as fast as Kernel2. Can anyone tell me why is that?

I think there are no bank conflicts in any of these kernels.

#define BLOCKDIM 128

__global__ void Kernel1()

{

	__shared__ int mem[BLOCKDIM * 16];

	for(int i = 0; i < BLOCKDIM * 16; i += BLOCKDIM)

		mem[threadIdx.x + i] = i;

}

__global__ void Kernel2()

{

	__shared__ TYPE2 mem[BLOCKDIM * 16];

	for(int i = 0; i < 16; i++)

		mem[threadIdx.x * 16 + (threadIdx.x + i) % 16] = i;

}

My graphic card is GF 9600GT (CUDA 1.1)

This is my first post so i’d like to say hello to everyone and I hope you forgive me for my bad english :).

I’m trying to figure out why Kernel1 is twice as fast as Kernel2. Can anyone tell me why is that?

I think there are no bank conflicts in any of these kernels.

#define BLOCKDIM 128

__global__ void Kernel1()

{

	__shared__ int mem[BLOCKDIM * 16];

	for(int i = 0; i < BLOCKDIM * 16; i += BLOCKDIM)

		mem[threadIdx.x + i] = i;

}

__global__ void Kernel2()

{

	__shared__ TYPE2 mem[BLOCKDIM * 16];

	for(int i = 0; i < 16; i++)

		mem[threadIdx.x * 16 + (threadIdx.x + i) % 16] = i;

}

My graphic card is GF 9600GT (CUDA 1.1)