Hi,

I have changed the sdk transpose kernel as follows to deal with non-square matrices:

```
extern "C"
__global__ void transposek(double* array, double* output, int size, int lda){
__shared__ double tile[TILE_DIM][TILE_DIM+1];
int xIndex = blockIdx.x * TILE_DIM + threadIdx.x;
int yIndex = blockIdx.y * TILE_DIM + threadIdx.y;
int index_in = xIndex + (yIndex)*lda;
xIndex = blockIdx.y * TILE_DIM + threadIdx.x;
yIndex = blockIdx.x * TILE_DIM + threadIdx.y;
int index_out = xIndex + (yIndex)*size/lda;
for (int i=0; i<TILE_DIM; i+=BLOCK_ROWS) {
xIndex = index_in+i*size/lda;
if(xIndex < size)
tile[threadIdx.y+i][threadIdx.x] = array[xIndex];
}
__syncthreads();
for (int i=0; i<TILE_DIM; i+=BLOCK_ROWS) {
if(xIndex < size)
output[index_out+i*lda] = tile[threadIdx.x][threadIdx.y+i];
}
__syncthreads(); //I tried with and without this line, same problem
}
```

And the test code is very simple, as follows:

```
#define TILE_DIM 32
#define BLOCK_ROWS 32
#define THREAD_BLOCK 512
#define REQ(size,tile) (size-1)/tile+1
void transpose(double* array, double* output, int size, int lda){
dim3 grid(REQ(size/lda, TILE_DIM), REQ(lda, TILE_DIM)), threads(TILE_DIM,BLOCK_ROWS);
transposek<<<grid, threads>>>(array, output, size, lda);
}
using namespace std;
void printArray(double* array, int size){
for(int i = 0; i < size; i++) cout << array[i] << ", ";
cout << endl;
}
void randomInit(double* array, int size){
for(int i = 0; i< size; i++) array[i]=i+1;
}
void fixInit(double* array, int size, double value){
for(int i = 0; i< size; i++) array[i]=value;
}
int main(int argc, char** argv){
unsigned int uiWA = 4;
unsigned int uiHA = 2;
unsigned int size_A = uiWA * uiHA;
unsigned int mem_size_A = sizeof(double) * size_A;
double* h_A = (double*)malloc(mem_size_A); randomInit(h_A, size_A);
double* d_A;
cudaMalloc((void**) &d_A, mem_size_A);
double* d_AT;
cudaMalloc((void**) &d_AT, mem_size_A);
// copy host memory to device
cudaMemcpy(d_A, h_A, mem_size_A,cudaMemcpyHostToDevice);
printArray(h_A, size_A);
transpose(d_A, d_AT, size_A, uiHA);
cudaMemcpy(h_A, d_AT, mem_size_A, cudaMemcpyDeviceToHost);
printArray(h_A, size_A);
using namespace std;{
cout << REQ(size_A/uiHA, TILE_DIM) << "," << REQ(uiHA, TILE_DIM) << endl;
cout << TILE_DIM << "," << BLOCK_ROWS << endl;
}
}
```

So half the time the result is right, 1 3 5 7 2 4 6 8

But the other half of the time the result is 1 3 5 7 0 0 0 0 or 1 3 5 7 x x x x, where x is some junk value (like uninitialized var)

What is going on?

I have GTX460 and I compiled the code both with sm_20 and sm_21.