Tiled partitioning

Hi all,
Here is a code for performing vector and matrix product. It uses shared memory using tiled partion concept.

  I get garbage values for this code. I dont understand the mistake i have made. This happens everytime I write a code on my own.. :( :(.. I refer to many books, documents, understand and then write. Even then I dont get result and everything i do goes waste.. :(.. Please help me with this code..

#include<stdio.h>
#include<cuda.h>
#define TILE_WIDTH 32
global void matvectmul(float *A, float *B, float *C, int len, int col)
{
shared float As[TILE_WIDTH];
int tx = threadIdx.x;
int tid = blockIdx.x * blockDim.x + tx;
float val = 0;
if(tid < len)
{
for(int m=0; m<len/TILE_WIDTH; m++)
{
As[tx] = A[m * TILE_WIDTH + tx];
__syncthreads();
for(int k=0; k<TILE_WIDTH; k++)
val += As[tx] * B[(tid * col) + (m * TILE_WIDTH + k)];
__syncthreads();
}
C[ tid ] = val;
}
}

int main(int argc, char *argv)
{
int row = atoi(argv[1]);
int col = atoi(argv[2]);

float *hostA, *hostB, *hostC;
float *devA, *devB, *devC;

hostA = (float *)malloc(row * sizeof(float));
hostB = (float *)malloc(row * col * sizeof(float));
hostC = (float *)malloc(col * sizeof(float));
for(int i=0; i<row; i++)
{
//hostA[i] = rand() + (RAND_MAX - rand())/RAND_MAX;
hostA[i] = 2.00f;
for(int j=0; j<col; j++)
{
//hostB[i * col + j] = rand() + (RAND_MAX - rand())/RAND_MAX;
hostB[i * col + j] = 2.00f;
}
}

cudaMalloc((void **)&devA, row * sizeof(float));
cudaMalloc((void **)&devB, row * col * sizeof(float));
cudaMalloc((void **)&devC, col * sizeof(float));

cudaMemcpy((void*)devA,(void*)hostA,(row * sizeof(float)),cudaMemcpyHostToDevice);
cudaMemcpy((void*)devB,(void*)hostB,(row * col * sizeof(float)),cudaMemcpyHostToDevice);

int m = col /(TILE_WIDTH * TILE_WIDTH);
dim3 threadsperblock(TILE_WIDTH, TILE_WIDTH);
if(col % (TILE_WIDTH * TILE_WIDTH) !=0)
m = m + 1;
dim3 blockspergrid(m,1);

matvectmul<<<blockspergrid, threadsperblock>>>(devA, devB, devC, row, col);

cudaMemcpy((void*)hostC,(void*)devC,(col * sizeof(float)),cudaMemcpyDeviceToHost);

for(int i=0;i<col;i++)
printf("%f ",hostC[i]);
cudaFree(devA);
cudaFree(devB);
cudaFree(devC);

free(hostA);
free(hostB);
free(hostC);
}