memory function does not see the memory, although the device array is copied

#include "stdio.h"

#define BLOCK_SIZE 32

#define N 3

__global__ void Muld(float *A, float *B, int wA, int wB, float *C) {

	int bx = blockIdx.x; // Block index

	int by = blockIdx.y;

	int tx = threadIdx.x; // Thread index

	int ty = threadIdx.y;

	int aBegin = wA * BLOCK_SIZE * by; // Index of the first sub-matrix of A processed by the block

	int aEnd = aBegin + wA - 1; // Index of the last sub-matrix of A processed by the block

	int aStep = BLOCK_SIZE; // Step size used to iterate through the sub-matrices of A

	int bBegin = BLOCK_SIZE * bx; // Index of the first sub-matrix of B processed by the block

	int bStep = BLOCK_SIZE * wB; // Step size used to iterate through the sub-matrices of B

	float Csub = 0; // The element of the block sub-matrix that is computed by the thread

	for (int a = aBegin, b = bBegin; a <= aEnd; a += aStep, b += bStep) {

		   // Shared memory for the sub-matrix of A

		   __shared__ float As[BLOCK_SIZE][BLOCK_SIZE];

		   // Shared memory for the sub-matrix of B

		   __shared__ float Bs[BLOCK_SIZE][BLOCK_SIZE];

	As[ty][tx] = A[a + wA * ty + tx]; // Load the matrices from global memory to shared memory;

	Bs[ty][tx] = B[b + wB * ty + tx]; // each thread loads one element of each matrix

	__syncthreads(); // Synchronize to make sure the matrices are loaded

	// Multiply the two matrices together;

	// each thread computes one element

	// of the block sub-matrix

	for (int k = 0; k < BLOCK_SIZE; ++k)

		   Csub += As[ty][k] * Bs[k][tx];

	// Synchronize to make sure that the preceding

	// computation is done before loading two new

	// sub-matrices of A and B in the next iteration

	__syncthreads();

	}

	// Write the block sub-matrix to global memory;

	// each thread writes one element

	int c = wB * BLOCK_SIZE * by + BLOCK_SIZE * bx;

	C[c + wB * ty + tx] = Csub;

	C[1]=12;

}

int main(){

	int numBytes = N * N * sizeof ( float );

	float A[]={1, 2, 3,

			 3, 4, 5,

			 3, 4, 5},

		  B[]={1, 2, 3,

			 3, 4, 5,

			 3, 4, 5},

		  C[9];

	

	float * adev = NULL;

	float * bdev = NULL;

	float * cdev = NULL;

	

	cudaMalloc ((void**)&adev, numBytes );

	cudaMalloc ((void**)&bdev, numBytes );

	cudaMalloc ((void**)&cdev, numBytes );

	cudaMemcpy(adev, A, numBytes, cudaMemcpyHostToDevice);

	cudaMemcpy(bdev, B, numBytes, cudaMemcpyHostToDevice);

	cudaMemcpy(cdev, C, numBytes, cudaMemcpyHostToDevice);

	

	dim3  threads(32,32), blocks(N / threads.x, N / threads.y,1);

	Muld <<< blocks, threads >>> (adev,bdev,N,N,cdev);

	

	cudaMemcpy(C, cdev, numBytes, cudaMemcpyDeviceToHost);

	

	for (int i=0; i<9; i++) printf("%0.2f ",C[i]);

	printf("\n");

	

	return 0;

}

Integer divisions are floored, so

blocks(N / threads.x, N / threads.y,1) == blocks(3 / 32, 3 / 32,1) == blocks(0,0,1)

N.

thanks