New size does not help. Try this test code:
#include <stdio.h>
#include <helper_functions.h>
#include <helper_cuda.h>
#include "cuprintf.cu"
#include "cuPrintf.cuh"
#define add(A, B) (A+B)
#define mul(A, B) (A*B)
#define BLK_N 2
#define BLK_M 2
// size of thread block for calculating C (innermost loop)
#define DIM_X 2
#define DIM_Y 2
__global__ void kernel_name (int M, int N, float *C,float *A, float alpha, float beta)
{
int idx = threadIdx.x; // thread's m dimension
int idy = threadIdx.y; // thread's n dimension
int blx = blockIdx.x; // block's m dimension
int bly = blockIdx.y; // block's n dimension
//printf ( "idx,idy,blx,bly %d %d %d %d \n",idx,idy,blx,bly); // --> bad results
cuPrintf ( "idx,idy,blx,bly %d %d %d %d \n",idx,idy,blx,bly); // --> good results
// Store C regs->dev
#pragma unroll
for (unsigned int n = 0; n < N; n++) {
int coord_dCn = bly*BLK_N + n*DIM_Y+idy;
cuPrintf ( "n coord_dCn %u %d \n",n,coord_dCn);
#pragma unroll
for (unsigned int m = 0; m < M; m++) {
int coord_dCm = blx*BLK_M + m*DIM_X+idx;
cuPrintf ( "n m %u %u \n",n,m);
if (coord_dCm < M && coord_dCn < N) {
int offsC = coord_dCn*4 + coord_dCm;
cuPrintf ( "coord_dCm offsC %d %d \n",coord_dCm,offsC);
float ®C = A[coord_dCm + coord_dCn];
float &memC = C[offsC];
memC = add(mul(alpha, regC), mul(beta, memC));
cuPrintf("&memC %x memC %10.4f offsC %3d C[offsC] %10.4f \n",&memC,memC,offsC,C[offsC]);
}
}
}
}
void randomInit(float *data, int size)
{
for (int i = 0; i < size; ++i){
data[i] = rand() / (float)RAND_MAX;}
}
int main(int argc, char **argv)
{
int cuda_device = 0;
cuda_device = findCudaDevice(argc, (const char **)argv);
checkCudaErrors(cudaGetDevice(&cuda_device));
size_t buf=1e6;
cudaDeviceSetLimit(cudaLimitPrintfFifoSize, buf);
int M =4, N = 4;
int msize = 16*sizeof(float);
// allocate host memory
float *a = 0; // pointer to the array data in host memory
float *c = 0; // pointer to the array data in host memory
checkCudaErrors(cudaMallocHost((void **)&a, msize));
checkCudaErrors(cudaMallocHost((void **)&c, msize));
// allocate device memory
float *d_a = 0; // pointers to data and init value in the device memory
float *d_c = 0; // pointers to data and init value in the device memory
checkCudaErrors(cudaMalloc((void **)&d_a, msize));
checkCudaErrors(cudaMalloc((void **)&d_c, msize));
dim3 bloc(BLK_N,BLK_M,1);
dim3 grid(M/ bloc.x, N / bloc.y);
// initialize host memory
randomInit( a,M*N );
for ( int i = 0; i < M*N ; i++)
printf ( "i %d a[i] %f \n",i,a[i]);
cudaPrintfInit();
// copy host memory to device
cudaMemcpyAsync(d_a, a, msize, cudaMemcpyHostToDevice);
kernel_name<<< grid, bloc, 0>>>(M,N, d_c,d_a, 1.1f, 0.10f);
cudaMemcpyAsync(c,d_c,msize,cudaMemcpyDeviceToHost) ;
cudaPrintfDisplay(stdout,false);
for ( int i = 0; i < M*N ; i++)
printf ( "i %d c[i] %f \n",i,c[i]);
cudaPrintfEnd();
cudaFreeHost(a);
cudaFree(d_a);
cudaFreeHost(c);
cudaFree(d_c);
cudaDeviceReset();
}