I wonder maximum number of threads per block really limits the number of threads in each block.

I have found that CUDA device_info gives maximum number of threads per block. However, when I run some CUDA file, it worked well!

This is the information of DEVICE about maximum number of threads per block:

Maximum number of threads per multiprocessor:  2048
  Maximum number of threads per block:           1024

This is the code of my CUDA file:

dim3 threadPB(1024,1024,64);
        add<<<1,threadPB>>>(dev_a,dev_b,dev_c);

As you can see, the number of threads I allocate is 67108864 and it exceeds 1024(=Maximum number of threads per block).

Does it really cause a problem to allocate threads at each block more than threads device allows?

do you really allocate 67108864 threads, or do you allocate 64 x 1024 x 1024 threads - a thread block of 1024 threads that is run 1024 x 64 times?

as a simple test, try to seat a kernel with:

dim3 dB(1024x1024x64, 1, 1)

or even
dim3 dB(2048,1, 1)

you can easily step the program in the debugger, suspend it, and note what kernel thread blocks are resident, with their dimensions

This is my source code:

#define N 100000
__global__ void add(int *a, int *b, int *c){
        int tid = threadIdx.x + blockDim.x *threadIdx.y + blockDim.y *blockDim.x * threadIdx.z;
        printf("[%d] x=%d, y=%d, z=%d\n",tid,threadIdx.x,threadIdx.y,threadIdx.z);
        //while(tid< N){
                c[tid] = a[tid]+ b[tid];
        //      tid += blockDim.x * gridDim.x;
        //}
}



int main (){
        struct timeval tv1,tv2,ctv1,ctv2;
        //gettimeofday(&tv1,NULL);

        int a[N], b[N],c[N];
        int *dev_a, *dev_b,*dev_c;

        cudaMalloc((void **)&dev_a, N * sizeof(int));
        cudaMalloc((void **)&dev_b, N * sizeof(int));
        cudaMalloc((void **)&dev_c, N * sizeof(int));
        int i;
        for(i=0;i<N;i++){
                a[i] =-i;
                b[i]= i*i;
        }

        gettimeofday(&ctv1,NULL);
        cudaMemcpy(dev_a, a, N * sizeof(int),cudaMemcpyHostToDevice);
        cudaMemcpy(dev_b, b, N * sizeof(int),cudaMemcpyHostToDevice);
gettimeofday(&tv1,NULL);

        dim3 blockPG(2147483647, 65535, 65535);
        dim3 threadPB(16,16,4);
        add<<<1,threadPB>>>(dev_a,dev_b,dev_c);
        gettimeofday(&tv2,NULL);

        cudaMemcpy(c, dev_c, N * sizeof(int),cudaMemcpyDeviceToHost);
        gettimeofday(&ctv2,NULL);
        //for(i=0;i<N;i++) printf("%d + %d = %d\n",a[i] ,b[i], c[i]);
        for(i=0;i<N;i++){
                if((a[i]+b[i])!=c[i]) {
                        printf("Wrong");
                        break;
                }
        }

The CUDA file of above post works well; it has 16,16,4 thread per block.

However, when the number of thread per block exceeds the device’s limit, it does not print sequence while the result is correct.

This is the result of CUDA when I set 1024 threads:

[727] x=7, y=13, z=2
[728] x=8, y=13, z=2
[729] x=9, y=13, z=2
[730] x=10, y=13, z=2
[731] x=11, y=13, z=2
[732] x=12, y=13, z=2
[733] x=13, y=13, z=2
[734] x=14, y=13, z=2
[735] x=15, y=13, z=2

This is the result when I set 4096 threads:

Wrong in c[0]

It does not only print any error message but also correct result.

Does CUDA compiler check whether threads per block in CUDA file exceeds limit of device?

Check CUDA return codes for errors, and (for a test) zero the result space before launching the kernel.

CUDA does not clear allocated memory, so you are very likely to find the (correct) results from a previous successful kernel launch. This is a common cause for confusion for new CUDA users.

You can run it, but the result will be wrong, you can check this through my code below.

/*
 * Copyright 1993-2010 NVIDIA Corporation.  All rights reserved.
 *
 * NVIDIA Corporation and its licensors retain all intellectual property and 
 * proprietary rights in and to this software and related documentation. 
 * Any use, reproduction, disclosure, or distribution of this software 
 * and related documentation without an express license agreement from
 * NVIDIA Corporation is strictly prohibited.
 *
 * Please refer to the applicable NVIDIA end user license agreement (EULA) 
 * associated with this source code for terms and conditions that govern 
 * your use of this NVIDIA software.
 * 
 */
#include <stdio.h>

static void HandleError( cudaError_t err,
                         const char *file,
                         int line ) {
    if (err != cudaSuccess) {
        printf( "%s in %s at line %d\n", cudaGetErrorString( err ),
                file, line );
        exit( EXIT_FAILURE );
    }
}

#define HANDLE_ERROR( err ) (HandleError( err, __FILE__, __LINE__ ))

#define N   1024
// #define N   1025

__global__ void add( int *a, int *b, int *c ) {
    int tid = threadIdx.x;    // this thread handles the data at its thread id
    if (tid < N)
        c[tid] = a[tid] + b[tid];
}

int main( void ) {
    int a[N], b[N], c[N];
    int *dev_a, *dev_b, *dev_c;

    // allocate the memory on the GPU
    HANDLE_ERROR( cudaMalloc( (void**)&dev_a, N * sizeof(int) ) );
    HANDLE_ERROR( cudaMalloc( (void**)&dev_b, N * sizeof(int) ) );
    HANDLE_ERROR( cudaMalloc( (void**)&dev_c, N * sizeof(int) ) );

    // fill the arrays 'a' and 'b' on the CPU
    for (int i=0; i<N; i++) {
        a[i] = -i;
        b[i] = i * i;
    }

    // copy the arrays 'a' and 'b' to the GPU
    HANDLE_ERROR( cudaMemcpy( dev_a, a, N * sizeof(int),
                              cudaMemcpyHostToDevice ) );
    HANDLE_ERROR( cudaMemcpy( dev_b, b, N * sizeof(int),
                              cudaMemcpyHostToDevice ) );

    dim3 threadsPerBlock( N, 1, 1 );

    add<<<1,threadsPerBlock>>>( dev_a, dev_b, dev_c );

    // copy the array 'c' back from the GPU to the CPU
    HANDLE_ERROR( cudaMemcpy( c, dev_c, N * sizeof(int),
                              cudaMemcpyDeviceToHost ) );

    // display the results
    for (int i=0; i<N; i++) {
        if (i==N-1)
        printf( "%d + %d = %d\n", a[i], b[i], c[i] );
    }

    // free the memory allocated on the GPU
    HANDLE_ERROR( cudaFree( dev_a ) );
    HANDLE_ERROR( cudaFree( dev_b ) );
    HANDLE_ERROR( cudaFree( dev_c ) );

    return 0;
}

Once you change the N to 1025 (or any value bigger than 1024), you will find the computing result is wrong.