2D Matrix copy problem

Hello,

I have written a simple code to do floating point matrix multiplication on CPU and GPU and compare the results. The problem is copying matrix into the device. Here is the code:

#include <math.h>
#include <sys/time.h>
#include <stdlib.h>
#include <stdio.h>

int main()
{

		int Dim=5;
		int test=0;
		int i=0,j=0,k=0,c=0,d=0; 
		float Matrix1[Dim][Dim], Matrix2[Dim][Dim],Result_ACC[Dim][Dim],Result_OMP[Dim][Dim],Diff=0,sum=0;
		double tstart, tstop;

		for (i=0;i<Dim;i++){
			for(j=0;j<Dim;j++){
				Result_ACC[i][j]=0;
				Result_OMP[i][j]=0;
				Matrix1[i][j]=((float)(rand()%50)/15);
				Matrix2[i][j]=((float)(rand()%70)/17);
	 		}
		}

		for (i=0; i<Dim; i++){
		    	for (c = 0; c < Dim; c++) {
				printf("Result_ACC[%d][%d]= %G \n", i,c, Result_ACC[i][c]);
			}
		}	

		printf ("Going to GPU.test is %d \n\n",test); 	

		struct timeval mytime1;
		gettimeofday(&mytime1,(struct timezone*)0);
		tstart = (double)(mytime1.tv_sec + mytime1.tv_usec*1.0e-6);

		#pragma acc kernels pcopyout(Result_ACC[Dim][Dim],test) copyin(Dim,sum,Matrix1[Dim][Dim], Matrix2[Dim][Dim],c,d,k)
	    		for (c = 0; c < Dim; c++) {
		      		for (d = 0; d < Dim; d++) {
		        		for (k = 0; k < Dim; k++) {
		          			sum = sum + Matrix1[c][k]*Matrix2[k][d];
						test=10;
		        		}		
			        	Result_ACC[c][d] = sum;
			        	sum = 0;
		      		}
    			}

		printf ("Back from GPU.test is %d \n\n",test); 


		for (i=0; i<Dim; i++){
		    	for (c = 0; c < Dim; c++) {
				printf("Result_ACC[%d][%d]= %G \n", i,c, Result_ACC[i][c]);
			}
		}		



		struct timeval mytime2;
		gettimeofday(&mytime2,(struct timezone*)0);
		tstop = (double)(mytime2.tv_sec + mytime2.tv_usec*1.0e-6);
		printf ("Timer Stoped.Dim is %d Sum is %G \n", Dim, sum); 
		sum=0;
		    	for (c = 0; c < Dim; c++) {
		      		for (d = 0; d < Dim; d++) {
		        		for (k = 0; k < Dim; k++) {
		          			sum = sum + Matrix1[c][k]*Matrix2[k][d];
		        		}		
			        	Result_OMP[c][d] = sum;
			        	sum = 0;
		      		}
	    		}

		for (i=0; i<Dim; i++){
		    	for (c =0; c < Dim; c++) {
				printf("Result_OMP[%d][%d]= %G \n",i,c, Result_OMP[i][c]);
			}
		}		


		for (i=0; i<Dim; i++){
		    	for (c = 0; c < Dim; c++) {
				Diff+= fabs(Result_OMP[i][c]-Result_ACC[i][c]);
			}
		}		

		printf ("The average difference is %0.15f and it took %G seconds.\n",Diff/(Dim*Dim), tstop-tstart);
		return 0;
}

matrix dimension is 55 and I am expecting to have 55*4 bytes copied into device. But when I run the code I get this report on the terminal:

Result_ACC[0][0]= 0 
Result_ACC[0][1]= 0 
Result_ACC[0][2]= 0 
Result_ACC[0][3]= 0 
Result_ACC[0][4]= 0 
Result_ACC[1][0]= 0 
Result_ACC[1][1]= 0 
Result_ACC[1][2]= 0 
Result_ACC[1][3]= 0 
Result_ACC[1][4]= 0 
Result_ACC[2][0]= 0 
Result_ACC[2][1]= 0 
Result_ACC[2][2]= 0 
Result_ACC[2][3]= 0 
Result_ACC[2][4]= 0 
Result_ACC[3][0]= 0 
Result_ACC[3][1]= 0 
Result_ACC[3][2]= 0 
Result_ACC[3][3]= 0 
Result_ACC[3][4]= 0 
Result_ACC[4][0]= 0 
Result_ACC[4][1]= 0 
Result_ACC[4][2]= 0 
Result_ACC[4][3]= 0 
Result_ACC[4][4]= 0 
Going to GPU.test is 0 

upload CUDA data  file=/home/jooya/Shark/examples/OpenMP_OpenAcc/main.cpp function=main line=39 device=0 variable=Dim bytes=4
upload CUDA data  file=/home/jooya/Shark/examples/OpenMP_OpenAcc/main.cpp function=main line=39 device=0 variable=sum bytes=4
upload CUDA data  file=/home/jooya/Shark/examples/OpenMP_OpenAcc/main.cpp function=main line=39 device=0 variable=Matrix1 bytes=20
upload CUDA data  file=/home/jooya/Shark/examples/OpenMP_OpenAcc/main.cpp function=main line=39 device=0 variable=Matrix2 bytes=20
upload CUDA data  file=/home/jooya/Shark/examples/OpenMP_OpenAcc/main.cpp function=main line=39 device=0 variable=c bytes=4
upload CUDA data  file=/home/jooya/Shark/examples/OpenMP_OpenAcc/main.cpp function=main line=39 device=0 variable=d bytes=4
upload CUDA data  file=/home/jooya/Shark/examples/OpenMP_OpenAcc/main.cpp function=main line=39 device=0 variable=k bytes=4
launch CUDA kernel  file=/home/jooya/Shark/examples/OpenMP_OpenAcc/main.cpp function=main line=40 device=0 num_gangs=1 num_workers=1 vector_length=1 grid=1 block=1
download CUDA data  file=/home/jooya/Shark/examples/OpenMP_OpenAcc/main.cpp function=main line=48 device=0 variable=test bytes=4
download CUDA data  file=/home/jooya/Shark/examples/OpenMP_OpenAcc/main.cpp function=main line=48 device=0 variable=Result_ACC bytes=20
Back from GPU.test is 2139095040 

Result_ACC[0][0]= 5.95294 
Result_ACC[0][1]= 0.647059 
Result_ACC[0][2]= 7.75686 
Result_ACC[0][3]= 0.843137 
Result_ACC[0][4]= 4.01176 
Result_ACC[1][0]= 0 
Result_ACC[1][1]= 0 
Result_ACC[1][2]= 0 
Result_ACC[1][3]= 0 
Result_ACC[1][4]= 0 
Result_ACC[2][0]= 0 
Result_ACC[2][1]= 0 
Result_ACC[2][2]= 0 
Result_ACC[2][3]= 0 
Result_ACC[2][4]= 0 
Result_ACC[3][0]= 0 
Result_ACC[3][1]= 0 
Result_ACC[3][2]= 0 
Result_ACC[3][3]= 0 
Result_ACC[3][4]= 0 
Result_ACC[4][0]= 0 
Result_ACC[4][1]= 0 
Result_ACC[4][2]= 0 
Result_ACC[4][3]= 0 
Result_ACC[4][4]= 0 
Timer Stoped.Dim is 5 Sum is 0 
Result_OMP[0][0]= 31.7647 
Result_OMP[0][1]= 25.3098 
Result_OMP[0][2]= 26.9882 
Result_OMP[0][3]= 22.6235 
Result_OMP[0][4]= 16.4392 
Result_OMP[1][0]= 22.1255 
Result_OMP[1][1]= 21.7059 
Result_OMP[1][2]= 20.2118 
Result_OMP[1][3]= 18.9843 
Result_OMP[1][4]= 8.85098 
Result_OMP[2][0]= 14.1412 
Result_OMP[2][1]= 10.4118 
Result_OMP[2][2]= 13.5686 
Result_OMP[2][3]= 9.68628 
Result_OMP[2][4]= 6.45882 
Result_OMP[3][0]= 20.7412 
Result_OMP[3][1]= 18.1843 
Result_OMP[3][2]= 19.902 
Result_OMP[3][3]= 15.6431 
Result_OMP[3][4]= 7.77255 
Result_OMP[4][0]= 23.8588 
Result_OMP[4][1]= 20.6627 
Result_OMP[4][2]= 23.0275 
Result_OMP[4][3]= 18.0549 
Result_OMP[4][4]= 8.82745 
Diff is 25.8118 
Diff is 24.6627 
Diff is 19.2314 
Diff is 21.7804 
Diff is 12.4275 
Diff is 22.1255 
Diff is 21.7059 
Diff is 20.2118 
Diff is 18.9843 
Diff is 8.85098 
Diff is 14.1412 
Diff is 10.4118 
Diff is 13.5686 
Diff is 9.68628 
Diff is 6.45882 
Diff is 20.7412 
Diff is 18.1843 
Diff is 19.902 
Diff is 15.6431 
Diff is 7.77255 
Diff is 23.8588 
Diff is 20.6627 
Diff is 23.0275 
Diff is 18.0549 
Diff is 8.82745 
The average difference is 17.069334030151367 and it took 0.254893 seconds.

It only copies 20 bytes to the device. I don’t know what is wrong here. Even the test variable value that is copied back from device is not correct. I assign 10 to test in the kernel, but I read a weird value in CPU. I guess it has to do with the 2D arrays in the code.

Thanks,
Ali

Hi Ali,

In looking at the compiler feedback messages, -Minfo=accel, I see that the issue is the compiler is not generating a kernel given the “live-out” variables, sum and test.

% pgcc -acc -Minfo=accel test.c -V15.7
main:
35, Generating copyout(Result_ACC[:Dim][:Dim],test)
Generating copyin(Dim,sum,Matrix1[:Dim][:Dim],Matrix2[:Dim][:Dim])
36, Loop carried scalar dependence for sum at line 40
Scalar last value needed after loop for sum,test at line 62
Accelerator restriction: scalar variable live-out from loop: test,sum
38, Loop carried scalar dependence for sum at line 40
Scalar last value needed after loop for sum,test at line 62
Accelerator restriction: scalar variable live-out from loop: test,sum
Accelerator kernel generated
Generating Tesla code
39, Scalar last value needed after loop for test at line 48
Accelerator restriction: scalar variable live-out from loop: test

Your use of “test” is problematic since it will inhibit parallelization. The problem being that every thread will be updating the same “test” so the value will depend upon which thread updated it last (non-deterministic). Yes in this particular case it doesn’t matter, but in general it’s a problem.

Reduction variable must be private to each thread. However by trying to print out “sum” after the loop, you’ve forced the compiler to make the variable global and hence inhibit the reduction. So besides the live-out, every thread will try to use the same “sum” causing collisions.

I would advise not using “test” here, nor try to print out “sum”, given this is not parallel. See below.

Hope this helps,
Mat

% cat test.c
#include <math.h>
#include <sys/time.h>
#include <stdlib.h>
#include <stdio.h>

int main()
{

      int Dim=5;
      int test=0;
      int i=0,j=0,k=0,c=0,d=0;
      float Matrix1[Dim][Dim], Matrix2[Dim][Dim],Result_ACC[Dim][Dim],Result_OMP[Dim][Dim],Diff=0,sum=0;
      double tstart, tstop;
      for (i=0;i<Dim;i++){
         for(j=0;j<Dim;j++){
            Result_ACC[i][j]=0;
            Result_OMP[i][j]=0;
            Matrix1[i][j]=((float)(rand()%50)/15);
            Matrix2[i][j]=((float)(rand()%70)/17);
          }
      }

      for (i=0; i<Dim; i++){
             for (c = 0; c < Dim; c++) {
            printf("Result_ACC[%d][%d]= %G \n", i,c, Result_ACC[i][c]);
         }
      }

      printf ("Going to GPU.test\n\n");

      struct timeval mytime1;
      gettimeofday(&mytime1,(struct timezone*)0);
      tstart = (double)(mytime1.tv_sec + mytime1.tv_usec*1.0e-6);

      #pragma acc kernels pcopyout(Result_ACC[Dim][Dim]) copyin(Dim,sum,Matrix1[Dim][Dim], Matrix2[Dim][Dim])
      for (c = 0; c < Dim; c++) {
         for (d = 0; d < Dim; d++) {
            sum = 0;
            for (k = 0; k < Dim; k++) {
               sum = sum + Matrix1[c][k]*Matrix2[k][d];
            }
            Result_ACC[c][d] = sum;
         }
      }

      printf ("Back from GPU.\n\n");


      for (i=0; i<Dim; i++){
             for (c = 0; c < Dim; c++) {
            printf("Result_ACC[%d][%d]= %G \n", i,c, Result_ACC[i][c]);
         }
      }



      struct timeval mytime2;
      gettimeofday(&mytime2,(struct timezone*)0);
      tstop = (double)(mytime2.tv_sec + mytime2.tv_usec*1.0e-6);
      sum=0;
             for (c = 0; c < Dim; c++) {
                  for (d = 0; d < Dim; d++) {
                    for (k = 0; k < Dim; k++) {
                         sum = sum + Matrix1[c][k]*Matrix2[k][d];
                    }
                    Result_OMP[c][d] = sum;
                    sum = 0;
                  }
             }

      for (i=0; i<Dim; i++){
             for (c =0; c < Dim; c++) {
            printf("Result_OMP[%d][%d]= %G \n",i,c, Result_OMP[i][c]);
         }
      }


      for (i=0; i<Dim; i++){
             for (c = 0; c < Dim; c++) {
            Diff+= fabs(Result_OMP[i][c]-Result_ACC[i][c]);
         }
      }

      printf ("The average difference is %0.15f and it took %G seconds.\n",Diff/(Dim*Dim), tstop-tstart);
      return 0;
}

% pgcc -acc -Minfo=accel test.c -V15.7 ; a.out
main:
     35, Generating copyout(Result_ACC[:Dim][:Dim])
         Generating copyin(Dim,sum,Matrix1[:Dim][:Dim],Matrix2[:Dim][:Dim])
     36, Loop is parallelizable
     37, Loop is parallelizable
         Accelerator kernel generated
         Generating Tesla code
         36, #pragma acc loop gang, vector(4) /* blockIdx.y threadIdx.y */
         37, #pragma acc loop gang, vector(32) /* blockIdx.x threadIdx.x */
     39, Loop is parallelizable
Result_ACC[0][0]= 0
Result_ACC[0][1]= 0
Result_ACC[0][2]= 0
Result_ACC[0][3]= 0
Result_ACC[0][4]= 0
Result_ACC[1][0]= 0
Result_ACC[1][1]= 0
Result_ACC[1][2]= 0
Result_ACC[1][3]= 0
Result_ACC[1][4]= 0
Result_ACC[2][0]= 0
Result_ACC[2][1]= 0
Result_ACC[2][2]= 0
Result_ACC[2][3]= 0
Result_ACC[2][4]= 0
Result_ACC[3][0]= 0
Result_ACC[3][1]= 0
Result_ACC[3][2]= 0
Result_ACC[3][3]= 0
Result_ACC[3][4]= 0
Result_ACC[4][0]= 0
Result_ACC[4][1]= 0
Result_ACC[4][2]= 0
Result_ACC[4][3]= 0
Result_ACC[4][4]= 0
Going to GPU.test

Back from GPU.

Result_ACC[0][0]= 31.7647
Result_ACC[0][1]= 25.3098
Result_ACC[0][2]= 26.9882
Result_ACC[0][3]= 22.6235
Result_ACC[0][4]= 16.4392
Result_ACC[1][0]= 22.1255
Result_ACC[1][1]= 21.7059
Result_ACC[1][2]= 20.2118
Result_ACC[1][3]= 18.9843
Result_ACC[1][4]= 8.85098
Result_ACC[2][0]= 14.1412
Result_ACC[2][1]= 10.4118
Result_ACC[2][2]= 13.5686
Result_ACC[2][3]= 9.68628
Result_ACC[2][4]= 6.45882
Result_ACC[3][0]= 20.7412
Result_ACC[3][1]= 18.1843
Result_ACC[3][2]= 19.902
Result_ACC[3][3]= 15.6431
Result_ACC[3][4]= 7.77255
Result_ACC[4][0]= 23.8588
Result_ACC[4][1]= 20.6627
Result_ACC[4][2]= 23.0275
Result_ACC[4][3]= 18.0549
Result_ACC[4][4]= 8.82745
Result_OMP[0][0]= 31.7647
Result_OMP[0][1]= 25.3098
Result_OMP[0][2]= 26.9882
Result_OMP[0][3]= 22.6235
Result_OMP[0][4]= 16.4392
Result_OMP[1][0]= 22.1255
Result_OMP[1][1]= 21.7059
Result_OMP[1][2]= 20.2118
Result_OMP[1][3]= 18.9843
Result_OMP[1][4]= 8.85098
Result_OMP[2][0]= 14.1412
Result_OMP[2][1]= 10.4118
Result_OMP[2][2]= 13.5686
Result_OMP[2][3]= 9.68628
Result_OMP[2][4]= 6.45882
Result_OMP[3][0]= 20.7412
Result_OMP[3][1]= 18.1843
Result_OMP[3][2]= 19.902
Result_OMP[3][3]= 15.6431
Result_OMP[3][4]= 7.77255
Result_OMP[4][0]= 23.8588
Result_OMP[4][1]= 20.6627
Result_OMP[4][2]= 23.0275
Result_OMP[4][3]= 18.0549
Result_OMP[4][4]= 8.82745
The average difference is 0.000000457763662 and it took 0.543943 seconds.

Thanks Mat. That was a great help.

One more question, If I launch the same kernel from different cores at the same time, do they run on GPU in parallel? Consider the code below (the same code, I just added OpenMP pragmas to run it on multiple cores). I have set the size of grid to 1, the thread-block size to 256*4 to make sure that each kernel is run on one SMX (on K20c NVIDIA GPU).

#include <math.h>
#include <sys/time.h>
#include <stdlib.h>
#include <stdio.h>
#include <omp.h>
int main(int argc, char *argv[])
{



      int ii;



	if(argc != 3) {
    		fprintf(stderr,"Use: %s size nIter\n",argv[0]);
    		return -1;
  	}
 
  	int Dim=atoi(argv[1]);
  	int itr=atoi(argv[2]);
   
  	if(itr <= 0) {
    		fprintf(stderr,"%s: Invalid nIter (%d)\n",argv[0],itr);
    		return -1;
  	}

      omp_set_num_threads(itr);

      #pragma omp parallel for
      for (ii=0; ii<itr; ii++){

      //int Dim=400;
      int i=0,j=0,k=0,c=0,d=0;
      float Matrix1[Dim][Dim], Matrix2[Dim][Dim],Result_ACC[Dim][Dim],Result_OMP[Dim][Dim],Diff=0,sum=0;
      double tstart, tstop;
      for (i=0;i<Dim;i++){
         for(j=0;j<Dim;j++){
            Result_ACC[i][j]=0;
            Result_OMP[i][j]=0;
            Matrix1[i][j]=((float)(rand()%50)/15);
            Matrix2[i][j]=((float)(rand()%70)/17);
          }
      }

      tstart = omp_get_wtime();
      //#pragma acc data create(Result_ACC[0:Dim][0:Dim]), copyin(Dim,sum,Matrix1[0:Dim][0:Dim], Matrix2[0:Dim][0:Dim])
      #pragma acc kernels loop gang(1), vector(256), worker(32) pcopyout(Result_ACC[0:Dim][0:Dim]) copyin(Dim,sum,Matrix1[0:Dim][0:Dim], Matrix2[0:Dim][0:Dim])
      for (c = 0; c < Dim; c++) {
	 #pragma acc loop gang(1), vector
         for (d = 0; d < Dim; d++) {
            sum = 0;
            for (k = 0; k < Dim; k++) {
               sum = sum + Matrix1[c][k]*Matrix2[k][d];
            }
            Result_ACC[c][d] = sum;
         }
      }

      tstop = omp_get_wtime();
      printf ("Thread %d: %G seconds on GPU.\n",omp_get_thread_num(), tstop-tstart);

      }
      return 0;
}

And the output for ./a.out 1000 10 is as follows:

Thread 4: 12.9037 seconds on GPU.
Thread 7: 12.9815 seconds on GPU.
Thread 9: 32.032 seconds on GPU.
Thread 3: 31.9044 seconds on GPU.
Thread 8: 31.8053 seconds on GPU.
Thread 6: 31.8045 seconds on GPU.
Thread 5: 31.8285 seconds on GPU.
Thread 0: 31.8414 seconds on GPU.
Thread 1: 31.9472 seconds on GPU.
Thread 2: 31.8153 seconds on GPU.

I assume that threads with the same run time (4 and 7 for example) are running in parallel. Is this a correct assumption? For smaller matrix (100*100) even with 24 cores, the run time of kernels are almost the same.
When I check “nvidia-smi”, it shows only one ./a.out running on GPU. When launching multiple kernels from different cores, should I expect multiple ./a.out showing by “nvidia-smi”?


Thanks,
Ali

If I launch the same kernel from different cores at the same time, do they run on GPU in parallel?

Compute regions will execute on the default GPU unless you call the routine “acc_set_device_num” or set the environment variable ACC_DEVICE_NUM. For OpenMP, what I’d advise you to do, is get the number of devices available by calling “acc_get_num_devices”, create a omp parallel region, then assign each thread to a device. If you have more threads than devices, round-robin the assignment.

They may or may not run in parallel on the same device. This will depend upon the type of device you have and how many resources each kernel takes.

On newer Tesla cards like a K20c, multiple kernels can be running on the same card at the same time. However, they can also be configured to run in exclusive mode thus allowing only one host process at a time (not an issue for OpenMP but can cause problems with MPI). When running at the same time, the first kernel will begin to use all needed multiprocessor units on the device. Only as the first kernel releases SM units, is the second kernel able to start using them.

So if you have several kernels that use only a few SMs, then they will probably run in parallel. However, it’s more likely that they will overlap but not run completely in parallel.

  • Mat

Thanks for the reply Mat.

I have two GPUs on the system. But I would like to only utilize K20c which is device number zero. I have already set the “ACC_DEVICE_NUM” to zero. So all the kernels that are launched from OpenMP threads target K20c.

To prevent a kernel from utilizing all GPU SMs I have set the number of gangs and vectors (thread-blocks) to one:

        #pragma acc data pcopyin(Dim,Matrix1[0:Dim][0:Dim], Matrix2[0:Dim][0:Dim])
	tstart = 0;
        #pragma omp barrier
        tstart = omp_get_wtime();

        #pragma acc kernels loop gang(1), vector(512) create(Result_ACC[0:Dim][0:Dim],sum)
        for (c = 0; c < Dim; c++) {
	   #pragma acc loop gang(1), vector
           for (d = 0; d < Dim; d++) {
              sum = 0;
              for (k = 0; k < Dim; k++) {
                 sum = sum + Matrix1[c][k]*Matrix2[k][d];
              }
              Result_ACC[c][d] = sum;
           }
        }

This generates a kernel with num_gangs=1 num_workers=512 vector_length=1 grid=1 block=1x512. So, It runs on one SM only(please correct me if I am wrong).
In the code, I first transfer all the data to GPU. Then sync all the threads and start my timer and launch the kernels. When GPU part is done, I stop the timer. So, I am timing the kernel execution time. When I set the number of OpenMP threads to 10 and Matrix size to 100, I get the following timings for each kernel:

0.00646305
0.0123739
0.0182931
0.0241401
0.029995
0.035836
0.0416801
0.047524
0.053375
0.0592122

It suggests that the kernels are running in serial not parallel. As each kernel only needs one SM and K20c has more than 10 SMs, I am expecting to see more or less the same execution time here (I know that they all share the same L2 cache and memory bandwidth).
Any suggestion?

Thanks,
Ali

Ok, it looks like our runtime is blocking on the launch as well. I removed the data movement and added “async”, and now see what looks like (mostly) parallel execution on the device.

% cat test.c
#include <math.h>
#include <sys/time.h>
#include <stdlib.h>
#include <stdio.h>
#include <omp.h>
int main(int argc, char *argv[])
{



      int ii;



   if(argc != 3) {
          fprintf(stderr,"Use: %s size nIter\n",argv[0]);
          return -1;
     }

     int Dim=atoi(argv[1]);
     int itr=atoi(argv[2]);

     if(itr <= 0) {
          fprintf(stderr,"%s: Invalid nIter (%d)\n",argv[0],itr);
          return -1;
     }

      omp_set_num_threads(itr);

      #pragma omp parallel for
      for (ii=0; ii<itr; ii++){

      //int Dim=400;
      int i=0,j=0,k=0,c=0,d=0;
      float Matrix1[Dim][Dim], Matrix2[Dim][Dim],Result_ACC[Dim][Dim],Result_OMP[Dim][Dim],Diff=0,sum=0;
      double tstart, tstop;
      for (i=0;i<Dim;i++){
         for(j=0;j<Dim;j++){
            Result_ACC[i][j]=0;
            Result_OMP[i][j]=0;
            Matrix1[i][j]=((float)(rand()%50)/15);
            Matrix2[i][j]=((float)(rand()%70)/17);
          }
      }
      #pragma acc data create(Result_ACC[0:Dim][0:Dim]) create(Dim,sum,Matrix1[0:Dim][0:Dim], Matrix2[0:Dim][0:Dim])
      {

   for (int iii=0;iii<3;++iii) {
      tstart = omp_get_wtime();
      #pragma acc kernels loop gang(1) vector(256)  private(sum) async(ii)
      for (c = 0; c < Dim; c++) {
         #pragma acc loop seq
         for (d = 0; d < Dim; d++) {
            sum = 0;
            for (k = 0; k < Dim; k++) {
               sum = sum + Matrix1[c][k]*Matrix2[k][d];
            }
            Result_ACC[c][d] = sum;
         }
      }
      #pragma acc wait(ii)
      tstop = omp_get_wtime();
      printf ("Thread %d %d: %G seconds on GPU.\n",omp_get_thread_num(),iii, tstop-tstart);
  }
}
      }
      return 0;
}
sbe02:/home/colgrove/tmp% pgcc -acc -mp -fast test.c -Minfo=accel
main:
     45, Generating create(Result_ACC[:Dim][:Dim],Dim,sum,Matrix1[:Dim][:Dim],Matrix2[:Dim][:Dim])
     51, Loop is parallelizable
     53, Loop is parallelizable
         Accelerator kernel generated
         Generating Tesla code
         51, #pragma acc loop vector(256) /* threadIdx.x */
     55, Loop is parallelizable
sbe02:/home/colgrove/tmp% time a.out 1000 10
Thread 7 0: 3.42758 seconds on GPU.
Thread 3 0: 3.42793 seconds on GPU.
Thread 5 0: 3.41498 seconds on GPU.
Thread 8 0: 3.41316 seconds on GPU.
Thread 6 0: 3.40709 seconds on GPU.
Thread 4 0: 3.40272 seconds on GPU.
Thread 2 0: 3.40173 seconds on GPU.
Thread 9 0: 3.39376 seconds on GPU.
Thread 2 1: 3.27525 seconds on GPU.
Thread 7 1: 3.29679 seconds on GPU.
Thread 6 1: 3.32013 seconds on GPU.
Thread 4 1: 3.33027 seconds on GPU.
Thread 1 0: 6.73814 seconds on GPU.
Thread 0 0: 6.76195 seconds on GPU.
Thread 3 1: 3.37307 seconds on GPU.
Thread 5 1: 3.40607 seconds on GPU.
Thread 7 2: 3.38571 seconds on GPU.
Thread 8 1: 6.70588 seconds on GPU.
Thread 2 2: 3.44913 seconds on GPU.
Thread 9 1: 6.73799 seconds on GPU.
Thread 3 2: 3.38182 seconds on GPU.
Thread 5 2: 3.36001 seconds on GPU.
Thread 6 2: 3.45831 seconds on GPU.
Thread 4 2: 3.48083 seconds on GPU.
Thread 0 1: 6.76845 seconds on GPU.
Thread 1 1: 6.78861 seconds on GPU.
Thread 8 2: 6.74302 seconds on GPU.
Thread 9 2: 6.71886 seconds on GPU.
Thread 1 2: 6.63563 seconds on GPU.
Thread 0 2: 6.65436 seconds on GPU.
171.509u 66.667s 0:24.57 969.3% 0+0k 0+0io 0pf+0w

That is exactly what I was trying to achieve. Thank you so much.