CUDA never stops

Hello everyone !

First of all, my operating system is Linux 2.6.18-194.17.4.el5 #1 SMP Mon Oct 25 15:50:53 EDT 2010 x86_64 x86_64 x86_64 GNU/Linux.

I have some very odd behaviour with my cuda code; it never stops. Here it is :

/* ising.cu

Single spin-flip Metropolis simulation of a two-dimensional

   ferromagnetic Ising model on graphics processing units (GPUs)

   using the NVIDIA CUDA framework.

Copyright (C) 2009, 2010 Martin Weigel

This program is free software; you can redistribute it and/or modify it under

   the terms of the GNU General Public License as published by the Free Software

   Foundation; either version 2 of the License, or (at your option) any later

   version.

This program is distributed in the hope that it will be useful, but WITHOUT

   ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS

   FOR A PARTICULAR PURPOSE. See the GNU General Public License for more

   details.

You should have received a copy of the GNU General Public License along with

   this program; if not, see <http://www.gnu.org/licenses/>.

Please check (and, if relevant, cite) the related preprint arXiv:1006.3865 at

http://arxiv.org/abs/1006.3865

 */

#include <stdio.h>

#include <stdlib.h>

#include <cutil.h>

#define DIM 2

#define L 1200 // multiple de 12

#define BLOCKL 12

#define GRIDL (L/BLOCKL)

#define BLOCKS ((GRIDL*GRIDL)/2)

#define THREADS ((BLOCKL*BLOCKL)/3)

#define N (L*L)

#define SPINS_PER_BLOCK (N/3)

#define TOTTHREADS (BLOCKS*THREADS)

#define SWEEPS_EQUI 2000

#define SWEEPS_GLOBAL 1000

#define SWEEPS_LOCAL 1

#define SWEEPS_EMPTY 1

#define TOT_SWEEPS (SWEEPS_GLOBAL*SWEEPS_LOCAL*SWEEPS_EMPTY)

#define BETA 0.4

#define A  1664525

#define C  1013904223

#define AA A

#define CC C

#define MULT 2.328306437080797e-10f

#define MULT2 4.6566128752457969e-10f

#define sS(x,y) sS[(y+1)*(BLOCKL+2)+x+1]

#define s(x,y) s[y*L+x]

#define RAN(n) (n = AA*n + CC)

//#define CPU

#define MEAS

typedef int spin_t;

texture<float,1,cudaReadModeElementType> boltzT;

__global__ void metro_sublattice(spin_t *s, int *ranvec, int *result, unsigned int boffset)

{

	unsigned int n = threadIdx.y*BLOCKL+threadIdx.x;

	unsigned int xoffset = blockIdx.x*BLOCKL;

	unsigned int yoffset = (2*blockIdx.y+(blockIdx.x+boffset)%2)*BLOCKL;

	unsigned int ran = ranvec[(blockIdx.y*GRIDL+blockIdx.x)*THREADS+n];

#ifdef MEAS

	int ie = 0;

#endif

	unsigned int x = threadIdx.x + xoffset;

	unsigned int y1 = (threadIdx.x%3)+3*threadIdx.y + yoffset;

	unsigned int y2 = ((threadIdx.x+1)%3)+3*threadIdx.y + yoffset;

	unsigned int y3 = ((threadIdx.x+2)%3)+3*threadIdx.y + yoffset;

	for(int i = 0; i < SWEEPS_LOCAL; ++i) {

		// first nearest neighbours

		int ide = s(x,y1)*(s((x-1+L)%L,y1)+s(x,(y1-1+L)%L)+s((x+1)%L,y1)+s(x,(y1+1)%L));

		// next nearest neighbours energy

		ide += s(x,y1)*(s((x-2+L)%L,y1)+s(x,(y1-2+L)%L)+s((x+2)%L,y1)+s(x,(y1+2)%L));

		if(MULT*(*(unsigned int*)(&RAN(ran))) < tex1Dfetch(boltzT, ide+2*DIM)) {

			s(x,y1) = -s(x,y1);

#ifdef MEAS

			ie -= 2*ide;

#endif

		}

		__syncthreads();

		// first nearest neighbours

		ide = s(x,y2)*(s((x-1+L)%L,y2)+s(x,(y2-1+L)%L)+s((x+1)%L,y2)+s(x,(y2+1)%L)); 

		// next nearest neighbours energy 

		ide += s(x,y2)*(s((x-2+L)%L,y2)+s(x,(y2-2+L)%L)+s((x+2)%L,y2)+s(x,(y2+2)%L));

		if(MULT*(*(unsigned int*)(&RAN(ran))) < tex1Dfetch(boltzT, ide+2*DIM)) {

			s(x,y2) = -s(x,y2);

#ifdef MEAS

			ie -= 2*ide;

#endif

		}

		__syncthreads();

		

		

		// first nearest neighbours

		ide = s(x,y3)*(s((x-1+L)%L,y3)+s(x,(y3-1+L)%L)+s((x+1)%L,y3)+s(x,(y3+1)%L)); 

		// next nearest neighbours energy 

		ide += s(x,y3)*(s((x-2+L)%L,y3)+s(x,(y3-2+L)%L)+s((x+2)%L,y3)+s(x,(y3+2)%L));

		if(MULT*(*(unsigned int*)(&RAN(ran))) < tex1Dfetch(boltzT, ide+2*DIM)) {

			s(x,y3) = -s(x,y3);

#ifdef MEAS

			ie -= 2*ide;

#endif

		}

		__syncthreads(); 

		 

	}

	ranvec[(blockIdx.y*GRIDL+blockIdx.x)*THREADS+n] = ran;

#ifdef MEAS

	__shared__ int deltaE[THREADS];

	deltaE[n] = ie;

	for(int stride = THREADS>>1; stride > 0; stride >>= 1) {

		__syncthreads();

		if(n < stride) deltaE[n] += deltaE[n+stride];

	}

	if(n == 0) result[blockIdx.y*GRIDL+blockIdx.x] += deltaE[0];

#endif

}

int cpu_energy(spin_t *s)

{

	int ie = 0;

	for(int x = 0; x < L; ++x)

		for(int y = 0; y < L; ++y) {

			// first nearest neighbours energy

			ie += s(x,y)*(s((x-1+L)%L,y)+s(x,(y-1+L)%L)+s((x+1)%L,y)+s(x,(y+1)%L));

			// next nearest neighbours energy

			ie += s(x,y)*(s((x-2+L)%L,y)+s(x,(y-2+L)%L)+s((x+2)%L,y)+s(x,(y+2)%L));

		}

	return ie/2;

}

void simulate()

{

	spin_t *s, *sD;

	int *ranvec, *ranvecD, *result, *resultD;

	float boltzGPU[4*DIM+1], boltzCPU[2*DIM+1];

	dim3 grid(GRIDL, GRIDL/2);

	dim3 block(BLOCKL, BLOCKL/3);

	srand48(3145627);

	s = (spin_t*)malloc(N*sizeof(spin_t));

	ranvec = (int*)malloc(TOTTHREADS*sizeof(int));

	result = (int*)malloc(BLOCKS*sizeof(int));

	for(int i = 0; i < N; ++i) s[i] = (drand48() < 0.5) ? 1 : -1;

	ranvec[0] = 1;

	for(int i = 1; i < TOTTHREADS; ++i) ranvec[i] = 16807*ranvec[i-1];

	for(int i = 0; i < BLOCKS; ++i) result[i] = 0;

	for(int i = -2*DIM; i <= 2*DIM; ++i) boltzGPU[i+2*DIM] = min(1.0,exp(-2*BETA*i));

	for(int i = 0; i <= 2*DIM; ++i) boltzCPU[i] = exp(-2*BETA*i);

	float *boltztext;

	CUDA_SAFE_CALL(cudaMalloc((void**)&boltztext, (4*DIM+1)*sizeof(float)));

	CUDA_SAFE_CALL(cudaMemcpy(boltztext, boltzGPU, (4*DIM+1)*sizeof(float), cudaMemcpyHostToDevice));

	CUDA_SAFE_CALL(cudaBindTexture(NULL, boltzT, boltztext, (4*DIM+1)*sizeof(float)));

	CUDA_SAFE_CALL(cudaMalloc((void**)&sD, N*sizeof(spin_t)));

	CUDA_SAFE_CALL(cudaMemcpy(sD, s, N*sizeof(spin_t), cudaMemcpyHostToDevice));

	CUDA_SAFE_CALL(cudaMalloc((void**)&ranvecD, TOTTHREADS*sizeof(int)));

	CUDA_SAFE_CALL(cudaMemcpy(ranvecD, ranvec, TOTTHREADS*sizeof(int), cudaMemcpyHostToDevice));

	CUDA_SAFE_CALL(cudaMalloc((void**)&resultD, BLOCKS*sizeof(int)));

	CUDA_SAFE_CALL(cudaMemcpy(resultD, result, BLOCKS*sizeof(int), cudaMemcpyHostToDevice));

	int ie = cpu_energy(s);

	fprintf(stderr,"equilibration\n");

	for(int i = 0; i < SWEEPS_EQUI; ++i) {

		printf("Step %i of %i... ", i+1, SWEEPS_EQUI);

		for(int j = 0; j < SWEEPS_EMPTY; ++j) {

			metro_sublattice<<<grid, block>>>(sD, ranvecD, resultD, 0);

			metro_sublattice<<<grid, block>>>(sD, ranvecD, resultD, 1);

		}

		printf("done.\n");

	} 

#ifdef MEAS

	//FILE *fp = fopen("gpu_en.dat","w");

#endif

	printf("GPU simulation\n");

	cudaThreadSynchronize();

	unsigned int timer;

	cutCreateTimer(&timer);

	cutStartTimer(timer);

	for(int i = 0; i < SWEEPS_GLOBAL; ++i) {

		for(int j = 0; j < SWEEPS_EMPTY; ++j) {

			metro_sublattice<<<grid, block>>>(sD, ranvecD, resultD, 0);

			metro_sublattice<<<grid, block>>>(sD, ranvecD, resultD, 1);

		}

#ifdef MEAS

		int sum = ie;

		cudaMemcpy(result, resultD, BLOCKS*sizeof(int), cudaMemcpyDeviceToHost);

		for(int j = 0; j < BLOCKS; ++j) sum += result[j];

		//fwrite(&sum, sizeof(int), 1, fp);

		printf("GPU Energy : %i\n",sum);

#endif

	}

#ifdef MEAS  

	//fclose(fp);

#endif

	CUT_CHECK_ERROR("Kernel execution failed");

	cudaThreadSynchronize();

	cutStopTimer(timer);

	printf("GPU time: %f ms, %f ns per spin flip\n",cutGetTimerValue(timer),1000000/((float)N)*cutGetTimerValue(timer)/TOT_SWEEPS);

	CUDA_SAFE_CALL(cudaMemcpy(s, sD, N*sizeof(spin_t), cudaMemcpyDeviceToHost));

	CUDA_SAFE_CALL(cudaMemcpy(ranvec, ranvecD, TOTTHREADS*sizeof(int), cudaMemcpyDeviceToHost));

	cudaFree(sD);

	cudaFree(ranvecD);

	cudaFree(resultD);

#ifdef CPU

	int ran = 16807;

#ifdef MEAS

	ie = cpu_energy(s);

	//fp = fopen("cpu_en.dat","w");

#endif

	printf("CPU simulation\n");

	cutCreateTimer(&timer);

	cutStartTimer(timer);

	for(int i = 0; i < SWEEPS_GLOBAL; ++i) {

		for(int j = 0; j < SWEEPS_EMPTY*SWEEPS_LOCAL; ++j) {

			for(int y = 0; y < L; ++y) {

				for(int x = 0; x < L; ++x) {

					// first nearest neighbours energy

					int ide = s(x,y)*(s((x-1+L)%L,y)+s(x,(y-1+L)%L)+s((x+1)%L,y)+s(x,(y+1)%L));

					// next nearest neighbours energy

					ide += s(x,y)*(s((x-2+L)%L,y)+s(x,(y-2+L)%L)+s((x+2)%L,y)+s(x,(y+2)%L));

					if(ide <= 0 || fabs(RAN(ran)*MULT2) < boltzCPU[ide]) {

						s[L*y+x] = -s[L*y+x];

#ifdef MEAS

						ie -= 2*ide;

#endif

					}

				}

			}

		}

#ifdef MEAS

		//fwrite(&ie, sizeof(int), 1, fp);

        printf("CPU Energy : %i\n",ie);

#endif

	}

	cutStopTimer(timer);

#ifdef MEAS

	//fclose(fp);

#endif

	printf("CPU time: %f ms, %f ns per spin flip\n",cutGetTimerValue(timer),1000000/((float)N)*cutGetTimerValue(timer)/TOT_SWEEPS);

#endif

	free(s);

	free(ranvec);

	free(result);

}

int main(int argc, char *argv[])

{

	CUT_DEVICE_INIT(argc,argv);

	simulate();

}

Or there if you prefer : on codepad

My code never stops even if I disable all the loops (by putting SWEEP_* to 1), and I really don’t know where is the problem. The code should be pretty simple to understand, it’s a 2D spin model with second nearest neighbour interactions. I have divided my grid like this : http://arxiv.org/pdf/1101.1427v2 at page 7, except that I have three different kind of threads, not two like on the picture i.e. my blocks are divided by three.

Thanks a lot for your help !

I use CUDA 4.0.

first a disclaimer… I last wrote C over 10 years ago, so my knowledge of C syntax might be out of date, especially if this is really c++ code, but here goes…

s is a pointer to an int.

lots of your code does this type of thing:
ide += s(x,y)*(s(x,y-2) + s(x,1));
which looks like you are calling a function called s.

So, are you inadvertently jumping to some memory address?
You’d think the compiler would warn you.

Yes I have put a define for this : define s(x,y) s[(y)*(L)+x] ! It’s easier to read like that.

I have tested my code several times and actually it hangs a lot of time around this part :

for(int i = 0; i < SWEEPS_EQUI; ++i) {

                for(int j = 0; j < SWEEPS_EMPTY; ++j) {

                        metro_sublattice<<<grid, block>>>(sD, ranvecD, resultD, 0);

                        metro_sublattice<<<grid, block>>>(sD, ranvecD, resultD, 1);

                }

        }

I was wrong saying that my code never stops, it’s actually pretty slow. The program use a huge amount of memory after a few steps in the loop and I don’t know where this come from.

I see in the C programming guide that kernel launches are asynchronous, so your loop is trying to launch all your kernels pretty quickly. Maybe a wait after each pair using cudaDeviceSynchronize is needed.

It does not help sadly !

After exactly 512 steps the program begins to be very slow !