Hello everyone !
First of all, my operating system is Linux 2.6.18-194.17.4.el5 #1 SMP Mon Oct 25 15:50:53 EDT 2010 x86_64 x86_64 x86_64 GNU/Linux.
I have some very odd behaviour with my cuda code; it never stops. Here it is :
/* ising.cu
Single spin-flip Metropolis simulation of a two-dimensional
ferromagnetic Ising model on graphics processing units (GPUs)
using the NVIDIA CUDA framework.
Copyright (C) 2009, 2010 Martin Weigel
This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
Foundation; either version 2 of the License, or (at your option) any later
version.
This program is distributed in the hope that it will be useful, but WITHOUT
ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
details.
You should have received a copy of the GNU General Public License along with
this program; if not, see <http://www.gnu.org/licenses/>.
Please check (and, if relevant, cite) the related preprint arXiv:1006.3865 at
http://arxiv.org/abs/1006.3865
*/
#include <stdio.h>
#include <stdlib.h>
#include <cutil.h>
#define DIM 2
#define L 1200 // multiple de 12
#define BLOCKL 12
#define GRIDL (L/BLOCKL)
#define BLOCKS ((GRIDL*GRIDL)/2)
#define THREADS ((BLOCKL*BLOCKL)/3)
#define N (L*L)
#define SPINS_PER_BLOCK (N/3)
#define TOTTHREADS (BLOCKS*THREADS)
#define SWEEPS_EQUI 2000
#define SWEEPS_GLOBAL 1000
#define SWEEPS_LOCAL 1
#define SWEEPS_EMPTY 1
#define TOT_SWEEPS (SWEEPS_GLOBAL*SWEEPS_LOCAL*SWEEPS_EMPTY)
#define BETA 0.4
#define A 1664525
#define C 1013904223
#define AA A
#define CC C
#define MULT 2.328306437080797e-10f
#define MULT2 4.6566128752457969e-10f
#define sS(x,y) sS[(y+1)*(BLOCKL+2)+x+1]
#define s(x,y) s[y*L+x]
#define RAN(n) (n = AA*n + CC)
//#define CPU
#define MEAS
typedef int spin_t;
texture<float,1,cudaReadModeElementType> boltzT;
__global__ void metro_sublattice(spin_t *s, int *ranvec, int *result, unsigned int boffset)
{
unsigned int n = threadIdx.y*BLOCKL+threadIdx.x;
unsigned int xoffset = blockIdx.x*BLOCKL;
unsigned int yoffset = (2*blockIdx.y+(blockIdx.x+boffset)%2)*BLOCKL;
unsigned int ran = ranvec[(blockIdx.y*GRIDL+blockIdx.x)*THREADS+n];
#ifdef MEAS
int ie = 0;
#endif
unsigned int x = threadIdx.x + xoffset;
unsigned int y1 = (threadIdx.x%3)+3*threadIdx.y + yoffset;
unsigned int y2 = ((threadIdx.x+1)%3)+3*threadIdx.y + yoffset;
unsigned int y3 = ((threadIdx.x+2)%3)+3*threadIdx.y + yoffset;
for(int i = 0; i < SWEEPS_LOCAL; ++i) {
// first nearest neighbours
int ide = s(x,y1)*(s((x-1+L)%L,y1)+s(x,(y1-1+L)%L)+s((x+1)%L,y1)+s(x,(y1+1)%L));
// next nearest neighbours energy
ide += s(x,y1)*(s((x-2+L)%L,y1)+s(x,(y1-2+L)%L)+s((x+2)%L,y1)+s(x,(y1+2)%L));
if(MULT*(*(unsigned int*)(&RAN(ran))) < tex1Dfetch(boltzT, ide+2*DIM)) {
s(x,y1) = -s(x,y1);
#ifdef MEAS
ie -= 2*ide;
#endif
}
__syncthreads();
// first nearest neighbours
ide = s(x,y2)*(s((x-1+L)%L,y2)+s(x,(y2-1+L)%L)+s((x+1)%L,y2)+s(x,(y2+1)%L));
// next nearest neighbours energy
ide += s(x,y2)*(s((x-2+L)%L,y2)+s(x,(y2-2+L)%L)+s((x+2)%L,y2)+s(x,(y2+2)%L));
if(MULT*(*(unsigned int*)(&RAN(ran))) < tex1Dfetch(boltzT, ide+2*DIM)) {
s(x,y2) = -s(x,y2);
#ifdef MEAS
ie -= 2*ide;
#endif
}
__syncthreads();
// first nearest neighbours
ide = s(x,y3)*(s((x-1+L)%L,y3)+s(x,(y3-1+L)%L)+s((x+1)%L,y3)+s(x,(y3+1)%L));
// next nearest neighbours energy
ide += s(x,y3)*(s((x-2+L)%L,y3)+s(x,(y3-2+L)%L)+s((x+2)%L,y3)+s(x,(y3+2)%L));
if(MULT*(*(unsigned int*)(&RAN(ran))) < tex1Dfetch(boltzT, ide+2*DIM)) {
s(x,y3) = -s(x,y3);
#ifdef MEAS
ie -= 2*ide;
#endif
}
__syncthreads();
}
ranvec[(blockIdx.y*GRIDL+blockIdx.x)*THREADS+n] = ran;
#ifdef MEAS
__shared__ int deltaE[THREADS];
deltaE[n] = ie;
for(int stride = THREADS>>1; stride > 0; stride >>= 1) {
__syncthreads();
if(n < stride) deltaE[n] += deltaE[n+stride];
}
if(n == 0) result[blockIdx.y*GRIDL+blockIdx.x] += deltaE[0];
#endif
}
int cpu_energy(spin_t *s)
{
int ie = 0;
for(int x = 0; x < L; ++x)
for(int y = 0; y < L; ++y) {
// first nearest neighbours energy
ie += s(x,y)*(s((x-1+L)%L,y)+s(x,(y-1+L)%L)+s((x+1)%L,y)+s(x,(y+1)%L));
// next nearest neighbours energy
ie += s(x,y)*(s((x-2+L)%L,y)+s(x,(y-2+L)%L)+s((x+2)%L,y)+s(x,(y+2)%L));
}
return ie/2;
}
void simulate()
{
spin_t *s, *sD;
int *ranvec, *ranvecD, *result, *resultD;
float boltzGPU[4*DIM+1], boltzCPU[2*DIM+1];
dim3 grid(GRIDL, GRIDL/2);
dim3 block(BLOCKL, BLOCKL/3);
srand48(3145627);
s = (spin_t*)malloc(N*sizeof(spin_t));
ranvec = (int*)malloc(TOTTHREADS*sizeof(int));
result = (int*)malloc(BLOCKS*sizeof(int));
for(int i = 0; i < N; ++i) s[i] = (drand48() < 0.5) ? 1 : -1;
ranvec[0] = 1;
for(int i = 1; i < TOTTHREADS; ++i) ranvec[i] = 16807*ranvec[i-1];
for(int i = 0; i < BLOCKS; ++i) result[i] = 0;
for(int i = -2*DIM; i <= 2*DIM; ++i) boltzGPU[i+2*DIM] = min(1.0,exp(-2*BETA*i));
for(int i = 0; i <= 2*DIM; ++i) boltzCPU[i] = exp(-2*BETA*i);
float *boltztext;
CUDA_SAFE_CALL(cudaMalloc((void**)&boltztext, (4*DIM+1)*sizeof(float)));
CUDA_SAFE_CALL(cudaMemcpy(boltztext, boltzGPU, (4*DIM+1)*sizeof(float), cudaMemcpyHostToDevice));
CUDA_SAFE_CALL(cudaBindTexture(NULL, boltzT, boltztext, (4*DIM+1)*sizeof(float)));
CUDA_SAFE_CALL(cudaMalloc((void**)&sD, N*sizeof(spin_t)));
CUDA_SAFE_CALL(cudaMemcpy(sD, s, N*sizeof(spin_t), cudaMemcpyHostToDevice));
CUDA_SAFE_CALL(cudaMalloc((void**)&ranvecD, TOTTHREADS*sizeof(int)));
CUDA_SAFE_CALL(cudaMemcpy(ranvecD, ranvec, TOTTHREADS*sizeof(int), cudaMemcpyHostToDevice));
CUDA_SAFE_CALL(cudaMalloc((void**)&resultD, BLOCKS*sizeof(int)));
CUDA_SAFE_CALL(cudaMemcpy(resultD, result, BLOCKS*sizeof(int), cudaMemcpyHostToDevice));
int ie = cpu_energy(s);
fprintf(stderr,"equilibration\n");
for(int i = 0; i < SWEEPS_EQUI; ++i) {
printf("Step %i of %i... ", i+1, SWEEPS_EQUI);
for(int j = 0; j < SWEEPS_EMPTY; ++j) {
metro_sublattice<<<grid, block>>>(sD, ranvecD, resultD, 0);
metro_sublattice<<<grid, block>>>(sD, ranvecD, resultD, 1);
}
printf("done.\n");
}
#ifdef MEAS
//FILE *fp = fopen("gpu_en.dat","w");
#endif
printf("GPU simulation\n");
cudaThreadSynchronize();
unsigned int timer;
cutCreateTimer(&timer);
cutStartTimer(timer);
for(int i = 0; i < SWEEPS_GLOBAL; ++i) {
for(int j = 0; j < SWEEPS_EMPTY; ++j) {
metro_sublattice<<<grid, block>>>(sD, ranvecD, resultD, 0);
metro_sublattice<<<grid, block>>>(sD, ranvecD, resultD, 1);
}
#ifdef MEAS
int sum = ie;
cudaMemcpy(result, resultD, BLOCKS*sizeof(int), cudaMemcpyDeviceToHost);
for(int j = 0; j < BLOCKS; ++j) sum += result[j];
//fwrite(&sum, sizeof(int), 1, fp);
printf("GPU Energy : %i\n",sum);
#endif
}
#ifdef MEAS
//fclose(fp);
#endif
CUT_CHECK_ERROR("Kernel execution failed");
cudaThreadSynchronize();
cutStopTimer(timer);
printf("GPU time: %f ms, %f ns per spin flip\n",cutGetTimerValue(timer),1000000/((float)N)*cutGetTimerValue(timer)/TOT_SWEEPS);
CUDA_SAFE_CALL(cudaMemcpy(s, sD, N*sizeof(spin_t), cudaMemcpyDeviceToHost));
CUDA_SAFE_CALL(cudaMemcpy(ranvec, ranvecD, TOTTHREADS*sizeof(int), cudaMemcpyDeviceToHost));
cudaFree(sD);
cudaFree(ranvecD);
cudaFree(resultD);
#ifdef CPU
int ran = 16807;
#ifdef MEAS
ie = cpu_energy(s);
//fp = fopen("cpu_en.dat","w");
#endif
printf("CPU simulation\n");
cutCreateTimer(&timer);
cutStartTimer(timer);
for(int i = 0; i < SWEEPS_GLOBAL; ++i) {
for(int j = 0; j < SWEEPS_EMPTY*SWEEPS_LOCAL; ++j) {
for(int y = 0; y < L; ++y) {
for(int x = 0; x < L; ++x) {
// first nearest neighbours energy
int ide = s(x,y)*(s((x-1+L)%L,y)+s(x,(y-1+L)%L)+s((x+1)%L,y)+s(x,(y+1)%L));
// next nearest neighbours energy
ide += s(x,y)*(s((x-2+L)%L,y)+s(x,(y-2+L)%L)+s((x+2)%L,y)+s(x,(y+2)%L));
if(ide <= 0 || fabs(RAN(ran)*MULT2) < boltzCPU[ide]) {
s[L*y+x] = -s[L*y+x];
#ifdef MEAS
ie -= 2*ide;
#endif
}
}
}
}
#ifdef MEAS
//fwrite(&ie, sizeof(int), 1, fp);
printf("CPU Energy : %i\n",ie);
#endif
}
cutStopTimer(timer);
#ifdef MEAS
//fclose(fp);
#endif
printf("CPU time: %f ms, %f ns per spin flip\n",cutGetTimerValue(timer),1000000/((float)N)*cutGetTimerValue(timer)/TOT_SWEEPS);
#endif
free(s);
free(ranvec);
free(result);
}
int main(int argc, char *argv[])
{
CUT_DEVICE_INIT(argc,argv);
simulate();
}
Or there if you prefer : on codepad
My code never stops even if I disable all the loops (by putting SWEEP_* to 1), and I really don’t know where is the problem. The code should be pretty simple to understand, it’s a 2D spin model with second nearest neighbour interactions. I have divided my grid like this : http://arxiv.org/pdf/1101.1427v2 at page 7, except that I have three different kind of threads, not two like on the picture i.e. my blocks are divided by three.
Thanks a lot for your help !
I use CUDA 4.0.