Thread Id as loop condition Using thread Id as an upper bound within a for loop causes crash

I actually posted this in the General Forum but I think it may have been the wrong one. Perhaps it is more suited to here.

Hi Guys,

I have been working on a matrix vector multiplication routine. Due to the properties of the matrix I can split the matrix up into its diagonal and one offset of the diagonal (All entries in a particular column are the same apart from the diagonal). These are stored as 1 d arrays within constant memory. I then multiply these by an array which is stored in shared memory to give me my full matrix vector multiplication. I initially wrote the following code to solve this.

[codebox]static device void MatrixVecMult (float *z, float *x) {

int tid = threadIdx.x;

    int i;

x[tid] = L_d[tid] * z[tid];

for (i = 0; i < tid; i++)

            x[tid] +=  L[i] * z[i];

}[/codebox]

where z and x are declared in shared memory and are of size blockDim and L and L_d are the offsets and the diagonals respectively and are sized in the same way. This code does not work for a thread number over 32. This suggests to me something to do with warps etc. but I am not sure. The following code performs the exact same operations yet it works correctly and is stable.

[codebox]static device void MatrixVecMult (float *z, float *x) {

int tid = threadIdx.x;

    int i;

x[tid] = L_d[tid] * z[tid];

for (i = 0; i < blockDim.x; i++) {

            if( i == tid)

                    break;

            x[tid] +=  L[i] * z[i];

    }

}[/codebox]

As you can see the algorithms are the same. Is there a technical issue I am missing here? Something to do with warps and memory access? Is it a bad idea to have the thread ID as a loop condition. Any help on this would be much appreciated as I am quite at a loss as to why one works and one does not.

Further info. Running on Mac OSx, GeForce GT120, compute capability 1.1, fully updated cuda drivers, toolkit etc.

Thanks in advance

What exactly does “does not work” mean?

What exactly does “does not work” mean?

The card freezes and the error checking gives unspecified launch failure. Also, the function works fine as a standalone function i.e. when I was initially writing it I was testing it on its own to check results and it gave the expected output to a given input. As part of a larger program it becomes incredibly unstable. I may run the code once then running it again immediately after will cause the card to freeze and/or unspecified launch failure

When I say the other code “works” I mean the result is correct and the card functions correctly. I am just worried there is some underlying issue I am not aware of. The other functions perform correctly and are stable.

The card freezes and the error checking gives unspecified launch failure. Also, the function works fine as a standalone function i.e. when I was initially writing it I was testing it on its own to check results and it gave the expected output to a given input. As part of a larger program it becomes incredibly unstable. I may run the code once then running it again immediately after will cause the card to freeze and/or unspecified launch failure

When I say the other code “works” I mean the result is correct and the card functions correctly. I am just worried there is some underlying issue I am not aware of. The other functions perform correctly and are stable.

That usually means out of bounds memory access in your kernel code, like the equivalent of a segmentation violation in host code. Try cuda-memcheck on it and see what it reports.

That usually means out of bounds memory access in your kernel code, like the equivalent of a segmentation violation in host code. Try cuda-memcheck on it and see what it reports.

I don’t see anything wrong with the first code.

I tried on win7, gtx 275, compiled for sm_11 and sm_13, no error. I admit I didn’t check outcomes or produce a serious program, just 1 block, however…

[codebox]include <cuda.h>

include <cutil_inline.h>

define BLOCKDIM 128

define REPETITIONS 100

static device constant float L[BLOCKDIM],L_d[BLOCKDIM];

static device void MatrixVecMult( float *z, float *x )

{

int tid = threadIdx.x;

int i;

x[tid] = L_d[tid] * z[tid];

for (i = 0; i < tid; i++)

	x[tid] +=  L[i] * z[i];

}

static void global matvecmul (float *z)

{

__shared__ float x1[BLOCKDIM], z1[BLOCKDIM];

int tid=threadIdx.x;

z1[tid]=z[tid];

__syncthreads();

MatrixVecMult(x1,z1);

}

int main()

{

float *d_z,*h_z,*h_L,*h_L_d;

for (int k=0;k<REPETITIONS;k++)

{

	h_z=(float*)malloc(BLOCKDIM*sizeof(*h_z));

	h_L=(float*)malloc(BLOCKDIM*sizeof(*h_L));

	h_L_d=(float*)malloc(BLOCKDIM*sizeof(*h_L_d));

	cudaMalloc(&d_z,BLOCKDIM*sizeof(*d_z));

	for (int n=0;n<BLOCKDIM;n++)

	{

		h_z[n]=(float)rand()*10.f/RAND_MAX;

		h_L[n]=(float)rand()*10.f/RAND_MAX;

		h_L_d[n]=(float)rand()*10.f/RAND_MAX;

	}

	cutilSafeCall(cudaMemcpyToSymbol("L",h_L,sizeof(L)));

	cutilSafeCall(cudaMemcpyToSymbol("L_d",h_L_d,sizeof(L_d)));

	cudaMemcpy(d_z,h_z,BLOCKDIM*sizeof(*d_z),cudaMemcpyHostToDev

ice);

	matvecmul<<<1,BLOCKDIM>>>(d_z);

	cudaThreadSynchronize();

	cudaMemcpy(d_z,h_z,BLOCKDIM*sizeof(*d_z),cudaMemcpyHostToDev

ice);

	cudaFree(d_z);

	free(h_z); free(h_L); free(h_L_d);

}

return 0;

}[/codebox]

I don’t see anything wrong with the first code.

I tried on win7, gtx 275, compiled for sm_11 and sm_13, no error. I admit I didn’t check outcomes or produce a serious program, just 1 block, however…

[codebox]include <cuda.h>

include <cutil_inline.h>

define BLOCKDIM 128

define REPETITIONS 100

static device constant float L[BLOCKDIM],L_d[BLOCKDIM];

static device void MatrixVecMult( float *z, float *x )

{

int tid = threadIdx.x;

int i;

x[tid] = L_d[tid] * z[tid];

for (i = 0; i < tid; i++)

	x[tid] +=  L[i] * z[i];

}

static void global matvecmul (float *z)

{

__shared__ float x1[BLOCKDIM], z1[BLOCKDIM];

int tid=threadIdx.x;

z1[tid]=z[tid];

__syncthreads();

MatrixVecMult(x1,z1);

}

int main()

{

float *d_z,*h_z,*h_L,*h_L_d;

for (int k=0;k<REPETITIONS;k++)

{

	h_z=(float*)malloc(BLOCKDIM*sizeof(*h_z));

	h_L=(float*)malloc(BLOCKDIM*sizeof(*h_L));

	h_L_d=(float*)malloc(BLOCKDIM*sizeof(*h_L_d));

	cudaMalloc(&d_z,BLOCKDIM*sizeof(*d_z));

	for (int n=0;n<BLOCKDIM;n++)

	{

		h_z[n]=(float)rand()*10.f/RAND_MAX;

		h_L[n]=(float)rand()*10.f/RAND_MAX;

		h_L_d[n]=(float)rand()*10.f/RAND_MAX;

	}

	cutilSafeCall(cudaMemcpyToSymbol("L",h_L,sizeof(L)));

	cutilSafeCall(cudaMemcpyToSymbol("L_d",h_L_d,sizeof(L_d)));

	cudaMemcpy(d_z,h_z,BLOCKDIM*sizeof(*d_z),cudaMemcpyHostToDev

ice);

	matvecmul<<<1,BLOCKDIM>>>(d_z);

	cudaThreadSynchronize();

	cudaMemcpy(d_z,h_z,BLOCKDIM*sizeof(*d_z),cudaMemcpyHostToDev

ice);

	cudaFree(d_z);

	free(h_z); free(h_L); free(h_L_d);

}

return 0;

}[/codebox]

It must be that I have some underlying memory issue that I’m not taking care of. Overstepping bounds or something. Not that I can see where or why it’s happening. It’s nothing that I can see offhand as everything seems to be working as normal. I will look into it more in depth. Probably just inexperience.

Thanks Guys.

It must be that I have some underlying memory issue that I’m not taking care of. Overstepping bounds or something. Not that I can see where or why it’s happening. It’s nothing that I can see offhand as everything seems to be working as normal. I will look into it more in depth. Probably just inexperience.

Thanks Guys.