correct way to make 2-loops in a kernel code?

Since maybe I will use large matrix, I followed the way in “cuda by example” to translate a CPUMatches in to GPUMatches, however, my translation works bad. Any help? thanks

this is the output, they should be same!
cpu matched=1057707

gpu matched=258048

[code]
#define _CRT_SECURE_NO_DEPRECATE

#include
#include

void CPUMatches(int picnum, int TotalPtL, int *host_match)
{
int PointIdxL=0, j=0;
for (PointIdxL=0; PointIdxL

#define _CRT_SECURE_NO_DEPRECATE

#include <stdio.h>
#include <stdlib.h>

void CPUMatches(int picnum, int TotalPtL, int *host_match)
{
    int PointIdxL=0, j=0;
    for (PointIdxL=0; PointIdxL<TotalPtL; PointIdxL++)
    {
        for (j=0; j<picnum; j++)                
        {
            host_match[j+PointIdxL*picnum]=12;                        
        }        
    }    
}

__global__ void GPUMatches(int picnum, int TotalPtL, int *dev_match)
{
    int PointIdxL=threadIdx.x+blockIdx.x*blockDim.x;
    int j=threadIdx.y+blockIdx.y*blockDim.y;
    while (PointIdxL<TotalPtL)
    {
        while (j<picnum)                
        {
            dev_match[j+PointIdxL*picnum]=12;            
            j+=blockDim.y*gridDim.y;
        }
        PointIdxL+=blockDim.x*gridDim.x;
    }    
}

int CheckMatch(int picnumR, int TotalPtL, int *match)
{
    int matched=0, i=0, j=0;
    for (i=0;i<TotalPtL;i++)
        for(j=0;j<picnumR;j++)
            if (match[j+i*picnumR]==12)
                matched++;
    return matched;
}

int main(void)
{
    int i=0, j=0, TotalPtL=16789, picnumR=63, matched=0;
   
    int *host_match=NULL;
    int *dev_match=NULL;    

    host_match=(int *)calloc(TotalPtL*picnumR, sizeof(int));
    cudaMalloc((void**)&dev_match, TotalPtL*picnumR*sizeof(int));
    
    for (i=0;i<TotalPtL;i++)
        for(j=0;j<picnumR;j++)
            host_match[j+i*picnumR]=-1;
          
    cudaMemcpy(dev_match, host_match, TotalPtL*picnumR*sizeof(int), cudaMemcpyHostToDevice);
    
    CPUMatches(picnumR, TotalPtL, host_match);
    matched=CheckMatch(picnumR, TotalPtL, host_match);
    printf("cpu matched=%i

", matched);

dim3 grids(256, 256);
    dim3 threads(16, 16);
    GPUMatches<<<grids, threads>>> (picnumR, TotalPtL, dev_match);
     
    cudaMemcpy(host_match, dev_match, 
                                        TotalPtL*picnumR*sizeof(int), 
                                        cudaMemcpyDeviceToHost);
    
    matched=CheckMatch(picnumR, TotalPtL, host_match);
    printf("gpu matched=%i

", matched);   
    
    free(host_match);
    cudaFree(dev_match);
}

anybody, any help?

Hi,

You just need to move your j initialization inside the first while loop in the GPU code. j is not being re-initialized as it is in the for loop of your CPU code, so the inner j loop only runs for the first value of PointIdxL in the GPU code.

Hope that helps,
Thomas