cuda atomicAdd() got problem with a big array under fedora 11 AtomicAdd() goes wrong with big array

fedora 11 x64 gcc34

cudadriver_2.3_linux_64_190.18.run

cudasdk_2.3_linux.run

tesla c1060 x 4

24G Memory

Intel i7 965

i am using CUDA to analyse the networkpackets. here is my code with problem.

[codebox]

#include “udp_kernel_cu.h”

#include “cudatrans_cu.h”

#define THREAD 256

#define UR 197131

global void udp_analyse_on_gpu(packetblock *packets_g_e,

                                    packetresult* result,

                                    char *resultnext,

                                    clock_t *time_gpu_6 ,

                                    u_int32_t* resultcopier_gpu)

{

shared u_int32_t sharememory[11];

    const int blid = blockIdx.x;

    const int thid = threadIdx.x;

if (blid == 0 && thid == 0){

            *time_gpu_6 = clock();

            for (int i=0; i<UR; i++){

                    result[i].block = 0;

                    result[i].anzahl = 0;

                    result[i].offset = 0;

            }

}

if (thid == 0){

            for (int i=0; i<UR;i++){

                    atomicAdd(&(result[i].anzahl),0);

            }

            for (int i=0; i<11;i++){

                    sharememory[i]=0;

            }

    }

    __syncthreads();

if (resultnext[blid * THREAD * 3 + thid * 3 + 0] == ‘u’

        && resultnext[blid * THREAD * 3 + thid * 3 + 1] == 'd'

        && resultnext[blid * THREAD * 3 + thid * 3 + 2] == 'p'){

//resultcopier_gpu[blid] = thid;

            size_t offset = packets_g_e[blid * THREAD + thid].offset;//get the current offset from packetblock

//get the udpheader from packet

            udpheader _udpheader;

            _udpheader.psrc = packets_g_e[blid * THREAD + thid]._data[offset] << 8

                        | packets_g_e[blid * THREAD + thid]._data[offset+1];

            offset += 2;

            _udpheader.pdst = packets_g_e[blid * THREAD + thid]._data[offset] << 8

                        | packets_g_e[blid * THREAD + thid]._data[offset+1];

            offset += 2;

            _udpheader.length = packets_g_e[blid * THREAD + thid]._data[offset] << 8

                        | packets_g_e[blid * THREAD + thid]._data[offset+1];

            offset += 2;

            _udpheader.checksum = packets_g_e[blid * THREAD + thid]._data[offset] << 8

                        | packets_g_e[blid * THREAD + thid]._data[offset+1];

            offset += 2;

//source port

                    if( _udpheader.psrc > 0 && _udpheader.psrc < ID_UDP_SRC_PORT_NUM) {

                            atomicAdd(&(result[_udpheader.psrc].anzahl),1);

                    }

[b][i]if(_udpheader.psrc < UDP_PORT_WELL) {

                            atomicAdd(&sharememory[0], 1);//ID_UDP_SRC_PORT_WELLKNOWN

}

                    else if (_udpheader.psrc < UDP_PORT_REG ){//&& _udpheader.psrc >= UDP_PORT_WELL)  {

                            atomicAdd(&sharememory[1], 1);//ID_UDP_SRC_PORT_REGISTERED

}

                    else {//if (_udpheader.psrc >= UDP_PORT_REG && _udpheader.psrc < ID_UDP_DST_PORT_NUM) {

                            atomicAdd(&sharememory[2], 1);//ID_UDP_SRC_PORT_DYNAMIC

                    }[/i][/b]

//block3

                    if (_udpheader.psrc >= 1024 && _udpheader.pdst >= 1024) {

                            atomicAdd(&sharememory[7], 1);//ID_UDP_PORT_SRC_AND_DEST_ABOVE_1023

}

if (_udpheader.psrc < 1024 && _udpheader.pdst < 1024) {

atomicAdd(&sharememory[8], 1);//ID_UDP_PORT_SRC_AND_DEST_BELOW_1024

}

if (_udpheader.psrc >= 1024 && _udpheader.pdst < 1024) {

atomicAdd(&sharememory[9], 1);//ID_UDP_PORT_SRC_ABOVE_1023_DEST_BELOW_1024

}

if (_udpheader.psrc < 1024 && _udpheader.pdst >= 1024) {

atomicAdd(&sharememory[10], 1);//ID_UDP_PORT_SRC_BELOW_1024_DEST_ABOVE_1023

                    }

//destination port

                    for (int i=0; i<UR;i++){

                            atomicAdd(&(result[i].anzahl),0);

                    }

if( _udpheader.pdst < ID_UDP_DST_PORT_NUM ) {

                            atomicAdd(&(result[ID_UDP_DST_PORT +  _udpheader.pdst].anzahl), 1);

                    }

[i][b]if(_udpheader.pdst < UDP_PORT_WELL) {

                            atomicAdd(&sharememory[3], 1);//ID_UDP_DST_PORT_WELLKNOWN

                    }

                    else if (_udpheader.pdst < UDP_PORT_REG ){//&& _udpheader.pdst >= UDP_PORT_WELL) {

                            atomicAdd(&sharememory[4], 1);//ID_UDP_DST_PORT_REGISTERED

                    }

                    else {//if (_udpheader.pdst >= UDP_PORT_REG && _udpheader.pdst < ID_UDP_DST_PORT_NUM) {

                            atomicAdd(&sharememory[5], 1);//ID_UDP_DST_PORT_DYNAMIC

                    }[/b][/i]

{ //length

                    u_int16_t length = _udpheader.length;

                    atomicAdd(&(result[131591 + length].anzahl), 1);//block 2

                    length >>= 7;

if(length < ID_UDP_LENGTH_NUM) {

                            atomicAdd(&(result[131078 + length].anzahl), 1);

                    }

            }

// Skype V2.x

            if (_udpheader.length == 19) {

                    if (packets_g_e[blid * THREAD + thid]._data[offset + 30] & 0x8f == 7) {

                            atomicAdd(&sharememory[6], 1);//131590 + ID_UDP_SKYPEV2

                    }

            }

            /*

            if (dstport == 4500 || srcport == 4500) {  // additionally IKEv2 and IKEv1

                    // When sent on UDP port 4500, IKE messages have prepended four octets of zero

                    if(packets_g_e[blid * THREAD + thid].offset + 4 > packets_g_e[blid * THREAD + thid].length) {

                            //IAS::Logs::Logger::ERRLOG("UDP/IKE header fragmented or truncated");

                    }

                    offset += 4;

            }

            */

            packets_g_e[blid * THREAD + thid].offset = offset;// at the end, save the offset back to packetblock

resultnext[blid * THREAD * 3 + thid * 3 + 0] = ‘\0’;

            resultnext[blid * THREAD * 3 + thid * 3 + 1] = '\0';

            resultnext[blid * THREAD * 3 + thid * 3 + 2] = '\0';

    }

__syncthreads();

for (int i=0;i< 131590; i++){

                    result[i].block = 0;

                    result[i].offset = i;

            }

result[131590].block = 1;

            result[131590].offset = 0;

for (int i=0;i<65536; i++){

                    result[i + 131591].block = 2;

                    result[i + 131591].offset = i;

            }

for (int i=0; i<4; i++){

                    result[i + 197127].block = 3;

                    result[i + 197127].offset = i;

            }

	if (thid == 0){

            for (int i=0 ; i < 6 ; i++){

                    atomicAdd(&(result[i + 131072].anzahl), sharememory[i]);

            }

atomicAdd(&(result[132590].anzahl), sharememory[6]);

for (int i=7; i<11 ;i++ ){

                    atomicAdd(&(result[i + 197127 - 7].anzahl), sharememory[i]);

            }

	}

__syncthreads();

    if (blid == 0 && thid == 0) *time_gpu_6 = clock()- (*time_gpu_6);

}

[/codebox]

the codes with bold and Italic marked always do wrong. then i try to print out the result of every block. then i find that some blocks did not do the code. then i just put :

[codebox]

for (int i=0; i<UR;i++){

                    atomicAdd(&(result[i].anzahl),0);

            }

[/codebox]

into my code, then all the threads do the right things, and the results are right.

can anybody tell me, why it happend? is it a bug with cuda under Linux.

for (int i=0; i<UR;i++){
atomicAdd(&(result[i].anzahl),0);
}

the codes here make no sense for the gpu. but just let the threads know where the address are. then it works.

Is that a bug from CUDA? this “Warm up” costs really lots of time!

make it up.