fedora 11 x64 gcc34
cudadriver_2.3_linux_64_190.18.run
cudasdk_2.3_linux.run
tesla c1060 x 4
24G Memory
Intel i7 965
i am using CUDA to analyse the networkpackets. here is my code with problem.
[codebox]
#include “udp_kernel_cu.h”
#include “cudatrans_cu.h”
#define THREAD 256
#define UR 197131
global void udp_analyse_on_gpu(packetblock *packets_g_e,
packetresult* result,
char *resultnext,
clock_t *time_gpu_6 ,
u_int32_t* resultcopier_gpu)
{
shared u_int32_t sharememory[11];
const int blid = blockIdx.x;
const int thid = threadIdx.x;
if (blid == 0 && thid == 0){
*time_gpu_6 = clock();
for (int i=0; i<UR; i++){
result[i].block = 0;
result[i].anzahl = 0;
result[i].offset = 0;
}
}
if (thid == 0){
for (int i=0; i<UR;i++){
atomicAdd(&(result[i].anzahl),0);
}
for (int i=0; i<11;i++){
sharememory[i]=0;
}
}
__syncthreads();
if (resultnext[blid * THREAD * 3 + thid * 3 + 0] == ‘u’
&& resultnext[blid * THREAD * 3 + thid * 3 + 1] == 'd'
&& resultnext[blid * THREAD * 3 + thid * 3 + 2] == 'p'){
//resultcopier_gpu[blid] = thid;
size_t offset = packets_g_e[blid * THREAD + thid].offset;//get the current offset from packetblock
//get the udpheader from packet
udpheader _udpheader;
_udpheader.psrc = packets_g_e[blid * THREAD + thid]._data[offset] << 8
| packets_g_e[blid * THREAD + thid]._data[offset+1];
offset += 2;
_udpheader.pdst = packets_g_e[blid * THREAD + thid]._data[offset] << 8
| packets_g_e[blid * THREAD + thid]._data[offset+1];
offset += 2;
_udpheader.length = packets_g_e[blid * THREAD + thid]._data[offset] << 8
| packets_g_e[blid * THREAD + thid]._data[offset+1];
offset += 2;
_udpheader.checksum = packets_g_e[blid * THREAD + thid]._data[offset] << 8
| packets_g_e[blid * THREAD + thid]._data[offset+1];
offset += 2;
//source port
if( _udpheader.psrc > 0 && _udpheader.psrc < ID_UDP_SRC_PORT_NUM) {
atomicAdd(&(result[_udpheader.psrc].anzahl),1);
}
[b][i]if(_udpheader.psrc < UDP_PORT_WELL) {
atomicAdd(&sharememory[0], 1);//ID_UDP_SRC_PORT_WELLKNOWN
}
else if (_udpheader.psrc < UDP_PORT_REG ){//&& _udpheader.psrc >= UDP_PORT_WELL) {
atomicAdd(&sharememory[1], 1);//ID_UDP_SRC_PORT_REGISTERED
}
else {//if (_udpheader.psrc >= UDP_PORT_REG && _udpheader.psrc < ID_UDP_DST_PORT_NUM) {
atomicAdd(&sharememory[2], 1);//ID_UDP_SRC_PORT_DYNAMIC
}[/i][/b]
//block3
if (_udpheader.psrc >= 1024 && _udpheader.pdst >= 1024) {
atomicAdd(&sharememory[7], 1);//ID_UDP_PORT_SRC_AND_DEST_ABOVE_1023
}
if (_udpheader.psrc < 1024 && _udpheader.pdst < 1024) {
atomicAdd(&sharememory[8], 1);//ID_UDP_PORT_SRC_AND_DEST_BELOW_1024
}
if (_udpheader.psrc >= 1024 && _udpheader.pdst < 1024) {
atomicAdd(&sharememory[9], 1);//ID_UDP_PORT_SRC_ABOVE_1023_DEST_BELOW_1024
}
if (_udpheader.psrc < 1024 && _udpheader.pdst >= 1024) {
atomicAdd(&sharememory[10], 1);//ID_UDP_PORT_SRC_BELOW_1024_DEST_ABOVE_1023
}
//destination port
for (int i=0; i<UR;i++){
atomicAdd(&(result[i].anzahl),0);
}
if( _udpheader.pdst < ID_UDP_DST_PORT_NUM ) {
atomicAdd(&(result[ID_UDP_DST_PORT + _udpheader.pdst].anzahl), 1);
}
[i][b]if(_udpheader.pdst < UDP_PORT_WELL) {
atomicAdd(&sharememory[3], 1);//ID_UDP_DST_PORT_WELLKNOWN
}
else if (_udpheader.pdst < UDP_PORT_REG ){//&& _udpheader.pdst >= UDP_PORT_WELL) {
atomicAdd(&sharememory[4], 1);//ID_UDP_DST_PORT_REGISTERED
}
else {//if (_udpheader.pdst >= UDP_PORT_REG && _udpheader.pdst < ID_UDP_DST_PORT_NUM) {
atomicAdd(&sharememory[5], 1);//ID_UDP_DST_PORT_DYNAMIC
}[/b][/i]
{ //length
u_int16_t length = _udpheader.length;
atomicAdd(&(result[131591 + length].anzahl), 1);//block 2
length >>= 7;
if(length < ID_UDP_LENGTH_NUM) {
atomicAdd(&(result[131078 + length].anzahl), 1);
}
}
// Skype V2.x
if (_udpheader.length == 19) {
if (packets_g_e[blid * THREAD + thid]._data[offset + 30] & 0x8f == 7) {
atomicAdd(&sharememory[6], 1);//131590 + ID_UDP_SKYPEV2
}
}
/*
if (dstport == 4500 || srcport == 4500) { // additionally IKEv2 and IKEv1
// When sent on UDP port 4500, IKE messages have prepended four octets of zero
if(packets_g_e[blid * THREAD + thid].offset + 4 > packets_g_e[blid * THREAD + thid].length) {
//IAS::Logs::Logger::ERRLOG("UDP/IKE header fragmented or truncated");
}
offset += 4;
}
*/
packets_g_e[blid * THREAD + thid].offset = offset;// at the end, save the offset back to packetblock
resultnext[blid * THREAD * 3 + thid * 3 + 0] = ‘\0’;
resultnext[blid * THREAD * 3 + thid * 3 + 1] = '\0';
resultnext[blid * THREAD * 3 + thid * 3 + 2] = '\0';
}
__syncthreads();
for (int i=0;i< 131590; i++){
result[i].block = 0;
result[i].offset = i;
}
result[131590].block = 1;
result[131590].offset = 0;
for (int i=0;i<65536; i++){
result[i + 131591].block = 2;
result[i + 131591].offset = i;
}
for (int i=0; i<4; i++){
result[i + 197127].block = 3;
result[i + 197127].offset = i;
}
if (thid == 0){
for (int i=0 ; i < 6 ; i++){
atomicAdd(&(result[i + 131072].anzahl), sharememory[i]);
}
atomicAdd(&(result[132590].anzahl), sharememory[6]);
for (int i=7; i<11 ;i++ ){
atomicAdd(&(result[i + 197127 - 7].anzahl), sharememory[i]);
}
}
__syncthreads();
if (blid == 0 && thid == 0) *time_gpu_6 = clock()- (*time_gpu_6);
}
[/codebox]
the codes with bold and Italic marked always do wrong. then i try to print out the result of every block. then i find that some blocks did not do the code. then i just put :
[codebox]
for (int i=0; i<UR;i++){
atomicAdd(&(result[i].anzahl),0);
}
[/codebox]
into my code, then all the threads do the right things, and the results are right.
can anybody tell me, why it happend? is it a bug with cuda under Linux.