#!/usr/local/bin/perl -w # # # #################################################################### # This script is purely experimental, and by no means should the # user assume the output to be correct. In fact, the output is # almost certainly incorrect. # # There are fundamental flaws with this method of netflow record # deduplication such as flows duplicated not within the same second, # and flows duplicated prior to or after the current input data # set. (Those are just two reasons, there are more.) # # I wrote this script to further familiarize myself with the netflow # v7 record data, and as an investigative exercise to see whether # or not this deduplication method provides any value. # # I'm interested in hearing what other ideas are out there on the # topic of netflow record deduplication, so please feel free to # email me with your comments/suggestions/critisisms. ;) #################################################################### # # # The problem: # When receiving netflow data from multiple exporters there is a # chance of ending up with multiple unique flow records for the same # individual traffic flow. This is possible when the same packet # or packets traverse two different routers that are both sending # netflow data to a given collector. Often, the only difference # between these records lies in the following netflow v7 # variables: $exaddr, $unix_nsecs, $sysuptime, $nexthop, $input, # and $output. # # One common way to deduplicate flows is to use flow-nfilter and # filter on $exaddr. While this works relatively well, it does # have problems such as if the same packet traverses one $exaddr # multiple times (a load balancer going through multiple vlans on # the same switch can lead to this quite easily). Additionally, # this method adds overall complexity and requires significant # additional computing resources. # # Depending on clock accuracy, latency between the collector and # the different exaddr's, and a bunch of other factors, $unix_secs # can be different, although given the lack of precision (one # second) it is less likely. Thus, this script was conceived. # # # # # flow-dedup.pl # An experimental netflow record deduplicator, by gvolk@gvolk.com # # Usage: flow-cat | flow-export -f2 | flow-dedup.pl # # This deduplicator works off the assumption that if a subset of # flow record variables are the same for a given set of flow # records, then a duplicate has been found, and will be # consolidated via a hash table. # # The subset implemented here is $unix_secs, $dpkts, $doctets, # $srcaddr, $dstaddr, $srcport, $dstport, and $prot. # # This script will store each flow record in a hash table keyed # on the above vars which effectively eliminates those records # that have an identical set of the above vars. After all the # input data has been exhausted, the deduplicated flow records # will be printed out in "flow-export -f2" format, and a summary # of how many flows were eliminated will also be produced. # # In it's current form, this script will allocate memory # equivalent to 5.8 times the input data. # # use strict; # declare some vars my @rec; my %flowhash; my @deduplicated; my $element; my $precount=0; my $postcount=0; my $duplicates; # declare the generic netflow v7 record vars my ($unix_secs,$unix_nsecs,$sysuptime,$exaddr,$dpkts,$doctets, $first,$last,$engine_type,$engine_id,$srcaddr,$dstaddr,$nexthop, $input,$output,$srcport,$dstport,$prot,$tos,$tcp_flags, $src_mask,$dst_mask,$src_as,$dst_as,$router_sc); # And so it begins... while () { next if($_ =~ /^$/); # skip null lines next if($_ =~ /^\s*#/); # skip comment lines chomp; # ditch the CR @rec=split(/,/,$_); # split on commas # Assign each netflow v7 record variable $unix_secs=$rec[0]; $unix_nsecs=$rec[1]; $sysuptime=$rec[2]; $exaddr=$rec[3]; $dpkts=$rec[4]; $doctets=$rec[5]; $first=$rec[6]; $last=$rec[7]; $engine_type=$rec[8]; $engine_id=$rec[9]; $srcaddr=$rec[10]; $dstaddr=$rec[11]; $nexthop=$rec[12]; $input=$rec[13]; $output=$rec[14]; $srcport=$rec[15]; $dstport=$rec[16]; $prot=$rec[17]; $tos=$rec[18]; $tcp_flags=$rec[19]; $src_mask=$rec[20]; $dst_mask=$rec[21]; $src_as=$rec[22]; $dst_as=$rec[23]; $router_sc=$rec[24]; # Store each entire record, keyed on the following vars in {}'s # This is where we loose the "duplicated" flow records. $flowhash{"$unix_secs,$dpkts,$doctets,$srcaddr,$dstaddr,$srcport,$dstport,$prot"} = "$unix_secs,$unix_nsecs,$sysuptime,$exaddr,$dpkts,$doctets,$first,$last,$engine_type,$engine_id,$srcaddr,$dstaddr,$nexthop,$input,$output,$srcport,$dstport,$prot,$tos,$tcp_flags,$src_mask,$dst_mask,$src_as,$dst_as,$router_sc"; $precount++; # Count each original record } # Print out each flow record that is in the hash foreach $element (keys %flowhash) { print "$flowhash{$element}\n"; $postcount++; # Count each new record } # How many records did we eliminate? $duplicates=$precount - $postcount; # Summarize... print "precount=$precount postcount=$postcount duplicated records=$duplicates\n";