Appendixes > Record Statistics with Statistics Server > Sample Files > Sample XML Event Script

Sample XML Event Script
#!/usr/bin/perl
 
use strict;
use warnings;
use File::Tail;
use Encode;
---------------------------------------------------
 
# Script to monitor a given logfile ($ARGV[0]) generated by any IDOL servers, constantly tailing it. When a query appears in the logs it gathers the appropriate information and sends it to the Stats server at the location below.
#
# The log file has the following format:
# <date> <time> [<thread number>] <information>
#
# For example:
# (...)
# 12/12/2007 08:22:21 [7] /action=suggest&maxresults=6&reference=doc_123123 (127.0.0.1)
# 12/12/2007 08:22:23 [7] Returning 6 matches
# 12/12/2007 08:22:23 [7] Suggest complete
# (...)
#
# Parameters: Four main parameters are required to run the script.
#
# - logfile name: Passed to the script on the command line. The full path must be specified.
# - stathost: Host name of the stats server.
# - statport: Event port of the Stats server to which the script can send XML events.
# - idolname: To use in Mutiple IDOL server mode. IDOL server name generating the logfile that you want to monitor. The value must be the same one specified in the Stats server configuration file when setting the "IDOLName" stat parameter.
#
# For example: idolname = "myIDOLServer"
 
# If only one single IDOL server is used, you can set $idolname=""
#
# Run the script as follows:
#
#    perl XMLscript.pl <logfile name>
#
#----------------------------------------------------------------
 
 
if (!defined $ARGV[0]) {
    print "Syntax: $0 [logfile]\n";
    exit(0);
}
 
my $name = $ARGV[0];  # The log file to monitor
my $statshost = "127.0.0.1";  # Host of the Stats server
my $statsport = 19871;  # Event port of the statsserver
my $idolname = "";    # IDOL server name. Set an IDOL server name if Stat server is running in Multiple IDOL server mode.
my $BATCHSIZE=1; # Wait until we have this many events and send them all at once
my $tailn=0; # Start tailing from the end
my $resetTail=0; #Start tailing after the file has been automatically closed and reopened
my $tail; # Structure returned while taililng a file
my %threadqueries; # The 'current query' on each thread
my %threadips; # The 'current ip' on each thread
my %threadtime;  # The 'current time' on each thread
my $total = 0;   # Number of XML events generated by the script
 
sub queryinfo($$$$$@);  # return event XML for a given query with matches and terms
sub uri_unescape($);  # Unescape a URI-escaped string
sub uri_escape($);  # Escape a URI string
sub postevent($$$);  # Send XML events to the Stats server
 
#--------------------------------
# Main loop
#--------------------------------
for (;;)
{
    eval {
    # Tail the given logfile
    $tail = new File::Tail(name=>$name,maxinterval=>0.1,tail=>$tailn);
    $tailn=0;
    
    # Event Creating loop
    for (;;)
    {
        # Create the header of the XML event
        my $data="<?xml version='1.0' encoding='ISO-8859-1' ?>\n<events>\n";
 
        # Count number of events to send to the Stats server
        my $events = 0;
 
    # Event Writing loop
    for (;;)
    {
        # Wait for new input in the log file.
        my ($nfound,$timeleft,@pending) =         File::Tail::select(undef,undef,undef,0.01,$tail);
 
        # Exit if no new line is found
        last if $events > 0 && !$nfound;
 
        # Returns one line from the log file
        my $line = $tail->read;
        # Check whether the line contains the following format (i.e. 16/09/2008 14:20:00 [1] ....)
        if ($line =~ m,^(\d\d/\d\d/\d{4} \d\d:\d\d:\d\d) \[(\d+)\] (.*),)
        {
        # A log line (beginning with a time)
        my $timeLog = $1; # Get the time when the IDOL server received the query
        my $thread=$2; # The thread number the query is on
        my $message=$3; # The rest of the log line
 
        # Check whether the message contains a query (action=termgetbest&...)
        if ($message =~ m,^/?(a.*?=.+) \(([\d\.]+)\)$,)
        {
        # The initial 'action received'
        $threadqueries{$thread}=$1; # Get the query
        $threadips{$thread}=$2; # Get IP address of the IDOL server that sent the query
        $threadtime{$thread}=$timeLog; # Get the query time
        }
        # Check whether the message contains the number of hits, returned matches or completed query
        elsif ( $threadqueries{$thread} && ( $message =~ m:^Completed Action, returning (\d+) hits$: ||
 
        $message =~ m:^Returning (\d+) match: ||
 
        $message =~ m:^.* complete$:))
        {
        # The end of the query. Form the event xml and save
        my $matches = $1 || 0;
        my $query = $threadqueries{$thread};
        my $ip = $threadips{$thread};
 
        # If the query format is correct, generate the data for the XML event if (defined $query && defined $ip && $query =~ m,/?a.*?=(\w+)([?&].*(?<=[?&])text=([^?&]*))?,)
        {
            $events++;
            my $terms = $3 || "";
            $data.=queryinfo($idolname, $query,$1,$matches,$ip,split(/ /,uri_unescape($terms)));
        }
 
            print STDERR "$threadtime{$thread} $threadqueries{$thread}\n";
 
            # Reset
            $threadqueries{$thread} = "";
            $threadips{$thread} = "";
            $threadtime{$thread}="";
            } # end of the action
            } # end of the line
 
        # Check if the script has to send all events to the Stats server now or later
            last if $events >= $BATCHSIZE;
 
    } #end of the Event Writing loop
 
            # End of the XML event
            $data.="</events>\n";
 
        my $lnData = length($data);
        #print STDERR "Data buffer length = $lnData\n";
        print STDERR "DATA:\n".$data."\n";
 
        # End of the current batch. Post the response to the statsserver.
            if ($events)
            {
            $total+=$events;
            #print STDERR "Events to send: $events $total\n";
                eval { postevent($statshost, $statsport, $data); };
                print "Posting events failed: $@" if $@;
                $data = "";
                if (!($total%1000)) {print STDERR "Total events: $total\n";}
                $data="";
            }
        } #end of the Event Creating loop
    }; # end of eval
 
    warn unless $tailn;
    $tailn=-1;
    sleep 0.1;
} #end of the Main loop
 
 
sub strip_control_chars($) {
    my $x=shift;
    $x=~tr/\x00-\x1F//d;
    return $x;
}
 
#-----------------------------------------------------
# Unescape a URI-escaped string
#-----------------------------------------------------
sub uri_unescape($)
{
    my %escapes;
    # Build a char->hex map
    for (0..255) {
        $escapes{chr($_)} = sprintf("%%%02X", $_);
    }
    # Note from RFC1630: "Sequences which start with a percent sign but are not followed by two hexadecimal characters are reserved for future extension"
    my $str = shift;
    if (@_ && wantarray) {
        # not executed for the common case of a single argument
        my @str = @_; # need to copy
        return map { s/%([0-9A-Fa-f]{2})/chr(hex($1))/eg } $str, @str;
    }
    $str =~ s/%([0-9A-Fa-f]{2})/chr(hex($1))/eg;
    return $str;
} # End sub uri_unescape
 
#-----------------------------------------------------
# Escape a URI string
#-----------------------------------------------------
sub uri_escape($)
{
    my %escapes;
    # Build a char->hex map
    for (0..255) {
        $escapes{chr($_)} = sprintf("%%%02X", $_);
    }
    my($text) = @_;
    # Default unsafe characters. (RFC 2396 ^uric)
    $text =~ s/([^;\/?:@&=+\$,A-Za-z0-9\-_.!~*'()])/$escapes{$1}/g;
    $text;
} # End sub uri_escape
 
 
#--------------------------------------------------------------------
# Extract information from a query and build the XML to send
#--------------------------------------------------------------------
sub queryinfo($$$$$@) {
    my $idolname = shift;
    my $query = shift;
    my $action = shift;
    my $matches = shift;
    my $ip = shift;
    my @terms = @_;
    my $ret = ""; # The xml packet to be sent
    $ret.="<queryinfo>\n<ver>0.1</ver>\n<url><![CDATA[$query]]></url>\n";
    $ret.="<action>$action</action>\n";
    $ret.="<terms><term>".uri_escape(strip_control_chars($_))."</term></terms>\n" for @terms;
    $ret.="<numhits>$matches</numhits>\n";
    $ret.="<ip>$ip</ip>\n";
    if($idolname){
    $ret.="<idolname>$idolname</idolname>\n";
    }
    $ret.="</queryinfo>\n";
 
    return $ret;
}
 
#--------------------------------------------------------------------
# Post an event to the given host and port
#--------------------------------------------------------------------
sub postevent($$$) {
    my $host = shift;
    my $port = shift;
    my $data = shift;
    my $nConnectTry=3;
    use Socket;
 
    socket(INDEXSOCK, Socket::PF_INET, Socket::SOCK_STREAM, getprotobyname('tcp'))
    || print STDERR "Socket Failure: $!";
    my $inet_addr = Socket::inet_aton($host)
    || print STDERR "Internet_addr Failure: $1\n";
    my $paddr= Socket::sockaddr_in($port, $inet_addr)
    || print STDERR "Sockaddr_in Failure: $!\n";
    while(!connect(INDEXSOCK, $paddr) && $nConnectTry>0){
    $nConnectTry--;
    }
    $nConnectTry>0 or die "Connect problem: $|\n";
 
    #connect(INDEXSOCK, $paddr) || print STDERR "Connect problem: $!\n";
    select INDEXSOCK; $| =1;
    select STDOUT;
    print INDEXSOCK "POST ";
    print INDEXSOCK "/stats";
    print INDEXSOCK " HTTP/1.0\r\n";
    print INDEXSOCK "Content-Length: ".length($data)."\r\n\r\n";
    print INDEXSOCK "$data";
 
    my $buffer;
    read(INDEXSOCK,$buffer,100);
    #$buffer =~ /INDEXID=(\d+)/;
    close INDEXSOCK;
    return(1);
}