#!/usr/bin/perl -w
use strict;

#use diagnostics;

use XML::Filter::BufferText;     # to buffer all character events
use XML::SAX::Machines qw(:all); # to pipe the 2 SAX handlers

use FindBin qw($Bin);
use lib $Bin;
use wtr2_base;

my $DEBUG=0;

init_db();

my $CAN_OUTPUT= 0;

my @files= @ARGV || (<$dir{invoices}/*.xml>);

foreach my $file (@files)
  { my $handler= wtr2_handler->new();

    my $pipeline = Pipeline( XML::Filter::BufferText->new(),
                             $handler,
                           );
    my $data = $pipeline->parse_uri ( $file);
    
    my $errors= check_invoice( $data);

    if( !@$errors)
      { store_invoice( $data); }
    else
      { print "ERROR in $file\n  ", join( "\n  ", @$errors), "\n";
        if( $CAN_OUTPUT) 
          { my $rejected_file= rejected( $file);
            print "adding errors in $rejected_file\n" if( $DEBUG);
            add_errors( $pipeline, $data, $errors);         # the parser is passed so it can be re-used 
            output_doc_to_check(  $rejected_file, $data);
          }
      };
  }

sub check_invoice
  { my( $data)= @_;
    my $errors=[];  # array ref, holds the error messages

    check_buyer(  $data->{BuyerPartyIdentifier},
                  $data->{BuyerOrganisationName},
                  $errors
               );
    check_po(     $data->{OrderIdentifier}, $errors);

    my @rows= @{$data->{InvoiceRow}};

    reset_default_row_id();
    
    foreach my $row( @rows)
      { # this does not cope well with broken row numbers

        my $row_id= $row->{RowIdentifier} || default_row_id();
        
        print "checking row $row_id\n" if $DEBUG;

        check_qtty( $row_id,
                    $row->{DeliveredQuantity}, 
                    $row->{DeliveredQuantityUC}, 
                    $row->{OrderedQuantity}, 
                    $row->{OrderedQuantityUC},
                    $errors
                  );
      }
    
    return $errors;
  }


sub store_invoice
  { my( $xml_data)= @_; 
    print "storing invoice $xml_data->{InvoiceNumber}\n";

    # build the various data structures
    my $data;

    $data->{invoice} = { number         => $xml_data->{InvoiceNumber},
                         date           => $xml_data->{InvoiceDate},
                         po             => $xml_data->{OrderIdentifier},
                         amount_no_tax  => $xml_data->{InvoiceTotalVatExcludedAmount},
                         tax            => $xml_data->{InvoiceTotalVatAmount},
                         amount         => $xml_data->{InvoiceTotalVatIncludedAmount},
                         payment_status => $xml_data->{PaymentStatusCode},
                       };

    $data->{seller}  = { identifier      => $xml_data->{SellerPartyIdentifier},
                         name            => $xml_data->{SellerOrganisationName},
                         tax_code        => $xml_data->{SellerOrganisationTaxCode},
                       };

    $data->{address} = { street         => $xml_data->{SellerStreetName},
                         town           => $xml_data->{SellerTownName},
                         zip            => $xml_data->{SellerPostCodeIdentifier},
                         country_code   => $xml_data->{CountryCode},
                         po_box         => $xml_data->{SellerPostOfficeBoxIdentifier},
                       };

    $data->{contact} = { name           => $xml_data->{SellerContactPersonName},
                         phone          => $xml_data->{SellerPhoneNumberIdentifier},
                         email          => $xml_data->{SellerEmailaddressIdentifier},
                       };

    $data->{invoicerow} ||= [];
    reset_default_row_id();

    foreach my $invoicerow (@{$xml_data->{InvoiceRow}})
      { push @{$data->{invoicerow}},
          { row_id        => $invoicerow->{RowIdentifier} || default_row_id(),
            sku           => $invoicerow->{ArticleIdentifier},
            name          => $invoicerow->{ArticleName},
            qty           => $invoicerow->{DeliveredQuantity},
            qty_unit      => $invoicerow->{DeliveredQuantityUC},
            unit_price    => $invoicerow->{UnitPriceAmount},
            amount_no_tax => $invoicerow->{RowVatExcludedAmount},
            tax           => $invoicerow->{RowVatAmount},
            amount        => $invoicerow->{RowAmount},
          }
      }

    store_all( $data);
  }



package wtr2_handler;

use base qw(XML::SAX::Base);

use Getopt::Long;                # to process actions associated with SAX events

# all those could be stored as part of the parser object
# but IMHO this implies potential name collision

my( $content, $start);

BEGIN {

# declare which element content and attributes we want to store
# note that this works fine because elements are not re-used, the element
# name always gives enough information to figure out what to do with it

# note that the little languages used in the action part depends on the fact
# that - is not a valid characters at the start of an XML identifier

# actions for start_element handler
# 
# actions format is:
#   actions  : action (';' action)* # only one used here
#   action   : command options+
#   command : ('--create' | '--store_att' <attribute name>)
#   options  : option+
#   option   : --in <sub_record>    # sub_record must be have been defined
#              --as <field_name>    # name of the field in the hash 
#                                   # (defaults to the attribute name)
#              --parent <name>      # trigger only when in proper parent
#              --create only supports the -as option (not used)

$start =  { # elements that create a new sub_record for repeated content
            InvoiceRow => '--create',
            # attributes to be stored
            # format is #store_att
            DeliveredQuantity => '--store_att QuantityUnitCode --in InvoiceRow --as DeliveredQuantityUC',
            OrderedQuantity   => '--store_att QuantityUnitCode --in InvoiceRow --as OrderedQuantityUC',
          };
          
                      
# actions for characters handler
# actions format is: 
#   actions  : action (';' action)* # only one used here
#   action   : command options+
#   command : ('--store')
#   options  : option+
#   option   : --in <sub_record>    # sub_record must be have been defined
#              --as <field_name>    # not used here
#              --parent <name>      # trigger only when in proper parent

$content= { # content that needs to be stored once per invoice
            BuyerPartyIdentifier          => '--store',
            BuyerOrganisationName         => '--store',
            InvoiceNumber                 => '--store',
            InvoiceDate                   => '--store',
            OrderIdentifier               => '--store',
            InvoiceTotalVatExcludedAmount => '--store',
            InvoiceTotalVatAmount         => '--store',
            InvoiceTotalVatIncludedAmount => '--store',
            PaymentStatusCode             => '--store',
            SellerPartyIdentifier         => '--store',
            SellerOrganisationName        => '--store',
            SellerOrganisationTaxCode     => '--store',
            SellerStreetName              => '--store',
            SellerTownName                => '--store',
            SellerPostCodeIdentifier      => '--store',
            CountryCode                   => '--store --parent SellerPostalAddressDetails',
            SellerPostOfficeBoxIdentifier => '--store',
            SellerContactPersonName       => '--store',
            SellerPhoneNumberIdentifier   => '--store',
            SellerEmailaddressIdentifier  => '--store',
            # repeated content
            RowIdentifier                 => '--store --in InvoiceRow',
            ArticleIdentifier             => '--store --in InvoiceRow',
            ArticleName                   => '--store --in InvoiceRow',
            DeliveredQuantity             => '--store --in InvoiceRow',
            OrderedQuantity               => '--store --in InvoiceRow',
            UnitPriceAmount               => '--store --in InvoiceRow',
            RowVatExcludedAmount          => '--store --in InvoiceRow',
            RowVatAmount                  => '--store --in InvoiceRow',
            RowAmount                     => '--store --in InvoiceRow', 
          };

} 

# very complex new! it's a hash so we can add the state data needed
sub new
  { my $class = shift;
    my $self= bless {}, $class;
  }

# reset data for each invoice
sub start_document 
  { $_[0]->{data}    = {}; # stored data
    $_[0]->{context} = []; # element name stack
  }

# 
sub start_element
  { my( $p, $elt)= @_;
          
    my $name= $elt->{Name};
    
    # store the context
    push @{$p->{context}}, $name;
    
    # process actions for the relevant elements
    if( my $actions= $start->{$name})
      { my @actions= split /;/, $actions; # actions are ; separated (not used here)
        foreach my $action (@actions)
          { # use Getopt::Long to parse the action
            local @ARGV= split /\s+/, $action;
            my %options;
            GetOptions( \%options, "create", "store_att=s", "in=s", "as=s", "parent=s");

            if( $options{parent} and ($p->{context}->[-2] ne $options{parent})) { next; }

            # process each type of action
            if( $options{create})
              { # create a new sub_record
                if( $options{store_att}) { die "can't use --store_att and --create in $name => '$action'\n"; }
                if( $options{in})        { die "can't use --in with --create in $name => '$action'\n"; }
                $options{as} ||= $name; # the sub_record name defaults to the element name
                $p->{data}->{$name} ||= [];
                push @{$p->{data}->{$name}}, {};
              }
            elsif( my $att= $options{store_att}) 
              { # store an attribute
                my $att_clarkian= "{}".$att;  # attributes are indexed using the clarkian notation 
                my $value= $elt->{Attributes}->{$att_clarkian}->{Value}; 
                store( $p->{data}, $options{in}, $options{as} || $name, $value); 
              }
            else
              { die "no valid start action found in $name => '$action'\n"; }
          }
      }
  }
              

sub characters
  { my( $p, $characters)= @_;
    my $name= $p->{context}->[-1];
    if( my $actions= $content->{$name})
      { my @actions= split /;/, $actions;
        foreach my $action (@actions)
          { local @ARGV= split /\s+/, $action;
            my %options;
            GetOptions( \%options, "store", "in=s", "as=s", "parent=s");
            if( $options{parent} and ($p->{context}->[-2] ne $options{parent})) { next; }
            if( $options{store})
              { store( $p->{data}, $options{in}, $options{as} || $name, $characters->{Data}); }
            else
              { die "no valid contentt action found in $name => '$action'\n"; }
          }
      }
  }

sub end_element { pop @{$_[0]->{context}}; }

sub end_document { return $_[0]->{data}; }

sub store
  { my( $data, $in, $as, $value)= @_;
    if( my $sub_record= $in)
      { # create it in a sub_record
        $data->{$sub_record}->[-1]->{$as}= $value; 
      }
    else
      { # create at top-level
        $data->{$as}= $value;
      }
  }

1;

__END__

=head1 NAME

wtr2_sax_base

=head1 SYNOPSYS

  perl wtr2_sax_base

=head1 DESCRIPTION

This code uses SAX to extract the data from the invoices. It parses the
invoice and extract the relevant data into a Perl data structure that is then
used to check the invoice and update the data base.

The first problem to solve when using SAX is that the content of elements
can be broken in different calls to the C<characters> handler. So I needed to
buffer the content. Luckily enough, Robin Berjon's C<XML::Filter::BufferText>
does just that! 

So I used a SAX machine (using L<SAX::Machines>) to pipe the 2 handlers, first
L<XML::Filter::BufferText>, then my own handler: L<wtr2_handler>. 
Note that C<SAX::Machines> takes care of 

C<wtr2_handler> extracts all the information needed to check the invoice, then
store it in the data base. The resulting data (returned by the L<end_document> 
handler) is then used by L<check_invoice> and L<store_invoice>.

As this is something that is likely to be quite common and as there are few 
SAX modules that do this, I decided to go generic: I created a small language
to describe how to extract the data and store it in my custom data structure.

The idea is to give an element name (no namespaces are used in this DTD, so 
there is no need to get fancy) and associate an action to it. Actions can be
associated with the start of an element or with its content.
At the start of an element it is possible to store attributes or to create
new sub-records, for repeatable data in the document, such as C<InvoiceRow>

The content of an element can be stored, either as top-level data, for
non-repeatable data, or in a sub-record, for repeatable data.

The easiest way I found to parse these actions was to use L<Getopt::Long>
Overall this is slightly overkill for this problem, but could be re-used in
other cases, so I thought it would be worth it to show it here.

In order to know in which element the parser is from the C<characters> handler
I used a stack of element names: the C<start_element> handler pushes the
current element name on the stack and the C<end_element> handler pops it. This
is the only way to get access to the parent name, needed for the C<--parent>
option.

Overall the code was quite a pain to write, especially as the default parser,
L<XML::LibXML::SAX::Parser> had a problem during my tests, as once again I had
upgraded C<libxml2> but not the Perl module. The hardest part was designing a
way to express what I wanted to extract from the XML document and how to store
it, without resorting with one of those long lists of C<if>s that I find make
code such a pain to maintain.

=head1 AUTHOR

Michel Rodriguez <mirod@xmltwig.com>

=head1 LICENSE

This code is Copyright (c) 2003 Michel Rodriguez. All rights reserved.

This library is free software; you can redistribute it and/or modify
it under the same terms as Perl itself.

Comments can be sent to mirod@xmltwig.com

=head1 SEE ALSO

XML::SAX XML::SAX::Machines XML::Filter::BufferText

Ways to Rome 2 - Kourallinen Dollareita: http://www.xmltwig.com/article/ways_to_rome_2/
   



