#!/usr/bin/perl -w
# using Perl Regular Expressions
# Author: Ingo Macherius <macherius@gmd.de>
# modified by Michel Rodriguez <mirod@xmltwig.com>
use strict;

open( REC, "<REC-xml-19980210.xml") or die $!;

# slurp the whole document into memory
undef $/; 
my $doc=<REC>;

# remove comments NOW
# fails for <!-- in an attribute?
$doc =~ s{<!--.*?-->}{}sg;

# a semi generic way to get the entities
# fails miserably for entities using other entities
my %ent= ( amp  => '&', quot => '"', apos => "'", lt   => '<', gt   => '>',
	   xmlpio => "'<?xml'",             # uses &lt;
	   hcro   => "&#x",                 # uses &amp
	   nbsp   => ' ', '#160' => ' ',    # def is commented out in the REC   
         );

while( $doc=~ /<!ENTITY\s+(\w+)\s+(["'])(.*?)\2\s*>/g)
  { $ent{$1} ||= $3; } # use ||= to avoid redefining entities

my $i = 0;
foreach ( $doc =~ m{<prod.*?>.*?</prod>}gs ) {
	my( $lhs) = m{<lhs>(.*?)</lhs>} or die "no lhs in prod $_";;

        my $rhs='';
	while( m{<rhs.*?>(.*?)</rhs>}sg)
          { $rhs .= $1; }

	$rhs =~ s{</?nt.*?>}{}sg;          # remove nt tags
	#$rhs =~ s{<com.*?>.*?</com>}{}sg; # remove com elements, not needed here

	$i++;
	print clean( "[$i] $lhs ::= $rhs"), "\n";
}

sub clean { 
        my( $string)= @_;
        # yes, you have to replace the entities yourself
        $string=~ s{&(\w+);}{ $ent{$1} || die "unknown entity $1"}eg;
        $string =~ s{\xc2\xa0}{ }g; # weird character in source
	$string =~ s{\s+}{ }g; $string =~ s{\s$}{};
	return $string;
}

