XML, the Perl Way

File Coverage

File: blib/lib/XML/Twig.pm
Coverage: 93.1%

line stmt bran cond sub pod time code
1
262
262
262
262
691847
249
13395
use strict;
2
256
256
256
256
852
228
13403
use warnings; # > perl 5.5
3
4 # This is created in the caller's space
5 # I realize (now!) that it's not clean, but it's been there for 10+ years...
6 BEGIN
7
230
6
230
6
1995
64
{ sub ::PCDATA { '#PCDATA' } ## no critic (Subroutines::ProhibitNestedSubs);
8
5
5
34
  sub ::CDATA { '#CDATA' } ## no critic (Subroutines::ProhibitNestedSubs);
9 }
10
11
230
230
226
226
57901
6374
6460
use UNIVERSAL();
12
13 ## if a sub returns a scalar, it better not bloody disappear in list context
14 ## no critic (Subroutines::ProhibitExplicitReturnUndef);
15
16 my $perl_version;
17 my $parser_version;
18
19 ######################################################################
20 package XML::Twig;
21 ######################################################################
22
23 require 5.004;
24
25
226
226
219
219
49922
5157
666
use utf8; # > perl 5.5
26
27
219
219
211
211
4660
3916
7758
use vars qw($VERSION @ISA %valid_option);
28
29
211
211
211
211
556
3017
7566
use Carp;
30
211
211
211
211
557
3037
1323
use File::Spec;
31
211
211
208
208
2727
3040
10516
use File::Basename;
32
33 *isa= *UNIVERSAL::isa;
34
35 # flag, set to true if the weaken sub is available
36
208
208
206
206
591
2555
75658
use vars qw( $weakrefs);
37
38 # flag set to true if the version of expat seems to be 1.95.2, which has annoying bugs
39 # wrt doctype handling. This is global for performance reasons.
40 my $expat_1_95_2=0;
41
42 # a slight non-xml mod: # is allowed as a first character
43 my $REG_TAG_FIRST_LETTER;
44 #$REG_TAG_FIRST_LETTER= q{(?:[^\W\d]|[:#_])}; # < perl 5.6 - does not work for leading non-ascii letters
45 $REG_TAG_FIRST_LETTER= q{(?:[[:alpha:]:#_])}; # >= perl 5.6
46
47 my $REG_TAG_LETTER= q{(?:[\w_.-]*)};
48
49 # a simple name (no colon)
50 my $REG_NAME_TOKEN= qq{(?:$REG_TAG_FIRST_LETTER$REG_TAG_LETTER*)};
51
52 # a tag name, possibly including namespace
53 my $REG_NAME= qq{(?:(?:$REG_NAME_TOKEN:)?$REG_NAME_TOKEN)};
54
55 # tag name (leading # allowed)
56 # first line is for perl 5.005, second line for modern perl, that accept character classes
57 my $REG_TAG_NAME=$REG_NAME;
58
59 # name or wildcard (* or '') (leading # allowed)
60 my $REG_NAME_W = qq{(?:$REG_NAME|[*])};
61
62 # class and ids are deliberatly permissive
63 my $REG_NTOKEN_FIRST_LETTER;
64 #$REG_NTOKEN_FIRST_LETTER= q{(?:[^\W\d]|[:_])}; # < perl 5.6 - does not work for leading non-ascii letters
65 $REG_NTOKEN_FIRST_LETTER= q{(?:[[:alpha:]:_])}; # >= perl 5.6
66
67 my $REG_NTOKEN_LETTER= q{(?:[\w_:.-]*)};
68
69 my $REG_NTOKEN= qq{(?:$REG_NTOKEN_FIRST_LETTER$REG_NTOKEN_LETTER*)};
70 my $REG_CLASS = $REG_NTOKEN;
71 my $REG_ID = $REG_NTOKEN;
72
73 # allow <tag> #<tag> (private elt) * <tag>.<class> *.<class> <tag>#<id> *#<id>
74 my $REG_TAG_PART= qq{(?:$REG_NAME_W(?:[.]$REG_CLASS|[#]$REG_ID)?|[.]$REG_CLASS)};
75
76 my $REG_REGEXP = q{(?:/(?:[^\\/]|\\.)*/[eimsox]*)}; # regexp
77 my $REG_MATCH = q{[!=]~}; # match (or not)
78 my $REG_STRING = q{(?:"(?:[^\\"]|\\.)*"|'(?:[^\\']|\\.)*')}; # string (simple or double quoted)
79 my $REG_NUMBER = q{(?:\d+(?:\.\d*)?|\.\d+)}; # number
80 my $REG_VALUE = qq{(?:$REG_STRING|$REG_NUMBER)}; # value
81 my $REG_OP = q{==|!=|>|<|>=|<=|eq|ne|lt|gt|le|ge|=}; # op
82 my $REG_FUNCTION = q{(?:string|text)\(\s*\)};
83 my $REG_STRING_ARG = qq{(?:string|text)\\(\\s*$REG_NAME_W\\s*\\)};
84 my $REG_COMP = q{(?:>=|<=|!=|<|>|=)};
85
86 my $REG_TAG_IN_PREDICATE= $REG_NAME_W . q{(?=\s*(?i:and\b|or\b|\]|$))};
87
88 # keys in the context stack, chosen not to interfere with att names, even private (#-prefixed) ones
89 my $ST_TAG = '##tag';
90 my $ST_ELT = '##elt';
91 my $ST_NS = '##ns' ;
92
93 # used in the handler trigger code
94 my $REG_NAKED_PREDICATE= qq{((?:"[^"]*"|'[^']*'|$REG_STRING_ARG|$REG_FUNCTION|\@$REG_NAME_W|$REG_MATCH\\s*$REG_REGEXP|[\\s\\d><=!()+.-]|(?i:and)|(?i:or)|$REG_TAG_IN_PREDICATE)*)};
95 my $REG_PREDICATE= qq{\\[$REG_NAKED_PREDICATE\\]};
96
97 # not all axis, only supported ones (in get_xpath)
98 my @supported_axis= ( 'ancestor', 'ancestor-or-self', 'child', 'descendant', 'descendant-or-self',
99                       'following', 'following-sibling', 'parent', 'preceding', 'preceding-sibling', 'self'
100                     );
101 my $REG_AXIS = "(?:" . join( '|', @supported_axis) .")";
102
103 # only used in the "xpath"engine (for get_xpath/findnodes) for now
104 my $REG_PREDICATE_ALT = qr{\[(?:(?:string\(\s*\)|\@$REG_TAG_NAME)\s*$REG_MATCH\s*$REG_REGEXP\s*|[^\]]*)\]};
105
106 # used to convert XPath tests on strings to the perl equivalent
107 my %PERL_ALPHA_TEST= ( '=' => ' eq ', '!=' => ' ne ', '>' => ' gt ', '>=' => ' ge ', '<' => ' lt ', '<=' => ' le ');
108
109 my( $FB_HTMLCREF, $FB_XMLCREF);
110
111 my $NO_WARNINGS= $perl_version >= 5.006 ? 'no warnings' : 'local $^W=0';
112
113 # default namespaces, both ways
114 my %DEFAULT_NS= ( xml => "http://www.w3.org/XML/1998/namespace",
115                   xmlns => "http://www.w3.org/2000/xmlns/",
116                 );
117 my %DEFAULT_URI2NS= map { $DEFAULT_NS{$_} => $_ } keys %DEFAULT_NS;
118
119 # constants
120 my( $PCDATA, $CDATA, $PI, $COMMENT, $ENT, $ELT, $TEXT, $ASIS, $EMPTY, $BUFSIZE);
121
122 # used when an HTML doc only has a PUBLIC declaration, to generate the SYSTEM one
123 # this should really be done by HTML::TreeBuilder, but as of HTML::TreeBuilder 4.2 it isn't
124 # the various declarations are taken from http://en.wikipedia.org/wiki/Document_Type_Declaration
125 my %HTML_DECL= ( "-//W3C//DTD HTML 4.0 Transitional//EN" => "http://www.w3.org/TR/REC-html40/loose.dtd",
126                  "-//W3C//DTD HTML 4.01//EN" => "http://www.w3.org/TR/html4/strict.dtd",
127                  "-//W3C//DTD HTML 4.01 Transitional//EN" => "http://www.w3.org/TR/html4/loose.dtd",
128                  "-//W3C//DTD HTML 4.01 Frameset//EN" => "http://www.w3.org/TR/html4/frameset.dtd",
129                  "-//W3C//DTD XHTML 1.0 Strict//EN" => "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd",
130                  "-//W3C//DTD XHTML 1.0 Transitional//EN" => "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd",
131                  "-//W3C//DTD XHTML 1.0 Frameset//EN" => "http://www.w3.org/TR/xhtml1/DTD/xhtml1-frameset.dtd",
132                  "-//W3C//DTD XHTML 1.1//EN" => "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd",
133                  "-//W3C//DTD XHTML Basic 1.0//EN" => "http://www.w3.org/TR/xhtml-basic/xhtml-basic10.dtd",
134                  "-//W3C//DTD XHTML Basic 1.1//EN" => "http://www.w3.org/TR/xhtml-basic/xhtml-basic11.dtd",
135                  "-//WAPFORUM//DTD XHTML Mobile 1.0//EN" => "http://www.wapforum.org/DTD/xhtml-mobile10.dtd",
136                  "-//WAPFORUM//DTD XHTML Mobile 1.1//EN" => "http://www.openmobilealliance.org/tech/DTD/xhtml-mobile11.dtd",
137                  "-//WAPFORUM//DTD XHTML Mobile 1.2//EN" => "http://www.openmobilealliance.org/tech/DTD/xhtml-mobile12.dtd",
138                  "-//W3C//DTD XHTML+RDFa 1.0//EN" => "http://www.w3.org/MarkUp/DTD/xhtml-rdfa-1.dtd",
139                );
140
141 my $DEFAULT_HTML_TYPE= "-//W3C//DTD HTML 4.0 Transitional//EN";
142
143 my $SEP= qr/\s*(?:$|\|)/;
144
145 BEGIN
146 {
147
204
203
61105
$VERSION = '3.47';
148
149
206
206
204
204
59459
927726
1319
use XML::Parser;
150
204
2183
my $needVersion = '2.23';
151
203
431
($parser_version= $XML::Parser::VERSION)=~ s{_\d+}{}; # remove _<n> from version so numeric tests do not warn
152
203
100
1233
croak "need at least XML::Parser version $needVersion" unless $parser_version >= $needVersion;
153
154
202
2003
($perl_version= $])=~ s{_\d+}{};
155
156
201
50
482
if( $perl_version >= 5.008)
157
201
187
187
187
201
9063
47386
1046721
36400
  { eval "use Encode qw( :all)";
158
201
1968
    $FB_XMLCREF = 0x0400; # Encode::FB_XMLCREF;
159
201
251
    $FB_HTMLCREF = 0x0200; # Encode::FB_HTMLCREF;
160   }
161
162 # test whether we can use weak references
163 # set local empty signal handler to trap error messages
164
201
201
186
2121
{ local $SIG{__DIE__};
165
194
50
0
33
6517
  if( eval( 'require Scalar::Util') && defined( \&Scalar::Util::weaken))
166
194
194
6328
993
    { import Scalar::Util( 'weaken'); $weakrefs= 1; }
167   elsif( eval( 'require WeakRef'))
168
4
4
13
3
    { import WeakRef; $weakrefs= 1; }
169   else
170
4
446
    { $weakrefs= 0; }
171 }
172
173
191
864
import XML::Twig::Elt;
174
191
749
import XML::Twig::Entity;
175
191
1141
import XML::Twig::Entity_list;
176
177 # used to store the gi's
178 # should be set for each twig really, at least when there are several
179 # the init ensures that special gi's are always the same
180
181 # constants: element types
182
191
351
$PCDATA = '#PCDATA';
183
191
175
$CDATA = '#CDATA';
184
191
595
$PI = '#PI';
185
191
173
$COMMENT = '#COMMENT';
186
191
19432
$ENT = '#ENT';
187
188 # element classes
189
191
6289
$ELT = '#ELT';
190
190
4099
$TEXT = '#TEXT';
191
192 # element properties
193
190
2217
$ASIS = '#ASIS';
194
190
1192
$EMPTY = '#EMPTY';
195
196 # used in parseurl to set the buffer size to the same size as in XML::Parser::Expat
197
190
173
$BUFSIZE = 32768;
198
199
200 # gi => index
201
190
1481
%XML::Twig::gi2index=( '', 0, $PCDATA => 1, $CDATA => 2, $PI => 3, $COMMENT => 4, $ENT => 5);
202 # list of gi's
203
190
989
@XML::Twig::index2gi=( '', $PCDATA, $CDATA, $PI, $COMMENT, $ENT);
204
205 # gi's under this value are special
206
190
606
$XML::Twig::SPECIAL_GI= @XML::Twig::index2gi;
207
208
190
477
%XML::Twig::base_ent= ( '>' => '&gt;', '<' => '&lt;', '&' => '&amp;', "'" => '&apos;', '"' => '&quot;',);
209
190
562
566
1876
foreach my $c ( "\n", "\r", "\t") { $XML::Twig::base_ent{$c}= sprintf( "&#x%02x;", ord( $c)); }
210
211 # now set some aliases
212
188
335
*find_nodes = *get_xpath; # same as XML::XPath
213
188
278
*findnodes = *get_xpath; # same as XML::LibXML
214
188
174
*getElementsByTagName = *descendants;
215
188
162
*descendants_or_self = *descendants; # valid in XML::Twig, not in XML::Twig::Elt
216
188
590
*find_by_tag_name = *descendants;
217
187
159
*getElementById = *elt_id;
218
187
564
*getEltById = *elt_id;
219
187
148
*toString = *sprint;
220
187
134891
*create_accessors = *att_accessors;
221
222 }
223
224 @ISA = qw(XML::Parser);
225
226 # fake gi's used in twig_handlers and start_tag_handlers
227 my $ALL = '_all_'; # the associated function is always called
228 my $DEFAULT= '_default_'; # the function is called if no other handler has been
229
230 # some defaults
231 my $COMMENTS_DEFAULT= 'keep';
232 my $PI_DEFAULT = 'keep';
233
234
235 # handlers used in regular mode
236 my %twig_handlers=( Start => \&_twig_start,
237                     End => \&_twig_end,
238                     Char => \&_twig_char,
239                     Entity => \&_twig_entity,
240                     XMLDecl => \&_twig_xmldecl,
241                     Doctype => \&_twig_doctype,
242                     Element => \&_twig_element,
243                     Attlist => \&_twig_attlist,
244                     CdataStart => \&_twig_cdatastart,
245                     CdataEnd => \&_twig_cdataend,
246                     Proc => \&_twig_pi,
247                     Comment => \&_twig_comment,
248                     Default => \&_twig_default,
249                     ExternEnt => \&_twig_extern_ent,
250       );
251
252 # handlers used when twig_roots is used and we are outside of the roots
253 my %twig_handlers_roots=
254   ( Start => \&_twig_start_check_roots,
255     End => \&_twig_end_check_roots,
256     Doctype => \&_twig_doctype,
257     Char => undef, Entity => undef, XMLDecl => \&_twig_xmldecl,
258     Element => undef, Attlist => undef, CdataStart => undef,
259     CdataEnd => undef, Proc => undef, Comment => undef,
260     Proc => \&_twig_pi_check_roots,
261     Default => sub {}, # hack needed for XML::Parser 2.27
262     ExternEnt => \&_twig_extern_ent,
263   );
264
265 # handlers used when twig_roots and print_outside_roots are used and we are
266 # outside of the roots
267 my %twig_handlers_roots_print_2_30=
268   ( Start => \&_twig_start_check_roots,
269     End => \&_twig_end_check_roots,
270     Char => \&_twig_print,
271     Entity => \&_twig_print_entity,
272     ExternEnt => \&_twig_print_entity,
273     DoctypeFin => \&_twig_doctype_fin_print,
274     XMLDecl => sub { _twig_xmldecl( @_); _twig_print( @_); },
275     Doctype => \&_twig_print_doctype, # because recognized_string is broken here
276     # Element => \&_twig_print, Attlist => \&_twig_print,
277     CdataStart => \&_twig_print, CdataEnd => \&_twig_print,
278     Proc => \&_twig_pi_check_roots, Comment => \&_twig_print,
279     Default => \&_twig_print_check_doctype,
280     ExternEnt => \&_twig_extern_ent,
281   );
282
283 # handlers used when twig_roots, print_outside_roots and keep_encoding are used
284 # and we are outside of the roots
285 my %twig_handlers_roots_print_original_2_30=
286   ( Start => \&_twig_start_check_roots,
287     End => \&_twig_end_check_roots,
288     Char => \&_twig_print_original,
289     # I have no idea why I should not be using this handler!
290     Entity => \&_twig_print_entity,
291     ExternEnt => \&_twig_print_entity,
292     DoctypeFin => \&_twig_doctype_fin_print,
293     XMLDecl => sub { _twig_xmldecl( @_); _twig_print_original( @_) },
294     Doctype => \&_twig_print_original_doctype, # because original_string is broken here
295     Element => \&_twig_print_original, Attlist => \&_twig_print_original,
296     CdataStart => \&_twig_print_original, CdataEnd => \&_twig_print_original,
297     Proc => \&_twig_pi_check_roots, Comment => \&_twig_print_original,
298     Default => \&_twig_print_original_check_doctype,
299   );
300
301 # handlers used when twig_roots and print_outside_roots are used and we are
302 # outside of the roots
303 my %twig_handlers_roots_print_2_27=
304   ( Start => \&_twig_start_check_roots,
305     End => \&_twig_end_check_roots,
306     Char => \&_twig_print,
307     # if the Entity handler is set then it prints the entity declaration
308     # before the entire internal subset (including the declaration!) is output
309     Entity => sub {},
310     XMLDecl => \&_twig_print, Doctype => \&_twig_print,
311     CdataStart => \&_twig_print, CdataEnd => \&_twig_print,
312     Proc => \&_twig_pi_check_roots, Comment => \&_twig_print,
313     Default => \&_twig_print,
314     ExternEnt => \&_twig_extern_ent,
315   );
316
317 # handlers used when twig_roots, print_outside_roots and keep_encoding are used
318 # and we are outside of the roots
319 my %twig_handlers_roots_print_original_2_27=
320   ( Start => \&_twig_start_check_roots,
321     End => \&_twig_end_check_roots,
322     Char => \&_twig_print_original,
323     # for some reason original_string is wrong here
324     # this can be a problem if the doctype includes non ascii characters
325     XMLDecl => \&_twig_print, Doctype => \&_twig_print,
326     # if the Entity handler is set then it prints the entity declaration
327     # before the entire internal subset (including the declaration!) is output
328     Entity => sub {},
329     #Element => undef, Attlist => undef,
330     CdataStart => \&_twig_print_original, CdataEnd => \&_twig_print_original,
331     Proc => \&_twig_pi_check_roots, Comment => \&_twig_print_original,
332     Default => \&_twig_print, # _twig_print_original does not work
333     ExternEnt => \&_twig_extern_ent,
334   );
335
336
337 my %twig_handlers_roots_print= $parser_version > 2.27
338                                ? %twig_handlers_roots_print_2_30
339                                : %twig_handlers_roots_print_2_27;
340 my %twig_handlers_roots_print_original= $parser_version > 2.27
341                                ? %twig_handlers_roots_print_original_2_30
342                                : %twig_handlers_roots_print_original_2_27;
343
344
345 # handlers used when the finish_print method has been called
346 my %twig_handlers_finish_print=
347   ( Start => \&_twig_print,
348     End => \&_twig_print, Char => \&_twig_print,
349     Entity => \&_twig_print, XMLDecl => \&_twig_print,
350     Doctype => \&_twig_print, Element => \&_twig_print,
351     Attlist => \&_twig_print, CdataStart => \&_twig_print,
352     CdataEnd => \&_twig_print, Proc => \&_twig_print,
353     Comment => \&_twig_print, Default => \&_twig_print,
354     ExternEnt => \&_twig_extern_ent,
355   );
356
357 # handlers used when the finish_print method has been called and the keep_encoding
358 # option is used
359 my %twig_handlers_finish_print_original=
360   ( Start => \&_twig_print_original, End => \&_twig_print_end_original,
361     Char => \&_twig_print_original, Entity => \&_twig_print_original,
362     XMLDecl => \&_twig_print_original, Doctype => \&_twig_print_original,
363     Element => \&_twig_print_original, Attlist => \&_twig_print_original,
364     CdataStart => \&_twig_print_original, CdataEnd => \&_twig_print_original,
365     Proc => \&_twig_print_original, Comment => \&_twig_print_original,
366     Default => \&_twig_print_original,
367   );
368
369 # handlers used within ignored elements
370 my %twig_handlers_ignore=
371   ( Start => \&_twig_ignore_start,
372     End => \&_twig_ignore_end,
373     Char => undef, Entity => undef, XMLDecl => undef,
374     Doctype => undef, Element => undef, Attlist => undef,
375     CdataStart => undef, CdataEnd => undef, Proc => undef,
376     Comment => undef, Default => undef,
377     ExternEnt => undef,
378   );
379
380
381 # those handlers are only used if the entities are NOT to be expanded
382 my %twig_noexpand_handlers= ( ExternEnt => undef, Default => \&_twig_default );
383
384 my @saved_default_handler;
385
386 my $ID= 'id'; # default value, set by the Id argument
387 my $css_sel=0; # set through the css_sel option to allow .class selectors in triggers
388
389 # all allowed options
390 %valid_option=
391     ( # XML::Twig options
392       TwigHandlers => 1, Id => 1,
393       TwigRoots => 1, TwigPrintOutsideRoots => 1,
394       StartTagHandlers => 1, EndTagHandlers => 1,
395       ForceEndTagHandlersUsage => 1,
396       DoNotChainHandlers => 1,
397       IgnoreElts => 1,
398       Index => 1,
399       AttAccessors => 1,
400       EltAccessors => 1,
401       FieldAccessors => 1,
402       CharHandler => 1,
403       TopDownHandlers => 1,
404       KeepEncoding => 1, DoNotEscapeAmpInAtts => 1,
405       ParseStartTag => 1, KeepAttsOrder => 1,
406       LoadDTD => 1, DTDHandler => 1,
407       DoNotOutputDTD => 1, NoProlog => 1,
408       ExpandExternalEnts => 1,
409       DiscardSpaces => 1, KeepSpaces => 1, DiscardAllSpaces => 1,
410       DiscardSpacesIn => 1, KeepSpacesIn => 1,
411       PrettyPrint => 1, EmptyTags => 1,
412       EscapeGt => 1,
413       Quote => 1,
414       Comments => 1, Pi => 1,
415       OutputFilter => 1, InputFilter => 1,
416       OutputTextFilter => 1,
417       OutputEncoding => 1,
418       RemoveCdata => 1,
419       EltClass => 1,
420       MapXmlns => 1, KeepOriginalPrefix => 1,
421       SkipMissingEnts => 1,
422       # XML::Parser options
423       ErrorContext => 1, ProtocolEncoding => 1,
424       Namespaces => 1, NoExpand => 1,
425       Stream_Delimiter => 1, ParseParamEnt => 1,
426       NoLWP => 1, Non_Expat_Options => 1,
427       Xmlns => 1, CssSel => 1,
428       UseTidy => 1, TidyOptions => 1,
429       OutputHtmlDoctype => 1,
430     );
431
432 my $active_twig; # last active twig,for XML::Twig::s
433
434 # predefined input and output filters
435
187
187
187
201
629
174
1040717
use vars qw( %filter);
436 %filter= ( html => \&html_encode,
437            safe => \&safe_encode,
438            safe_hex => \&safe_encode_hex,
439          );
440
441
442 # trigger types (used to sort them)
443 my ($LEVEL_TRIGGER, $REGEXP_TRIGGER, $XPATH_TRIGGER)=(1..3);
444
445 sub new
446
3137
1
11024513
  { my ($class, %args) = @_;
447
3137
2337
    my $handlers;
448
449     # change all nice_perlish_names into nicePerlishNames
450
3137
5126
    %args= _normalize_args( %args);
451
452     # check options
453
3137
100
5556
    unless( $args{MoreOptions})
454
3136
4931
      { foreach my $arg (keys %args)
455
5448
100
8753
        { carp "invalid option $arg" unless $valid_option{$arg}; }
456       }
457
458     # a twig is really an XML::Parser
459     # my $self= XML::Parser->new(%args);
460
3137
2749
    my $self;
461
3137
8831
    $self= XML::Parser->new(%args);
462
463
3137
71193
    bless $self, $class;
464
465
3137
4962
    $self->{_twig_context_stack}= [];
466
467     # allow tag.class selectors in handler triggers
468
3137
100
7852
    $css_sel= $args{CssSel} || 0;
469
470
471
3137
100
4729
    if( exists $args{TwigHandlers})
472
230
243
      { $handlers= $args{TwigHandlers};
473
230
415
        $self->setTwigHandlers( $handlers);
474
223
307
        delete $args{TwigHandlers};
475       }
476
477     # take care of twig-specific arguments
478
3130
100
4234
    if( exists $args{StartTagHandlers})
479
30
65
      { $self->setStartTagHandlers( $args{StartTagHandlers});
480
30
47
        delete $args{StartTagHandlers};
481       }
482
483
3130
100
4070
    if( exists $args{DoNotChainHandlers})
484
1
2
      { $self->{twig_do_not_chain_handlers}= $args{DoNotChainHandlers}; }
485
486
3130
100
4104
    if( exists $args{IgnoreElts})
487       { # change array to hash so you can write ignore_elts => [ qw(foo bar baz)]
488
11
1
2
1
100
32
2
5
2
        if( isa( $args{IgnoreElts}, 'ARRAY')) { $args{IgnoreElts}= { map { $_ => 1 } @{$args{IgnoreElts}} }; }
489
11
27
        $self->setIgnoreEltsHandlers( $args{IgnoreElts});
490
11
13
        delete $args{IgnoreElts};
491       }
492
493
3130
100
3969
    if( exists $args{Index})
494
2
2
      { my $index= $args{Index};
495         # we really want a hash name => path, we turn an array into a hash if necessary
496
2
100
4
        if( ref( $index) eq 'ARRAY')
497
1
2
1
4
          { my %index= map { $_ => $_ } @$index;
498
1
2
            $index= \%index;
499           }
500
2
5
        while( my( $name, $exp)= each %$index)
501
3
4
4
4
4
12
3
7
8
          { $self->setTwigHandler( $exp, sub { push @{$_[0]->{_twig_index}->{$name}}, $_; 1; }); }
502       }
503
504
3130
100
8078
    $self->{twig_elt_class}= $args{EltClass} || 'XML::Twig::Elt';
505
3130
75
100
100
4702
117
    if( defined( $args{EltClass}) && $args{EltClass} ne 'XML::Twig::Elt') { $self->{twig_alt_elt_class}=1; }
506
3130
76
100
3981
126
    if( exists( $args{EltClass})) { delete $args{EltClass}; }
507
508
3130
100
3646
    if( exists( $args{MapXmlns}))
509
22
24
      { $self->{twig_map_xmlns}= $args{MapXmlns};
510
22
23
        $self->{Namespaces}=1;
511
22
25
        delete $args{MapXmlns};
512       }
513
514
3130
100
4187
    if( exists( $args{KeepOriginalPrefix}))
515
4
5
      { $self->{twig_keep_original_prefix}= $args{KeepOriginalPrefix};
516
4
4
        delete $args{KeepOriginalPrefix};
517       }
518
519
3130
2932
    $self->{twig_dtd_handler}= $args{DTDHandler};
520
3130
2651
    delete $args{DTDHandler};
521
522
3130
100
3563
    if( $args{ExpandExternalEnts})
523
5
11
      { $self->set_expand_external_entities( 1);
524
5
6
        $self->{twig_expand_external_ents}= $args{ExpandExternalEnts};
525
5
9
        $self->{twig_read_external_dtd}= 1; # implied by ExpandExternalEnts
526
5
100
13
        if( $args{ExpandExternalEnts} == -1)
527
2
3
          { $self->{twig_extern_ent_nofail}= 1;
528
2
12
            $self->setHandlers( ExternEnt => \&_twig_extern_ent_nofail);
529           }
530
5
47
        delete $args{LoadDTD};
531
5
6
        delete $args{ExpandExternalEnts};
532       }
533     else
534
3125
4577
      { $self->set_expand_external_entities( 0); }
535
536
3130
50
0
5999
    if( !$args{NoLWP} && ! _use( 'URI') && ! _use( 'URI::File') && ! _use( 'LWP'))
537
0
0
      { $self->{twig_ext_ent_handler}= \&XML::Parser::initial_ext_ent_handler }
538     else
539
3130
4210
      { $self->{twig_ext_ent_handler}= \&XML::Parser::file_ext_ent_handler }
540
541
3130
100
3947
    if( $args{DoNotEscapeAmpInAtts})
542
1
1
      { $self->set_do_not_escape_amp_in_atts( 1);
543
1
1
        $self->{twig_do_not_escape_amp_in_atts}=1;
544       }
545     else
546
3129
4210
      { $self->set_do_not_escape_amp_in_atts( 0);
547
3129
2725
        $self->{twig_do_not_escape_amp_in_atts}=0;
548       }
549
550     # deal with TwigRoots argument, a hash of elements for which
551     # subtrees will be built (and associated handlers)
552
553
3130
100
3910
    if( $args{TwigRoots})
554
101
207
      { $self->setTwigRoots( $args{TwigRoots});
555
99
133
        delete $args{TwigRoots};
556       }
557
558
3128
100
3781
    if( $args{EndTagHandlers})
559
15
100
100
51
      { unless ($self->{twig_roots} || $args{ForceEndTagHandlersUsage})
560
1
153
          { croak "you should not use EndTagHandlers without TwigRoots\n",
561                   "if you want to use it anyway, normally because you have ",
562                   "a start_tag_handlers that calls 'ignore' and you want to ",
563                   "call an ent_tag_handlers at the end of the element, then ",
564                   "pass 'force_end_tag_handlers_usage => 1' as an argument ",
565                   "to new";
566           }
567
568
14
26
        $self->setEndTagHandlers( $args{EndTagHandlers});
569
14
164
        delete $args{EndTagHandlers};
570       }
571
572
3127
100
3709
    if( $args{TwigPrintOutsideRoots})
573
63
100
314
      { croak "cannot use twig_print_outside_roots without twig_roots"
574           unless( $self->{twig_roots});
575         # if the arg is a filehandle then store it
576
62
100
99
        if( _is_fh( $args{TwigPrintOutsideRoots}) )
577
59
102
          { $self->{twig_output_fh}= $args{TwigPrintOutsideRoots}; }
578
62
93
        $self->{twig_default_print}= $args{TwigPrintOutsideRoots};
579       }
580
581     # space policy
582
3126
100
3492
    if( $args{KeepSpaces})
583
49
100
195
      { croak "cannot use both keep_spaces and discard_spaces" if( $args{DiscardSpaces});
584
48
100
170
        croak "cannot use both keep_spaces and discard_all_spaces" if( $args{DiscardAllSpaces});
585
47
100
184
        croak "cannot use both keep_spaces and keep_spaces_in" if( $args{KeepSpacesIn});
586
46
50
        $self->{twig_keep_spaces}=1;
587
46
58
        delete $args{KeepSpaces};
588       }
589
3123
100
3487
    if( $args{DiscardSpaces})
590       {
591
5
100
106
        croak "cannot use both discard_spaces and keep_spaces_in" if( $args{KeepSpacesIn});
592
4
100
97
        croak "cannot use both discard_spaces and discard_all_spaces" if( $args{DiscardAllSpaces});
593
3
100
98
        croak "cannot use both discard_spaces and discard_spaces_in" if( $args{DiscardSpacesIn});
594
2
2
        $self->{twig_discard_spaces}=1;
595
2
2
        delete $args{DiscardSpaces};
596       }
597
3120
100
3442
    if( $args{KeepSpacesIn})
598
9
100
111
      { croak "cannot use both keep_spaces_in and discard_spaces_in" if( $args{DiscardSpacesIn});
599
8
100
107
        croak "cannot use both keep_spaces_in and discard_all_spaces" if( $args{DiscardAllSpaces});
600
7
7
        $self->{twig_discard_spaces}=1;
601
7
10
        $self->{twig_keep_spaces_in}={};
602
7
7
7
17
        my @tags= @{$args{KeepSpacesIn}};
603
7
9
10
15
        foreach my $tag (@tags) { $self->{twig_keep_spaces_in}->{$tag}=1; }
604
7
14
        delete $args{KeepSpacesIn};
605       }
606
607
3118
100
3416
    if( $args{DiscardAllSpaces})
608       {
609
2
100
100
        croak "cannot use both discard_all_spaces and discard_spaces_in" if( $args{DiscardSpacesIn});
610
1
1
        $self->{twig_discard_all_spaces}=1;
611
1
2
        delete $args{DiscardAllSpaces};
612       }
613
614
3117
100
3285
    if( $args{DiscardSpacesIn})
615
4
6
      { $self->{twig_keep_spaces}=1;
616
4
6
        $self->{twig_discard_spaces_in}={};
617
4
4
2
9
        my @tags= @{$args{DiscardSpacesIn}};
618
4
6
5
9
        foreach my $tag (@tags) { $self->{twig_discard_spaces_in}->{$tag}=1; }
619
4
8
        delete $args{DiscardSpacesIn};
620       }
621     # discard spaces by default
622
3117
100
4746
    $self->{twig_discard_spaces}= 1 unless( $self->{twig_keep_spaces});
623
624
3117
67
5619
    $args{Comments}||= $COMMENTS_DEFAULT;
625
3117
4
100
100
100
5956
7
    if( $args{Comments} eq 'drop') { $self->{twig_keep_comments}= 0; }
626
2225
2082
    elsif( $args{Comments} eq 'keep') { $self->{twig_keep_comments}= 1; }
627
887
701
    elsif( $args{Comments} eq 'process') { $self->{twig_process_comments}= 1; }
628
1
82
    else { croak "wrong value for comments argument: '$args{Comments}' (should be 'drop', 'keep' or 'process')"; }
629
3116
3508
    delete $args{Comments};
630
631
3116
67
5188
    $args{Pi}||= $PI_DEFAULT;
632
3116
3
100
100
100
4829
4
    if( $args{Pi} eq 'drop') { $self->{twig_keep_pi}= 0; }
633
2226
1990
    elsif( $args{Pi} eq 'keep') { $self->{twig_keep_pi}= 1; }
634
886
739
    elsif( $args{Pi} eq 'process') { $self->{twig_process_pi}= 1; }
635
1
81
    else { croak "wrong value for pi argument: '$args{Pi}' (should be 'drop', 'keep' or 'process')"; }
636
3115
2493
    delete $args{Pi};
637
638
3115
100
2996
    if( $args{KeepEncoding})
639       {
640         # set it in XML::Twig::Elt so print functions know what to do
641
1020
1171
        $self->set_keep_encoding( 1);
642
1020
100
2330
        $self->{parse_start_tag}= $args{ParseStartTag} || \&_parse_start_tag;
643
1020
100
1323
        delete $args{ParseStartTag} if defined( $args{ParseStartTag}) ;
644
1020
849
        delete $args{KeepEncoding};
645       }
646     else
647
2095
2701
      { $self->set_keep_encoding( 0);
648
2095
100
2016
        if( $args{ParseStartTag})
649
2
5
          { $self->{parse_start_tag}= $args{ParseStartTag}; }
650         else
651
2093
1687
          { delete $self->{parse_start_tag}; }
652
2095
1454
        delete $args{ParseStartTag};
653       }
654
655
3115
100
2961
    if( $args{OutputFilter})
656
5
8
      { $self->set_output_filter( $args{OutputFilter});
657
5
6
        delete $args{OutputFilter};
658       }
659     else
660
3110
3612
      { $self->set_output_filter( 0); }
661
662
3115
100
3515
    if( $args{RemoveCdata})
663
1
1
      { $self->set_remove_cdata( $args{RemoveCdata});
664
1
1
        delete $args{RemoveCdata};
665       }
666     else
667
3114
3630
      { $self->set_remove_cdata( 0); }
668
669
3115
100
3061
    if( $args{OutputTextFilter})
670
5
8
      { $self->set_output_text_filter( $args{OutputTextFilter});
671
5
5
        delete $args{OutputTextFilter};
672       }
673     else
674
3110
3508
      { $self->set_output_text_filter( 0); }
675
676
3115
100
3406
    if( exists $args{KeepAttsOrder})
677
12
19
      { $self->{keep_atts_order}= $args{KeepAttsOrder};
678
12
100
15
        if( _use( 'Tie::IxHash'))
679
11
22
          { $self->set_keep_atts_order( $self->{keep_atts_order}); }
680         else
681
1
86
          { croak "Tie::IxHash not available, option keep_atts_order not allowed"; }
682       }
683     else
684
3103
3301
      { $self->set_keep_atts_order( 0); }
685
686
687
3114
72
100
3339
113
    if( $args{PrettyPrint}) { $self->set_pretty_print( $args{PrettyPrint}); }
688
3114
1
100
3126
2
    if( $args{EscapeGt}) { $self->escape_gt( $args{EscapeGt}); }
689
3114
12
100
3086
18
    if( $args{EmptyTags}) { $self->set_empty_tag_style( $args{EmptyTags}) }
690
691
3114
1
1
100
3470
5
1
    if( exists $args{Id}) { $ID= $args{Id}; delete $args{ID}; }
692
3114
4
4
100
3093
11
5
    if( $args{NoProlog}) { $self->{no_prolog}= 1; delete $args{NoProlog}; }
693
3114
3
3
100
3154
7
5
    if( $args{DoNotOutputDTD}) { $self->{no_dtd_output}= 1; delete $args{DoNotOutputDTD}; }
694
3114
9
9
100
3226
13
9
    if( $args{LoadDTD}) { $self->{twig_read_external_dtd}= 1; delete $args{LoadDTD}; }
695
3114
1
1
100
2944
3
1
    if( $args{CharHandler}) { $self->setCharHandler( $args{CharHandler}); delete $args{CharHandler}; }
696
697
3114
3
1
100
3078
6
1
    if( $args{InputFilter}) { $self->set_input_filter( $args{InputFilter}); delete $args{InputFilter}; }
698
3112
3
3
100
3032
76
36
    if( $args{NoExpand}) { $self->setHandlers( %twig_noexpand_handlers); $self->{twig_no_expand}=1; }
699
3112
2
2
100
3435
4
2
    if( my $output_encoding= $args{OutputEncoding}) { $self->set_output_encoding( $output_encoding); delete $args{OutputFilter}; }
700
701
3112
4
4
100
4432
3
3
    if( my $tdh= $args{TopDownHandlers}) { $self->{twig_tdh}=1; delete $args{TopDownHandlers}; }
702
703
3112
1
100
3411
2
    if( my $acc_a= $args{AttAccessors}) { $self->att_accessors( @$acc_a); }
704
3112
4
100
100
3105
14
    if( my $acc_e= $args{EltAccessors}) { $self->elt_accessors( isa( $acc_e, 'ARRAY') ? @$acc_e : $acc_e); }
705
3112
2
100
100
3304
9
    if( my $acc_f= $args{FieldAccessors}) { $self->field_accessors( isa( $acc_f, 'ARRAY') ? @$acc_f : $acc_f); }
706
707
3112
24
100
3037
238
    if( $args{UseTidy}) { $self->{use_tidy}= 1; }
708
3112
50
7240
    $self->{tidy_options}= $args{TidyOptions} || {};
709
710
3112
1
100
3397
1
    if( $args{OutputHtmlDoctype}) { $self->{html_doctype}= 1; }
711
712
3112
100
6777
    $self->set_quote( $args{Quote} || 'double');
713
714     # set handlers
715
3112
100
3014
    if( $self->{twig_roots})
716
99
100
139
      { if( $self->{twig_default_print})
717
62
100
84
          { if( $self->{twig_keep_encoding})
718
36
255
              { $self->setHandlers( %twig_handlers_roots_print_original); }
719             else
720
26
119
              { $self->setHandlers( %twig_handlers_roots_print); }
721           }
722         else
723
37
156
          { $self->setHandlers( %twig_handlers_roots); }
724       }
725     else
726
3013
10392
      { $self->setHandlers( %twig_handlers); }
727
728     # XML::Parser::Expat does not like these handler to be set. So in order to
729     # use the various sets of handlers on XML::Parser or XML::Parser::Expat
730     # objects when needed, these ones have to be set only once, here, at
731     # XML::Parser level
732
3112
133706
    $self->setHandlers( Init => \&_twig_init, Final => \&_twig_final);
733
734
3112
37252
    $self->{twig_entity_list}= XML::Twig::Entity_list->new;
735
736
3112
3243
    $self->{twig_id}= $ID;
737
3112
2838
    $self->{twig_stored_spaces}='';
738
739
3112
2440
    $self->{twig_autoflush}= 1; # auto flush by default
740
741
3112
2482
    $self->{twig}= $self;
742
3112
3090
100
3914
5164
    if( $weakrefs) { weaken( $self->{twig}); }
743
744
3112
6152
    return $self;
745   }
746
747 sub parse
748   {
749
3282
1
845397
    my $t= shift;
750     # if called as a class method, calls nparse, which creates the twig then parses it
751
3282
179
100
67
9080
387
    if( !ref( $t) || !isa( $t, 'XML::Twig')) { return $t->nparse( @_); }
752
753     # requires 5.006 at least (or the ${^UNICODE} causes a problem) # > perl 5.5
754     # trap underlying bug in IO::Handle (see RT #17500) # > perl 5.5
755     # croak if perl 5.8+, -CD (or PERL_UNICODE set to D) and parsing a pipe # > perl 5.5
756
3103
50
17
8790
    if( $perl_version>=5.008 && ${^UNICODE} && (${^UNICODE} & 24) && isa( $_[0], 'GLOB') && -p $_[0] ) # > perl 5.5
757
1
162
      { croak "cannot parse the output of a pipe when perl is set to use the UTF8 perlIO layer\n" # > perl 5.5
758               . "set the environment variable PERL_UNICODE or use the -C option (see perldoc perlrun)\n" # > perl 5.5
759               . "not to include 'D'"; # > perl 5.5
760       } # > perl 5.5
761
3102
3102
2600
7305
    $t= eval { $t->SUPER::parse( @_); };
762
763
3102
100
75
11150070
    if( !$t
764         && $@=~m{(syntax error at line 1, column 0, byte 0|not well-formed \(invalid token\) at line 1, column 1, byte 1)}
765         && -f $_[0]
766       )
767
1
84
      { croak "you seem to have used the parse method on a filename ($_[0]), you probably want parsefile instead"; }
768
3101
3715
    return _checked_parse_result( $t, $@);
769   }
770
771 sub parsefile
772
82
82
1
154
  { my $t= shift;
773
82
1
100
100
716
3
    if( -f $_[0] && ! -s $_[0]) { return _checked_parse_result( undef, "empty file '$_[0]'"); }
774
81
81
93
400
    $t= eval { $t->SUPER::parsefile( @_); };
775
81
1599
    return _checked_parse_result( $t, $@);
776   }
777
778 sub _checked_parse_result
779
3183
2748
  { my( $t, $returned)= @_;
780
3183
100
3735
    if( !$t)
781
28
100
67
165
      { if( isa( $returned, 'XML::Twig') && $returned->{twig_finish_now})
782
6
4
          { $t= $returned;
783
6
5
            delete $t->{twig_finish_now};
784
6
8
            return $t->_twig_final;
785           }
786         else
787
22
53
          { _croak( $returned, 0); }
788       }
789
790
3155
2184
    $active_twig= $t;
791
3155
4095
    return $t;
792   }
793
794
2
2
1
7
sub active_twig { return $active_twig; }
795
796 sub finish_now
797
6
6
1
9
  { my $t= shift;
798
6
6
    $t->{twig_finish_now}=1;
799
6
26
    die $t;
800   }
801
802
803
3
3
1
7
sub parsefile_inplace { shift->_parse_inplace( parsefile => @_); }
804
6
6
1
17
sub parsefile_html_inplace { shift->_parse_inplace( parsefile_html => @_); }
805
806 sub _parse_inplace
807
9
9
19
  { my( $t, $method, $file, $suffix)= @_;
808
9
50
14
    _use( 'File::Temp') || croak "need File::Temp to use inplace methods\n";
809
9
13
    _use( 'File::Basename');
810
811
812
9
433
    my $tmpdir= dirname( $file);
813
9
41
    my( $tmpfh, $tmpfile)= File::Temp::tempfile( DIR => $tmpdir);
814
9
2032
    my $original_fh= select $tmpfh;
815
816
9
50
33
47
    unless( $t->{twig_keep_encoding} || $perl_version < 5.006)
817
9
50
104241
      { if( grep /useperlio=define/, `$^X -V`) # we can only use binmode :utf8 if perl was compiled with useperlio
818
9
66
          { binmode( $tmpfh, ":utf8" ); }
819       }
820
821
9
160
    $t->$method( $file);
822
823
9
28
    select $original_fh;
824
9
3139
    close $tmpfh;
825
9
69
    my $mode= (stat( $file))[2] & oct(7777);
826
9
50
79
    chmod $mode, $tmpfile or croak "cannot change temp file mode to $mode: $!";
827
828
9
100
18
    if( $suffix)
829
6
10
      { my $backup;
830
6
3
100
38
29
        if( $suffix=~ m{\*}) { ($backup = $suffix) =~ s/\*/$file/g; }
831
3
12
        else { $backup= $file . $suffix; }
832
833
6
50
164
        rename( $file, $backup) or croak "cannot backup initial file ($file) to $backup: $!";
834       }
835
9
50
339
    rename( $tmpfile, $file) or croak "cannot rename temp file ($tmpfile) to initial file ($file): $!";
836
837
9
62
    return $t;
838   }
839
840
841 sub parseurl
842
11
11
1
8591
  { my $t= shift;
843
11
27
    $t->_parseurl( 0, @_);
844   }
845
846 sub safe_parseurl
847
9
9
1
17
  { my $t= shift;
848
9
34
    $t->_parseurl( 1, @_);
849   }
850
851 sub safe_parsefile_html
852
2
2
1
2
  { my $t= shift;
853
2
2
3
4
    eval { $t->parsefile_html( @_); };
854
2
50
8
    return $@ ? $t->_reset_twig_after_error : $t;
855   }
856
857 sub safe_parseurl_html
858
2
2
1
5
  { my $t= shift;
859
2
50
3
    _use( 'LWP::Simple') or croak "missing LWP::Simple";
860
2
2
4
5
    eval { $t->parse_html( LWP::Simple::get( shift()), @_); } ;
861
2
50
10
    return $@ ? $t->_reset_twig_after_error : $t;
862   }
863
864 sub parseurl_html
865
1
1
1
2
  { my $t= shift;
866
1
50
2
    _use( 'LWP::Simple') or croak "missing LWP::Simple";
867
1
4
    $t->parse_html( LWP::Simple::get( shift()), @_);
868   }
869
870
871 # uses eval to catch the parser's death
872 sub safe_parse_html
873
8
8
1
23
  { my $t= shift;
874
8
8
10
35
    eval { $t->parse_html( @_); } ;
875
8
50
72
    return $@ ? $t->_reset_twig_after_error : $t;
876   }
877
878 sub parsefile_html
879
10
10
1
37
  { my $t= shift;
880
10
26
    my $file= shift;
881
10
100
41
    my $indent= $t->{ErrorContext} ? 1 : 0;
882
10
123
    $t->set_empty_tag_style( 'html');
883
10
100
36
    my $html2xml= $t->{use_tidy} ? \&_tidy_html : \&_html2xml;
884
10
100
50
54
    my $options= $t->{use_tidy} ? $t->{tidy_options} || {} : { indent => $indent, html_doctype => $t->{html_doctype} };
885
10
39
    $t->parse( $html2xml->( _slurp( $file), $options), @_);
886
10
23
    return $t;
887   }
888
889 sub parse_html
890
56
56
1
65661
  { my $t= shift;
891
56
100
100
164
    my $options= ref $_[0] && ref $_[0] eq 'HASH' ? shift() : {};
892
56
100
102
    my $use_tidy= exists $options->{use_tidy} ? $options->{use_tidy} : $t->{use_tidy};
893
56
243
    my $content= shift;
894
56
50
87
    my $indent= $t->{ErrorContext} ? 1 : 0;
895
56
90
    $t->set_empty_tag_style( 'html');
896
56
100
87
    my $html2xml= $use_tidy ? \&_tidy_html : \&_html2xml;
897
56
100
50
139
    my $conv_options= $use_tidy ? $t->{tidy_options} || {} : { indent => $indent, html_doctype => $t->{html_doctype} };
898
56
100
4408
    $t->parse( $html2xml->( isa( $content, 'GLOB') ? _slurp_fh( $content) : $content, $conv_options), @_);
899
55
169
    return $t;
900   }
901
902 sub xparse
903
2007
1
1446
  { my $t= shift;
904
2007
1448
    my $to_parse= $_[0];
905
2007
2
100
100
100
100
100
9424
4
    if( isa( $to_parse, 'GLOB')) { $t->parse( @_); }
906
1987
100
4668
    elsif( $to_parse=~ m{^\s*<}) { $to_parse=~ m{<html}i ? $t->_parse_as_xml_or_html( @_)
907                                                                      : $t->parse( @_);
908                                              }
909
2
100
3
    elsif( $to_parse=~ m{^\w+://.*\.html?$}) { _use( 'LWP::Simple') or croak "missing LWP::Simple";
910
1
3
                                               $t->_parse_as_xml_or_html( LWP::Simple::get( shift()), @_);
911                                              }
912
5
100
7
    elsif( $to_parse=~ m{^\w+://}) { _use( 'LWP::Simple') or croak "missing LWP::Simple";
913
4
10
                                               my $doc= LWP::Simple::get( shift);
914
4
1
100
492823
6
                                               if( ! defined $doc) { $doc=''; }
915
4
22
                                               my $xml_parse_ok= $t->safe_parse( $doc, @_);
916
4
100
6
                                               if( $xml_parse_ok)
917
1
3
                                                 { return $xml_parse_ok; }
918                                                else
919
3
4
                                                 { my $diag= $@;
920
3
100
10
                                                   if( $doc=~ m{<html}i)
921
2
3
                                                     { $t->parse_html( $doc, @_); }
922                                                     else
923
1
94
                                                      { croak $diag; }
924                                                  }
925                                              }
926
2
3
    elsif( $to_parse=~ m{\.html?$}) { my $content= _slurp( shift);
927
2
4
                                               $t->_parse_as_xml_or_html( $content, @_);
928                                              }
929
9
29
    else { $t->parsefile( @_); }
930   }
931
932 sub _parse_as_xml_or_html
933
11
11
113823
  { my $t= shift;
934
11
100
20
    if( _is_well_formed_xml( $_[0]))
935
3
5
      { $t->parse( @_) }
936     else
937
8
50
18
      { my $html2xml= $t->{use_tidy} ? \&_tidy_html : \&_html2xml;
938
8
50
0
24
        my $options= $t->{use_tidy} ? $t->{tidy_options} || {} : { indent => 0, html_doctype => $t->{html_doctype} };
939
8
16
        my $html= $html2xml->( $_[0], $options, @_);
940
8
50
12
        if( _is_well_formed_xml( $html))
941
8
17
          { $t->parse( $html); }
942         else
943
0
0
          { croak $@; } # can't really test this because HTML::Parser or HTML::Tidy may change how they deal with bas HTML between versions
944       }
945   }
946
947 { my $parser;
948   sub _is_well_formed_xml
949
19
67
19
43
    { $parser ||= XML::Parser->new;
950
19
19
114
43
      eval { $parser->parse( $_[0]); };
951
19
100
2776
      return $@ ? 0 : 1;
952     }
953 }
954
955 sub nparse
956
2008
1
443408
  { my $class= shift;
957
2008
1525
    my $to_parse= pop;
958
2008
2881
    $class->new( @_)->xparse( $to_parse);
959   }
960
961
1
1
1
2
sub nparse_pp { shift()->nparse( pretty_print => 'indented', @_); }
962
4
4
1
62
sub nparse_e { shift()->nparse( error_context => 1, @_); }
963
1
1
1
2
sub nparse_ppe { shift()->nparse( pretty_print => 'indented', error_context => 1, @_); }
964
965
966 sub _html2xml
967
49
49
233
  { my( $html, $options)= @_;
968
49
100
71
    _use( 'HTML::TreeBuilder', '3.13') or croak "cannot parse HTML: missing HTML::TreeBuilder v >= 3.13\n";
969
48
199
    my $tree= HTML::TreeBuilder->new;
970
48
7106
    $tree->ignore_ignorable_whitespace( 0);
971
48
289
    $tree->ignore_unknown( 0);
972
48
215
    $tree->no_space_compacting( 1);
973
48
209
    $tree->store_comments( 1);
974
48
200
    $tree->store_pis(1);
975
48
880
    $tree->parse( $html);
976
48
1675799
    $tree->eof;
977
978
48
809
    my $xml='';
979
48
100
67
126
    if( $options->{html_doctype} && exists $tree->{_decl} )
980
1
2
      { my $decl= $tree->{_decl}->as_XML;
981
982         # first try to fix declarations that are missing the SYSTEM part
983
1
1
33
123
5
        $decl =~ s{^\s*<!DOCTYPE \s+ ((?i)html) \s+ PUBLIC \s+ "([^"]*)" \s* >}
984
1
4
                  { my $system= $HTML_DECL{$2} || $HTML_DECL{$DEFAULT_HTML_TYPE};
985                     qq{<!DOCTYPE $1 PUBLIC "$2" "$system">}
986
987                   }xe;
988
989         # then check that the declaration looks OK (so it parses), if not remove it,
990         # better to parse without the declaration than to die stupidly
991
1
50
33
7
        if( $decl =~ m{<!DOCTYPE \s+ (?i:HTML) (\s+ PUBLIC \s+ "[^"]*" \s+ (SYSTEM \s+)? "[^"]*")? \s*>}x # PUBLIC then SYSTEM
992             || $decl =~ m{<!DOCTYPE \s+ (?i:HTML) \s+ SYSTEM \s+ "[^"]*" \s*>}x # just SYSTEM
993           )
994
1
1
          { $xml= $decl; }
995       }
996
997
48
158
    $xml.= _as_XML( $tree);
998
999
1000
48
127
    _fix_xml( $tree, \$xml);
1001
1002
48
1
100
89
9
    if( $options->{indent}) { _indent_xhtml( \$xml); }
1003
48
124
    $tree->delete;
1004
48
99242
    $xml=~ s{\s+$}{}s; # trim end
1005
48
980
    return $xml;
1006   }
1007
1008 sub _tidy_html
1009
25
25
90
  { my( $html, $options)= @_;
1010
25
50
42
   _use( 'HTML::Tidy') or croak "cannot cleanup HTML using HTML::Tidy (required by the use_tidy option): $@\n"; ;
1011
25
258
    my $TIDY_DEFAULTS= { output_xhtml => 1, # duh!
1012                          tidy_mark => 0, # do not add the "generated by tidy" comment
1013                          numeric_entities => 1,
1014                          char_encoding => 'utf8',
1015                          bare => 1,
1016                          clean => 1,
1017                          doctype => 'transitional',
1018                          fix_backslash => 1,
1019                          merge_divs => 0,
1020                          merge_spans => 0,
1021                          sort_attributes => 'alpha',
1022                          indent => 0,
1023                          wrap => 0,
1024                          break_before_br => 0,
1025                        };
1026
25
50
53
    $options ||= {};
1027
25
158
    my $tidy_options= { %$TIDY_DEFAULTS, %$options};
1028
25
139
    my $tidy = HTML::Tidy->new( $tidy_options);
1029
25
2754
    $tidy->ignore( type => 1, type => 2 ); # 1 is TIDY_WARNING, 2 is TIDY_ERROR, not clean
1030
25
406
    my $xml= $tidy->clean( $html );
1031
25
124838
    return $xml;
1032   }
1033
1034
1035 { my %xml_parser_encoding;
1036   sub _fix_xml
1037
49
49
718
    { my( $tree, $xml)= @_; # $xml is a ref to the xml string
1038
1039
49
46
      my $max_tries=5;
1040
49
36
      my $add_decl;
1041
1042
49
67
69
      while( ! _check_xml( $xml) && $max_tries--)
1043         {
1044           # a couple of fixes for weird HTML::TreeBuilder errors
1045
3
100
50
50
50
27
          if( $@=~ m{^\s*xml (or text )?declaration not at start of (external )?entity}i)
1046
1
5
            { $$xml=~ s{<\?xml.*?\?>}{}g;
1047               #warn " fixed xml declaration in the wrong place\n";
1048             }
1049           elsif( $@=~ m{undefined entity})
1050
0
0
0
            { $$xml=~ s{&(amp;)?Amp;}{&amp;}g if $HTML::TreeBuilder::VERSION < 4.00;
1051
0
0
0
0
0
              if( _use( 'HTML::Entities::Numbered')) { $$xml=name2hex_xml( $$xml); }
1052
0
0
0
0
0
0
0
0
0
              $$xml=~ s{&(\w+);}{ my $ent= $1; if( $ent !~ m{^(amp|lt|gt|apos|quote)$}) { "&amp;$ent;" } }eg;
1053             }
1054           elsif( $@=~ m{&Amp; used in html})
1055             # if $Amp; is used instead of &amp; then HTML::TreeBuilder's as_xml is tripped (old version)
1056
0
0
0
            { $$xml=~ s{&(amp;)?Amp;}{&amp;}g if $HTML::TreeBuilder::VERSION < 4.00;
1057             }
1058           elsif( $@=~ m{^\s*not well-formed \(invalid token\)})
1059
2
100
5
            { if( $HTML::TreeBuilder::VERSION < 4.00)
1060
1
5
                { $$xml=~ s{&(amp;)?Amp;}{&amp;}g;
1061
1
11
                  $$xml=~ s{(<[^>]* )(\d+=)"}{$1a$2"}g; # <table 1> comes out as <table 1="1">, "fix the attribute
1062                 }
1063
2
4
              my $q= '<img "="&#34;" '; # extracted so vim doesn't get confused
1064
2
2
50
3
6
              if( _use( 'HTML::Entities::Numbered')) { $$xml=name2hex_xml( $$xml); }
1065
2
50
32
              if( $$xml=~ m{$q})
1066
0
0
                { $$xml=~ s{$q}{<img }g; # happens with <img src="foo.png"" ...
1067                 }
1068               else
1069
2
4
                { my $encoding= _encoding_from_meta( $tree);
1070
2
2
50
6
4
                  unless( keys %xml_parser_encoding) { %xml_parser_encoding= _xml_parser_encodings(); }
1071
1072
2
50
6
                  if( ! $add_decl)
1073
2
100
50
50
33
33
9
                    { if( $xml_parser_encoding{$encoding})
1074
1
1
                        { $add_decl=1; }
1075                       elsif( $encoding eq 'euc-jp' && $xml_parser_encoding{'x-euc-jp-jisx0221'})
1076
0
0
0
0
                        { $encoding="x-euc-jp-jisx0221"; $add_decl=1;}
1077                       elsif( $encoding eq 'shift-jis' && $xml_parser_encoding{'x-sjis-jisx0221'})
1078
0
0
0
0
                        { $encoding="x-sjis-jisx0221"; $add_decl=1;}
1079
1080
2
100
3
                      if( $add_decl)
1081
1
6
                        { $$xml=~ s{^(<\?xml.*?\?>)?}{<?xml version="1.0" encoding="$encoding"?>}s;
1082                           #warn " added decl (encoding $encoding)\n";
1083                         }
1084                       else
1085
1
2
                        { $$xml=~ s{^(<\?xml.*?\?>)?}{}s;
1086                           #warn " converting to utf8 from $encoding\n";
1087
1
2
                          $$xml= _to_utf8( $encoding, $$xml);
1088                         }
1089                     }
1090                   else
1091
0
0
                    { $$xml=~ s{^(<\?xml.*?\?>)?}{}s;
1092                       #warn " converting to utf8 from $encoding\n";
1093
0
0
                      $$xml= _to_utf8( $encoding, $$xml);
1094                     }
1095                 }
1096             }
1097         }
1098
1099       # some versions of HTML::TreeBuilder escape CDATA sections
1100
49
0
243
0
      $$xml=~ s{(&lt;!\[CDATA\[.*?\]\]&gt;)}{_unescape_cdata( $1)}eg;
1101
1102   }
1103
1104   sub _xml_parser_encodings
1105
2
2
4
    { my @encodings=( 'iso-8859-1'); # this one is included by default, there is no map for it in @INC
1106
2
2
      foreach my $inc (@INC)
1107
18
42
557
1288
        { push @encodings, map { basename( $_, '.enc') } glob( File::Spec->catdir( $inc => XML => Parser => Encodings => '*.enc')); }
1108
2
44
5
53
      return map { $_ => 1 } @encodings;
1109     }
1110 }
1111
1112
1113 sub _unescape_cdata
1114
1
1
220
  { my( $cdata)= @_;
1115
1
4
    $cdata=~s{&lt;}{<}g;
1116
1
2
    $cdata=~s{&gt;}{>}g;
1117
1
3
    $cdata=~s{&amp;}{&}g;
1118
1
2
    return $cdata;
1119   }
1120
1121 sub _as_XML {
1122
1123     # fork of HTML::Element::as_XML, which is a little too buggy and inconsistent between versions for my liking
1124
48
48
51
    my ($elt) = @_;
1125
48
42
    my $xml= '';
1126
48
84
    my $empty_element_map = $elt->_empty_element_map;
1127
1128
48
103
    my ( $tag, $node, $start ); # per-iteration scratch
1129     $elt->traverse(
1130         sub {
1131
36543
301506
            ( $node, $start ) = @_;
1132
36543
100
100
42774
            if ( ref $node )
1133               { # it's an element
1134
24447
18781
                $tag = $node->{'_tag'};
1135
24447
100
20021
                if ($start)
1136                   { # on the way in
1137
12237
43554
17250
64274
                    foreach my $att ( grep { ! m{^(_|/$)} } keys %$node )
1138                        { # fix attribute names instead of dying
1139
6034
4161
                         my $new_att= $att;
1140
6034
4
100
7708
4
                         if( $att=~ m{^\d}) { $new_att= "a$att"; }
1141
6034
6122
                         $new_att=~ s{[^\w\d:_-]}{}g;
1142
6034
50
5723
                         $new_att ||= 'a';
1143
6034
6
100
9727
12
                         if( $new_att ne $att) { $node->{$new_att}= delete $node->{$att}; }
1144                        }
1145
1146
12237
27
50
100
67
20239
99
                    if ( $empty_element_map->{$tag} and !@{ $node->{'_content'} || []} )
1147
27
51
                      { $xml.= $node->starttag_XML( undef, 1 ); }
1148                     else
1149
12210
18161
                      { $xml.= $node->starttag_XML(undef); }
1150                   }
1151                 else
1152                  { # on the way out
1153
12210
0
0
50
33
18862
0
                   unless ( $empty_element_map->{$tag} and !@{ $node->{'_content'} || [] } )
1154
12210
16487
                    { $xml.= $node->endtag_XML();
1155                     } # otherwise it will have been an <... /> tag.
1156                   }
1157               }
1158             elsif( $node=~ /<!\[CDATA\[/) # the content includes CDATA
1159
2
10
              { foreach my $chunk (split /(<!\[CDATA\[.*?\]\]>)/s, $node) # chunks are CDATA sections or normal text
1160
5
100
16
                  { $xml.= $chunk =~ m{<!\[CDATA\[} ? $chunk : _xml_escape( $chunk); }
1161               }
1162             else # it's just text
1163
12094
10833
              { $xml .= _xml_escape($node); }
1164
36543
250776
            1; # keep traversing
1165         }
1166
48
301
    );
1167
48
1114
  return $xml;
1168 }
1169
1170 sub _xml_escape
1171
12097
7956
  { my( $html)= @_;
1172
12097
6266
    $html =~ s{&(?! # An ampersand that isn't followed by...
1173                   ( \#[0-9]+; | # A hash mark, digits and semicolon, or
1174                     \#x[0-9a-fA-F]+; | # A hash mark, "x", hex digits and semicolon, or
1175                     [\w]+; # A valid unicode entity name and semicolon
1176                   )
1177                 )
1178               }
1179               {&amp;}gx if 0; # Needs to be escaped to amp
1180
1181
12097
8550
    $html=~ s{&}{&amp;}g;
1182
1183     # in old versions of HTML::TreeBuilder &amp; can come out as &Amp;
1184
12097
0
50
33
26576
0
    if( $HTML::TreeBuilder::VERSION && $HTML::TreeBuilder::VERSION <= 3.23) { $html=~ s{&Amp;}{&amp;}g; }
1185
1186     # simple character escapes
1187
12097
7379
    $html =~ s/</&lt;/g;
1188
12097
7002
    $html =~ s/>/&gt;/g;
1189
12097
6512
    $html =~ s/"/&quot;/g;
1190
12097
8809
    $html =~ s/'/&apos;/g;
1191
1192
12097
14273
    return $html;
1193   }
1194
1195
1196
1197
1198 sub _check_xml
1199
52
52
49
  { my( $xml)= @_; # $xml is a ref to the xml string
1200
52
52
49
193
    my $ok= eval { XML::Parser->new->parse( $$xml); };
1201     #if( $ok) { warn " parse OK\n"; }
1202
52
16080
    return $ok;
1203   }
1204
1205 sub _encoding_from_meta
1206
2
2
2
  { my( $tree)= @_;
1207
2
3
    my $enc="iso-8859-1";
1208
2
9
    my @meta= $tree->find( 'meta');
1209
2
75
    foreach my $meta (@meta)
1210
1
50
20
13
      { if( $meta->{'http-equiv'} && ($meta->{'http-equiv'} =~ m{^\s*content-type\s*}i)
1211             && $meta->{content} && ($meta->{content} =~ m{^\s*text/html\s*;\s*charset\s*=\s*(\S*)\s*}i)
1212           )
1213
1
2
          { $enc= lc $1;
1214             #warn " encoding from meta tag is '$enc'\n";
1215
1
1
            last;
1216           }
1217       }
1218
2
3
    return $enc;
1219   }
1220
1221 { sub _to_utf8
1222
2
2
8
    { my( $encoding, $string)= @_;
1223
2
8
      local $SIG{__DIE__};
1224
2
50
0
0
0
5
      if( _use( 'Encode'))
1225
2
10
        { Encode::from_to( $string, $encoding => 'utf8', 0x0400); } # 0x0400 is Encode::FB_XMLCREF
1226       elsif( _use( 'Text::Iconv'))
1227
0
0
0
0
        { my $converter = eval { Text::Iconv->new( $encoding => "utf8") };
1228
0
0
0
0
0
          if( $converter) { $string= $converter->convert( $string); }
1229         }
1230       elsif( _use( 'Unicode::Map8') && _use( 'Unicode::String'))
1231
0
0
        { my $map= Unicode::Map8->new( $encoding);
1232
0
0
          $string= $map->tou( $string)->utf8;
1233         }
1234
2
212
      $string=~ s{[\x00-\x08\x0B\x0C\x0E-\x1F]}{}g; # get rid of control chars, portable in 5.6
1235
2
9
    return $string;
1236   }
1237 }
1238
1239
1240 sub _indent_xhtml
1241
2
2
30
  { my( $xhtml)= @_; # $xhtml is a ref
1242
2
78
7
132
    my %block_tag= map { $_ => 1 } qw( html
1243                                          head
1244                                            meta title link script base
1245                                          body
1246                                            h1 h2 h3 h4 h5 h6
1247                                            p br address blockquote pre
1248                                            ol ul li dd dl dt
1249                                            table tr td th tbody tfoot thead col colgroup caption
1250                                            div frame frameset hr
1251                                      );
1252
1253
2
11
    my $level=0;
1254
2
27
2
100
100
100
100
67
100
24
177
2
    $$xhtml=~ s{( (?:<!(?:--.*?-->|[CDATA[.*?]]>)) # ignore comments and CDATA sections
1255
2
8
                  | <(\w+)((?:\s+\w+\s*=\s*(?:"[^"]*"|'[^']*'))*\s*/>) # empty tag
1256                   | <(\w+) # start tag
1257
11
17
                  |</(\w+) # end tag
1258
11
50
26
                )
1259
11
100
26
               }
1260
11
59
               { if( $2 && $block_tag{$2}) { my $indent= " " x $level;
1261                                                  "\n$indent<$2$3";
1262
11
11
10
45
                                               }
1263
3
12
                 elsif( $4 && $block_tag{$4}) { my $indent= " " x $level;
1264                                                  $level++ unless( $4=~ m{/>});
1265                                                  my $nl= $4 eq 'html' ? '' : "\n";
1266                                                  "$nl$indent<$4";
1267                                                }
1268                  elsif( $5 && $block_tag{$5}) { $level--; "</$5"; }
1269                  else { $1; }
1270                }xesg;
1271   }
1272
1273
1274 sub add_stylesheet
1275
2
2
1
4
  { my( $t, $type, $href)= @_;
1276
2
4
2
7
    my %text_type= map { $_ => 1 } qw( xsl css);
1277
2
3
    my $ss= $t->{twig_elt_class}->new( $PI);
1278
2
100
3
    if( $text_type{$type})
1279
1
3
      { $ss->_set_pi( 'xml-stylesheet', qq{type="text/$type" href="$href"}); }
1280     else
1281
1
119
      { croak "unsupported style sheet type '$type'"; }
1282
1283
1
2
    $t->_add_cpi_outside_of_root( leading_cpi => $ss);
1284
1
2
    return $t;
1285   }
1286
1287 { my %used; # module => 1 if require ok, 0 otherwise
1288   my %disallowed; # for testing, refuses to _use modules in this hash
1289
1290   sub _disallow_use ## no critic (Subroutines::ProhibitNestedSubs);
1291
6
6
146
    { my( @modules)= @_;
1292
6
23
      $disallowed{$_}= 1 foreach (@modules);
1293     }
1294
1295   sub _allow_use ## no critic (Subroutines::ProhibitNestedSubs);
1296
4
4
6
    { my( @modules)= @_;
1297
4
11
      $disallowed{$_}= 0 foreach (@modules);
1298     }
1299
1300   sub _use ## no critic (Subroutines::ProhibitNestedSubs);
1301
3411
79823
    { my( $module, $version)= @_;
1302
3411
100
6813
      $version ||= 0;
1303
3411
7
100
4215
325
      if( $disallowed{$module}) { return 0; }
1304
3404
3145
100
4698
6116
      if( $used{$module}) { return 1; }
1305
259
256
256
100
13819
1563705
2223
      if( eval "require $module") { import $module; $used{$module}= 1;
1306
256
100
521
                                    if( $version)
1307                                       {
1308                                         ## no critic (TestingAndDebugging::ProhibitNoStrict);
1309
187
187
187
194
681
157
1248673
                                        no strict 'refs';
1310
12
12
11
100
11
72
28
                                        if( ${"${module}::VERSION"} >= $version ) { return 1; }
1311
1
14
                                        else { return 0; }
1312                                       }
1313                                     else
1314
244
832
                                      { return 1; }
1315                                   }
1316
3
3
7
8
      else { $used{$module}= 0; return 0; }
1317     }
1318 }
1319
1320 # used to solve the [n] predicates while avoiding getting the entire list
1321 # needs a prototype to accept passing bare blocks
1322 sub _first_n(&$@) ## nocritic (Subroutines::ProhibitSubroutinePrototypes);
1323
106
106
81
  { my $coderef= shift;
1324
106
66
    my $n= shift;
1325
106
57
    my $i=0;
1326
106
100
100
109
    if( $n > 0)
1327
99
121
35
35
100
100
101
1518
24
457
      { foreach (@_) { if( &$coderef) { $i++; return $_ if( $i == $n); } } }
1328     elsif( $n < 0)
1329
6
13
11
11
100
100
6
105
8
62
      { foreach (reverse @_) { if( &$coderef) { $i--; return $_ if( $i == $n); } } }
1330     else
1331
1
85
      { croak "illegal position number 0"; }
1332
71
1085
    return undef;
1333   }
1334
1335 sub _slurp_uri
1336
11
11
2344
  { my( $uri, $base)= @_;
1337
11
1
1
100
60
6
4
    if( $uri=~ m{^\w+://}) { _use( 'LWP::Simple'); return LWP::Simple::get( $uri); }
1338
10
20
    else { return _slurp( _based_filename( $uri, $base)); }
1339   }
1340
1341 sub _based_filename
1342
27
27
93
  { my( $filename, $base)= @_;
1343     # cf. XML/Parser.pm's file_ext_ent_handler
1344
27
100
67
67
    if (defined($base) and not ($filename =~ m{^(?:[\\/]|\w+:)}))
1345
3
3
          { my $newpath = $base;
1346
3
9
            $newpath =~ s{[^\\/:]*$}{$filename};
1347
3
3
            $filename = $newpath;
1348           }
1349
27
175
    return $filename;
1350   }
1351
1352 sub _slurp
1353
23
23
59
  { my( $filename)= @_;
1354
23
25
    my $to_slurp;
1355
23
100
1303
    open( $to_slurp, "<$filename") or croak "cannot open '$filename': $!";
1356
19
92
    local $/= undef;
1357
19
223
    my $content= <$to_slurp>;
1358
19
76
    close $to_slurp;
1359
19
152
    return $content;
1360   }
1361
1362 sub _slurp_fh
1363
2
2
3
  { my( $fh)= @_;
1364
2
6
    local $/= undef;
1365
2
34
    my $content= <$fh>;
1366
2
8
    return $content;
1367   }
1368
1369 # I should really add extra options to allow better configuration of the
1370 # LWP::UserAgent object
1371 # this method forks (except on VMS!)
1372 # - the child gets the data and copies it to the pipe,
1373 # - the parent reads the stream and sends it to XML::Parser
1374 # the data is cut it chunks the size of the XML::Parser::Expat buffer
1375 # the method returns the twig and the status
1376 sub _parseurl
1377
20
20
38
  { my( $t, $safe, $url, $agent)= @_;
1378
20
50
46
    _use( 'LWP') || croak "LWP not available, needed to use parseurl methods";
1379
20
50
114
    if( $^O ne 'VMS')
1380
20
50
251
      { pipe( README, WRITEME) or croak "cannot create connected pipes: $!";
1381
20
100
8079
        if( my $pid= fork)
1382           { # parent code: parse the incoming file
1383
15
270
            close WRITEME; # no need to write
1384
15
100
387
            my $result= $safe ? $t->safe_parse( \*README) : $t->parse( \*README);
1385
15
147
            close README;
1386
15
100
94
            return $@ ? 0 : $t;
1387           }
1388         else
1389          { # child
1390
5
230
            close README; # no need to read
1391
5
223
            local $|=1;
1392
5
67
409
            $agent ||= LWP::UserAgent->new;
1393
5
2070
            my $request = HTTP::Request->new( GET => $url);
1394             # _pass_url_content is called with chunks of data the same size as
1395             # the XML::Parser buffer
1396             my $response = $agent->request( $request,
1397
5
4
4
21626
287003
                             sub { _pass_url_content( \*WRITEME, @_); }, $BUFSIZE);
1398
5
100
17789
            $response->is_success or croak "$url ", $response->message;
1399
4
79
            close WRITEME;
1400
4
318
            CORE::exit(); # CORE is there for mod_perl (which redefines exit)
1401           }
1402       }
1403     else
1404       { # VMS branch (hard to test!)
1405
0
0
        local $|=1;
1406
0
0
0
        $agent ||= LWP::UserAgent->new;
1407
0
0
        my $request = HTTP::Request->new( GET => $url);
1408
0
0
        my $response = $agent->request( $request);
1409
0
0
0
        $response->is_success or croak "$url ", $response->message;
1410
0
0
0
        my $result= $safe ? $t->safe_parse($response->content) : $t->parse($response->content);
1411
0
0
0
        return $@ ? 0 : $t;
1412      }
1413
1414   }
1415
1416 # get the (hopefully!) XML data from the URL and
1417 sub _pass_url_content
1418
4
4
12
  { my( $fh, $data, $response, $protocol)= @_;
1419
4
4
4
35
    print {$fh} $data;
1420   }
1421
1422 sub add_options
1423
1
1
1
1
23
3
  { my %args= map { $_, 1 } @_;
1424
1
8
    %args= _normalize_args( %args);
1425
1
1
3
2
    foreach (keys %args) { $valid_option{$_}++; }
1426   }
1427
1428
27
27
60
sub _pretty_print_styles { return XML::Twig::Elt::_pretty_print_styles(); }
1429
1430 sub _twig_store_internal_dtd
1431  {
1432    # warn " in _twig_store_internal_dtd...\n"; # DEBUG handler
1433
161
161
134
    my( $p, $string)= @_;
1434
161
101
    my $t= $p->{twig};
1435
161
3
100
189
7
    if( $t->{twig_keep_encoding}) { $string= $p->original_string(); }
1436
161
147
    $t->{twig_doctype}->{internal} .= $string;
1437
161
535
    return;
1438   }
1439
1440 sub _twig_stop_storing_internal_dtd
1441    { # warn " in _twig_stop_storing_internal_dtd...\n"; # DEBUG handler
1442
110
110
101
    my $p= shift;
1443
110
50
33
342
    if( @saved_default_handler && defined $saved_default_handler[1])
1444
110
224
      { $p->setHandlers( @saved_default_handler); }
1445     else
1446       {
1447
0
0
        $p->setHandlers( Default => undef);
1448       }
1449
110
1798
    $p->{twig}->{twig_doctype}->{internal}=~ s{^\s*\[}{};
1450
110
178
    $p->{twig}->{twig_doctype}->{internal}=~ s{\]\s*$}{};
1451
110
365
    return;
1452   }
1453
1454 sub _twig_doctype_fin_print
1455   { # warn " in _twig_doctype_fin_print...\n"; # DEBUG handler
1456
16
16
20
    my( $p)= shift;
1457
16
6
100
67
45
7
    if( $p->{twig}->{twig_doctype}->{has_internal} && !$expat_1_95_2) { print ' ]>'; }
1458
16
47
    return;
1459   }
1460
1461
1462 sub _normalize_args
1463
8311
5193
  { my %normalized_args;
1464
8311
11813
    while( my $key= shift )
1465
5514
7679
6934
11029
      { $key= join '', map { ucfirst } split /_/, $key;
1466         #$key= "Twig".$key unless( substr( $key, 0, 4) eq 'Twig');
1467
5514
10348
        $normalized_args{$key}= shift ;
1468       }
1469
8311
14873
    return %normalized_args;
1470   }
1471
1472
62
62
50
100
67
62
96
272
sub _is_fh { return unless $_[0]; return $_[0] if( isa( $_[0], 'GLOB') || isa( $_[0], 'IO::Scalar')); }
1473
1474 sub _set_handler
1475
757
757
766
  { my( $handlers, $whole_path, $handler)= @_;
1476
1477
757
6131
    my $H_SPECIAL = qr{($ALL|$DEFAULT|$COMMENT|$TEXT)};
1478
757
3366
    my $H_PI = qr{(\?|$PI)\s*(([^\s]*)\s*)};
1479
757
1068
    my $H_LEVEL = qr{level \s* \( \s* ([0-9]+) \s* \)}x;
1480
757
1083
    my $H_REGEXP = qr{\(\?([\^xism]*)(-[\^xism]*)?:(.*)\)}x;
1481
757
17966
    my $H_XPATH = qr{(/?/?$REG_TAG_PART? \s* ($REG_PREDICATE\s*)?)+}x;
1482
1483
757
606
    my $prev_handler;
1484
1485
757
609
    my $cpath= $whole_path;
1486     #warn "\$cpath: '$cpath\n";
1487
757
100
31315
    while( $cpath && $cpath=~ s{^\s*($H_SPECIAL|$H_PI|$H_LEVEL|$H_REGEXP|$H_XPATH)\s*($|\|)}{})
1488
770
1647
      { my $path= $1;
1489         #warn "\$cpath: '$cpath' - $path: '$path'\n";
1490
770
67
3471
        $prev_handler ||= $handlers->{handlers}->{string}->{$path} || undef; # $prev_handler gets the first found handler
1491
1492
770
100
100
1017
           _set_special_handler ( $handlers, $path, $handler, $prev_handler)
1493         || _set_pi_handler ( $handlers, $path, $handler, $prev_handler)
1494         || _set_level_handler ( $handlers, $path, $handler, $prev_handler)
1495         || _set_regexp_handler ( $handlers, $path, $handler, $prev_handler)
1496         || _set_xpath_handler ( $handlers, $path, $handler, $prev_handler)
1497         || croak "unrecognized expression in handler: '$whole_path'";
1498
1499         # this both takes care of the simple (gi) handlers and store
1500         # the handler code reference for other handlers
1501
764
2512
        $handlers->{handlers}->{string}->{$path}= $handler;
1502       }
1503
1504
751
2
100
870
201
    if( $cpath) { croak "unrecognized expression in handler: '$whole_path'"; }
1505
1506
749
2694
    return $prev_handler;
1507   }
1508
1509
1510 sub _set_special_handler
1511
770
770
828
  { my( $handlers, $path, $handler, $prev_handler)= @_;
1512
770
100
2708
    if( $path =~ m{^\s*($ALL|$DEFAULT|$COMMENT|$TEXT)\s*$}io )
1513
47
92
      { $handlers->{handlers}->{$1}= $handler;
1514
47
138
        return 1;
1515       }
1516     else
1517
723
1816
      { return 0; }
1518   }
1519
1520 sub _set_xpath_handler
1521
665
665
620
  { my( $handlers, $path, $handler, $prev_handler)= @_;
1522
665
100
733
    if( my $handler_data= _parse_xpath_handler( $path, $handler))
1523
659
935
      { _add_handler( $handlers, $handler_data, $path, $prev_handler);
1524
659
2149
        return 1;
1525       }
1526     else
1527
4
425
      { return 0; }
1528   }
1529
1530 sub _add_handler
1531
674
674
794
  { my( $handlers, $handler_data, $path, $prev_handler)= @_;
1532
1533
674
657
    my $tag= $handler_data->{tag};
1534
674
156
100
1189
281
    my @handlers= $handlers->{xpath_handler}->{$tag} ? @{$handlers->{xpath_handler}->{$tag}} : ();
1535
1536
674
64
176
100
813
84
311
    if( $prev_handler) { @handlers= grep { $_->{path} ne $path } @handlers; }
1537
1538
674
100
1201
    push @handlers, $handler_data if( $handler_data->{handler});
1539
1540
674
100
962
    if( @handlers > 1)
1541
138
686
100
100
100
100
100
100
100
75
50
50
100
100
100
283
6797
      { @handlers= sort { (($b->{score}->{type} || 0) <=> ($a->{score}->{type} || 0))
1542                           || (($b->{score}->{anchored} || 0) <=> ($a->{score}->{anchored} || 0))
1543                           || (($b->{score}->{steps} || 0) <=> ($a->{score}->{steps} || 0))
1544                           || (($b->{score}->{predicates} || 0) <=> ($a->{score}->{predicates} || 0))
1545                           || (($b->{score}->{tests} || 0) <=> ($a->{score}->{tests} || 0))
1546                           || ($a->{path} cmp $b->{path})
1547                         } @handlers;
1548       }
1549
1550
674
1359
    $handlers->{xpath_handler}->{$tag}= \@handlers;
1551   }
1552
1553 sub _set_pi_handler
1554
723
723
704
  { my( $handlers, $path, $handler, $prev_handler)= @_;
1555     # PI conditions ( '?target' => \&handler or '?' => \&handler
1556     # or '#PItarget' => \&handler or '#PI' => \&handler)
1557
723
100
3330
    if( $path=~ /^\s*(?:\?|$PI)\s*(?:([^\s]*)\s*)$/)
1558
43
100
116
      { my $target= $1 || '';
1559         # update the path_handlers count, knowing that
1560         # either the previous or the new handler can be undef
1561
43
84
        $handlers->{pi_handlers}->{$1}= $handler;
1562
43
131
        return 1;
1563       }
1564     else
1565
680
1960
      { return 0;
1566       }
1567   }
1568
1569 sub _set_level_handler
1570
680
680
619
  { my( $handlers, $path, $handler, $prev_handler)= @_;
1571
680
100
831
    if( $path =~ m{^ \s* level \s* \( \s* ([0-9]+) \s* \) \s* $}ox )
1572
7
12
      { my $level= $1;
1573
7
31
31
67
31
28
25
171
        my $sub= sub { my( $stack)= @_; return( ($stack->[-1]->{$ST_TAG} !~ m{^#}) && (scalar @$stack == $level + 1) ) };
1574
7
26
        my $handler_data= { tag=> '*', score => { type => $LEVEL_TRIGGER}, trigger => $sub,
1575                              path => $path, handler => $handler, test_on_text => 0
1576                            };
1577
7
12
        _add_handler( $handlers, $handler_data, $path, $prev_handler);
1578
7
19
        return 1;
1579       }
1580     else
1581
673
1692
      { return 0; }
1582   }
1583
1584 sub _set_regexp_handler
1585
673
673
637
  { my( $handlers, $path, $handler, $prev_handler)= @_;
1586     # if the expression was a regexp it is now a string (it was stringified when it became a hash key)
1587
673
100
758
    if( $path=~ m{^\(\?([\^xism]*)(?:-[\^xism]*)?:(.*)\)$})
1588
8
98
      { my $regexp= qr/(?$1:$2)/; # convert it back into a regexp
1589
8
36
36
36
23
23
161
        my $sub= sub { my( $stack)= @_; return( $stack->[-1]->{$ST_TAG} =~ $regexp ) };
1590
8
32
        my $handler_data= { tag=> '*', score => { type => $REGEXP_TRIGGER} , trigger => $sub,
1591                              path => $path, handler => $handler, test_on_text => 0
1592                            };
1593
8
11
        _add_handler( $handlers, $handler_data, $path, $prev_handler);
1594
8
21
        return 1;
1595       }
1596     else
1597
665
1572
      { return 0; }
1598   }
1599
1600 my $DEBUG_HANDLER= 0; # 0 or 1 (output the handler checking code) or 2 (super verbose)
1601 my $handler_string; # store the handler itself
1602
2
2
47
sub _set_debug_handler { $DEBUG_HANDLER= shift; }
1603
6
0
6
50
6
8
0
10
sub _warn_debug_handler { if( $DEBUG_HANDLER < 3) { warn @_; } else { $handler_string .= join( '', @_); } }
1604
1
1
1
1
5
1
2
sub _return_debug_handler { my $string= $handler_string; $handler_string=''; return $string; }
1605
1606 sub _parse_xpath_handler
1607
665
665
561
  { my( $xpath, $handler)= @_;
1608
665
517
    my $xpath_original= $xpath;
1609
1610
1611
665
1
100
856
2
    if( $DEBUG_HANDLER >=1) { _warn_debug_handler( "\n\nparsing path '$xpath'\n"); }
1612
1613
665
586
    my $path_to_check= $xpath;
1614
665
15189
    $path_to_check=~ s{/?/?$REG_TAG_PART?\s*(?:$REG_PREDICATE\s*)?}{}g;
1615
665
0
50
67
1240
0
    if( $DEBUG_HANDLER && $path_to_check=~ /\S/) { _warn_debug_handler( "left: $path_to_check\n"); }
1616
665
50
893
    return if( $path_to_check=~ /\S/);
1617
1618
665
2091
    (my $xpath_to_display= $xpath)=~ s{(["{}'\[\]\@\$])}{\\$1}g;
1619
1620
665
448
    my @xpath_steps;
1621
665
441
    my $last_token_is_sep;
1622
1623
665
19116
    while( $xpath=~ s{^\s*
1624                        ( (//?) # separator
1625                         | (?:$REG_TAG_PART\s*(?:$REG_PREDICATE\s*)?) # tag name and optional predicate
1626                         | (?:$REG_PREDICATE) # just a predicate
1627                        )
1628                      }
1629                      {}x
1630          )
1631       { # check that we have alternating separators and steps
1632
857
100
1689
        if( $2) # found a separator
1633
114
1
100
132
3
          { if( $last_token_is_sep) { return 0; } # 2 separators in a row
1634
113
85
            $last_token_is_sep= 1;
1635           }
1636         else
1637
743
2
100
100
1285
5
          { if( defined( $last_token_is_sep) && !$last_token_is_sep) { return 0; } # 2 steps in a row
1638
741
589
            $last_token_is_sep= 0;
1639           }
1640
1641
854
4522
        push @xpath_steps, $1;
1642       }
1643
662
1
100
817
2
    if( $last_token_is_sep) { return 0; } # expression cannot end with a separator
1644
1645
661
453
    my $i=-1;
1646
1647
661
67
1464
    my $perlfunc= _join_n( $NO_WARNINGS . ';',
1648                            q|my( $stack)= @_; |,
1649                            q|my @current_elts= (scalar @$stack); |,
1650                            q|my @new_current_elts; |,
1651                            q|my $elt; |,
1652                            ($DEBUG_HANDLER >= 1) && (qq#warn q{checking path '$xpath_to_display'\n};#),
1653                          );
1654
1655
1656
661
535
    my $last_tag='';
1657
661
100
1068
    my $anchored= $xpath_original=~ m{^\s*/(?!/)} ? 1 : 0;
1658
661
1236
    my $score={ type => $XPATH_TRIGGER, anchored => $anchored };
1659
661
821
    my $flag= { test_on_text => 0 };
1660
661
546
    my $sep='/'; # '/' or '//'
1661
661
1173
    while( my $xpath_step= pop @xpath_steps)
1662
738
8016
      { my( $tag, $predicate)= $xpath_step =~ m{^($REG_TAG_PART)?(?:\[(.*)\])?\s*$};
1663
738
1039
        $score->{steps}++;
1664
738
100
907
        $tag||='*';
1665
1666
738
100
893
        my $warn_empty_stack= $DEBUG_HANDLER >= 2 ? qq{warn "return with empty stack\\n";} : '';
1667
1668
738
100
865
        if( $predicate)
1669
222
1
100
285
2
          { if( $DEBUG_HANDLER >= 2) { _warn_debug_handler( "predicate is: '$predicate'\n"); }
1670             # changes $predicate (from an XPath expression to a Perl one)
1671
222
1
100
1039
125
            if( $predicate=~ m{^\s*$REG_NUMBER\s*$}) { croak "position selector [$predicate] not supported on twig_handlers"; }
1672
221
281
            _parse_predicate_in_handler( $predicate, $flag, $score);
1673
221
1
100
409
3
            if( $DEBUG_HANDLER >= 2) { _warn_debug_handler( "predicate becomes: '$predicate'\n"); }
1674           }
1675
1676
737
820
       my $tag_cond= _tag_cond( $tag);
1677
737
100
838
       my $cond= join( " && ", grep { $_ } $tag_cond, $predicate) || 1;
1678
1679
737
8
8
100
100
67
1084
19
12
       if( $css_sel && $tag=~ m{\.}) { $tag=~s{\.[^.]*$}{}; $tag ||='*'; }
1680
737
679
       $tag=~ s{(.)#.+$}{$1};
1681
1682
737
67
1377
       $last_tag ||= $tag;
1683
1684
737
100
50
842
       if( $sep eq '/')
1685          {
1686
736
840
           $perlfunc .= sprintf( _join_n( q#foreach my $current_elt (@current_elts) #,
1687                                            q# { next if( !$current_elt); #,
1688                                            q# $current_elt--; #,
1689                                            q# $elt= $stack->[$current_elt]; #,
1690                                            q# if( %s) { push @new_current_elts, $current_elt;} #,
1691                                            q# } #,
1692                                         ),
1693                                  $cond
1694                                );
1695          }
1696        elsif( $sep eq '//')
1697          {
1698
1
2
           $perlfunc .= sprintf( _join_n( q#foreach my $current_elt (@current_elts) #,
1699                                            q# { next if( !$current_elt); #,
1700                                            q# $current_elt--; #,
1701                                            q# my $candidate= $current_elt; #,
1702                                            q# while( $candidate >=0) #,
1703                                            q# { $elt= $stack->[$candidate]; #,
1704                                            q# if( %s) { push @new_current_elts, $candidate;} #,
1705                                            q# $candidate--; #,
1706                                            q# } #,
1707                                            q# } #,
1708                                         ),
1709                                  $cond
1710                                );
1711          }
1712
737
100
1042
       my $warn= $DEBUG_HANDLER >= 2 ? _join_n( qq#warn qq%fail at cond '$cond'%;#) : '';
1713
737
758
       $perlfunc .= sprintf( _join_n( q#unless( @new_current_elts) { %s return 0; } #,
1714                                       q#@current_elts= @new_current_elts; #,
1715                                       q#@new_current_elts=(); #,
1716                                     ),
1717                              $warn
1718                            );
1719
1720
737
1746
        $sep= pop @xpath_steps;
1721      }
1722
1723
660
100
787
    if( $anchored) # there should be a better way, but this works
1724       {
1725
31
50
44
       my $warn= $DEBUG_HANDLER >= 2 ? _join_n( qq#warn qq{fail, stack not empty};#) : '';
1726
31
31
       $perlfunc .= sprintf( _join_n( q#if( ! grep { $_ == 0 } @current_elts) { %s return 0;}#), $warn);
1727       }
1728
1729
660
100
743
    $perlfunc.= qq{warn "handler for '$xpath_to_display' triggered\\n";\n} if( $DEBUG_HANDLER >=2);
1730
660
725
    $perlfunc.= qq{return q{$xpath_original};\n};
1731
660
100
724
    _warn_debug_handler( "\nperlfunc:\n$perlfunc\n") if( $DEBUG_HANDLER>=1);
1732
660
18
18
18
16
16
16
21
21
21
19
19
19
16
16
16
14
14
14
16
16
16
17
15
15
13
13
13
12
12
12
5
5
5
6
6
6
7
7
7
7
7
7
5
5
5
5
5
5
4
4
4
4
4
4
7
7
7
3
3
3
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
1
1
1
18
16
21
19
16
14
16
17
13
12
5
6
7
7
5
5
5
5
7
3
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
4
4
4
4
4
4
4
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
2
2
2
2
2
2
2
34049
61
18
1780
48
18
1510
65
22
1976
58
19
1802
52
15
1741
48
15
1540
51
15
1545
346
1863
1611
88
55
1507
35
51
1343
16
47
456
17
76
542
25
6
775
26
8
764
17
5
535
15
5
467
12
6
476
13
6
495
22
9
515
10
4
383
15
3
361
10
3
330
9
3
322
8
3
442
10
2
322
11
4
350
10
3
419
8
3
404
9
4
324
10
3
436
10
2
337
7
3
272
7
2
277
9
2
261
11
2
382
8
2
293
9
2
323
9
3
363
9
3
369
8
2
236
13
4
341
12
4
322
11
4
288
12
4
314
12
5
330
13
5
263
12
5
239
4
1
155
3
1
106
3
1
161
4
1
120
4
1
108
3
2
107
3
1
113
3
1
117
4
1
116
3
2
113
3
1
111
3
1
116
3
5
115
3
1
122
3
1
124
3
1
119
3
2
118
3
1
110
3
1
109
4
1
128
4
1
122
4
1
102
3
2
124
3
1
130
3
1
125
3
1
127
4
1
126
3
1
125
4
1
179
3
1
109
3
1
117
3
1
102
3
2
133
3
1
124
3
1
173
3
1
107
4
1
134
8
2
121
7
2
77
8
2
104
8
2
161
6
2
139
3
1
105
    my $s= eval "sub { $perlfunc }";
1733
660
100
1302
      if( $@)
1734
1
134
        { croak "wrong handler condition '$xpath' ($@);" }
1735
1736
659
100
948
      _warn_debug_handler( "last tag: '$last_tag', test_on_text: '$flag->{test_on_text}'\n") if( $DEBUG_HANDLER >=1);
1737
659
4
100
815
8
      _warn_debug_handler( "score: ", join( ' ', map { "$_: $score->{$_}" } sort keys %$score), "\n") if( $DEBUG_HANDLER >=1);
1738
659
4392
      return { tag=> $last_tag, score => $score, trigger => $s, path => $xpath_original, handler => $handler, test_on_text => $flag->{test_on_text} };
1739     }
1740
1741
2167
7018
sub _join_n { return join( "\n", @_, ''); }
1742
1743 # the "tag" part can be <tag>, <tag>.<class> or <tag>#<id> (where tag can be *, or start with # for hidden tags)
1744 sub _tag_cond
1745
737
737
667
  { my( $full_tag)= @_;
1746
1747
737
418
    my( $tag, $class, $id);
1748
737
100
894
    if( $full_tag=~ m{^(.+)#(.+)$})
1749
4
8
      { ($tag, $id)= ($1, $2); } # <tag>#<id>
1750     else
1751
733
100
1351
      { ( $tag, $class)= $css_sel ? $full_tag=~ m{^(.*?)(?:\.([^.]*))?$} : ($full_tag, undef); }
1752
1753
737
100
100
2405
    my $tag_cond = $tag && $tag ne '*' ? qq#(\$elt->{'$ST_TAG'} eq "$tag")# : '';
1754
737
100
772
    my $id_cond = defined $id ? qq#(\$elt->{id} eq "$id")# : '';
1755
737
100
721
    my $class_cond = defined $class ? qq#(\$elt->{class}=~ m{(^| )$class( |\$)})# : '';
1756
1757
737
2211
713
2540
    my $full_cond= join( ' && ', grep { $_ } ( $tag_cond, $class_cond, $id_cond));
1758
1759
737
1153
    return $full_cond;
1760   }
1761
1762 # input: the predicate ($_[0]) which will be changed in place
1763 # flags, a hashref with various flags (like test_on_text)
1764 # the score
1765 sub _parse_predicate_in_handler
1766
221
221
332
  { my( $flag, $score)= @_[1..2];
1767
221
585
5291
1991
    $_[0]=~ s{( ($REG_STRING) # strings
1768                  |\@($REG_TAG_NAME)(?=\s*(?:[><=!]|!~|=~)) # @att (followed by a comparison operator)
1769                  |\@($REG_TAG_NAME) # @att (not followed by a comparison operator)
1770
585
527
                 |=~|!~ # matching operators
1771                  |([><]=?|=|!=)(?=\s*[\d+-]) # test before a number
1772                  |([><]=?|=|!=) # test, other cases
1773
585
44
100
80
2895
55
                 |($REG_FUNCTION) # no arg functions
1774                  # this bit is a mess, but it is the only solution with this half-baked parser
1775
585
139
100
100
100
100
100
100
100
100
100
50
100
33
100
67
67
67
1833
346
                 |(string\(\s*$REG_NAME\s*\)\s*$REG_MATCH\s*$REG_REGEXP) # string( child)=~ /regexp/
1776
1
4
                 |(string\(\s*$REG_NAME\s*\)\s*$REG_COMP\s*$REG_STRING) # string( child) = "value" (or other test)
1777
161
100
709
                 |(string\(\s*$REG_NAME\s*\)\s*$REG_COMP\s*$REG_NUMBER) # string( child) = nb (or other test)
1778                  |(and|or)
1779                 # |($REG_NAME(?=\s*(and|or|$))) # nested tag name (needs to be after all other unquoted strings)
1780                  |($REG_TAG_IN_PREDICATE) # nested tag name (needs to be after all other unquoted strings)
1781
1782               )}
1783              { my( $token, $str, $att, $bare_att, $num_test, $alpha_test, $func, $str_regexp, $str_test_alpha, $str_test_num, $and_or, $tag)
1784
2
10
               = ( $1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12);
1785
1786                $score->{predicates}++;
1787
1788                # store tests on text (they are not always allowed)
1789
2
11
               if( $func || $str_regexp || $str_test_num || $str_test_alpha ) { $flag->{test_on_text}= 1; }
1790
1791
2
5
               if( defined $str) { $token }
1792
2
2
               elsif( $tag) { qq{(\$elt->{'$ST_ELT'} && \$elt->{'$ST_ELT'}->has_child( '$tag'))} }
1793
2
4
               elsif( $att) { $att=~ m{^#} ? qq{ (\$elt->{'$ST_ELT'} && \$elt->{'$ST_ELT'}->{att}->{'$att'})}
1794
2
2
                                                     : qq{\$elt->{'$att'}}
1795
2
10
                                      }
1796                                         # for some reason Devel::Cover flags the following lines as not tested. They are though.
1797
0
0
0
               elsif( $bare_att) { $bare_att=~ m{^#} ? qq{(\$elt->{'$ST_ELT'} && defined(\$elt->{'$ST_ELT'}->{att}->{'$bare_att'}))}
1798
0
0
                                                          : qq{defined( \$elt->{'$bare_att'})}
1799                                       }
1800
12
12
100
14
54
               elsif( $num_test && ($num_test eq '=') ) { "==" } # others tests are unchanged
1801
60
911
               elsif( $alpha_test) { $PERL_ALPHA_TEST{$alpha_test} }
1802                elsif( $func && $func=~ m{^string})
1803                                       { "\$elt->{'$ST_ELT'}->text"; }
1804                elsif( $str_regexp && $str_regexp =~ m{string\(\s*($REG_TAG_NAME)\s*\)\s*($REG_MATCH)\s*($REG_REGEXP)})
1805                                       { "defined( _first_n { \$_->text $2 $3 } 1, \$elt->{'$ST_ELT'}->_children( '$1'))"; }
1806                elsif( $str_test_alpha && $str_test_alpha =~ m{string\(\s*($REG_TAG_NAME)\s*\)\s*($REG_COMP)\s*($REG_STRING)})
1807                                       { my( $tag, $op, $str)= ($1, $2, $3);
1808                                         $str=~ s{(?<=.)'(?=.)}{\\'}g; # escape a quote within the string
1809                                         $str=~ s{^"}{'};
1810                                         $str=~ s{"$}{'};
1811                                         "defined( _first_n { \$_->text $PERL_ALPHA_TEST{$op} $str } 1, \$elt->{'$ST_ELT'}->children( '$tag'))"; }
1812                elsif( $str_test_num && $str_test_num =~ m{string\(\s*($REG_TAG_NAME)\s*\)\s*($REG_COMP)\s*($REG_NUMBER)})
1813                                       { my $test= ($2 eq '=') ? '==' : $2;
1814                                         "defined( _first_n { \$_->text $test $3 } 1, \$elt->{'$ST_ELT'}->children( '$1'))";
1815                                       }
1816                elsif( $and_or) { $score->{tests}++; $and_or eq 'and' ? '&&' : '||' ; }
1817                else { $token; }
1818              }gexs;
1819   }
1820
1821
1822 sub setCharHandler
1823
1
1
1
1
  { my( $t, $handler)= @_;
1824
1
2
    $t->{twig_char_handler}= $handler;
1825   }
1826
1827
1828 sub _reset_handlers
1829
423
423
681
  { my $handlers= shift;
1830
423
508
    delete $handlers->{handlers};
1831
423
329
    delete $handlers->{path_handlers};
1832
423
351
    delete $handlers->{subpath_handlers};
1833
423
50
595
    $handlers->{attcond_handlers_exp}=[] if( $handlers->{attcond_handlers});
1834
423
539
    delete $handlers->{attcond_handlers};
1835   }
1836
1837 sub _set_handlers
1838
423
100
423
651
  { my $handlers= shift || return;
1839
422
394
    my $set_handlers= {};
1840
422
422
363
892
    foreach my $path (keys %{$handlers})
1841
613
926
      { _set_handler( $set_handlers, $path, $handlers->{$path}); }
1842
1843
415
2077
    return $set_handlers;
1844   }
1845
1846
1847 sub setTwigHandler
1848
64
64
1
2918
  { my( $t, $path, $handler)= @_;
1849
64
100
251
    $t->{twig_handlers} ||={};
1850
64
112
    return _set_handler( $t->{twig_handlers}, $path, $handler);
1851   }
1852
1853 sub setTwigHandlers
1854
263
263
1
507
  { my( $t, $handlers)= @_;
1855
263
100
660
    my $previous_handlers= $t->{twig_handlers} || undef;
1856
263
668
    _reset_handlers( $t->{twig_handlers});
1857
263
433
    $t->{twig_handlers}= _set_handlers( $handlers);
1858
256
471
    return $previous_handlers;
1859   }
1860
1861 sub setStartTagHandler
1862
3
3
1
8
  { my( $t, $path, $handler)= @_;
1863
3
50
5
    $t->{twig_starttag_handlers}||={};
1864
3
4
    return _set_handler( $t->{twig_starttag_handlers}, $path, $handler);
1865   }
1866
1867 sub setStartTagHandlers
1868
34
34
1
293
  { my( $t, $handlers)= @_;
1869
34
100
101
    my $previous_handlers= $t->{twig_starttag_handlers} || undef;
1870
34
77
    _reset_handlers( $t->{twig_starttag_handlers});
1871
34
56
    $t->{twig_starttag_handlers}= _set_handlers( $handlers);
1872
34
60
    return $previous_handlers;
1873    }
1874
1875 sub setIgnoreEltsHandler
1876
2
2
1
499
  { my( $t, $path, $action)= @_;
1877
2
100
12
    $t->{twig_ignore_elts_handlers}||={};
1878
2
5
    return _set_handler( $t->{twig_ignore_elts_handlers}, $path, $action );
1879   }
1880
1881 sub setIgnoreEltsHandlers
1882
11
11
1
14
  { my( $t, $handlers)= @_;
1883
11
9
    my $previous_handlers= $t->{twig_ignore_elts_handlers};
1884
11
35
    _reset_handlers( $t->{twig_ignore_elts_handlers});
1885
11
20
    $t->{twig_ignore_elts_handlers}= _set_handlers( $handlers);
1886
11
12
    return $previous_handlers;
1887    }
1888
1889 sub setEndTagHandler
1890
2
2
1
48
  { my( $t, $path, $handler)= @_;
1891
2
100
10
    $t->{twig_endtag_handlers}||={};
1892
2
5
    return _set_handler( $t->{twig_endtag_handlers}, $path,$handler);
1893   }
1894
1895 sub setEndTagHandlers
1896
14
14
1
12
  { my( $t, $handlers)= @_;
1897
14
16
    my $previous_handlers= $t->{twig_endtag_handlers};
1898
14
30
    _reset_handlers( $t->{twig_endtag_handlers});
1899
14
21
    $t->{twig_endtag_handlers}= _set_handlers( $handlers);
1900
14
13
    return $previous_handlers;
1901    }
1902
1903 # a little more complex: set the twig_handlers only if a code ref is given
1904 sub setTwigRoots
1905
101
101
1
96
  { my( $t, $handlers)= @_;
1906
101
100
    my $previous_roots= $t->{twig_roots};
1907
101
231
    _reset_handlers($t->{twig_roots});
1908
101
151
    $t->{twig_roots}= _set_handlers( $handlers);
1909
1910
101
178
    _check_illegal_twig_roots_handlers( $t->{twig_roots});
1911
1912
99
99
71
145
    foreach my $path (keys %{$handlers})
1913
98
100
315
      { $t->{twig_handlers}||= {};
1914
98
100
67
450
        _set_handler( $t->{twig_handlers}, $path, $handlers->{$path})
1915           if( ref($handlers->{$path}) && isa( $handlers->{$path}, 'CODE'));
1916       }
1917
99
115
    return $previous_roots;
1918   }
1919
1920 sub _check_illegal_twig_roots_handlers
1921
101
101
91
  { my( $handlers)= @_;
1922
101
101
84
229
    foreach my $tag_handlers (values %{$handlers->{xpath_handler}})
1923
82
101
      { foreach my $handler_data (@$tag_handlers)
1924
82
100
231
          { if( my $type= $handler_data->{test_on_text})
1925
2
204
              { croak "string() condition not supported on twig_roots option"; }
1926           }
1927       }
1928
99
91
    return;
1929   }
1930
1931
1932 # just store the reference to the expat object in the twig
1933 sub _twig_init
1934    { # warn " in _twig_init...\n"; # DEBUG handler
1935
1936
3102
423328
    my $p= shift;
1937
3102
3357
    my $t=$p->{twig};
1938
1939
3102
1
100
4244
104
    if( $t->{twig_parsing} ) { croak "cannot reuse a twig that is already parsing"; }
1940
3101
2725
    $t->{twig_parsing}=1;
1941
1942
3101
3048
    $t->{twig_parser}= $p;
1943
3101
3078
100
3498
5174
    if( $weakrefs) { weaken( $t->{twig_parser}); }
1944
1945     # in case they had been created by a previous parse
1946
3101
2504
    delete $t->{twig_dtd};
1947
3101
2339
    delete $t->{twig_doctype};
1948
3101
2059
    delete $t->{twig_xmldecl};
1949
3101
2539
    delete $t->{twig_root};
1950
1951     # if needed set the output filehandle
1952
3101
4135
    $t->_set_fh_to_twig_output_fh();
1953
3101
3708
    return;
1954   }
1955
1956 # uses eval to catch the parser's death
1957 sub safe_parse
1958
13
13
1
92
  { my $t= shift;
1959
13
13
84
138
    eval { $t->parse( @_); } ;
1960
13
100
77
    return $@ ? $t->_reset_twig_after_error : $t;
1961   }
1962
1963 sub safe_parsefile
1964
25
25
1
33
  { my $t= shift;
1965
25
25
27
55
    eval { $t->parsefile( @_); } ;
1966
25
100
73
    return $@ ? $t->_reset_twig_after_error : $t;
1967   }
1968
1969 # restore a twig in a proper state so it can be reused for a new parse
1970 sub _reset_twig
1971
10
10
7
  { my $t= shift;
1972
10
12
    $t->{twig_parsing}= 0;
1973
10
16
    delete $t->{twig_current};
1974
10
13
    delete $t->{extra_data};
1975
10
12
    delete $t->{twig_dtd};
1976
10
9
    delete $t->{twig_in_pcdata};
1977
10
10
    delete $t->{twig_in_cdata};
1978
10
14
    delete $t->{twig_stored_space};
1979
10
31
    delete $t->{twig_entity_list};
1980
10
100
41
    $t->root->delete if( $t->root);
1981
10
28
    delete $t->{twig_root};
1982
10
11
    return $t;
1983   }
1984
1985 sub _reset_twig_after_error
1986
10
10
12
  { my $t= shift;
1987
10
24
    $t->_reset_twig;
1988
10
18
    return undef;
1989   }
1990
1991
1992 sub _add_or_discard_stored_spaces
1993
87121
60829
  { my $t= shift;
1994
1995
87121
62892
    $t->{twig_right_after_root}=0; #XX
1996
1997
87121
100
106393
    my $current= $t->{twig_current} or return; # ugly hack, with ignore on, twig_current can disappear
1998
84026
100
116208
    return unless length $t->{twig_stored_spaces};
1999
8375
8362
    my $current_gi= $XML::Twig::index2gi[$current->{'gi'}];
2000
2001
8375
100
10365
    if( ! $t->{twig_discard_all_spaces})
2002
8374
100
13634
      { if( ! defined( $t->{twig_space_policy}->{$current_gi}))
2003
594
828
          { $t->{twig_space_policy}->{$current_gi}= _space_policy( $t, $current_gi); }
2004
8374
100
100
40966
        if( $t->{twig_space_policy}->{$current_gi} || ($t->{twig_stored_spaces}!~ m{\n}) || $t->{twig_preserve_space})
2005
113
131
          { _insert_pcdata( $t, $t->{twig_stored_spaces} ); }
2006       }
2007
2008
8375
7236
    $t->{twig_stored_spaces}='';
2009
2010
8375
6714
    return;
2011   }
2012
2013 # the default twig handlers, which build the tree
2014 sub _twig_start
2015    { # warn " in _twig_start...\n"; # DEBUG handler
2016
2017     #foreach my $s (@_) { next if ref $s; warn "$s: ", is_utf8( $s) ? "has flag" : "FLAG NOT SET"; } # YYY
2018
2019
43600
1688675
    my ($p, $gi, @att)= @_;
2020
43600
33292
    my $t=$p->{twig};
2021
2022     # empty the stored pcdata (space stored in case they are really part of
2023     # a pcdata element) or stored it if the space policy dictates so
2024     # create a pcdata element with the spaces if need be
2025
43600
41682
    _add_or_discard_stored_spaces( $t);
2026
43600
27768
    my $parent= $t->{twig_current};
2027
2028     # if we were parsing PCDATA then we exit the pcdata
2029
43600
100
46316
    if( $t->{twig_in_pcdata})
2030
18139
11352
      { $t->{twig_in_pcdata}= 0;
2031
18139
14295
        delete $parent->{'twig_current'};
2032
18139
15077
        $parent= $parent->{parent};
2033       }
2034
2035     # if we choose to keep the encoding then we need to parse the tag
2036
43600
100
50
67428
    if( my $func = $t->{parse_start_tag})
2037
1654
2668
      { ($gi, @att)= &$func($p->original_string); }
2038     elsif( $t->{twig_entities_in_attribute})
2039       {
2040
0
0
       ($gi,@att)= _parse_start_tag( $p->recognized_string);
2041
0
0
         $t->{twig_entities_in_attribute}=0;
2042       }
2043
2044     # if we are using an external DTD, we need to fill the default attributes
2045
43600
49
100
51037
59
    if( $t->{twig_read_external_dtd}) { _fill_default_atts( $t, $gi, \@att); }
2046
2047     # filter the input data if need be
2048
43600
100
48990
    if( my $filter= $t->{twig_input_filter})
2049
5
10
      { $gi= $filter->( $gi);
2050
5
4
18
8
        foreach my $att (@att) { $att= $filter->($att); }
2051       }
2052
2053
43600
22410
    my $ns_decl;
2054
43600
100
42909
    if( $t->{twig_map_xmlns})
2055
69
83
      { $ns_decl= _replace_ns( $t, \$gi, \@att); }
2056
2057
43600
63829
    my $elt= $t->{twig_elt_class}->new( $gi);
2058
43600
54639
    $elt->set_atts( @att);
2059
2060     # now we can store the tag and atts
2061
43600
80260
    my $context= { $ST_TAG => $gi, $ST_ELT => $elt, @att};
2062
43600
100
47374
    $context->{$ST_NS}= $ns_decl if $ns_decl;
2063
43600
25523
100
43401
36742
    if( $weakrefs) { weaken( $context->{$ST_ELT}); }
2064
43600
43600
26442
49970
    push @{$t->{_twig_context_stack}}, $context;
2065
2066
43600
100
61504
    delete $parent->{'twig_current'} if( $parent);
2067
43600
30548
    $t->{twig_current}= $elt;
2068
43600
33434
    $elt->{'twig_current'}=1;
2069
2070
43600
100
35955
    if( $parent)
2071
40505
29693
      { my $prev_sibling= $parent->{last_child};
2072
40505
100
40097
        if( $prev_sibling)
2073
37932
32161
          { $prev_sibling->{next_sibling}= $elt;
2074
37932
37932
19924
100
29610
39780
24006
            $elt->{prev_sibling}=$prev_sibling; if( $XML::Twig::weakrefs) { weaken( $elt->{prev_sibling});} ;
2075           }
2076
2077
40505
40505
22451
100
33195
37737
21625
        $elt->{parent}=$parent; if( $XML::Twig::weakrefs) { weaken( $elt->{parent});} ;
2078
40505
2573
100
48028
2271
        unless( $parent->{first_child}) { $parent->{first_child}= $elt; }
2079
40505
40505
40505
22451
100
27549
26892
42096
25789
         $parent->{empty}=0; $parent->{last_child}=$elt; if( $XML::Twig::weakrefs) { weaken( $parent->{last_child});} ;
2080       }
2081     else
2082       { # processing root
2083
3095
3881
        $t->set_root( $elt);
2084         # call dtd handler if need be
2085
3095
50
3918
        $t->{twig_dtd_handler}->($t, $t->{twig_dtd})
2086           if( defined $t->{twig_dtd_handler});
2087
2088         # set this so we can catch external entities
2089         # (the handler was modified during DTD processing)
2090
3095
100
100
4352
        if( $t->{twig_default_print})
2091
62
167
          { $p->setHandlers( Default => \&_twig_print); }
2092         elsif( $t->{twig_roots})
2093
42
127
127
159
221
          { $p->setHandlers( Default => sub { return }); }
2094         else
2095
2991
6364
          { $p->setHandlers( Default => \&_twig_default); }
2096       }
2097
2098
43600
100
113512
    $elt->{empty}= $p->recognized_string=~ m{/\s*>$}s ? 1 : 0;
2099
2100
43600
100
252830
    $elt->{extra_data}= $t->{extra_data} if( $t->{extra_data});
2101
43600
32700
    $t->{extra_data}='';
2102
2103     # if the element is ID-ed then store that info
2104
43600
36540
    my $id= $elt->{'att'}->{$ID};
2105
43600
100
47062
    if( defined $id)
2106
18684
21739
      { $t->{twig_id_list}->{$id}= $elt;
2107
18684
9681
100
19634
11298
        if( $weakrefs) { weaken( $t->{twig_id_list}->{$id}); }
2108       }
2109
2110     # call user handler if need be
2111
43600
100
50639
    if( $t->{twig_starttag_handlers})
2112       { # call all appropriate handlers
2113
156
188
        my @handlers= _handler( $t, $t->{twig_starttag_handlers}, $gi);
2114
2115
156
117
        local $_= $elt;
2116
2117
156
143
        foreach my $handler ( @handlers)
2118
66
100
107
          { $handler->($t, $elt) || last; }
2119         # call _all_ handler if needed
2120
156
100
340
        if( my $all= $t->{twig_starttag_handlers}->{handlers}->{$ALL})
2121
3
6
          { $all->($t, $elt); }
2122       }
2123
2124     # check if the tag is in the list of tags to be ignored
2125
43600
100
47518
    if( $t->{twig_ignore_elts_handlers})
2126
81
105
      { my @handlers= _handler( $t, $t->{twig_ignore_elts_handlers}, $gi);
2127         # only the first handler counts, it contains the action (discard/print/string)
2128
81
24
24
100
98
26
37
        if( @handlers) { my $action= shift @handlers; $t->ignore( $elt, $action); }
2129       }
2130
2131
43600
2
100
100
59475
4
    if( $elt->{'att'}->{'xml:space'} && ( $elt->{'att'}->{'xml:space'} eq 'preserve')) { $t->{twig_preserve_space}++; }
2132
2133
2134
43600
103148
    return;
2135   }
2136
2137 sub _replace_ns
2138
83
83
72
  { my( $t, $gi, $atts)= @_;
2139
83
39
    my $decls;
2140
83
110
    foreach my $new_prefix ( $t->parser->new_ns_prefixes)
2141
37
116
      { my $uri= $t->parser->expand_ns_prefix( $new_prefix);
2142         # replace the prefix if it is mapped
2143
37
189
        $decls->{$new_prefix}= $uri;
2144
37
100
100
105
        if( !$t->{twig_keep_original_prefix} && (my $mapped_prefix= $t->{twig_map_xmlns}->{$uri}))
2145
25
20
          { $new_prefix= $mapped_prefix; }
2146         # now put the namespace declaration back in the element
2147
37
100
40
        if( $new_prefix eq '#default')
2148
3
21
          { push @$atts, "xmlns" => $uri; }
2149         else
2150
34
65
          { push @$atts, "xmlns:$new_prefix" => $uri; }
2151       }
2152
2153
83
100
352
    if( $t->{twig_keep_original_prefix})
2154       { # things become more complex: we need to find the original prefix
2155         # and store both prefixes
2156
13
18
        my $ns_info= $t->_ns_info( $$gi);
2157
13
9
        my $map_att;
2158
13
100
19
        if( $ns_info->{mapped_prefix})
2159
11
15
          { $$gi= "$ns_info->{mapped_prefix}:$$gi";
2160
11
19
            $map_att->{$ns_info->{mapped_prefix}}= $ns_info->{prefix};
2161           }
2162
13
8
        my $att_name=1;
2163
13
14
        foreach( @$atts)
2164
30
100
29
          { if( $att_name)
2165               {
2166
15
14
                my $ns_info= $t->_ns_info( $_);
2167
15
100
18
                if( $ns_info->{mapped_prefix})
2168
4
6
                  { $_= "$ns_info->{mapped_prefix}:$_";
2169
4
6
                    $map_att->{$ns_info->{mapped_prefix}}= $ns_info->{prefix};
2170                   }
2171
15
17
                $att_name=0;
2172               }
2173             else
2174
15
12
              { $att_name=1; }
2175           }
2176
13
100
28
        push @$atts, '#original_gi', $map_att if( $map_att);
2177       }
2178     else
2179
70
80
      { $$gi= $t->_replace_prefix( $$gi);
2180
70
52
        my $att_name=1;
2181
70
71
        foreach( @$atts)
2182
182
91
91
100
152
84
73
          { if( $att_name) { $_= $t->_replace_prefix( $_); $att_name=0; }
2183
91
75
            else { $att_name=1; }
2184           }
2185       }
2186
83
85
    return $decls;
2187   }
2188
2189
2190 # extract prefix, local_name, uri, mapped_prefix from a name
2191 # will only work if called from a start or end tag handler
2192 sub _ns_info
2193
28
28
21
  { my( $t, $name)= @_;
2194
28
23
    my $ns_info={};
2195
28
23
    my $p= $t->parser;
2196
28
43
    $ns_info->{uri}= $p->namespace( $name);
2197
28
100
150
    return $ns_info unless( $ns_info->{uri});
2198
2199
15
16
    $ns_info->{prefix}= _a_proper_ns_prefix( $p, $ns_info->{uri});
2200
15
67
37
    $ns_info->{mapped_prefix}= $t->{twig_map_xmlns}->{$ns_info->{uri}} || $ns_info->{prefix};
2201
2202
15
11
    return $ns_info;
2203   }
2204
2205 sub _a_proper_ns_prefix
2206
22
22
20
  { my( $p, $uri)= @_;
2207
22
32
    foreach my $prefix ($p->current_ns_prefixes)
2208
38
100
280
      { if( $p->expand_ns_prefix( $prefix) eq $uri)
2209
22
112
          { return $prefix; }
2210       }
2211
0
0
    return;
2212   }
2213
2214 # returns the uri bound to a prefix in the original document
2215 # only works in a handler
2216 # can be used to deal with xsi:type attributes
2217 sub original_uri
2218
2
2
1
16
  { my( $t, $prefix)= @_;
2219
2
2
    my $ST_NS = '##ns' ;
2220
2
6
2
100
1
12
3
    foreach my $ns (map { $_->{$ST_NS} if $_->{$ST_NS} } reverse @{$t->{_twig_context_stack}})
2221
6
100
17
      { return $ns->{$prefix} || next; }
2222
0
0
    return;
2223   }
2224
2225
2226 sub _fill_default_atts
2227
49
49
46
  { my( $t, $gi, $atts)= @_;
2228
49
37
    my $dtd= $t->{twig_dtd};
2229
49
51
    my $attlist= $dtd->{att}->{$gi};
2230
49
68
    my %value= @$atts;
2231
49
92
    foreach my $att (keys %$attlist)
2232
66
100
75
170
      { if( !exists( $value{$att})
2233             && exists( $attlist->{$att}->{default})
2234             && ( $attlist->{$att}->{default} ne '#IMPLIED')
2235           )
2236           { # the quotes are included in the default, so we need to remove them
2237
13
22
            my $default_value= substr( $attlist->{$att}->{default}, 1, -1);
2238
13
16
            push @$atts, $att, $default_value;
2239           }
2240       }
2241
49
73
    return;
2242   }
2243
2244
2245 # the default function to parse a start tag (in keep_encoding mode)
2246 # can be overridden with the parse_start_tag method
2247 # only works for 1-byte character sets
2248 sub _parse_start_tag
2249
1682
5826
  { my $string= shift;
2250
1682
921
    my( $gi, @atts);
2251
2252     # get the gi (between < and the first space, / or > character)
2253     #if( $string=~ s{^<\s*([^\s>/]*)[\s>/]*}{}s)
2254
1682
100
14189
    if( $string=~ s{^<\s*($REG_TAG_NAME)\s*[\s>/]}{}s)
2255
1664
2665
      { $gi= $1; }
2256     else
2257
18
1223
      { croak "error parsing tag '$string'"; }
2258
1664
2455
    while( $string=~ s{^([^\s=]*)\s*=\s*(["'])(.*?)\2\s*}{}s)
2259
200
531
      { push @atts, $1, $3; }
2260
1664
3767
    return $gi, @atts;
2261   }
2262
2263 sub set_root
2264
3137
1
2615
  { my( $t, $elt)= @_;
2265
3137
2923
    $t->{twig_root}= $elt;
2266
3137
50
3658
    if( $elt)
2267
3137
2708
      { $elt->{twig}= $t;
2268
3137
3111
100
3404
3973
        if( $weakrefs) { weaken( $elt->{twig}); }
2269       }
2270
3137
2329
    return $t;
2271   }
2272
2273 sub _twig_end
2274    { # warn " in _twig_end...\n"; # DEBUG handler
2275
43486
33715
    my ($p, $gi) = @_;
2276
2277
43486
31401
    my $t=$p->{twig};
2278
2279
43486
50
67
80840
    if( $t->{twig_in_pcdata} && (my $text_handler= $t->{TwigHandlers}->{$TEXT}) )
2280
0
0
0
0
0
      { local $_= $t->{twig_current}; $text_handler->( $t, $_) if $_;
2281       }
2282
2283
43486
69
100
44753
83
    if( $t->{twig_map_xmlns}) { $gi= $t->_replace_prefix( $gi); }
2284
2285
43486
34325
    _add_or_discard_stored_spaces( $t);
2286
2287     # the new twig_current is the parent
2288
43486
27268
    my $elt= $t->{twig_current};
2289
43486
34855
    delete $elt->{'twig_current'};
2290
2291     # if we were parsing PCDATA then we exit the pcdata too
2292
43486
100
45867
    if( $t->{twig_in_pcdata})
2293       {
2294
20602
12304
        $t->{twig_in_pcdata}= 0;
2295
20602
50
28992
        $elt= $elt->{parent} if($elt->{parent});
2296
20602
14882
        delete $elt->{'twig_current'};
2297       }
2298
2299     # parent is the new current element
2300
43486
27200
    my $parent= $elt->{parent};
2301
43486
30450
    $t->{twig_current}= $parent;
2302
2303
43486
100
41196
    if( $parent)
2304
40415
29684
      { $parent->{'twig_current'}=1;
2305         # twig_to_be_normalized
2306
40415
5
5
100
44164
9
6
        if( $parent->{twig_to_be_normalized}) { $parent->normalize; $parent->{twig_to_be_normalized}=0; }
2307       }
2308
2309
43486
100
47085
    if( $t->{extra_data})
2310
324
454
      { $elt->_set_extra_data_before_end_tag( $t->{extra_data});
2311
324
242
        $t->{extra_data}='';
2312       }
2313
2314
43486
100
44700
    if( $t->{twig_handlers})
2315       { # look for handlers
2316
3773
5123
        my @handlers= _handler( $t, $t->{twig_handlers}, $gi);
2317
2318
3773
100
4091
        if( $t->{twig_tdh})
2319
48
29
29
100
44
17
54
          { if( @handlers) { push @{$t->{twig_handlers_to_trigger}}, [ $elt, \@handlers ]; }
2320
48
100
71
            if( my $all= $t->{twig_handlers}->{handlers}->{$ALL})
2321
12
12
9
20
              { push @{$t->{twig_handlers_to_trigger}}, [ $elt, [$all] ]; }
2322           }
2323         else
2324           {
2325
3725
2691
            local $_= $elt; # so we can use $_ in the handlers
2326
2327
3725
3301
            foreach my $handler ( @handlers)
2328
711
100
1867
              { $handler->($t, $elt) || last; }
2329             # call _all_ handler if needed
2330
3713
4950
            my $all= $t->{twig_handlers}->{handlers}->{$ALL};
2331
3713
100
3720
            if( $all)
2332
283
394
              { $all->($t, $elt); }
2333
3713
883
100
100
8363
1204
            if( @handlers || $all) { $t->{twig_right_after_root}=0; }
2334           }
2335       }
2336
2337     # if twig_roots is set for the element then set appropriate handler
2338
43474
100
100
53155
    if( $t->{twig_root_depth} and ($p->depth == $t->{twig_root_depth}) )
2339
142
100
652
      { if( $t->{twig_default_print})
2340           { # select the proper fh (and store the currently selected one)
2341
79
99
            $t->_set_fh_to_twig_output_fh();
2342
79
0
50
110
0
            if( !$p->depth==1) { $t->{twig_right_after_root}=1; } #XX
2343
79
100
282
            if( $t->{twig_keep_encoding})
2344
60
194
              { $p->setHandlers( %twig_handlers_roots_print_original); }
2345             else
2346
19
64
              { $p->setHandlers( %twig_handlers_roots_print); }
2347           }
2348         else
2349
63
210
          { $p->setHandlers( %twig_handlers_roots); }
2350       }
2351
2352
43474
2
100
100
63319
2
    if( $elt->{'att'}->{'xml:space'} && ( $elt->{'att'}->{'xml:space'} eq 'preserve')) { $t->{twig_preserve_space}--; }
2353
2354
43474
43474
23024
35381
    pop @{$t->{_twig_context_stack}};
2355
43474
101240
    return;
2356   }
2357
2358 sub _trigger_tdh
2359
4
4
4
  { my( $t)= @_;
2360
2361
4
4
50
0
10
    if( @{$t->{twig_handlers_to_trigger}})
2362
4
78
4
1
78
9
      { my @handlers_to_trigger_now= sort { $a->[0]->cmp( $b->[0]) } @{$t->{twig_handlers_to_trigger}};
2363
4
5
        foreach my $elt_handlers (@handlers_to_trigger_now)
2364
41
25
          { my( $handled_elt, $handlers_to_trigger)= @$elt_handlers;
2365
41
26
            foreach my $handler ( @$handlers_to_trigger)
2366
41
41
50
24
38
              { local $_= $handled_elt; $handler->($t, $handled_elt) || last; }
2367           }
2368       }
2369
4
4
    return;
2370   }
2371
2372 # return the list of handler that can be activated for an element
2373 # (either of CODE ref's or 1's for twig_roots)
2374
2375 sub _handler
2376
4645
3760
  { my( $t, $handlers, $gi)= @_;
2377
2378
4645
3275
    my @found_handlers=();
2379
4645
2444
    my $found_handler;
2380
2381
4645
1140
9290
8330
1769
10486
    foreach my $handler ( map { @$_ } grep { $_ } $handlers->{xpath_handler}->{$gi}, $handlers->{xpath_handler}->{'*'})
2382
1493
1094
      { my $trigger= $handler->{trigger};
2383
1493
100
26337
         if( my $found_path= $trigger->( $t->{_twig_context_stack}))
2384
992
815
          { my $found_handler= $handler->{handler};
2385
992
1436
            push @found_handlers, $found_handler;
2386           }
2387       }
2388
2389     # if no handler found call default handler if defined
2390
4645
100
100
11566
    if( !@found_handlers && defined $handlers->{handlers}->{$DEFAULT})
2391
41
49
      { push @found_handlers, $handlers->{handlers}->{$DEFAULT}; }
2392
2393
4645
100
100
7341
    if( @found_handlers and $t->{twig_do_not_chain_handlers})
2394
1
1
      { @found_handlers= ($found_handlers[0]); }
2395
2396
4645
5666
    return @found_handlers; # empty if no handler found
2397
2398   }
2399
2400
2401 sub _replace_prefix
2402
234
234
169
  { my( $t, $name)= @_;
2403
234
201
    my $p= $t->parser;
2404
234
359
    my $uri= $p->namespace( $name);
2405     # try to get the namespace from default if none is found (for attributes)
2406     # this should probably be an option
2407
234
98
100
67
1312
135
    if( !$uri and( $name!~/^xml/)) { $uri= $p->expand_ns_prefix( '#default'); }
2408
234
100
508
    if( $uri)
2409
112
100
67
217
      { if (my $mapped_prefix= $t->{twig_map_xmlns}->{$uri} || $DEFAULT_URI2NS{$uri})
2410
105
166
          { return "$mapped_prefix:$name"; }
2411         else
2412
7
10
          { my $prefix= _a_proper_ns_prefix( $p, $uri);
2413
7
1
100
13
1
            if( $prefix eq '#default') { $prefix=''; }
2414
7
100
16
            return $prefix ? "$prefix:$name" : $name;
2415           }
2416       }
2417     else
2418
122
231
      { return $name; }
2419   }
2420
2421
2422 sub _twig_char
2423    { # warn " in _twig_char...\n"; # DEBUG handler
2424
2425
55618
44898
    my ($p, $string)= @_;
2426
55618
41272
    my $t=$p->{twig};
2427
2428
55618
100
62518
    if( $t->{twig_keep_encoding})
2429
7407
100
5683
      { if( !$t->{twig_in_cdata})
2430
1309
1913
          { $string= $p->original_string(); }
2431         else
2432           {
2433
187
187
187
191
749
203
739
            use bytes; # > perl 5.5
2434
6098
100
4486
            if( length( $string) < 1024)
2435
6058
6846
              { $string= $p->original_string(); }
2436             else
2437               { #warn "dodgy case";
2438                 # TODO original_string does not hold the entire string, but $string is wrong
2439                 # I believe due to a bug in XML::Parser
2440                 # for now, we use the original string, even if it means that it's been converted to utf8
2441               }
2442           }
2443       }
2444
2445
55618
2
100
72011
4
    if( $t->{twig_input_filter}) { $string= $t->{twig_input_filter}->( $string); }
2446
55618
2
100
55632
3
    if( $t->{twig_char_handler}) { $string= $t->{twig_char_handler}->( $string); }
2447
2448
55618
36156
    my $elt= $t->{twig_current};
2449
2450
55618
100
100
70396
    if( $t->{twig_in_cdata})
2451       { # text is the continuation of a previously created cdata
2452
6156
5153
        $elt->{cdata}.= $t->{twig_stored_spaces} . $string;
2453       }
2454     elsif( $t->{twig_in_pcdata})
2455       { # text is the continuation of a previously created pcdata
2456
314
100
370
        if( $t->{extra_data})
2457
75
139
          { $elt->_push_extra_data_in_pcdata( $t->{extra_data}, length( $elt->{pcdata}));
2458
75
70
            $t->{extra_data}='';
2459           }
2460
314
266
        $elt->{pcdata}.= $string;
2461       }
2462     else
2463       {
2464         # text is just space, which might be discarded later
2465
49148
100
80653
        if( $string=~/\A\s*\Z/s)
2466           {
2467
10362
100
9799
            if( $t->{extra_data})
2468               { # we got extra data (comment, pi), lets add the spaces to it
2469
34
35
                $t->{extra_data} .= $string;
2470               }
2471             else
2472               { # no extra data, just store the spaces
2473
10328
11534
                $t->{twig_stored_spaces}.= $string;
2474               }
2475           }
2476         else
2477
38786
52616
          { my $new_elt= _insert_pcdata( $t, $t->{twig_stored_spaces}.$string);
2478
38786
36402
            delete $elt->{'twig_current'};
2479
38786
27994
            $new_elt->{'twig_current'}=1;
2480
38786
24620
            $t->{twig_current}= $new_elt;
2481
38786
25802
            $t->{twig_in_pcdata}=1;
2482
38786
100
51802
            if( $t->{extra_data})
2483
23
36
              { $new_elt->_push_extra_data_in_pcdata( $t->{extra_data}, 0);
2484
23
24
                $t->{extra_data}='';
2485               }
2486           }
2487       }
2488
55618
126368
    return;
2489   }
2490
2491 sub _twig_cdatastart
2492    { # warn " in _twig_cdatastart...\n"; # DEBUG handler
2493
2494
103
103
74
    my $p= shift;
2495
103
88
    my $t=$p->{twig};
2496
2497
103
90
    $t->{twig_in_cdata}=1;
2498
103
147
    my $cdata= $t->{twig_elt_class}->new( $CDATA);
2499
103
86
    my $twig_current= $t->{twig_current};
2500
2501
103
100
113
    if( $t->{twig_in_pcdata})
2502       { # create the node as a sibling of the PCDATA
2503
20
20
19
100
20
28
31
        $cdata->{prev_sibling}=$twig_current; if( $XML::Twig::weakrefs) { weaken( $cdata->{prev_sibling});} ;
2504
20
18
        $twig_current->{next_sibling}= $cdata;
2505
20
24
        my $parent= $twig_current->{parent};
2506
20
20
19
100
13
25
27
        $cdata->{parent}=$parent; if( $XML::Twig::weakrefs) { weaken( $cdata->{parent});} ;
2507
20
20
20
19
100
21
17
24
24
         $parent->{empty}=0; $parent->{last_child}=$cdata; if( $XML::Twig::weakrefs) { weaken( $parent->{last_child});} ;
2508
20
21
        $t->{twig_in_pcdata}=0;
2509       }
2510     else
2511       { # we have to create a PCDATA element if we need to store spaces
2512
83
100
67
132
        if( $t->_space_policy($XML::Twig::index2gi[$twig_current->{'gi'}]) && $t->{twig_stored_spaces})
2513
5
6
          { _insert_pcdata( $t, $t->{twig_stored_spaces}); }
2514
83
76
        $t->{twig_stored_spaces}='';
2515
2516         # create the node as a child of the current element
2517
83
83
83
50
69
94
121
        $cdata->{parent}=$twig_current; if( $XML::Twig::weakrefs) { weaken( $cdata->{parent});} ;
2518
83
100
103
        if( my $prev_sibling= $twig_current->{last_child})
2519
8
8
8
50
9
12
9
          { $cdata->{prev_sibling}=$prev_sibling; if( $XML::Twig::weakrefs) { weaken( $cdata->{prev_sibling});} ;
2520
8
8
            $prev_sibling->{next_sibling}= $cdata;
2521           }
2522         else
2523
75
73
          { $twig_current->{first_child}= $cdata; }
2524
83
83
83
83
50
56
74
96
114
         $twig_current->{empty}=0; $twig_current->{last_child}=$cdata; if( $XML::Twig::weakrefs) { weaken( $twig_current->{last_child});} ;
2525
2526       }
2527
2528
103
118
    delete $twig_current->{'twig_current'};
2529
103
68
    $t->{twig_current}= $cdata;
2530
103
74
    $cdata->{'twig_current'}=1;
2531
103
4
4
100
131
6
3
    if( $t->{extra_data}) { $cdata->set_extra_data( $t->{extra_data}); $t->{extra_data}='' };
2532
103
429
    return;
2533   }
2534
2535 sub _twig_cdataend
2536    { # warn " in _twig_cdataend...\n"; # DEBUG handler
2537
2538
103
103
75
    my $p= shift;
2539
103
74
    my $t=$p->{twig};
2540
2541
103
83
    $t->{twig_in_cdata}=0;
2542
2543
103
71
    my $elt= $t->{twig_current};
2544
103
92
    delete $elt->{'twig_current'};
2545
103
94
    my $cdata= $elt->{cdata};
2546
103
113
    $elt->_set_cdata( $cdata);
2547
2548
103
103
65
199
    push @{$t->{_twig_context_stack}}, { $ST_TAG => $CDATA };
2549
2550
103
100
151
    if( $t->{twig_handlers})
2551       { # look for handlers
2552
19
28
        my @handlers= _handler( $t, $t->{twig_handlers}, $CDATA);
2553
19
15
        local $_= $elt; # so we can use $_ in the handlers
2554
19
4
100
33
6
        foreach my $handler ( @handlers) { $handler->($t, $elt) || last; }
2555       }
2556
2557
103
103
52
96
    pop @{$t->{_twig_context_stack}};
2558
2559
103
130
    $elt= $elt->{parent};
2560
103
82
    $t->{twig_current}= $elt;
2561
103
82
    $elt->{'twig_current'}=1;
2562
2563
103
98
    $t->{twig_long_cdata}=0;
2564
103
197
    return;
2565   }
2566
2567 sub _pi_elt_handlers
2568
1525
1072
  { my( $t, $pi)= @_;
2569
1525
100
3177
    my $pi_handlers= $t->{twig_handlers}->{pi_handlers} || return;
2570
66
101
    foreach my $handler ( $pi_handlers->{$pi->{target}}, $pi_handlers->{''})
2571
132
66
66
50
100
239
47
101
      { if( $handler) { local $_= $pi; $handler->( $t, $pi) || last; } }
2572   }
2573
2574 sub _pi_text_handler
2575
1506
1123
  { my( $t, $target, $data)= @_;
2576
1506
100
2290
    if( my $handler= $t->{twig_handlers}->{pi_handlers}->{$target})
2577
1
3
      { return $handler->( $t, $target, $data); }
2578
1505
100
1620
    if( my $handler= $t->{twig_handlers}->{pi_handlers}->{''})
2579
1
3
      { return $handler->( $t, $target, $data); }
2580
1504
100
100
4942
    return defined( $data) && $data ne '' ? "<?$target $data?>" : "<?$target?>" ;
2581   }
2582
2583 sub _comment_elt_handler
2584
1755
1125
  { my( $t, $comment)= @_;
2585
1755
100
4068
    if( my $handler= $t->{twig_handlers}->{handlers}->{$COMMENT})
2586
6
6
5
10
      { local $_= $comment; $handler->($t, $comment); }
2587   }
2588
2589 sub _comment_text_handler
2590
1892
1269
  { my( $t, $comment)= @_;
2591
1892
100
3046
    if( my $handler= $t->{twig_handlers}->{handlers}->{$COMMENT})
2592
4
10
      { $comment= $handler->($t, $comment);
2593
4
2
100
100
22
3
        if( !defined $comment || $comment eq '') { return ''; }
2594       }
2595
1890
2999
    return "<!--$comment-->";
2596   }
2597
2598
2599
2600 sub _twig_comment
2601    { # warn " in _twig_comment...\n"; # DEBUG handler
2602
2603
3653
16883
    my( $p, $comment_text)= @_;
2604
3653
2788
    my $t=$p->{twig};
2605
2606
3653
1751
100
4334
2595
    if( $t->{twig_keep_encoding}) { $comment_text= substr( $p->original_string(), 4, -3); }
2607
2608
3653
12427
    $t->_twig_pi_comment( $p, $COMMENT, $t->{twig_keep_comments}, $t->{twig_process_comments},
2609                           '_set_comment', '_comment_elt_handler', '_comment_text_handler', $comment_text
2610                         );
2611
3653
9919
    return;
2612   }
2613
2614 sub _twig_pi
2615    { # warn " in _twig_pi...\n"; # DEBUG handler
2616
2617
3038
12526
    my( $p, $target, $data)= @_;
2618
3038
2430
    my $t=$p->{twig};
2619
2620
3038
100
3539
    if( $t->{twig_keep_encoding})
2621
1518
2351
      { my $pi_text= substr( $p->original_string(), 2, -2);
2622
1518
6841
        ($target, $data)= split( /\s+/, $pi_text, 2);
2623       }
2624
2625
3038
5763
    $t->_twig_pi_comment( $p, $PI, $t->{twig_keep_pi}, $t->{twig_process_pi},
2626                           '_set_pi', '_pi_elt_handlers', '_pi_text_handler', $target, $data
2627                         );
2628
3038
8531
    return;
2629   }
2630
2631 sub _twig_pi_comment
2632
6691
12309
  { my( $t, $p, $type, $keep, $process, $set, $elt_handler, $text_handler, @parser_args)= @_;
2633
2634
6691
100
7295
    if( $t->{twig_input_filter})
2635
2
3
2
7
          { foreach my $arg (@parser_args) { $arg= $t->{twig_input_filter}->( $arg); } }
2636
2637     # if pi/comments are to be kept then we piggyback them to the current element
2638
6691
100
100
7645
    if( $keep)
2639       { # first add spaces
2640
3398
100
3854
        if( $t->{twig_stored_spaces})
2641
22
26
              { $t->{extra_data}.= $t->{twig_stored_spaces};
2642
22
23
                $t->{twig_stored_spaces}= '';
2643               }
2644
2645
3398
4810
        my $extra_data= $t->$text_handler( @parser_args);
2646
3398
4849
        $t->{extra_data}.= $extra_data;
2647
2648       }
2649     elsif( $process)
2650       {
2651
3280
2267
        my $twig_current= $t->{twig_current}; # defined unless we are outside of the root
2652
2653
3280
4861
        my $elt= $t->{twig_elt_class}->new( $type);
2654
3280
4396
        $elt->$set( @parser_args);
2655
3280
100
3887
        if( $t->{extra_data})
2656
676
755
          { $elt->set_extra_data( $t->{extra_data});
2657
676
458
            $t->{extra_data}='';
2658           }
2659
2660
3280
100
100
100
2900
        unless( $t->root)
2661
1451
1278
          { $t->_add_cpi_outside_of_root( leading_cpi => $elt);
2662           }
2663         elsif( $t->{twig_in_pcdata})
2664           { # create the node as a sibling of the PCDATA
2665
7
13
            $elt->paste_after( $twig_current);
2666
7
4
            $t->{twig_in_pcdata}=0;
2667           }
2668         elsif( $twig_current)
2669           { # we have to create a PCDATA element if we need to store spaces
2670
376
100
100
572
            if( $t->_space_policy($XML::Twig::index2gi[$twig_current->{'gi'}]) && $t->{twig_stored_spaces})
2671
1
2
              { _insert_pcdata( $t, $t->{twig_stored_spaces}); }
2672
376
363
            $t->{twig_stored_spaces}='';
2673             # create the node as a child of the current element
2674
376
419
            $elt->paste_last_child( $twig_current);
2675           }
2676         else
2677
1446
1437
          { $t->_add_cpi_outside_of_root( trailing_cpi => $elt); }
2678
2679
3280
100
3052
        if( $twig_current)
2680
383
396
          { delete $twig_current->{'twig_current'};
2681
383
256
            my $parent= $elt->{parent};
2682
383
271
            $t->{twig_current}= $parent;
2683
383
330
            $parent->{'twig_current'}=1;
2684           }
2685
2686
3280
4082
        $t->$elt_handler( $elt);
2687       }
2688
2689   }
2690
2691
2692 # add a comment or pi before the first element
2693 sub _add_cpi_outside_of_root
2694
2902
2106
  { my($t, $type, $elt)= @_; # $type is 'leading_cpi' or 'trailing_cpi'
2695
2902
67
4782
    $t->{$type} ||= $t->{twig_elt_class}->new( '#CPI');
2696     # create the node as a child of the current element
2697
2902
3299
    $elt->paste_last_child( $t->{$type});
2698
2902
1743
    return $t;
2699   }
2700
2701 sub _twig_final
2702    { # warn " in _twig_final...\n"; # DEBUG handler
2703
2704
3084
21097
    my $p= shift;
2705
3084
100
9197
    my $t= $p->isa( 'XML::Twig') ? $p : $p->{twig};
2706
2707     # store trailing data
2708
3084
579
579
100
3966
983
464
    if( $t->{extra_data}) { $t->{trailing_cpi_text} = $t->{extra_data}; $t->{extra_data}=''; }
2709
3084
100
8195
    $t->{trailing_spaces}= $t->{twig_stored_spaces} || '';
2710
3084
3084
2190
3749
    my $s= $t->{twig_stored_spaces}; $s=~s{\n}{\\n}g;
2711
3084
287
100
3539
346
    if( $t->{twig_stored_spaces}) { my $s= $t->{twig_stored_spaces}; }
2712
2713     # restore the selected filehandle if needed
2714
3084
3535
    $t->_set_fh_to_selected_fh();
2715
2716
3084
100
3741
    $t->_trigger_tdh if( $t->{twig_tdh});
2717
2718
3084
100
3262
    select $t->{twig_original_selected_fh} if($t->{twig_original_selected_fh}); # probably dodgy
2719
2720
3084
100
3869
    if( exists $t->{twig_autoflush_data})
2721
36
39
      { my @args;
2722
36
100
78
        push @args, $t->{twig_autoflush_data}->{fh} if( $t->{twig_autoflush_data}->{fh});
2723
36
1
100
73
2
        push @args, @{$t->{twig_autoflush_data}->{args}} if( $t->{twig_autoflush_data}->{args});
2724
36
59
        $t->flush( @args);
2725
36
59
        delete $t->{twig_autoflush_data};
2726
36
50
76
        $t->root->delete if $t->root;
2727       }
2728
2729     # tries to clean-up (probably not very well at the moment)
2730     #undef $p->{twig};
2731
3084
3046
    undef $t->{twig_parser};
2732
3084
2972
    delete $t->{twig_parsing};
2733
3084
3084
2263
5072
    @{$t}{ qw( twig_parser twig_parsing _twig_context_stack twig_current) }=();
2734
2735
3084
5042
    return $t;
2736   }
2737
2738 sub _insert_pcdata
2739
38905
28236
  { my( $t, $string)= @_;
2740     # create a new PCDATA element
2741
38905
25050
    my $parent= $t->{twig_current}; # always defined
2742
38905
19729
    my $elt;
2743
38905
100
38709
    if( exists $t->{twig_alt_elt_class})
2744
162
235
      { $elt= $t->{twig_elt_class}->new( $PCDATA);
2745
162
157
        $elt->_set_pcdata( $string);
2746       }
2747     else
2748
38743
109928
      { $elt= bless( { gi => $XML::Twig::gi2index{$PCDATA}, pcdata => $string }, 'XML::Twig::Elt'); }
2749
2750
38905
29744
    my $prev_sibling= $parent->{last_child};
2751
38905
100
32449
    if( $prev_sibling)
2752
191
182
      { $prev_sibling->{next_sibling}= $elt;
2753
191
191
185
100
154
244
283
        $elt->{prev_sibling}=$prev_sibling; if( $XML::Twig::weakrefs) { weaken( $elt->{prev_sibling});} ;
2754       }
2755     else
2756
38714
45673
      { $parent->{first_child}= $elt; }
2757
2758
38905
38905
20880
100
29084
39477
22940
    $elt->{parent}=$parent; if( $XML::Twig::weakrefs) { weaken( $elt->{parent});} ;
2759
38905
38905
38905
20880
100
29307
42881
43015
21995
     $parent->{empty}=0; $parent->{last_child}=$elt; if( $XML::Twig::weakrefs) { weaken( $parent->{last_child});} ;
2760
38905
32758
    $t->{twig_stored_spaces}='';
2761
38905
40251
    return $elt;
2762   }
2763
2764 sub _space_policy
2765
1053
939
  { my( $t, $gi)= @_;
2766
1053
654
    my $policy;
2767
1053
100
1408
    $policy=0 if( $t->{twig_discard_spaces});
2768
1053
100
1280
    $policy=1 if( $t->{twig_keep_spaces});
2769
1053
100
100
1655
    $policy=1 if( $t->{twig_keep_spaces_in}
2770                && $t->{twig_keep_spaces_in}->{$gi});
2771
1053
100
100
1528
    $policy=0 if( $t->{twig_discard_spaces_in}
2772                && $t->{twig_discard_spaces_in}->{$gi});
2773
1053
1849
    return $policy;
2774   }
2775
2776
2777 sub _twig_entity
2778    { # warn " in _twig_entity...\n"; # DEBUG handler
2779
76
76
94
    my( $p, $name, $val, $sysid, $pubid, $ndata, $param)= @_;
2780
76
79
    my $t=$p->{twig};
2781
2782     #{ no warnings; my $base= $p->base; warn "_twig_entity called: expand: '$t->{twig_expand_external_ents}', base: '$base', name: '$name', val: '$val', sysid: '$sysid', pubid: '$pubid', ndata: '$ndata', param: '$param'\n";}
2783
2784
76
55
    my $missing_entity=0;
2785
2786
76
100
125
    if( $sysid)
2787
38
100
49
      { if($ndata)
2788
15
14
100
35
16
          { if( ! -f _based_filename( $sysid, $p->base)) { $missing_entity= 1; }
2789           }
2790         else
2791
23
100
41
          { if( $t->{twig_expand_external_ents})
2792
5
5
9
16
              { $val= eval { _slurp_uri( $sysid, $p->base) };
2793
5
100
22
                if( ! defined $val)
2794
3
100
10
                  { if( $t->{twig_extern_ent_nofail})
2795
2
3
                      { $missing_entity= 1; }
2796                     else
2797
1
6
                      { _croak( "cannot load SYSTEM entity '$name' from '$sysid': $@", 3); }
2798                   }
2799               }
2800           }
2801       }
2802
2803
75
200
    my $ent=XML::Twig::Entity->new( $name, $val, $sysid, $pubid, $ndata, $param);
2804
75
16
100
103
34
    if( $missing_entity) { $t->{twig_missing_system_entities}->{$name}= $ent; }
2805
2806
75
107
    my $entity_list= $t->entity_list;
2807
75
75
50
131
107
    if( $entity_list) { $entity_list->add( $ent); }
2808
2809
75
50
110
    if( $parser_version > 2.27)
2810       { # this is really ugly, but with some versions of XML::Parser the value
2811         # of the entity is not properly returned by the default handler
2812
75