XML, the Perl Way

File Coverage

File: tools/xml_split/xml_split
Coverage: 79.2%

line stmt bran cond sub pod time code
1 #!/usr/bin/perl -w
2 # $Id: /xmltwig/trunk/tools/xml_split/xml_split 17 2007-06-04T11:57:10.366292Z mrodrigu $
3
29
29
29
29
76395
29
802
use strict;
4
5
29
29
29
29
38864
41
194
use XML::Twig;
6
29
29
29
29
7583
16536
2283
use FindBin qw( $RealBin $RealScript);
7
29
29
29
29
30433
696
1877
use Getopt::Std;
8
9
29
2148783
import xml_split::state::parser;
10
29
165
import xml_split::state::twig;
11
12
29
58
undef $Getopt::Std::STANDARD_HELP_VERSION;
13
29
33
$Getopt::Std::STANDARD_HELP_VERSION=1; # to stop processing after --help or --version
14
15
29
29
29
29
94
22
61705
use vars qw( $VERSION $USAGE);
16
17
29
38
$VERSION= "0.06";
18
29
37
$USAGE= "xml_split [-l <level> [-s <size> | -g <nb_grouped>] | -c <cond>] [-b <base>] [-n <nb>] [-e <ext>] [-p <plugin>] [-I <plugin_dir>] [-i] [-d] [-v] [-h] [-m] [-V] <files>\n";
19
20 { # main block
21
22
29
29
125
39
my $opt={};
23
29
72
getopts('l:c:b:g:n:e:p:is:dvhmV', $opt);
24
25 # defaults
26
29
100
1463
$opt->{n} ||= 2; # number of digits used for creating parts
27
29
33
50
161
$opt->{I} ||= ($ENV{HOME} || '') . "/.xml_split";
28
29
29
1
100
77
0
if( $opt->{h}) { die $USAGE, "\n"; }
30
28
1
100
51
0
if( $opt->{m}) { exec "pod2text $RealBin/$RealScript"; }
31
27
1
1
100
51
25
0
if( $opt->{V}) { print "xml_split version $VERSION\n"; exit; }
32
33
26
75
my %factor=( ' ' => 1, K => 1000, M => 1_000_000, G => 1_000_000_000);
34
26
6
1
100
100
51
12
0
if( $opt->{s}) { if( $opt->{c}) { die "cannot use -c and -s at the same time\n"; }
35
36
5
100
28
                 if( $opt->{s}=~ m{^\s*(\d+)\s*(G[bo]?|M[bo]?|K[bo]?\s*)?$}i)
37
4
50
21
                   { my( $size, $unit)= ($1, uc substr( $2 || ' ', 0, 1));
38
4
12
                     $opt->{s}= $size * $factor{$unit};
39                    }
40                  else
41
1
0
                   { die "invalid size (should be in Kb, Mb or Gb): '$opt->{s}'\n"; }
42                }
43
44
24
8
100
100
100
76
14
if( $opt->{g}) { die "cannot use -g and -s at the same time\n" if( $opt->{s});
45
7
100
9
                 die "cannot use -g and -c at the same time\n" if( $opt->{c});
46
6
100
17
                 $opt->{l} ||= 1;
47                }
48
5
50
9
elsif( $opt->{c}) { die "cannot use -l and -c at the same time\n" if( $opt->{l}); }
49
11
11
100
30
23
else { $opt->{l} ||= 1; $opt->{c}= "level( $opt->{l})"; }
50
51
52
22
100
127
my $options= { cond => $opt->{c},
53                base => $opt->{b}, nb_digits => $opt->{n}, ext => $opt->{e},
54                plugin => $opt->{p},
55                no_pi => $opt->{d},
56                verbose => $opt->{v},
57                xinclude => $opt->{i} ? 1 : 0,
58              };
59
60
61
22
22
my $state;
62
22
50
48
if( my $plugin= $opt->{p})
63
0
0
0
0
0
  { if( $plugin!~ m{^[\w:.-]+$}) { die "wrong plugin name '$plugin' (only word characters are allowed in plugin names)\n"; }
64
0
0
    push @INC, $opt->{I};
65
0
0
0
0
    eval { require $plugin };
66
0
0
0
0
0
    if( $@) { die "cannot find plugin '$plugin': $!"; }
67
0
0
    import $plugin;
68
0
0
    $state= $plugin->new( $options);
69   }
70
71
72
22
100
100
100
65
if( $opt->{s})
73
3
33
22
  { $state||= xml_split::state::parser->new( $options);
74
3
7
    $state->{level} = $opt->{l};
75
3
5
    $state->{size} = $opt->{s};
76
3
5
    $state->{current_size}=0;
77
3
10
    $state->{handlers}= { Start => \&parser_start_tag_size, End => \&parser_end_tag_size , Default => \&parser_default_size};
78
3
50
9
    warn "using XML::Parser\n" if( $opt->{v});
79
3
6
    split_with_parser( $state, @ARGV);
80   }
81 elsif( $opt->{g})
82
6
33
31
  { $state||= xml_split::state::parser->new( $options);
83
6
7
    $state->{level}= $opt->{l};
84
6
9
    $state->{group}= $opt->{g};
85
6
17
    $state->{handlers}= { Start => \&parser_start_tag_grouped, End => \&parser_end_tag_grouped , Default => \&parser_default_grouped};
86
6
50
11
    warn "using XML::Parser\n" if( $opt->{v});
87
6
10
    split_with_parser( $state, @ARGV);
88   }
89 elsif( $opt->{l})
90
8
33
39
  { $state||= xml_split::state::parser->new( $options);
91
8
12
    $state->{level}= $opt->{l};
92
8
23
    $state->{handlers}= { Start => \&parser_start_tag_level, End => \&parser_end_tag_level , Default => \&parser_default_level};
93
8
50
17
    warn "using XML::Parser\n" if( $opt->{v});
94
8
14
    split_with_parser( $state, @ARGV);
95   }
96 else
97
5
33
25
  { $state||= xml_split::state::twig->new( $options);
98
5
8
    split_with_twig( $state, @ARGV);
99   }
100
101
22
0
exit;
102 }
103
104 sub split_with_twig
105
5
5
8
  { my( $state, @files)= @_;
106
5
50
10
    if( !@files)
107
0
0
0
      { $state->{base} ||= 'out';
108
0
0
0
        $state->{ext} ||= '.xml';
109
0
0
        my $twig_options= twig_options( $state);
110
0
0
        my $t= XML::Twig->new( %$twig_options, $state);
111
0
0
        $state->{twig}= $t;
112
0
0
        $t->parse( \*STDIN);
113
0
0
        end_file( $t, $state);
114       }
115     else
116
5
7
      { foreach my $file (@files)
117           {
118
5
0
50
10
0
            unless( $state->{base}) { $state->{seq_nb}=0; }
119
5
36
            my( $base, $ext)= ($file=~ m{^(.*?)(\.\w+)?$});
120
5
33
15
            $state->{base} ||= $base;
121
5
33
19
            $state->{ext} ||= $ext || '.xml';
122
5
8
            my $twig_options= twig_options( $state);
123
5
28
            my $t= XML::Twig->new( %$twig_options);
124
5
7
            $state->{twig}= $t;
125
5
12
            $t->parsefile( $file);
126
5
9
            end_file( $t, $state);
127           }
128       }
129   }
130
131 sub split_with_parser
132
17
17
23
  { my( $state, @files)= @_;
133
17
50
34
    if( !@files)
134
0
0
0
      { $state->{base} ||= 'out';
135
0
0
0
        $state->{ext} ||= '.xml';
136
0
0
        my $parser_options= parser_options( $state);
137
0
0
        my $p= XML::Parser->new( %$parser_options);
138
0
0
        $state->{parser}= $p;
139
0
0
        $p->parse( \*STDIN);
140       }
141     else
142
17
23
      { foreach my $file (@files)
143           {
144
17
1
100
43
1
            unless( $state->{base}) { $state->{seq_nb}=0; }
145
17
128
            my( $base, $ext)= ($file=~ m{^(.*?)(\.\w+)?$});
146
17
67
39
            $state->{base} ||= $base;
147
17
33
68
            $state->{ext} ||= $ext || '.xml';
148
17
25
            my $parser_options= parser_options( $state);
149
17
90
            my $p= XML::Parser->new( %$parser_options);
150
17
434
            $state->{parser}= $p;
151
17
44
            $p->parsefile( $file);
152           }
153       }
154   }
155
156 sub parser_options
157
17
17
18
  { my( $state)= @_;
158     # prepare output to the main document
159
17
50
33
    unless( $state->{no_pi})
160
17
62
      { my $file_name= $state->main_file_name(); # main file name
161
17
50
36
        warn "generating main file $file_name\n" if( $state->{verbose});
162
17
50
715
        open( my $out, '>', $file_name) or die "cannot create main file '$file_name': $!";
163
17
26
        $state->{main_fh}= $out;
164
17
31
        $state->{current_fh}= $out;
165       }
166
189
189
98030
   my $handlers= { Start => sub { $state->{handlers}->{Start}->( $state, shift( @_)); },
167
189
189
765
                   End => sub { $state->{handlers}->{End}->( $state, shift( @_)); },
168
618
618
2882
                   Default => sub { $state->{handlers}->{Default}->( $state, shift( @_)); },
169
5
5
224248
                   XMLDecl => sub { parser_declaration( $state, @_); },
170
17
117
                 };
171
172
17
38
    return { Handlers => $handlers };
173   }
174
175 ###################################################################################
176 # #
177 # handlers for the -l option #
178 # #
179 ###################################################################################
180
181 sub parser_start_tag_level
182
84
84
66
  { my( $state, $p)= @_;
183
184
84
100
202
    if( $p->depth == $state->{level})
185
34
181
      { $state->{seq_nb}++;
186
34
79
        my $file_name= $state->file_name;
187         # prepare chunk file
188
34
50
53
        warn "generating $file_name\n" if( $state->{verbose});
189
34
50
841
        open( my $out, '>', $file_name) or die "cannot create output file '$file_name': $!";
190
34
43
        $state->{current_fh}= $out;
191
34
10
10
100
49
7
46
        if( $state->{xml_declaration}) { print {$state->{current_fh}} $state->{xml_declaration}, "\n"; }
192         # output pi
193
34
50
51
        unless( $state->{no_pi})
194
34
34
24
65
          { print {$state->{main_fh}} $state->include( $file_name) ; }
195       }
196
84
84
50
277
163
    print {$state->{current_fh}} $p->original_string if( $state->{current_fh});
197   }
198
199 sub parser_end_tag_level
200
84
84
59
  { my( $state, $p)= @_;
201
84
84
50
101
138
    print {$state->{current_fh}} $p->original_string if( $state->{current_fh});
202
84
100
312
    if( $p->depth == $state->{level})
203
34
50
162
      { unless( $state->{current_fh} == $state->{main_fh})
204
34
485
          { close $state->{current_fh};
205
34
197
            $state->{current_fh}= $state->{main_fh};
206           }
207       }
208   }
209
210 sub parser_default_level
211
282
282
208
  { my( $state, $p)= @_;
212
282
282
50
350
530
    print {$state->{current_fh}} $p->original_string if( $state->{current_fh});
213   }
214
215
216 ###################################################################################
217 # #
218 # handlers for the -s option #
219 # #
220 ###################################################################################
221
222 sub parser_start_tag_size
223
39
39
28
  { my( $state, $p)= @_;
224
39
100
100
50
    if( $p->depth == $state->{level} && !$state->{current_size})
225       {
226
3
18
        $state->{seq_nb}++;
227
3
12
        my $file_name= $state->file_name;
228         # prepare chunk file
229
3
50
7
        warn "generating $file_name\n" if( $state->{verbose});
230
3
50
96
        open( my $out, '>', $file_name) or die "cannot create output file '$file_name': $!";
231
3
6
        $state->{current_fh}= $out;
232
3
1
100
6
7
        print {$state->{current_fh}} qq{$state->{xml_declaration}\n} if $state->{xml_declaration};
233
3
3
11
11
        print {$state->{current_fh}} qq{<xml_split:root xmlns:xml_split="http://xmltwig.com/xml_split">\n};
234         # output pi
235
3
50
7
        unless( $state->{no_pi})
236
3
3
2
8
          { print {$state->{main_fh}} $state->include( $file_name) ; }
237
3
5
        $state->{store_size}=1;
238       }
239
39
142
    my $original_string= $p->original_string;
240
39
100
147
    $state->{current_size} += length( $original_string) if( $state->{store_size});
241
39
39
50
47
117
    print {$state->{current_fh}} $original_string if( $state->{current_fh});
242   }
243
244 sub parser_end_tag_size
245
39
39
28
  { my( $state, $p)= @_;
246
39
47
    my $original_string= $p->original_string;
247
39
50
131
    $state->{current_size} += length( $original_string) if( $state->{store_size});
248
39
50
67
45
    if( $p->depth == $state->{level} && $state->{current_size} > $state->{size})
249
0
0
0
0
0
      { print {$state->{current_fh}} $original_string if( $state->{current_fh});
250
0
0
        end_file_with_size( $state);
251       }
252     else
253
39
3
100
151
11
      { if($p->depth < $state->{level}) { end_file_with_size( $state); }
254
39
39
50
133
61
        print {$state->{current_fh}} $p->original_string if( $state->{current_fh});
255       }
256   }
257
258 sub end_file_with_size
259
3
3
3
  { my( $state)= @_;
260
3
50
8
    unless( $state->{current_fh} == $state->{main_fh})
261
3
3
1
5
      { print {$state->{current_fh}} qq{\n</xml_split:root>\n};
262
3
2012
        close $state->{current_fh};
263
3
10
        $state->{current_size}=0;
264
3
3
        $state->{store_size}=0;
265
3
7
        $state->{current_fh}= $state->{main_fh};
266       }
267   }
268
269 sub parser_default_size
270
124
124
81
  { my( $state, $p)= @_;
271
124
152
    my $string= $p->original_string;
272
124
100
380
    if( $state->{store_size})
273
114
75
      { $state->{current_size} += length( $string);
274
114
0
50
132
0
        if( $p->depth < $state->{level}) { end_file_with_size( $state); }
275       }
276
124
124
50
368
363
    print {$state->{current_fh}} $string if( $state->{current_fh});
277   }
278
279 ###################################################################################
280 # #
281 # handlers for the -g option #
282 # #
283 ###################################################################################
284
285 sub parser_start_tag_grouped
286
66
66
46
  { my( $state, $p)= @_;
287
66
100
89
    if( $p->depth == $state->{level})
288
32
100
109
      { if( !$state->{current_nb})
289
13
11
          { $state->{seq_nb}++;
290
13
35
            my $file_name= $state->file_name;
291             # prepare chunk file
292
13
50
19
            warn "generating $file_name\n" if( $state->{verbose});
293
13
50
301
            open( my $out, '>', $file_name) or die "cannot create output file '$file_name': $!";
294
13
17
            $state->{current_fh}= $out;
295
13
13
26
9
26
97
            print {$state->{current_fh}} join( "\n", grep { $_ } ( $state->{xml_declaration},
296                                                                    qq{<xml_split:root xmlns:xml_split="http://xmltwig.com/xml_split">\n }
297                                                                  )
298                                              );
299             # output pi
300
13
50
23
            unless( $state->{no_pi})
301
13
13
9
24
              { print {$state->{main_fh}} $state->include( $file_name) ; }
302           }
303       }
304
66
66
50
175
111
    print {$state->{current_fh}} $p->original_string if( $state->{current_fh});
305   }
306
307 sub parser_end_tag_grouped
308
66
66
46
  { my( $state, $p)= @_;
309
66
100
79
    if( $p->depth == $state->{level})
310
32
32
50
112
48
      { print {$state->{current_fh}} $p->original_string if( $state->{current_fh});
311
32
90
        $state->{current_nb}++;
312
32
10
100
73
16
        if( $state->{current_nb} == $state->{group}) { end_file_grouped( $state); }
313       }
314     else
315
34
11
100
109
43
      { if($p->depth < $state->{level}) { end_file_grouped( $state, { no_nl => 1 }); }
316
34
34
50
90
58
        print {$state->{current_fh}} $p->original_string if( $state->{current_fh});
317       }
318   }
319
320 sub end_file_grouped
321
21
21
19
  { my( $state, $options)= @_;
322
21
10
100
29
10
    print {$state->{current_fh}} qq{\n} unless( $options->{no_nl});
323
21
100
41
    unless( $state->{current_fh} == $state->{main_fh})
324
13
13
11
10
      { print {$state->{current_fh}} qq{</xml_split:root>\n};
325
13
332
        close $state->{current_fh};
326
13
13
        $state->{current_nb}=0;
327
13
72
        $state->{current_fh}= $state->{main_fh};
328       }
329   }
330
331 sub parser_default_grouped
332
212
212
143
  { my( $state, $p)= @_;
333
212
212
50
243
289
    print {$state->{current_fh}} $p->original_string if( $state->{current_fh});
334   }
335
336 sub char_parser
337
0
0
0
  { my( $state, $p)=( shift, shift);
338
0
0
0
0
0
    print {$state->{current_fh}} $_[0] if( $state->{current_fh});
339   }
340
341 sub parser_declaration
342
5
5
10
  { my( $state, $p, $version, $encoding, $standalone)= @_;
343
5
50
17
    $state->{xml_declaration}= $p->recognized_string || '';
344
5
5
49
56
    print {$state->{main_fh}} $state->{xml_declaration};
345     # avoid calling original_string if not needed
346     #if( !$state->{xml_declaration} || $state->{xml_declaration}=~ m{encoding\s*=\s*["']utf-?8["']}i)
347     # { $state->{utf8_encoded}=1;
348     # $p->setHandlers( Char => \&char_parser);
349     # }
350   }
351
352
353 sub twig_options
354
5
5
5
  { my( $state)= @_;
355
356     # base options, ensures maximun fidelity to the original document
357
5
12
    my $twig_options= { keep_encoding => 1, keep_spaces => 1 };
358
359     # prepare output to the main document
360
5
50
12
    unless( $state->{no_pi})
361
5
15
      { my $file_name= $state->main_file_name(); # main file name
362
5
50
10
        warn "generating main file $file_name\n" if( $state->{verbose});
363
5
50
222
        open( my $out, '>', $file_name) or die "cannot create main file '$file_name': $!";
364
5
8
        $state->{out}= $out;
365
5
7
        $twig_options->{twig_print_outside_roots}= $out;
366
5
33
33
20
58
        $twig_options->{start_tag_handlers}= { $state->{cond} => sub { $_->set_att( '#in_fragment' => 1); } };
367       }
368
369
5
33
33
15
43
    $twig_options->{twig_roots}= { $state->{cond} => sub { dump_elt( @_, $state); } };
370
5
6
    return $twig_options;
371   }
372
373 sub dump_elt
374
33
33
24
  { my( $t, $elt, $state)= @_;
375
33
28
    $state->{seq_nb}++;
376
33
23
    $state->{elt}= $elt;
377
378
33
139
    my $file_name= $state->file_name;
379
33
50
42
    warn "generating $file_name\n" if( $state->{verbose});
380
381
33
90
    my $fragment= XML::Twig->new();
382
33
34
    $fragment->{twig_xmldecl} = $t->{twig_xmldecl};
383
33
29
    $fragment->{twig_doctype} = $t->{twig_doctype};
384
33
34
    $fragment->{twig_dtd} = $t->{twig_dtd};
385
386
33
50
36
    if( !$state->{no_pis})
387       { # if we are still within a fragment, just replace the element by the PI
388         # otherwise print it to the main document
389
33
412
        my $include= $state->include( $file_name);
390
391
33
46
        $elt->del_att( '#in_fragment');
392
393
33
100
50
        if( $elt->inherited_att( '#in_fragment'))
394
2
6
          { $elt->parent( '*[@#in_fragment="1"]')->set_att( '#has_subdocs' => 1);
395
2
5
            $include->replace( $elt);
396           }
397         else
398
31
44
          { $elt->cut;
399
31
45
            $include->print( $state->{out});
400           }
401       }
402     else
403
0
0
      { $elt->cut; }
404
405
33
54
    $fragment->set_root( $elt);
406
33
50
885
    open( my $out, '>', $file_name) or die "cannot create output file '$file_name': $!";
407     #if( $state->{xml_declaration}) { warn "c1"; print {$out} $state->{xml_declaration}, "\n"; }
408     #if( $fragment->{xml_decl}) { warn "c2"; print {$out} $fragment->xml_decl, "\n"; }
409
33
68
    $fragment->set_keep_encoding( 1);
410
33
48
    $fragment->print( $out);
411
33
49043
    close $out;
412   }
413
414 sub end_file
415
5
5
7
  { my( $t, $state)= @_;
416
5
50
12
    unless( $state->{no_pi})
417
5
31006
      { close $state->{out}; }
418   }
419
420
421 # for Getop::Std
422
0
0
0
sub HELP_MESSAGE { return $USAGE; }
423
0
0
0
sub VERSION_MESSAGE { return $VERSION; }
424
425 package xml_split::state;
426
427 sub new
428
22
22
31
  { my( $ref, $options)= @_;
429
22
37
    my $state= bless $options, $ref;
430
22
72
    $state->{seq_nb}=0;
431
22
53
    return $state;
432   }
433
434 sub file_name
435
83
83
59
  { my( $state)= @_;
436
83
232
    my $nb= sprintf( "%0$state->{nb_digits}d", $state->{seq_nb});
437
83
109
    my $file_name= "$state->{base}-$nb$state->{ext}";
438
83
97
    $file_name =~ s{\\}{/}g;
439
83
105
    return $file_name;
440   }
441
442 sub main_file_name
443
22
22
24
  { my( $state)= @_;
444
22
70
    my $nb= sprintf( "%0$state->{nb_digits}d", 0);
445
22
49
    my $file_name= "$state->{base}-$nb$state->{ext}";
446
22
29
    return $file_name;
447   }
448
0
0
1;
449
450 ###################################################################################
451 # #
452 # state when using XML::Parser #
453 # #
454 ###################################################################################
455
456 package xml_split::state::parser;
457
0
0
import xml_split::state;
458
29
29
29
29
103
23
1714
use base 'xml_split::state';
459
460 sub include
461
50
50
56
  { my( $state, $file_name)= @_;
462
50
100
75
    if( $state->{xinclude})
463
12
35
      { return qq{<xi:include href="$file_name" />}; }
464     else
465
38
97
      { return qq{<?merge subdocs = 0 :$file_name?>}; }
466   }
467
0
0
1;
468
469 ###################################################################################
470 # #
471 # state when using XML::Twig #
472 # #
473 ###################################################################################
474
475 package xml_split::state::twig;
476
0
0
import xml_split::state;
477
29
29
29
29
4073
31
1453
use base 'xml_split::state';
478
479 sub include
480
33
33
30
  { my( $state, $file_name)= @_;
481
33
14
    my $include;
482
33
100
58
    my $subdocs= $state->{elt}->att( '#has_subdocs') || 0;
483
33
100
39
    if( $state->{xinclude})
484
12
33
      { $include= XML::Twig::Elt->new( 'xi:include', { href => $file_name });
485
12
1
100
23
2
        if( $subdocs) { $include->set_att( subdocs => 1); }
486       }
487     else
488       {
489
21
33
        $include= XML::Twig::Elt->new( '#PI')
490                                  ->set_pi( merge => " subdocs = $subdocs :$file_name");
491       }
492
33
34
    return $include;
493   }
494
495
0
0
1;
496
497 package main;
498