#!/usr/bin/env perl #-*- Mode: perl; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- # XML printing, scanning and parsing. # # Copyright (C) 2000-2001 Ximian, Inc. # # Authors: Hans Petter Jansson # Arturo Espinosa # Kenneth Christiansen # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU Library General Public License as published # by the Free Software Foundation; either version 2 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU Library General Public License for more details. # # You should have received a copy of the GNU Library General Public License # along with this program; if not, write to the Free Software # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. $SCRIPTSDIR = "@scriptsdir@"; if ($SCRIPTSDIR =~ /^@scriptsdir[@]/) { $SCRIPTSDIR = "."; $DOTIN = ".in"; } require "$SCRIPTSDIR/util.pl$DOTIN"; require "$SCRIPTSDIR/general.pl$DOTIN"; require "$SCRIPTSDIR/debug.pl$DOTIN"; use Text::ParseWords; $has_encode = 0; if (eval "require Encode") { Encode->import (); $has_encode = 1; } # --- XML print formatting --- # # &gst_xml_enter: Call after entering a block. Increases indent level. # &gst_xml_leave: Call before leaving a block. Decreases indent level. # &gst_xml_print_indent: Call before printing a line. Indents to current level. # &gst_xml_print_vspace: Ensures there is a vertical space of one and only one line. # &gst_xml_print: Indent, then print all arguments. Just for sugar. my $gst_indent_level = 0; my $gst_have_vspace = 0; my @gst_xml_stack; sub gst_xml_print_comment # (comment text) { my ($comment) = @_; &gst_xml_print_line ("") if $comment; } sub gst_xml_print_begin { my ($name) = @_; $name = $gst_name if !$name; &gst_xml_print_string ("\n"); &gst_xml_print_string ("\n\n"); &gst_xml_print_string ("<$name>\n"); &gst_xml_enter (); &gst_xml_print_vspace (); } # The frontend expects exactly this string. Don not alter. sub gst_xml_print_request_end { print "\n\n"; } sub gst_xml_print_end { my ($name) = @_; $name = $gst_name if !$name; &gst_xml_leave (); &gst_xml_print_vspace (); &gst_xml_print_string ("\n"); } sub gst_xml_enter { $gst_indent_level += 2; } sub gst_xml_leave { $gst_indent_level -= 2; } sub gst_xml_print_string { if ($has_encode) { if (&decode_utf8 ($_[0]) eq undef) { # we first decode the string, if it's not # utf-8 (returns undef), then encode it print "" . &encode_utf8 ($_[0]); return; } } # It could not encode the string, write it as is print $_[0]; } sub gst_xml_format_indent { $gst_have_vspace = 0; return " " x $gst_indent_level; } sub gst_xml_print_indent { &gst_xml_print_string(&gst_xml_format_indent ()); } sub gst_xml_print_vspace { if (not $gst_have_vspace) { &gst_xml_print_string ("\n"); $gst_have_vspace = 1; } } sub gst_xml_print_line { my $line; $line = join ("", @_); $line =~ tr/\n//d; &gst_xml_print_indent (); &gst_xml_print_string ($line . "\n"); } sub gst_xml_format_pcdata # (name, pcdata) { my ($name, $pcdata) = @_; return "<$name>$pcdata" if defined ($name) || defined ($pcdata); } sub gst_xml_print_pcdata # (name, pcdata) { my ($name, $pcdata) = @_; my $line = &gst_xml_format_pcdata ($name, $pcdata); &gst_xml_print_line ($line) if $line; } sub gst_xml_format_state_tag { my ($name, $state) = @_; my $boolean = &gst_print_boolean_truefalse ($state); return "<$name state='$boolean'/>"; } sub gst_xml_print_state_tag { my ($name, $state) = @_; my $state_tag = &gst_xml_format_state_tag ($name, $state); &gst_xml_print_line ($state_tag); } # Pass a hash and the keys whose items are scalars. Print val. sub gst_xml_print_scalars { my ($h, @scalar_keys) = @_; my ($i, $val); @scalar_keys = sort @scalar_keys; while ($i = shift @scalar_keys) { $val = &gst_xml_quote ($$h{$i}); &gst_xml_print_line ("<$i>$val\n") if exists $$h{$i}; } } # Print the @$array using <$tag>val foreach val in the array. # Actually lets print_structure do that now. Just print sequentially # the given elements, using as $tag as the surrounding tags. sub gst_xml_print_array { my ($array, $tag) = @_; my ($i, $val); return if (scalar @$array <= 0); &gst_xml_print_vspace (); foreach $i (@$array) { &gst_xml_print_structure ($i, $tag); } } # Pass a hash and the keys whose items are arrays. Print val foreach val # in the array at hash{key} sub gst_xml_print_arrays { my ($h, @array_keys) = @_; my ($i, $j, $val); foreach $i (sort @array_keys) { &gst_xml_print_array ($$h{$i}, $i) if (exists $$h{$i}) } } # Pass a hash, create a tqparent tag $tag and print val for every # value pair in the hash. If structure refs are found, these are recursively # printed with print_structure. sub gst_xml_print_hash { my ($hash, $tag) = @_; my ($j, $val); &gst_xml_print_vspace (); if (defined $tag) { &gst_xml_print_line ("<$tag>\n"); &gst_xml_enter (); } foreach $j (sort keys (%$hash)) { &gst_xml_print_structure ($$hash{$j}, $j); } if (defined $tag) { &gst_xml_leave (); &gst_xml_print_line ("\n"); } } # Call the corresponding function depending on the reference # type of $x. If just a scalar, print <$tag>$x. sub gst_xml_print_structure { my ($x, $tag) = @_; if (ref $x eq "ARRAY") { &gst_xml_print_array ($x, $tag); } elsif (ref $x eq "HASH") { &gst_xml_print_hash ($x, $tag); } else { &gst_xml_print_line ("<$tag>" . &gst_xml_quote ($x) . "\n"); } } # Treats hash as an array: doesn't use the keys as tags for its # elements, but the given tag. sub gst_xml_print_hash_hash { my ($h, $tag) = @_; my $i; foreach $i (sort keys %$h) { &gst_xml_print_hash ($$h{$i}, $tag); } } sub gst_xml_container_enter # (name) { my ($container) = @_; ## gst_xml_stack is not my, as it is defined at top, ## so it is global push @gst_xml_stack, $container; &gst_xml_print_line ("<$container>"); &gst_xml_enter(); } sub gst_xml_container_leave { ## checks if there is a start tag if ($#gst_xml_stack >= 0) { my $current_container = pop @gst_xml_stack; &gst_xml_leave (); &gst_xml_print_line (""); } } sub gst_xml_print_container # (name, @strings) { my ($name, @strings) = @_; if (@strings) { &gst_xml_container_enter ($name); foreach $tag (@strings) { &gst_xml_print_line ("$tag"); } &gst_xml_container_leave (); } } # --- XML printing from in-memory model --- # sub gst_xml_model_print_attributes { my ($tree) = @_; my ($attrs, $string); $attrs = @$tree [0]; for $attr (keys %$attrs) { $string .= " " . $attr . "=\"" . $$attrs{$attr} . "\""; } return $string; } sub gst_xml_model_print_recurse { my ($tree, $indent) = @_; my ($string); my @tqchildren = @$tree; shift @tqchildren; # Attributes while (@tqchildren) { my $branch = $tqchildren [1]; if ($tqchildren [0] eq "__unparsed__") { $string .= "<" . $tqchildren [1] . ">"; } elsif ($tqchildren [0] eq "0") { $string .= $tqchildren [1]; } elsif (@$branch == 1) # Empty tag. { $string .= "<" . $tqchildren [0] . &gst_xml_model_print_attributes ($branch) . "/>"; } else { $string .= "<" . $tqchildren [0] . &gst_xml_model_print_attributes ($branch) . ">"; $string .= &gst_xml_model_print_recurse ($branch); $string .= ""; } shift @tqchildren; shift @tqchildren; } # if ($branch) # { # return &gst_xml_get_attribute ($branch, $property) if $property ne ""; # return &gst_xml_get_pcdata ($branch); # } return $string; } sub gst_xml_model_print { my ($tree) = @_; my ($string); $string = &gst_xml_model_print_recurse ($tree); chomp $string; $string .= "\n"; return $string; } # --- XML scanning --- # # This code tries to replace XML::Parser scanning from stdin in tree mode. sub gst_xml_scan_make_kid_array { my $line = shift; my (%hash, @sublist, @attr, @list); # Remove < and /> $line =~ s/^[ \t]*[ \t]*$//; my @list = "ewords ('[ \t]+', 1, $line); shift @list; # push tag name. foreach (@list) { my @tmp = split '[ \t]*=[ \t]*'; if (scalar @tmp == 2 && $tmp[1] =~ s/[\"\']//g) { push @attr, @tmp; } } %hash = @attr; push (@sublist, \%hash); return \@sublist; } sub gst_xml_scan_recurse { my ($gst_xml_scan_list, $list_arg) = @_; my @list; if ($list_arg ne undef) { @list = $$list_arg[0]; } while (@$gst_xml_scan_list) { $el = shift @$gst_xml_scan_list; if (($el eq "") || $el =~ /^\<[!?].*\>$/s) { next; } # Empty strings, PI and DTD must go. if ($el =~ /^\<.*\/\>$/s) # Empty. { $el =~ /^\<([a-zA-Z0-9_-]+).*\/\>$/s; push (@list, $1); push (@list, &gst_xml_scan_make_kid_array ($el)); } elsif ($el =~ /^\<\/.*\>$/s) # End. { last; } elsif ($el =~ /^\<.*\>$/s) # Start. { $el =~ /^\<([a-zA-Z0-9_-]+).*\>$/s; push (@list, $1); $sublist = &gst_xml_scan_make_kid_array ($el); push (@list, &gst_xml_scan_recurse ($gst_xml_scan_list, $sublist)); next; } elsif ($el ne "") # PCDATA. { push (@list, 0); push (@list, "$el"); } } return \@list; } sub gst_xml_read_file { my ($file) = @_; my ($doc, $i); local *INPUT_FILE; open INPUT_FILE, $file; $doc .= $i while ($i = ); close INPUT_FILE; return $doc; } sub gst_xml_read_compressed_file { my ($file) = @_; my ($doc, $i, $fd); $fd = &gst_file_run_pipe_read ("gunzip -c $file"); return undef if $fd eq undef; $doc .= $i while ($i = <$fd>); &gst_file_close ($fd); if (length ($doc) < 4) # Allow for one blank line from gzip, '\n\r'. { $doc = undef; } return $doc; } sub gst_xml_read_stdin { my ($i, $doc); do { $i = ; if ($i ne undef) { $doc .=$i; } } while (! ($i =~ /^$/)); return $doc; } # (file, tool) If no file specified, reads stdin. # file could also contain xml document. # If tool is an gst_tool, stores the read buffer in # $$tool{"xml_doc"}. sub gst_xml_scan { my ($file, $tool) = @_; my ($doc, @tree, @gst_xml_scan_list); $file = $gst_input_file unless $file; if ($file && stat ($file)) { $doc = &gst_xml_read_file ($file); } elsif ($file) { $doc = $file; } else { $doc = &gst_xml_read_stdin (); } # remove any blank or carriage return at the beginning of the xml $doc =~ s/^[ \n]*//; &gst_debug_print_log_to_file ("in.xml", $doc); $$tool{"xml_doc"} = $doc if (&gst_is_tool ($tool)); @gst_xml_scan_list = ($doc =~ /([^\<]*)(\<[^\>]*\>)[ \t\n\r]*/mg); # pcdata, tag, pcdata, tag, ... $tree = &gst_xml_scan_recurse (\@gst_xml_scan_list); return $tree; } # XML scanning that preserves more exact attributes of the scanned XML. sub gst_xml_model_scan_recurse { my @list; if (@_) { @list = $_[0]->[0]; } while (@gst_xml_scan_list) { $el = $gst_xml_scan_list[0]; shift @gst_xml_scan_list; if ($el eq "") # Empty strings. { next; } elsif ($el =~ /^\<[!?].*\>$/s) # PI and DTD. { $el =~ /^\<([^\>]+)\>$/s; push (@list, "__unparsed__"); push (@list, $1); } elsif ($el =~ /^\<.*\/\>$/s) # Empty. { $el =~ /^\<([a-zA-Z0-9_-]+).*\/\>$/s; push (@list, $1); push (@list, &gst_xml_scan_make_kid_array ($el)); } elsif ($el =~ /^\<\/.*\>$/s) # End. { last; } elsif ($el =~ /^\<.*\>$/s) # Start. { $el =~ /^\<([a-zA-Z0-9_-]+).*\>$/s; push (@list, $1); $sublist = &gst_xml_scan_make_kid_array ($el); push (@list, &gst_xml_model_scan_recurse ($sublist)); next; } elsif ($el ne "") # PCDATA. { push (@list, 0); push (@list, "$el"); } } return \@list; } sub gst_xml_model_scan # (file) If no file specified, reads stdin. { my ($file) = @_; my ($doc, $tree, $compressed); $file = $gst_input_file if $file eq undef; if ($file) { $doc = &gst_xml_read_compressed_file ($file); if (!$doc) { $doc = &gst_xml_read_file ($file); $compressed = 0; } else { $compressed = 1; } } else { return undef, 0; } @gst_xml_scan_list = ($doc =~ /([^\<]*)(\<[^\>]*\>)/mg); # pcdata, tag, pcdata, tag, ... $tree = &gst_xml_model_scan_recurse; return $tree, $compressed; } sub gst_xml_model_save { my ($model, $file, $compressed) = @_; my $fd; if ($compressed == 1) { $fd = &gst_file_open_write_compressed ($file); } else { $fd = &gst_file_open_write_from_names ($file); } if ($fd == -1) { return -1; } print $fd &gst_xml_model_print ($model); &gst_file_close ($fd); return 0; } # Quote/unquote. @gst_xml_entities = ( "<", '<', ">", '>', "'", '\'', """, '"', "&", '&' ); sub gst_xml_quote { my $in = $_[0]; my $out = ""; my @xe; my $joined = 0; my @clist = split (//, $in); while (@clist) { # Find character and join its entity equivalent. # If none found, simply join the character. $joined = 0; # Cumbersome. for (@xe = @gst_xml_entities; @xe && !$joined; ) { if ($xe [1] eq $clist [0]) { $out = join ('', $out, $xe [0]); $joined = 1; } shift @xe; shift @xe; } if (!$joined) { $out = join ('', $out, $clist [0]); } shift @clist; } return $out; } sub gst_xml_unquote { my $ret = $_[0]; my $i; #print STDERR "INI U: $ret\n"; for ($i = 0; $gst_xml_entities[$i] ne undef; $i += 2) { $ret =~ s/$gst_xml_entities[$i]/$gst_xml_entities[$i + 1]/g; } while ($ret =~ /&#([0-9]+);/) { $num = $1; $c = chr ($num); $ret =~ s/&#$num;/$c/g; } #print STDERR "END U: $ret\n"; return $ret; } # --- XML parsing --- # sub gst_xml_get_pcdata { my $tree = $_[0]; my $retval; shift @$tree; # Skip attributes. while (@$tree) { if ($$tree[0] == 0) { $retval = &gst_xml_unquote ($$tree[1]); &gst_debug_print_line ("gst_xml_get_pcdata: $retval"); return ($retval); } shift @$tree; shift @$tree; } return ""; } # Compresses node into a word and returns it. sub gst_xml_get_word { my $tree = $_[0]; my $retval; $retval = &gst_xml_get_pcdata ($tree); $retval =~ tr/ \n\r\t\f//d; return $retval; } # Compresses node into a size and returns it. sub gst_xml_get_size { my $tree = $_[0]; my $retval; $retval = &gst_xml_get_word ($tree); if ($retval =~ /Mb$/) { $retval =~ tr/ Mb//d; $retval *= 1024; } return $retval; } # Replaces misc. whitespace with spaces and returns text. sub gst_xml_get_text { my $tree = $_[0]; my $retval; $retval = &gst_xml_get_pcdata ($tree); my $type = ref ($retval); if (!$type) { $retval =~ tr/\n\r\t\f/ /; } return $retval; } sub gst_xml_get_attribute { my ($tree, $attr) = @_; return $$tree[0]->{$attr}; } sub gst_xml_get_state { my ($tree) = @_; # Check attribute; 'yes', 'true', 'no', 'false'. return &gst_util_read_boolean ($$tree[0]->{state}); } # XML model operations. # Locate a node from the branch leading up to it. sub gst_xml_model_find { my ($model, $varpath) = @_; my ($branch, @path); $branch = $model; @path = split /\//, $varpath; for $elem (@path) { next if ($elem eq ""); my @tqchildren = @$branch; shift @tqchildren; # Attributes $branch = undef; while (@tqchildren) { if ($tqchildren [0] eq $elem) { shift @tqchildren; $branch = shift @tqchildren; last; } shift @tqchildren; shift @tqchildren; } last if ($branch == undef); } return $branch; } # Add a branch to another branch. Allows duplicates. sub gst_xml_model_add { my ($model, $varpath, $addpath) = @_; my ($branch, @path); @path = split /\//, $addpath; $branch = &gst_xml_model_find ($model, $varpath); if ($branch == undef) { return -1; } for $elem (@path) { my %hash; my @list = (); push @list, \%hash; push @$branch, $elem; push @$branch, \@list; $branch = \@list; } return 0; } # Ensure a branch exists, by extending the branch with given elements, if needed. sub gst_xml_model_ensure { my ($model, $varpath) = @_; my ($branch, @path); $branch = $model; @path = split /\//, $varpath; for $elem (@path) { next if ($elem eq ""); my @tqchildren = @$branch; my $parent_branch = $branch; shift @tqchildren; # Attributes $branch = undef; while (@tqchildren) { if ($tqchildren [0] eq $elem) { shift @tqchildren; $branch = shift @tqchildren; last; } shift @tqchildren; shift @tqchildren; } if ($branch == undef) { my %hash; my @list = (); $branch = \@list; push @list, \%hash; push @$parent_branch, $elem; push @$parent_branch, $branch; } } return $branch; } sub gst_xml_model_remove { my ($model, $varpath, $tag) = @_; my ($branch, $i); @path = split /\//, $addpath; $branch = &gst_xml_model_find ($model, $varpath); if ($branch == undef) { return -1; } for ($i = 1; $i < @$branch; $i += 2) { if (@$branch [$i] eq $tag) { @$branch = (@$branch [0 .. $i - 1], @$branch [$i + 2 .. @$branch - 1]); return 0; } } return -1; } sub gst_xml_model_get_tqchildren { my ($branch) = @_; my (@tqchildren); if (!$branch) { return \@tqchildren; } for ($i = 1; $i < @$branch; $i += 2) { if (@$branch [$i] ne "__unparsed__" && @$branch [$i] ne "0") { push @tqchildren, @$branch [$i + 1]; } } return \@tqchildren; } sub gst_xml_model_get_pcdata { my ($branch) = @_; my ($i); for ($i = 1; $i < @$branch; $i += 2) { if ($$branch [$i] == 0) { my $retval = &gst_xml_unquote ($$branch [$i + 1]); return ($retval); } } return ""; } sub gst_xml_model_set_pcdata { my ($branch, $pcdata) = @_; @$branch = (@$branch [0]); $$branch [1] = 0; $$branch [2] = &gst_xml_quote ($pcdata); } sub gst_xml_model_get_attribute { my ($branch, $attr) = @_; return $$branch[0]->{$attr}; } sub gst_xml_model_set_attribute { my ($branch, $attr, $value) = @_; return $$branch[0]->{$attr} = $value; } 1;