#!/usr/local/bin/perl5 # # Bert Bos # Created: 17 Mar 1998 # $Id: dtd2bnf,v 1.3 1998/03/17 18:48:44 bbos Exp $ # my $linelen = 72; my $PROG = substr($0, rindex($0, "/") + 1); my $USAGE = "Usage: $PROG file\n"; my $string = "(?:\"([^\"]*)\"|\'([^\']*)\')"; my %pent; # Parameter entities my %attributes; # Attribute lists my @element; # Elements in source order my %model; # Content models # Parse a string into an array of "words". # Words are whitespace-separated sequences of non-whitespace characters, # or quoted strings ("" or ''), with the quotes removed. sub parsewords { my $line = $_[0]; my @words = (); while ($line ne '') { if ($line =~ /^\s+/) { # Skip whitespace } elsif ($line =~ /^\"((?:[^\"]|\\\")*)\"/) { push(@words, $1); } elsif ($line =~ /^\'((?:[^\']|\\\')*)\'/) { push(@words, $1); } elsif ($line =~ /^\S+/) { push(@words, $&); } else { die "Cannot happen\n"; } $line = $'; } return @words; } # break lines at or before $linelen, indent continuation lines $indent sub break { my ($linelen, $indent, $line) = @_; my $result = ''; $line =~ s/\s+$//o; # Remove trailing whitespace while (length($line) > $linelen) { my $i = $linelen; BREAK: while (1) { if (substr($line, $i, 1) =~ /\s/so) { # found a space last BREAK; } if ($i <= $linelen) {$i--;} else {$i++;} if ($i == $indent) { # no space found to the left, try to the right $i = $linelen + 1; } if ($i == length($line)) { # no space found anywhere last BREAK; } } my $part = substr($line, 0, $i); $part =~ s/\s+$//o; # Remove trailing spaces $result .= $part; # Add to result $line = substr($line, $i + 1); $line =~ s/^\s+//o; # Remove leading spaces if (length($line) != 0) { $result .= "\n"; $line = (' ' x $indent) . $line; } } if (length($line) != 0) {$result .= $line;} return $result; } # Store content model, return empty string sub store_elt { my ($name, $model) = @_; $model =~ s/\#PCDATA/TEXT*/gio; $model =~ s/\s+/ /gso; push(@element, $name); $element{$name} = $model; return ''; } # Store attribute list, return empty string sub store_att { my ($element, $atts) = @_; $atts =~ s/\#FIXED//gio; # Remove #FIXED my @words = parsewords($atts); $attributes{$element} = [ @words ]; return ''; } # Return maximum value of an array of numbers sub max { my $max = $_[0]; foreach my $i (@_) { if ($i > $max) {$max = $i;} } return $max; } # Main $/ = undef; my $buf = <>; # remove comments $buf =~ s/\s+//gso; # remove processing instructions $buf =~ s/<\?.*?>\s+//gso; # loop until parameter entities fully expanded my $i; do { # store parameter entities $buf =~ s/\s+//gsio; # store attribute lists $buf =~ s//store_att($1, $2)/gsioe; # store content models $buf =~ s//store_elt($1, $2)/gsioe; # find maximum length of non-terminals my $maxlen = max(map(length, @element)) + 4; # loop over elements, writing EBNF foreach $e (@element) { $model = $element{$e}; # print rule for element $e my $s = sprintf("%-${maxlen}s = ", $e); $s .= "\"<$e\" ${e}_att* ( \"/>\" | \">\" ${e}_cnt \"\" )"; print break($linelen, $maxlen + 3, $s) . "\n"; # print rule for $e's attributes printf("%-${maxlen}s = ", "${e}_att"); my $h = $attributes{$e}; my @atts = @$h; if ($#atts == -1) { print ";empty\n"; } else { for (my $i = 0; $i <= $#atts; $i += 3) { if ($i != 0) {print ' ' x $maxlen . " | ";} # use only name, ignore type and default print "\"$atts[$i]\" \"=\" STRING\n"; } } # print rule for $e's content model $s = sprintf("%-${maxlen}s = ", "${e}_cnt"); if ($model =~ /^EMPTY$/io) { $s .= ";empty"; } elsif ($model =~ /^ANY$/io) { $s .= "( " . join(" | ", sort @element) . " )*"; } else { $model =~ s/\s+$//o; # remove trailing spaces if ($model =~ /\)$/o) { chop($model); # remove final ')' $model = substr($model, 1); # remove initial '(' } $model =~ s/\(/ \( /go; # one space around '(' $model =~ s/\)/ \) /go; # one space around ')' $model =~ s/\|/ \| /go; # one space around '|' $model =~ s/,/ /go; # one space instead of ',' $model =~ s/\s+\?/\?/go; # no spaces before '?' $model =~ s/\s+\*/\*/go; # no spaces before '*' $model =~ s/\s+\+/\+/go; # no spaces before '+' $model =~ s/\s+/ /go; # no multiple spaces $model =~ s/^ //o; # no initial space $s .= "$model"; } print break($linelen, $maxlen + 3, $s) . "\n\n"; } # print auxiliary tokens printf "%-${maxlen}s = \"\'\" TEXT* \"\'\" | \'\"\' TEXT* \'\"\'\n", "STRING"; printf "%-${maxlen}s = (any legal XML character)\n", "TEXT";