# ThMLutil -- utilities for ThML file processing
#
use strict;
#
#given a div start tag, this subroutine returns the id, title,
#and filename for the division. Dies if the division had no ID.
#
#Title chosen is the type and n if both are present (e.g. Chapter 3),
#followed by the title attribute if present. If neither of those were
#present, the ID is used.
#
sub getName
{
my $div = shift;
my $title = "";
# print STDERR "Getting title from division $div\n";
die "A division had no ID: $div\n" unless $div =~ m|id="([^"]*)"|;
my $id = $1;
my ($n) = $div =~ m|n="([^"]*?)"|;
my ($type) = $div =~ m|type="([^"]*?)"|s;
$title = "$type $n. " if $type and $n;
my ($titleatt) = $div =~ m|title="([^"]*?)"|s;
if ($titleatt) {
$title .= $titleatt if $title;
$title = $titleatt unless $title;
}
$title ||= $id;
my $filename = "$id.htm";
# print STDERR " returning title $title\n";
return ($id, $title, $filename);
}
#
# delete all tags in parameter
#
sub detag
{
my $in = shift @_;
$in =~ s|<[^>]*>||gs;
return $in;
}
# change the tag for a character style to something else:
# _delete --> delete start tag, end tag, and text between
# _unescape_delete --> unescape and delete tags
# _detag --> remove tags
sub chartag
{
my $styname = shift;
my $tagname = shift;
my $tagend = shift;
if ($tagname eq "_unescape_detag") {
s|
(.*?)
|&unescape($1)|gsie; } elsif ($tagname eq "_delete") { s|.*?
||gsi; } elsif ($tagname eq "_detag") { s|(.*?)
|$1|gsi; } elsif ($tagend eq "") { s|(.*?)
|<$tagname$1>$2|gsi; } else { s|(.*?)
|<$tagname$1>$2$tagend>|gsi; } } # change one tag to another sub retag { my $tagname = shift; my $tagrep = shift; my $tagend = shift; if ($tagrep eq "_detag") { s|?$tagname\b.*?>||gsi; } elsif ($tagrep eq "_delete") { s|<$tagname\b.*?>.*?$tagname>\s*||gsi; s|<$tagname\b.*?>\s*||gsi; } elsif ($tagrep eq "_delete_attributes") { s|(<$tagname\b).*?(/?>)|$1$2|gi; } else { s|<$tagname\b(.*?)>|<$tagrep$1>|gsi; s|$tagname\b(.*?)>|$tagend$1>|gsi; } } #escape notes so they don't get processed later sub escape { my $stuff = shift; $stuff =~ s/\&less-than;/g; return $stuff; } #change < to <, > to >, & to & sub unescape { my $stuff = shift; $stuff =~ s/(\<)(.*?)(\>)/$1.&dumbquo($2).$3/gse; $stuff =~ s/\<//g; $stuff =~ s/\&/\&/g; return $stuff; } # #unsmarten (dumbify?) quotes and apostrophes because #netscape is unsmart enough not to understand them. # sub dumbquo { my $stuff = shift; $stuff =~ s/\&[rl]dquo;/"/g; $stuff =~ s/\&[rl]squo;/'/g; $stuff =~ s/\'/'/g; $stuff =~ s/\—/--/g; $stuff =~ s/\–/-/g; $stuff =~ s/\&\#8209;/-/g; $stuff =~ s/\&\#8212;/--/g; $stuff =~ s/\&\#8211;/-/g; $stuff =~ s/\&\#821[67];/'/g; $stuff =~ s/\&\#822[01];/"/g; return $stuff; } # convert a hex unicode code to a decimal escape sub escapehex { my $code = shift; my $dec = hex $code; return "$dec;"; } #convert greek (charset=161) to decimal unicode escapes sub unigreek { my $gk=shift; # print STDERR "Greek passage: $gk\n"; $gk =~ s|(\d+)|$1+720|ge; # print STDERR "...converted to $gk\n"; return $gk; } #---------------------------------------------------------- # # Identify subroutines -- add ids to many tags. # This code can be run more than once on a file without # ill effect -- and it has been greatly speeded up, so # it can be called once in index and once in thm2htm. # # ncrease code is also here, to add n= attributes to #