# # SmartyPants - A Plug-In for Movable Type, Blosxom, and BBEdit # by John Gruber # http://daringfireball.net # # See the readme or POD for details, installation instructions, and # license information. # # Copyright (c) 2003-2004 John Gruber # package SmartyPants; use strict; use vars qw($VERSION); $VERSION = "1.5.1"; # Fri 12 Mar 2004 # Configurable variables: my $smartypants_attr = "1"; # Blosxom and BBEdit users: change this to configure. # 1 => "--" for em-dashes; no en-dash support # 2 => "---" for em-dashes; "--" for en-dashes # 3 => "--" for em-dashes; "---" for en-dashes # See docs for more configuration options. # Globals: my $tags_to_skip = qr!<(/?)(?:pre|code|kbd|script|math)[\s>]!; # Movable Type plug-in interface: require MT; MT->add_plugin( { name => "SmartyPants", description => q(), version => $VERSION, author_name => "John Gruber", author_link => "http://daringfireball.net/", plugin_link => "http://daringfireball.net/projects/smartypants/", registry => { tags => { modifier => { smarty_pants => \&SmartyPants, smart_quotes => \&SmartQuotes, smart_dashes => \&SmartDashes, smart_ellipses => \&SmartEllipses, }, function => { SmartyPantsVersion => \&SmartyPantsVersion, }, }, }, } ); sub SmartyPants { # Paramaters: my $text = shift; # text to be parsed my $attr = shift; # value of the smart_quotes="" attribute my $ctx = shift; # MT context object (unused) # Options to specify which transformations to make: my ( $do_quotes, $do_backticks, $do_dashes, $do_ellipses, $do_stupefy ); my $convert_quot = 0; # should we translate " entities into normal quotes? # Parse attributes: # 0 : do nothing # 1 : set all # 2 : set all, using old school en- and em- dash shortcuts # 3 : set all, using inverted old school en and em- dash shortcuts # # q : quotes # b : backtick quotes (``double'' only) # B : backtick quotes (``double'' and `single') # d : dashes # D : old school dashes # i : inverted old school dashes # e : ellipses # w : convert " entities to " for Dreamweaver users if ( $attr eq "0" ) { # Do nothing. return $text; } elsif ( $attr eq "1" ) { # Do everything, turn all options on. $do_quotes = 1; $do_backticks = 1; $do_dashes = 1; $do_ellipses = 1; } elsif ( $attr eq "2" ) { # Do everything, turn all options on, use old school dash shorthand. $do_quotes = 1; $do_backticks = 1; $do_dashes = 2; $do_ellipses = 1; } elsif ( $attr eq "3" ) { # Do everything, turn all options on, use inverted old school dash shorthand. $do_quotes = 1; $do_backticks = 1; $do_dashes = 3; $do_ellipses = 1; } elsif ( $attr eq "-1" ) { # Special "stupefy" mode. $do_stupefy = 1; } else { my @chars = split( //, $attr ); foreach my $c (@chars) { if ( $c eq "q" ) { $do_quotes = 1; } elsif ( $c eq "b" ) { $do_backticks = 1; } elsif ( $c eq "B" ) { $do_backticks = 2; } elsif ( $c eq "d" ) { $do_dashes = 1; } elsif ( $c eq "D" ) { $do_dashes = 2; } elsif ( $c eq "i" ) { $do_dashes = 3; } elsif ( $c eq "e" ) { $do_ellipses = 1; } elsif ( $c eq "w" ) { $convert_quot = 1; } else { # Unknown attribute option, ignore. } } } my $tokens ||= _tokenize($text); my $result = ''; my $in_pre = 0; # Keep track of when we're inside
 or  tags.

    my $prev_token_last_char = ""; # This is a cheat, used to get some context
                                   # for one-character tokens that consist of
                                   # just a quote char. What we do is remember
                                   # the last character of the previous text
                                   # token, to use as context to curl single-
                                   # character quote tokens correctly.

    foreach my $cur_token (@$tokens) {
        if ( $cur_token->[0] eq "tag" ) {

            # Don't mess with quotes inside tags.
            $result .= $cur_token->[1];
            if ( $cur_token->[1] =~ m/$tags_to_skip/ ) {
                $in_pre = defined $1 && $1 eq '/' ? 0 : 1;
            }
        }
        else {
            my $t = $cur_token->[1];
            my $last_char = substr( $t, -1 )
                ;    # Remember last char of this token before processing.
            if ( !$in_pre ) {
                $t = ProcessEscapes($t);

                if ($convert_quot) {
                    $t =~ s/"/"/g;
                }

                if ($do_dashes) {
                    $t = EducateDashes($t)          if ( $do_dashes == 1 );
                    $t = EducateDashesOldSchool($t) if ( $do_dashes == 2 );
                    $t = EducateDashesOldSchoolInverted($t)
                        if ( $do_dashes == 3 );
                }

                $t = EducateEllipses($t) if $do_ellipses;

                # Note: backticks need to be processed before quotes.
                if ($do_backticks) {
                    $t = EducateBackticks($t);
                    $t = EducateSingleBackticks($t) if ( $do_backticks == 2 );
                }

                if ($do_quotes) {
                    if ( $t eq q/'/ ) {

                        # Special case: single-character ' token
                        if ( $prev_token_last_char =~ m/\S/ ) {
                            $t = "’";
                        }
                        else {
                            $t = "‘";
                        }
                    }
                    elsif ( $t eq q/"/ ) {

                        # Special case: single-character " token
                        if ( $prev_token_last_char =~ m/\S/ ) {
                            $t = "”";
                        }
                        else {
                            $t = "“";
                        }
                    }
                    else {

                        # Normal case:
                        $t = EducateQuotes($t);
                    }
                }

                $t = StupefyEntities($t) if $do_stupefy;
            }
            $prev_token_last_char = $last_char;
            $result .= $t;
        }
    }

    return $result;
}

sub SmartQuotes {

    # Paramaters:
    my $text = shift;    # text to be parsed
    my $attr = shift;    # value of the smart_quotes="" attribute
    my $ctx  = shift;    # MT context object (unused)

    my $do_backticks;    # should we educate ``backticks'' -style quotes?

    if ( $attr == 0 ) {

        # do nothing;
        return $text;
    }
    elsif ( $attr == 2 ) {

        # smarten ``backticks'' -style quotes
        $do_backticks = 1;
    }
    else {
        $do_backticks = 0;
    }

    # Special case to handle quotes at the very end of $text when preceded by
    # an HTML tag. Add a space to give the quote education algorithm a bit of
    # context, so that it can guess correctly that it's a closing quote:
    my $add_extra_space = 0;
    if ( $text =~ m/>['"]\z/ ) {
        $add_extra_space
            = 1;    # Remember, so we can trim the extra space later.
        $text .= " ";
    }

    my $tokens ||= _tokenize($text);
    my $result = '';
    my $in_pre = 0;    # Keep track of when we're inside 
 or  tags

    my $prev_token_last_char = ""; # This is a cheat, used to get some context
                                   # for one-character tokens that consist of
                                   # just a quote char. What we do is remember
                                   # the last character of the previous text
                                   # token, to use as context to curl single-
                                   # character quote tokens correctly.

    foreach my $cur_token (@$tokens) {
        if ( $cur_token->[0] eq "tag" ) {

            # Don't mess with quotes inside tags
            $result .= $cur_token->[1];
            if ( $cur_token->[1] =~ m/$tags_to_skip/ ) {
                $in_pre = defined $1 && $1 eq '/' ? 0 : 1;
            }
        }
        else {
            my $t = $cur_token->[1];
            my $last_char = substr( $t, -1 )
                ;    # Remember last char of this token before processing.
            if ( !$in_pre ) {
                $t = ProcessEscapes($t);
                if ($do_backticks) {
                    $t = EducateBackticks($t);
                }

                if ( $t eq q/'/ ) {

                    # Special case: single-character ' token
                    if ( $prev_token_last_char =~ m/\S/ ) {
                        $t = "’";
                    }
                    else {
                        $t = "‘";
                    }
                }
                elsif ( $t eq q/"/ ) {

                    # Special case: single-character " token
                    if ( $prev_token_last_char =~ m/\S/ ) {
                        $t = "”";
                    }
                    else {
                        $t = "“";
                    }
                }
                else {

                    # Normal case:
                    $t = EducateQuotes($t);
                }

            }
            $prev_token_last_char = $last_char;
            $result .= $t;
        }
    }

    if ($add_extra_space) {
        $result =~ s/ \z//;    # Trim trailing space if we added one earlier.
    }
    return $result;
}

sub SmartDashes {

    # Paramaters:
    my $text = shift;          # text to be parsed
    my $attr = shift;          # value of the smart_dashes="" attribute
    my $ctx  = shift;          # MT context object (unused)

# reference to the subroutine to use for dash education, default to EducateDashes:
    my $dash_sub_ref = \&EducateDashes;

    if ( $attr == 0 ) {

        # do nothing;
        return $text;
    }
    elsif ( $attr == 2 ) {

        # use old smart dash shortcuts, "--" for en, "---" for em
        $dash_sub_ref = \&EducateDashesOldSchool;
    }
    elsif ( $attr == 3 ) {

        # inverse of 2, "--" for em, "---" for en
        $dash_sub_ref = \&EducateDashesOldSchoolInverted;
    }

    my $tokens;
    $tokens ||= _tokenize($text);

    my $result = '';
    my $in_pre = 0;    # Keep track of when we're inside 
 or  tags
    foreach my $cur_token (@$tokens) {
        if ( $cur_token->[0] eq "tag" ) {

            # Don't mess with quotes inside tags
            $result .= $cur_token->[1];
            if ( $cur_token->[1] =~ m/$tags_to_skip/ ) {
                $in_pre = defined $1 && $1 eq '/' ? 0 : 1;
            }
        }
        else {
            my $t = $cur_token->[1];
            if ( !$in_pre ) {
                $t = ProcessEscapes($t);
                $t = $dash_sub_ref->($t);
            }
            $result .= $t;
        }
    }
    return $result;
}

sub SmartEllipses {

    # Paramaters:
    my $text = shift;    # text to be parsed
    my $attr = shift;    # value of the smart_ellipses="" attribute
    my $ctx  = shift;    # MT context object (unused)

    if ( $attr == 0 ) {

        # do nothing;
        return $text;
    }

    my $tokens;
    $tokens ||= _tokenize($text);

    my $result = '';
    my $in_pre = 0;    # Keep track of when we're inside 
 or  tags
    foreach my $cur_token (@$tokens) {
        if ( $cur_token->[0] eq "tag" ) {

            # Don't mess with quotes inside tags
            $result .= $cur_token->[1];
            if ( $cur_token->[1] =~ m/$tags_to_skip/ ) {
                $in_pre = defined $1 && $1 eq '/' ? 0 : 1;
            }
        }
        else {
            my $t = $cur_token->[1];
            if ( !$in_pre ) {
                $t = ProcessEscapes($t);
                $t = EducateEllipses($t);
            }
            $result .= $t;
        }
    }
    return $result;
}

sub EducateQuotes {

    #
    #   Parameter:  String.
    #
    #   Returns:    The string, with "educated" curly quote HTML entities.
    #
    #   Example input:  "Isn't this fun?"
    #   Example output: “Isn’t this fun?”
    #

    local $_ = shift;

    # Tell perl not to gripe when we use $1 in substitutions,
    # even when it's undefined. Use $^W instead of "no warnings"
    # for compatibility with Perl 5.005:
    local $^W = 0;

    # Make our own "punctuation" character class, because the POSIX-style
    # [:PUNCT:] is only available in Perl 5.6 or later:
    my $punct_class = qr/[!"#\$\%'()*+,-.\/:;<=>?\@\[\\\]\^_`{|}~]/;

# Special case if the very first character is a quote
# followed by punctuation at a non-word-break. Close the quotes by brute force:
    s/^'(?=$punct_class\B)/’/;
    s/^"(?=$punct_class\B)/”/;

    # Special case for double sets of quotes, e.g.:
    #   

He said, "'Quoted' words in a larger quote."

s/"'(?=\w)/“‘/g; s/'"(?=\w)/‘“/g; # Special case for decade abbreviations (the '80s): s/'(?=\d{2}s)/’/g; my $close_class = qr![^\ \t\r\n\[\{\(\-]!; my $dec_dashes = qr/–|—/; # Get most opening single quotes: s { ( \s | # a whitespace char, or   | # a non-breaking space entity, or -- | # dashes, or &[mn]dash; | # named dash entities $dec_dashes | # or decimal entities &\#x201[34]; # or hex ) ' # the quote (?=\w) # followed by a word character } {$1‘}xg; # Single closing quotes: s { ($close_class)? ' (?(1)| # If $1 captured, then do nothing; (?=\s | s\b) # otherwise, positive lookahead for a whitespace ) # char or an 's' at a word ending position. This # is a special case to handle something like: # "Custer's Last Stand." } {$1’}xgi; # Any remaining single quotes should be opening ones: s/'/‘/g; # Get most opening double quotes: s { ( \s | # a whitespace char, or   | # a non-breaking space entity, or -- | # dashes, or &[mn]dash; | # named dash entities $dec_dashes | # or decimal entities &\#x201[34]; # or hex ) " # the quote (?=\w) # followed by a word character } {$1“}xg; # Double closing quotes: s { ($close_class)? " (?(1)|(?=\s)) # If $1 captured, then do nothing; # if not, then make sure the next char is whitespace. } {$1”}xg; # Any remaining quotes should be opening ones. s/"/“/g; return $_; } sub EducateBackticks { # # Parameter: String. # Returns: The string, with ``backticks'' -style double quotes # translated into HTML curly quote entities. # # Example input: ``Isn't this fun?'' # Example output: “Isn't this fun?” # local $_ = shift; s/``/“/g; s/''/”/g; return $_; } sub EducateSingleBackticks { # # Parameter: String. # Returns: The string, with `backticks' -style single quotes # translated into HTML curly quote entities. # # Example input: `Isn't this fun?' # Example output: ‘Isn’t this fun?’ # local $_ = shift; s/`/‘/g; s/'/’/g; return $_; } sub EducateDashes { # # Parameter: String. # # Returns: The string, with each instance of "--" translated to # an em-dash HTML entity. # local $_ = shift; s/--/—/g; return $_; } sub EducateDashesOldSchool { # # Parameter: String. # # Returns: The string, with each instance of "--" translated to # an en-dash HTML entity, and each "---" translated to # an em-dash HTML entity. # local $_ = shift; s/---/—/g; # em s/--/–/g; # en return $_; } sub EducateDashesOldSchoolInverted { # # Parameter: String. # # Returns: The string, with each instance of "--" translated to # an em-dash HTML entity, and each "---" translated to # an en-dash HTML entity. Two reasons why: First, unlike the # en- and em-dash syntax supported by # EducateDashesOldSchool(), it's compatible with existing # entries written before SmartyPants 1.1, back when "--" was # only used for em-dashes. Second, em-dashes are more # common than en-dashes, and so it sort of makes sense that # the shortcut should be shorter to type. (Thanks to Aaron # Swartz for the idea.) # local $_ = shift; s/---/–/g; # en s/--/—/g; # em return $_; } sub EducateEllipses { # # Parameter: String. # Returns: The string, with each instance of "..." translated to # an ellipsis HTML entity. Also converts the case where # there are spaces between the dots. # # Example input: Huh...? # Example output: Huh…? # local $_ = shift; s/\.\.\./…/g; s/\. \. \./…/g; return $_; } sub StupefyEntities { # # Parameter: String. # Returns: The string, with each SmartyPants HTML entity translated to # its ASCII counterpart. # # Example input: “Hello — world.” # Example output: "Hello -- world." # local $_ = shift; s/–/-/g; # en-dash s/—/--/g; # em-dash s/‘/'/g; # open single quote s/’/'/g; # close single quote s/“/"/g; # open double quote s/”/"/g; # close double quote s/…/.../g; # ellipsis return $_; } sub SmartyPantsVersion { return $VERSION; } sub ProcessEscapes { # # Parameter: String. # Returns: The string, with after processing the following backslash # escape sequences. This is useful if you want to force a "dumb" # quote or other character to appear. # # Escape Value # ------ ----- # \\ \ # \" " # \' ' # \. . # \- - # \` ` # local $_ = shift; s! \\\\ !\!gx; s! \\" !"!gx; s! \\' !'!gx; s! \\\. !.!gx; s! \\- !-!gx; s! \\` !`!gx; return $_; } sub _tokenize { # # Parameter: String containing HTML markup. # Returns: Reference to an array of the tokens comprising the input # string. Each token is either a tag (possibly with nested, # tags contained therein, such as , or a # run of text between tags. Each element of the array is a # two-element array; the first is either 'tag' or 'text'; # the second is the actual value. # # # Based on the _tokenize() subroutine from Brad Choate's MTRegex plugin. # # my $str = shift; my $pos = 0; my $len = length $str; my @tokens; my $depth = 6; my $nested_tags = join( '|', ('(?:<(?:[^<>]') x $depth ) . ( ')*>)' x $depth ); my $match = qr/(?s: ) | # comment (?s: <\? .*? \?> ) | # processing instruction $nested_tags/x; # nested tags while ( $str =~ m/($match)/g ) { my $whole_tag = $1; my $sec_start = pos $str; my $tag_start = $sec_start - length $whole_tag; if ( $pos < $tag_start ) { push @tokens, [ 'text', substr( $str, $pos, $tag_start - $pos ) ]; } push @tokens, [ 'tag', $whole_tag ]; $pos = pos $str; } push @tokens, [ 'text', substr( $str, $pos, $len - $pos ) ] if $pos < $len; \@tokens; } 1; __END__ =pod =head1 NAME B =head1 SYNOPSIS B [ B<-1> ] [ B<-2> ] [ B<-3> ] [ B<--version> ] [ B<--shortversion> ] [ I ... ] =head1 DESCRIPTION SmartyPants is a web publishing utility that translates plain ASCII punctuation characters into "smart" typographic punctuation HTML entities. SmartyPants can perform the following transformations: =over 4 =item * Straight quotes ( " and ' ) into "curly" quote HTML entities =item * Backticks-style quotes (``like this'') into "curly" quote HTML entities =item * Dashes (C<--> and C<--->) into en- and em-dash entities =item * Three consecutive dots (C<...>) into an ellipsis entity =back SmartyPants is a combination plug-in -- the same file works with Movable Type, Blosxom, BBEdit, and as a standalone Perl script. Version requirements and installation instructions for each of these tools can be found in the readme file that accompanies this script. SmartyPants does not modify characters within C<<
 >>, C<<  >>,
C<<  >>, C<<