diff --git a/mkindex.pl b/mkindex.pl index 9580d44df3..cc6fa39bfb 100755 --- a/mkindex.pl +++ b/mkindex.pl @@ -1,4 +1,5 @@ -#!/usr/bin/perl +#!/usr/bin/perl -w + # Build an index of words in the file index-words that are found in the text. # Words are compared case insensitively except for those starting with a dash # (i.e. program option names). "Words" may actually be phrases consisting of @@ -7,6 +8,10 @@ use feature "fc"; +# import shared function +use lib '.'; +require "urlify.pl"; + # Return the case-folded keyword UNLESS it appears to be an option string # in which case return it as-is. This makes word lookups case-insensitive # but option name lookups case-sensitive. @@ -31,33 +36,6 @@ sub folded { } close(F); -sub urlify { - my ($fname, $section)=@_; - - # convert letters to lower case - $section =~ tr/[A-Z]/[a-z]/; - - # Convert all '<' to '-less-than' - $section =~ s/\' to '-greater-than' - $section =~ s/\>/-greater-than-/g; - - # remove rubbish - $section =~ s/[*`'":\(\),]+//g; - - # convert anything left that isn't a dash, underscore, number or letter - $section =~ s/[^_a-zA-Z0-9-]/-/g; - - # Remove starting chars that aren't a letter or underscore; - # those aren't legal for the beginning of a section ID. - $section =~ s/^[^_a-zA-Z]+//g; - - # strip trailing dash '-' characters from the section header - $section =~ s/-+$//; - - return "$fname#$section"; -} sub single { my ($fname)=@_; @@ -72,16 +50,16 @@ sub single { my $l=$_; # Track whether we are within a markdown code block that begins/ends with ``` - if ($_ =~ /^\`\`\`.*/) { - if ($in_code_section) { + if($_ =~ /^\`\`\`.*/) { + if($in_code_section) { $in_code_section = 0; } else { $in_code_section = 1; } } - if (!$in_code_section) { - if ($_ =~ /^(#[\#]*) ([^\{]*)(\{\#(.*)\})/) { + if(!$in_code_section) { + if($_ =~ /^(#[\#]*) ([^\{]*)(\{\#(.*)\})/) { # This section header has an explicit ID specified, e.g. "#Foo {#foo}" $depth = $1; $section = $2; @@ -90,9 +68,14 @@ sub single { $section =~ s/\s+$//; $url = "$fname#$dest_id"; + + # Use only the anchor text when a section heading has an anchor tag + if($section =~ m/(.*)\[(.*)\]\(.*\).*/) { + $section = "$1$2"; + } $l = $section; } - elsif ($_ =~ /^(#[\#]*) (.*)/) { + elsif($_ =~ /^(#[\#]*) (.*)/) { # This section header has no explicit ID specified, e.g. "#Foo" $depth = $1; $section = $2; @@ -100,7 +83,14 @@ sub single { # trim whitespace off end of section $section =~ s/\s+$//; - $url=urlify($fname, $section); + my $url_section = urlify($section); + $url = "$fname#$url_section"; + + # Use only the anchor text when a section heading has an anchor tag + if($section =~ m/(.*)\[(.*)\]\(.*\).*/) { + $section = "$1$2"; + } + $l = $section; # use this too } } @@ -121,7 +111,6 @@ sub single { foreach my $w (@lwords) { if(folded($l) =~ /$w/) { if(!$word{$w}{$fname}) { - #print " $w ($url)\n"; $word{$w}{$fname}++; $all{$w} .= ($all{$w}?", ":"")."[$section]($url)"; } @@ -158,9 +147,9 @@ sub byname { my $l = substr(sorting($w), 0, 1); if(!$letter{$l}) { $letter{$l}++; - print "## $l\n"; + # Make sure headings have blank lines before and after + print "\n## $l\n\n"; } - printf " - ".$index{$w}.": "; print $all{$w}."\n"; } diff --git a/uni.pl b/uni.pl index fd3fc93eaa..1e31242157 100755 --- a/uni.pl +++ b/uni.pl @@ -1,4 +1,8 @@ -#!/usr/bin/perl +#!/usr/bin/perl -w + +# import shared function +use lib '.'; +require "urlify.pl"; my $sum = shift @ARGV; @@ -20,37 +24,209 @@ sub dirname { return ""; } + +# Converts a file path to something that can be used in a section ID +sub urlify_file_path { + my ($part) = @_; + + # Strip a leading '/' if present + $part =~ s/^\///; + + $part =~ s/[\/\.]+/__/g; + + return $part; +} + + +# Converts a relative path and current directory to a full path relative +# to the "ROOT" directory. +sub make_full_file_path { + my ($current_dir, $linked_file_path) = @_; + + # Handle pants with "../" in them + while($linked_file_path =~ m/^\.\..*/) { + # Strip leading ".." + $linked_file_path =~ s/^..//; + # Strinp leading "/" if present + $linked_file_path =~ s/^\///; + + # Strip trailing / + $current_dir =~ s/\/$//; + if($current_dir =~ m/.*\/.*/) { + # if there is more than one dir remaining, strip the last dir + $current_dir =~ s/(.*)\/(.*)/$1/; + } else { + # only one dir remaining, use root of "" + $current_dir = ""; + } + } + + my $final_path="$current_dir/$linked_file_path"; + + return $final_path; +} + + +# Rewrites an anchor destination to point to a section ID in the current file. +sub update_anchor { + my ($anchor_text, $anchor_target, $dir, $f) = @_; + my $final_target = $anchor_target; + + if($anchor_target =~ m/^(http\:|https\:).*$/ ) { + # don't rewrite regular http / https urls + } elsif($anchor_target =~ m/(^.*)(\.png|\.jpg)$/ ) { + # + # The anchor_target points to an image, add the image directory to the target + # + $final_target = "$dir$1$2"; + } elsif($anchor_target =~ m/((.+)\.md)\#(.+)/ ) { + # + # The anchor_target points a SPECIFIC section of a DIFFERENT *.md file + # + my $full_file_path=make_full_file_path($dir, $1); + my $file_target=urlify_file_path($full_file_path); + my $section_target=$3; + + $final_target = "#$file_target" . "-_-_-" . $section_target; + } elsif($anchor_target =~ m/((.+)\.md)$/x) { + # + # The anchor_target points NO SECTION in a DIFFERENT *.md file + # + + my $full_file_path=make_full_file_path($dir, $1); + my $file_target=urlify_file_path($full_file_path); + + $final_target = "#$file_target"; + } elsif($anchor_target =~ m/\#(.+)$/x) { + # + # The anchor_target points A SECTION in THIS *.md file + # + + my $full_file_path = $f; + my $file_target=urlify_file_path($full_file_path); + + my $section_target=$1; + + $final_target = "#$file_target" . "-_-_-" . $section_target; + } + + return $anchor_text . "($final_target)"; +} + +my $errors = 0; + sub include { my ($f) = @_; my $line; open(M, "<$f") || return; my $dir = dirname($f); + + my $same_dir_url_part = urlify_file_path($dir); + my $same_file_url_part = urlify_file_path($f); + + my $in_code_section = 0; + my $next_line_should_be_blank = 0; + + # Print an empty span at the top of each new file, to give "foo.md" links from other files a + # place to target with a {#foo} style link in the combined .md file print "\n"; + print "[ ]\{#$same_file_url_part\}\n"; + print "\n"; + while() { $line++; - # strip out links to markdown files - $_ =~ s/\[([^]]*)\]\(.*\.md(|\#(.*))\)/$1/g; - # add path to image links - $_ =~ s/^!\[(.*)\]\(([^)]*)\)/![$1]($dir$2)/g; - - if($_ =~ /\]\(.*\.md/) { - print STDERR "$f:$line:line-split markdown link\n"; - print STDERR "$_"; - $errors++; - } - if($_ =~ /谭/) { - # skip unicode letter pandoc does not like + my $complete_line = $_; + + if($next_line_should_be_blank) { + if(! (($complete_line eq "") || ($complete_line =~ m/^[\s]*$/))) { + print STDERR "WARNING: The line after a '#' header should be blank in $f, but was: $complete_line\n"; + } + + $next_line_should_be_blank = 0; } - else { - print $_; + + # Track whether we are within a markdown code block that begins/ends with ``` + if($complete_line =~ /^\`\`\`.*/) { + if($in_code_section) { + $in_code_section = 0; + } else { + $in_code_section = 1; + } + } elsif(!$in_code_section) { + # Split line, so we can update multiple URLs on one line + my @line_items = split( /( \[ [^]]*\]\([^)]*?\) )/x , $_); + + foreach my $item (@line_items) { + # Update all of the anchor targets in the line + $item =~ s/ (\[ [^]]*\]) \( ([^)]*?) \) /update_anchor($1 , $2, $dir, $f) /xge; + } + $complete_line = join('', @line_items); + + # + # Check for section H1, H2, etc, definitions (starting with #, ##, etc) + # + # Add an explicit section ID to those if not present + # Include in that section ID a reference to the full path to the current file + my $full_file_path = $f; + my $file_target=urlify_file_path($full_file_path); + + my $final_section_line = $complete_line; + + if($complete_line =~ /^(#[\#]*) ([^\{]*)(\{\#(.*)\})/) { + $next_line_should_be_blank = 1; + + # This section header has an explicit ID specified, e.g. "#Foo {#foo}" + my $depth = $1; + my $section = $2; + my $section_id=$4; + + # trim whitespace off end of section + $section =~ s/\s+$//; + + $section_id = "$file_target" . "-_-_-" . $section_id; + + $final_section_line = "$depth $section \{\#$section_id\}\n"; + } + elsif($complete_line =~ /^(#[\#]*) (.*)/) { + $next_line_should_be_blank = 1; + + # This section header has no explicit ID specified, e.g. "#Foo" + my $depth = $1; + my $section = $2; + + # trim whitespace off end of section + $section =~ s/\s+$//; + + my $section_id = "$file_target" . "-_-_-" . urlify($section); + + $final_section_line = "$depth $section \{\#$section_id\}\n"; + } + + $complete_line = $final_section_line; + + if($complete_line =~ /谭/) { + # skip unicode letter pandoc does not like + } + else { + print $complete_line; + } + } else { + # in a code section, print lines as-is. + + if($complete_line =~ /谭/) { + # skip unicode letter pandoc does not like + } + else { + print $complete_line; + } } } close(M); } -for my $f (@files) { - include($f); +for my $file (@files) { + include($file); } exit $errors; diff --git a/urlify.pl b/urlify.pl new file mode 100644 index 0000000000..af2d80ceb2 --- /dev/null +++ b/urlify.pl @@ -0,0 +1,47 @@ +use strict; + +# Code to create section-ids from plain text is used both in mkindex and in uni.pl + +sub urlify { + my ($section)=@_; + + if($section =~ m/(.*)\[(.*)\]\(.*\).*/) { + # The section header contains a link to somewhere else, we'll strip out the "text" portion + # for use in the 'urlify' output + my $old_section = $section; + + $section = "$1$2"; + } + + # convert letters to lower case + $section =~ tr/[A-Z]/[a-z]/; + + # Convert all '<' to '-less-than' + $section =~ s/\' to '-greater-than' + $section =~ s/\>/-greater-than-/g; + + # Convert all '.' to '-dot' + $section =~ s/\./-dot-/g; + + # Convert all '/' to '-slash' + $section =~ s/\//-slash-/g; + + # remove rubbish + $section =~ s/[*`'":\(\),]+//g; + + # convert anything left that isn't a dash, underscore, number or letter + $section =~ s/[^_a-zA-Z0-9-]/-/g; + + # If the starting chars aren't a letter or underscore, prepend "sect-" to them to turn them + # into legal identifiers + $section =~ s/^([^_a-zA-Z]+)/sect-$1/g; + + # strip trailing dash '-' characters from the section header + $section =~ s/-+$//; + + return "$section"; +} + +1;