#!/bin/ksh ###################################################################### # Program: refold # Purpose: A text-file restoration utility. # Arguments: Commands and files; see usage. # Author: Perette Barella # # IMPORTANT # THIS FILE IS UTF-8 ENCODED! #--------------------------------------------------------------------- arg0=$(basename $0) status=0 undo="/tmp/$arg0.$$.last" temp="/tmp/$arg0.$$.tmp" ###################################################################### # Function: usage # Purpose: Displays the usage of this command. # Author: Perette Barella #--------------------------------------------------------------------- function usage { cat << EOF $arg0 is a tool for restoring text files that have been mangled by mailers, readnews, dead encodings, etc. The general philosophy is to end up with a modern format: a UTF-8 encoded text file without line breaks, paragraphs separated by a blank line. Extra blank lines are maintained, as these often have significance. Usage: $arg0 [edit command] file ... Issues the specified edit command on the file, sending the result to standard out. Command-file pairs may be repeated, but output will run together. Multiple edit sequences must be done by successive invocations. If no edit command is given, $arg0 enters interactive mode. If only an edit command is given, $arg0 runs as a filter. Commands are as follows: -combine-indented-paragraphs -combine-paragraphs-separated-by-blanks Both "combine" functions assemble broken-line paragraphs, most typically 70-ish column format, into unbroken paragraphs. Paragraph cues are taken as described. -remove-double-spacing Eliminates double spacing. When there are multiple blank lines in succession, only the first is dropped. -demangle-weird-characters Fixes Quoted Printable and Rich Text Format encodings. The most obvious fix, typically, is fixing quotes and apostrophes which have been encoded as ugly character sequences. For Quoted Printable, also fixes extra line breaks introduced by the encoding. -fold Once you've got a file back into a modern format, you should keep it that way. If, however, you want to export it for posting on Usenet, etc., "folding" does all the magic: Inserting line breaks for a 70-character terminal, converting asymmetrical quotes into their ASCII equivalents, downconverting various Unicode diacritical characters into their plain ASCII equivalents, etc. -add-smart-quotes Replaces ASCII quotes and apostrophes with the "smart" equivalent. It also catches ellipsises. Performance with double-quotes is pretty good, but because of ambiguity between single-quotes, plural possessives, and contractions, these are not touched. EOF ##### End of function usage ##### } ###################################################################### # Function: combine_indented_paragraphs # Purpose: Combines lines in a source file into unbroken paragraphs. # Paragraph determination is done by leading whitespace on # a line; thus, a file must be <<'ed prior to processing # if it contains a left margin. # Author: Perette Barella #--------------------------------------------------------------------- function combine_indented_paragraphs { typeset buffer="" typeset didblanks=true while IFS="" read aline do if [ "$aline" = "" ] || expr "$aline" : '[ \t]*$' >/dev/null then [ "$buffer" != "" ] && print -- "$buffer" buffer="" if [ "$didblanks" = "false" ] then print print print fi didblanks=true elif expr "$aline" : '[ \t]' >/dev/null then if [ "$buffer" != "" ] then print -- "$buffer" print fi buffer="$aline" didblanks=false else buffer="$buffer $aline" didblanks=false fi done [ "$buffer" != "" ] && print -- "$buffer" } ###################################################################### # Function: combine_separated_paragraphs # Purpose: Create unbroken paragraphs from a text file, separating # paragraphs where blank lines are present. When there # is more than one blank line, extras are retained as they # are likely significant. # Author: Perette Barella #--------------------------------------------------------------------- function combine_separated_paragraphs { buffer="" while read aline do if [ "$aline" = "" ] then [ "$buffer" != "" ] && print -- "$buffer" print buffer="" else buffer="$buffer$aline " fi done | sed -e 's/[ \t]*$//' } ###################################################################### # Function: remove_double_spacing # Purpose: Removes double spacing. This step is necessary # before combining separated paragraphs-- otherwise, # lines will be treated as paragraphs. # Author: Perette Barella #--------------------------------------------------------------------- function remove_double_spacing { buffer="" emptycount=0 while IFS="" read aline do if [ "$aline" = "" ] || expr "$aline" : '[ \t]*$' >/dev/null then let emptycount=emptycount+1 [ $emptycount -gt 1 -a $emptycount -lt 5 ] && print else emptycount=0 print "$aline" fi done } ###################################################################### # Function: demangle_qp_and_rtf # Purpose: Fixes "junk" introduced by Quoted Printable and # RTF encodings, including smart quotes, a few special # characters, and QP's spurious line breaks. # Author: Perette Barella # References: "Quoted-printable" (preceded by =) # http://en.wikipedia.org/wiki/Quoted-printable # "Rich Text Format - Character Encoding" (preceded by \') # http://en.wikipedia.org/wiki/Rich_Text_Format #--------------------------------------------------------------------- function demangle_qp_and_rtf { while IFS="" read aline do # Fix spurious wrap from quoted printable if [ "$aline" != "" ] && [ "${aline:$((${#aline}-1))}" = "=" ] then print -n -- "${aline:0:$((${#aline}-1))}" else print -- "$aline" fi done | sed \ -e "s/=91/’/g" \ -e "s/=92/‘/g" \ -e 's/=93/“/g' \ -e 's/=94/”/g' \ -e "s/=A9/©/g" \ -e "s/=3D/=/g" \ -e "s/=20$//g" \ -e "s/=09$//g" \ -e "s/\\'d5/’/g" \ -e "s/\\'d4/‘/g" \ -e "s/\\'d2/\“/g" \ -e "s/\\'d3/\”/g" \ -e "s/\\cb[13] //g" \ -e "s/\\cb[13]//g" \ -e "s/\\c9/.../g" } ###################################################################### # Function: create_smart_quotes # Purpose: Modify plain-ascii quotes and apostrophes into # their "proper", opposing characters. # Elipsis translated to the corresponding character, retaining # surrounding whitespace. # Double-dashes become em-dashes with optional whitespace removed. # Single dashes separate from words by spaces become en-dashes. # Limitations: Single-quoted things and some apostrophes don't get # translated because it's impossible to distinguish: # 'Just 'cuz,' she said. # Author: Perette Barella #--------------------------------------------------------------------- function create_smart_quotes { sed -E \ -e 's/^"([A-Za-z])/“\1/'g \ -e 's/([ \t])"([A-Za-z])/\1“\2/'g \ -e 's/([A-Za-z0-9][.,?!]+)"([ \t])/\1”\2/'g \ -e 's/([A-Za-z0-9]\.\.\.)"([ \t])/\1”\2/'g \ -e 's/([A-Za-z0-9][.,?!])"([ \t]*)$/\1”\2/'g \ -e 's/([A-Za-z0-9]\.\.\.)"([ \t]*)$/\1”\2/'g \ -e 's/([A-Za-z0-9])"([.,?! \t]|\.\.\.)/\1”\2/'g \ -e 's/([A-Za-z0-9])"([.,?! \t]*|\.\.\.)$/\1”\2/'g \ -e "s/(ldn)'(t[ \t])/\\1’\\2/"g \ -e "s/[[:<:]]([A-Za-z]+)n't[[:>:]]/\\1n’t/"g \ -e "s/[[:<:]]([A-Za-z]+)'(s|ve|ll|re)[[:>:]]/\\1’\\2/"g \ -e "s/[[:<:]]I'm[[:>:]]/I’m/"g \ -e 's/([A-Za-z0-9])\.\.\.([ \tA-Za-z0-9])/\1…\2/'g \ -e 's/([A-Za-z0-9])\.\.\.([ \t]*)$/\1…\2/'g \ -e 's/ ?-- ?/—/'g \ -e 's/ - /–/'g } ##### End of function create_smart_quotes ##### # Execute an edit function on the specified file. # The result is put to standard out, which is presumably redirected # by the caller. function perform_edit { typeset action="$1" file="$2" case "$action" in combine-indented-paragraphs) (cat "$file"; print) | combine_indented_paragraphs | sed -e 's/^[ \t]*//' -e 's/[ ]*$//' ;; combine-paragraphs-separated-by-blanks) (cat "$file"; print) | combine_separated_paragraphs | sed -e 's/^[ \t]*//' -e 's/[ ]*$//' ;; remove-double-spacing) (cat "$file"; print) | remove_double_spacing ;; demangle-weird-characters) (sed -e "s/\\\\$//" "$file"; print) | demangle_qp_and_rtf ;; add-smart-quotes) create_smart_quotes < "$file" ;; fold) # Convert a file back to pure ASCII and # put it in 70-column format for posting # on newsgroups, etc. sed \ -e 's/–/ - /'g \ -e 's/—/--/'g \ -e 's/©/(c)/'g \ -e 's/®/(R)/'g \ -e 's/…/.../'g \ -e 's/™/(TM)/'g \ -e 's/•/*/'g \ -e "s/[‘’]/'/"g \ -e 's/[“”]/"/'g \ -e 's/[áàäâã]/a/'g \ -e 's/[ÀÁÄÂÃ]/A/'g \ -e 's/[èéëê]/e/'g \ -e 's/[ÈÉËÊ]/E/'g \ -e 's/[ìíïî]/i/'g \ -e 's/[ÌÍÏÎ]/I/'g \ -e 's/[òóöôõ]/o/'g \ -e 's/[ÒÓÖÔÕ]/O/'g \ -e 's/[ùúüû]/u/'g \ -e 's/[ÙÚÜÛ]/U/'g \ -e 's/ñ/n/'g \ -e 's/Ñ/N/'g \ -e 's/ç/c/'g \ -e 's/Ç/C/'g \ -e 's/æ/ae/'g \ -e 's/Æ/Ae/'g \ "$file" | fold -70 -s ;; *) print "Unknown action: $action." exit 1 esac return $? } function encoding_lab { typeset file="$1" undo="$2" encoding checksum size typeset temp="/var/tmp/$0.lab.$$.tmp" typeset all_encodings=$(iconv --list | awk '{print $1}') typeset possible_encodings="" typeset rejected_encodings="" cksum < "$file" | read checksum size eval "typeset en_${checksum}_${size}='the original file'" for encoding in $all_encodings do if iconv -f "$encoding" -t "utf-8" "$file" > "$temp" 2>/dev/null then cksum < "$temp" | read checksum size if eval " [ \"\${en_${checksum}_${size}}\" != \"\" ]" then eval "print \"$encoding duplicates \${en_${checksum}_${size}}\"" else possible_encodings="$possible_encodings $encoding" eval "typeset en_${checksum}_${size}='$encoding'" fi else rejected_encodings="$rejected_encodings $encoding" fi done print -n "Rejecting encodings: $rejected_encodings" print if [ "$possible_encodings" = "" ] then print "Nothing to do in the encoding lab!" return 1 fi typeset selected_encoding="" typeset commit="" while true do print "Current encoding is: ${selected_encoding:-not set}" select encoding in $possible_encodings $commit help cancel do if [ "$encoding" = "" ] then print "Unknown action." continue fi case "$encoding" in cancel) return 1 ;; commit) if mv "$file" "$undo" then iconv -f "$selected_encoding" -t "utf-8" "$undo" > "$file" return 0 fi return 1 ;; help) cat << EOF The Encoding Lab allows you to quickly try different character encodings, select the correct one, and convert the file you're working with into Unicode (UTF-8 encoded). Type the number of the encoding you want to try. The file will be converted to that encoding, then the differences between the two file shown. If the file is correct (look to smart quotes, umlauts, accent graves, and other special characters) then commit; otherwise, try a different encoding. What are these encodings? In the old days, computers had 256 characters to work with. Different regions (say, Russia) had different character sets to accommodate the special characters necessary in their language, all using the same 256 values (0-255, or 0 .. 2^8-1) for their encodings. China, Japan, and Korea (which have hundreds of characters, more than the 256 old-days limit) came up with their own systems, not even compatible with each other. Nothing was compatible, and it sucked; it was a tower of Babel. Then Unicode came, and the way was clear. The Unicode character set accommodates every language in use. Documents written in Unicode thus eliminate the per-country encoding problem, although there are a handful of ways to transliterate Unicode characters into byte sequences for machines-- which is the encoding. UTF-8 is a common way of encoding Unicode characters using byte sequences that are compatible with older software, and thus UNIX friendly. Unicode UTF-8 is also the most common Unicode encoding on the World Wide Web. EOF ;; delete) commit="" possible_encodings="$(print -- "$possible_encodings" | sed "s/ $selected_encoding / /g")" selected_encoding="" ;; *) commit="commit delete" selected_encoding="$encoding" iconv -f "$encoding" -t "utf-8" "$file" > "$temp" diff "$file" "$temp" rm -f "$temp" ;; esac break done done } function show_file_head { typeset file="$1" length=20 while [ $length -gt 5 -a $(head -$length "$file" | wc -w) -gt 150 ] do let length=length-1 done if [ $length -lt 20 ] then head -$length "$file" | fold -s -78 else head -$length "$file" fi } function edit_file { quit=false file="$1" cp "$file" "$temp" cp "$file" "$undo" if ! iconv -f utf-8 -t utf-8 "$file" > /dev/null then print "$file: This file is not Unicode UTF-8 encoded." encoding_lab "$temp" "$undo" fi while true do show_file_head "$temp" print select action in "open-textedit" "edit-vi" "combine-indented-paragraphs" "combine-paragraphs-separated-by-blanks" "remove-double-spacing" "demangle-weird-characters" "add-smart-quotes" "encoding-lab" "fold" "diff" "undo" "commit" "revert" "quit" do if [ "$action" = "" ] then print "Unknown action." continue fi if [ "$action" = "quit" -a "$quit" = "true" ] then return 0 fi quit=false case "$action" in open-textedit) open -e "$temp" ;; edit-vi) vi "$temp" ;; undo) cp "$undo" "$temp" ;; commit) cp "$temp" "$file" return 0 ;; diff) diff "$file" "$temp" continue ;; revert) mv "$temp" "$undo" && cp "$file" "$temp" ;; encoding-lab) encoding_lab "$temp" "$undo" ;; quit) quit=true print "ARE YOU SURE? Quit again to confirm." continue ;; *) if mv "$temp" "$undo" then perform_edit "$action" "$undo" > "$temp" fi ;; esac break done done } if [ $# -eq 0 ] then usage exit 1 fi for file in "$@" do if [ "$file" = "-?" ] then usage exit 0 elif [ "${file:0:1}" = "-" ] then action="${file:1}" elif [ "$action" = "" ] then edit_file "$file" action="" else perform_edit "$action" "$file" fi done if [ $# -eq 1 -a "$action" != "" ] then perform_edit "$action" /dev/stdin fi rm -f "$temp" "$undo" exit $status