#!/bin/ksh
######################################################################
# $Id: buildindex,v 1.23 2008/02/20 14:42:51 perette Exp $
# Author: Perette Barella
# Purpose: This script generates index.web or delta.web which can then
# be run through the M4 macros to generate a corresponding
# HTML file.
# Version: @(#) $Id: buildindex,v 1.23 2008/02/20 14:42:51 perette Exp $
# Copyright: Copyright 1997-2003 Perette Barella.
# All rights reserved.
######################################################################
function get_index_entries
{
typeset file="$1" dirname basename
[ "${file:0:2}" = "./" ] && file="${file:2}"
[ ! -r "$file" ] && return 1
dirname=$(dirname "$file") || return 1
basename=$(basename "$file" .web) || return 1
(cd "$dirname" && ${M4:-m4} -P -DFILE_SOURCE=$basename.web -DFILE_DEST=$basename.html ${INCLUDE}/readindex.m4) |
sed -e "s&http:.*$dirname/$basename\.html &$dirname/$basename.html &" \
-e "s&http:.*$dirname/$basename\.html#\([^ ]*\) &$dirname/$basename.html#\1 &"
}
function get_page_title
{
typeset file="$1" dirname basename name
[ ! -r "$file" ] && return 1
dirname=$(dirname "$file") || return 1
basename=$(basename "$file") || return 1
case "$basename" in
*.web)
cd "$dirname" || return 1
name=$(${M4:-m4} -P -DFILE_SOURCE=$basename -DFILE_DEST=$(basename .$basename web).html ${INCLUDE}/readtitle.m4) ||
return 1
;;
*.txt.gz|*.txt.Z)
name=$(zegrep -v '^[ ]*$|BEGIN PGP' "$file" | head -1)
[ "$name" != "" ] && name="$basename: $name"
;;
*.txt|*.do)
name=$(head -15 "$file" | grep '^Subject:' |
sed -e 's/^Subject: *//' -e 's/[rR]e: *//g')
[ "$name" = "" ] &&
name=$(egrep -v '^[ ]*$|BEGIN PGP' "$file" | head -1)
if [ "$name" = "" -o "${name:0:2}" = "#!" ]
then
name=$(head -2 $file |
sed -e 's/&/&/' -e 's/\</' -e 's/>/\>/')
fi
[ "$name" != "" ] && name="$basename: $name"
;;
*.pdf)
name=$(strings "$file" | grep 'dc:title' | sed \
-e 's/.*]*>//' \
-e 's&]*>.*&&' \
-e 's/<[^>]*>//g' \
-e 's/^[ ]*//'g \
-e 's/[ ]*$//g')
[ "$name" != "" ] && name="$name (PDF)"
;;
*)
echo "$arg0: Don't know file format for $basename." 1>&2
return 1
;;
esac
[ "$name" = "" ] && return 1
echo "$name" | sed -e 's/&/&/' -e 's/\</' -e 's/>/\>/'
return 0
}
function get_page_location
{
typeset file="$1"
case "$file" in
*.web)
echo "$(dirname $file)/$(basename $file .web).html"
;;
*)
echo "$file"
esac
}
# islocked - return lock status of a file
# return 0 for locked, 255 for confused (didn't find lock data)
# other >0 value for unlocked.
islocked ()
{
file="$1"
while read aline
do
case "$aline" in
locks\;*)
# no locks
return 1
;;
locks*)
return 0
;;
text*)
# at text section, no locks found
echo "$file: Can not find lock data" 1>&2
return 255
;;
esac
done < $1
echo "$file: Can not find lock data: EOF" 1>&2
return 255
}
function read_rcs_last_change
{
# RCS stores stuff in reverse chronological order in the file.
# So just # locate the first log, and return that.
typeset file="$1" aline change="" collect=false
while read aline
do
if [ "$aline" = "log" ]
then
collect=true
elif [ "$aline" = "@" -a "$collect" = "true" ]
then
echo "$change"
return 0
elif [ "$aline" = "text" ]
then
echo "$file: Got to text line." 1>&2
return 1
elif [ "$collect" = "true" ]
then
change="$change ${aline:1}"
fi
done < "$file"
echo "$file: Can't read change log." 1>&2
set +x
return 1
}
function add_update_info
{
typeset file="$1" rcsfile updated change=""
rcsfile="$(dirname "$file")/RCS/$(basename "$file"),v"
updated=$(ls -l $file | awk '{ print $6 " " $7 " " $8 }')
if [ -f "$rcsfile" ]
then
if islocked "$rcsfile"
then
change="Changes are available, but not yet marked complete."
else
# updated=$(grep "^date " "$rcsfile" | head -1 |
# awk '{print $2}' | sed 's/;//g')
change=$(read_rcs_last_change "$rcsfile")
fi
fi
echo "
Last updated: $updated."
if [ "$change" != "" ]
then
echo "
"
echo "Changes: \`\`$change''" |
sed -e 's/&/&/' -e 's/\</' -e 's/>/\>/'
fi
return 0
}
# KORN SHELL DEPENDENT FUNCTION
# (Fork order of pipe.)
function format_content_index
{
typeset file url title index lastgroup lastlevel=0 currentlevel
echo '
'
for file in "$@"
do
get_index_entries "$file"
done |
grep -v '^[ ]*$' |
sed -e 's/^[ ]*//' -e 's/ ,/,/g' -e 's/, /,/g' -e 's/<[^>]*>//g' |
sort -f -t" " +2 $temp |
while IFS=" " read url title indexline
do
currentlevel=0
# Break comma-separated index entry into heirarchy levels
while index[$currentlevel]=$(echo "$indexline" |
cut -d, -s -f$(($currentlevel + 1)));
[ "${index[$currentlevel]}" != "" ]
do
let currentlevel++
done
# In case there are no commas in the index entry
if [ $currentlevel = 0 ]
then
currentlevel=1
index[$currentlevel]="$indexline"
fi
# Find the first heirarchy level that's different
typeset backto=0
while [ $currentlevel -gt $backto -a \
"${lastgroup[$backto]}" = "${index[$backto]}" ]
do
let backto++;
done
# Output end-list tags until we get back to the changed level
while [ $lastlevel -gt $backto ]
do
echo "
"
let lastlevel--
lastgroup[$lastlevel]=""
done
# Output new/different heirarchy levels if necessary
while [ $lastlevel -lt $currentlevel ]
do
echo "${index[$lastlevel]}"
echo ""
lastgroup[$lastlevel]="${index[$lastlevel]}"
let lastlevel++
done
# Output the index entry
if [ "$url" = "SEE" ]
then
echo "- See: $title"
elif [ "$url" = "ALSO" ]
then
echo "
- See also: $title"
else
echo "
- $title"
fi
done
while [ $lastlevel -ge 0 ]
do
let lastlevel--;
echo "
"
done
}
function is_in_index
{
typeset file="$1" dirname basename
dirname=$(dirname "$file")
basename=$(basename "$file")
# if no index page, everything is listed.
[ ! -f "$dirname/index.html" ] && return 0
grep -q -w "$basename" "$dirname/index.html"
return $?
}
function format_list
{
typeset class="" pagetitle location any=false suppress=false
[ "$mode" = "deltas" ] && class=" CLASS=padded"
[ "$1" = "-s" ] && suppress=true && shift
for file in "$@"
do
[ ! -r "$file" ] && continue
[ "$(dirname "$file")" = "$(dirname "$destfile")" -a \
"$(basename "$file")" = "$(basename "$destfile")" ] && continue
if ! pagetitle=$(get_page_title "$file")
then
echo "$file: can't determine page title." 1>&2
pagetitle=$(basename "$file")
fi
if ! location=$(get_page_location "$file")
then
echo "$file: can't determine location." 1>&2
continue
fi
if [ "$mode" = "deltas" -o "$mode" = "map" ] && ! is_in_index "$location"
then
echo "$file: Not published, skipping." 1>&2
continue
fi
[ "$any" = "false" ] && ! $suppress && echo ""
}
function recurse_map
{
typeset file here filelist empty=false dirname="${1:-.}" base title
if [ "$dirname" = "." ]
then
filelist=$(ls -1 "$dirname"/*.web "$dirname"/*.txt "$dirname"/*.do "$dirname"/*.pdf "$dirname"/*.txt.gz 2>/dev/null)
else
filelist=$(ls -1 "$dirname"/*.web "$dirname"/*.txt "$dirname"/*.do "$dirname"/*.pdf "$dirname"/*.txt.gz 2>/dev/null |
egrep -v '/index.web$|/index.html$|/RCS$|/SCCS$|/Thumbnails$')
fi
[ "$filelist" = "" ] && empty=true
if ! $empty
then
echo ""
$allfiles && format_list -s $filelist
elif $allfiles
then
for file in "$dirname"/*
do
[ -d "$file" ] && continue
typeset base=$(basename "$file")
$empty && echo ""
echo "- $base"
empty=false
done
fi
for file in "$dirname"/*
do
base=$(basename "$file")
[ "$base" = "RCS" ] && continue
[ "$base" = "SCCS" ] && continue
[ "$base" = "Thumbnails" ] && continue
if [ -d "$file" ]
then
if [ -f "$file/index.web" ] &&
title="$(get_page_title "$file/index.web")"
then
$empty && echo "
"
echo "- $title"
elif [ -f "$file/index.html" ]
then
$empty && echo "
"
echo "- $base"
else
! $allfiles && continue
$empty && echo "
"
echo "- $base"
fi
empty=false
recurse_map "$file"
fi
done
$empty && $allfiles && echo ""
! $empty && echo "
"
}
function format_sitemap {
typeset filename filedate junk month day year time yeartime urlbase
typeset thisyear=$(date '+%Y')
typeset lastyear=$thisyear
typeset thismonth=$(date '+%m')
if [ ! -r "${WEBBASE}/siteinfo.m4" ]
then
echo "Can't determine base URL: No $WEBBASE/siteinfo.m4" 1>&2
exit 1
fi
typeset urlbase=$(grep SITE_URL "$WEBBASE/siteinfo.m4" |
sed -e 's/.*http/http/' -e "s/'.*$//")
if [ "$urlbase" = "" ]
then
echo "Can't determine base URL: can't decode SITE_URL in $WEBBASE/siteinfo.m4 " 1>&2
exit 1
fi
typeset diroffset=$(echo "$PWD" | sed -e "s&$WEBBASE&&" -e 's&^/&&')
[ "$diroffset" != "" ] && urlbase="$urlbase/$diroffset"
[ "${thismonth:0:1}" = "0" ] && thismonth="${thismonth:1:1}"
[ $thismonth -le 6 ] && let lastmonth--;
TZ="UTC:0" find . \( -name '*.web' -o -name '*.txt' -o \
-name '*.do' -o -name '*.pdf' \) -type f -ls |
while read junk junk junk junk junk junk junk month day yeartime filename
do
if [[ $filename == *.web ]]
then
typeset dir=$(dirname "$filename")
typeset base=$(basename "$filename" .web)
outfile="$urlbase/$dir/$base.html"
else
typeset outfile="$urlbase/$filename"
fi
[ ${#day} = 1 ] && day="0$day"
year="$thisyear"
case "$month" in
1|01|Jan) month="01"; year=$thisyear ;;
2|02|Feb) month="02"; year=$thisyear ;;
3|03|Mar) month="03"; year=$thisyear ;;
4|04|Apr) month="04"; year=$thisyear ;;
5|05|May) month="05"; year=$thisyear ;;
6|06|Jun) month="06"; year=$thisyear ;;
7|07|Jul|Jly) month="07"; year=$lastyear ;;
8|08|Aug) month="08"; year=$lastyear ;;
Sep) month="09"; year=$lastyear ;;
Oct) month="10"; year=$lastyear ;;
Nov) month="11"; year=$lastyear ;;
Dec) month="12"; year=$lastyear ;;
esac
if [ "${yeartime:2:1}" = ":" ]
then
filedate="$year-$month-${day}T$yeartime:00+00:00"
else
filedate="$yeartime-$month-$day"
fi
echo "$outfile$filedate"
done | sed -e 's&//&/&g' -e 's&/\./&/&g'
}
arg0=$(basename $0)
[ "$WEBBASE" = "" ] && WEBBASE="$HOME/Web"
INCLUDE=${WEBBASE}/Include
timeframe=30
mode=TOC
allfiles=true
while getopts "cdhimo:t:s" option
do
case "$option" in
c)
mode=TOC
;;
d)
mode=deltas
;;
h)
allfiles=false
;;
i)
mode=index
;;
m)
mode=map
;;
o)
destfile="$OPTARG"
;;
t)
timeframe="$OPTARG"
;;
s)
mode=sitemap
;;
?)
echo "Usage: $arg0 [-c|-d|-m] [-h] [-o output] [-t time]"
exit 1
;;
esac
done
robots="FOLLOW,NOINDEX,NOARCHIVE"
case "$mode" in
TOC)
[ "$destfile" = "" ] && destfile=index.web
title="${TOCNAME:-Contents}"
filelist=$(echo *.web *.txt *.do */index.web)
robots="FOLLOW,INDEX,ARCHIVE"
message=""
;;
index)
[ "$destfile" = "" ] && destfile="indexpage.web"
title="${TOCNAME:-Alphabetical Content Index}"
filelist=$(find . -name '*.web') ;
message=""
;;
deltas)
[ "$destfile" = "" ] && destfile=delta.web
title="${TOCNAME:-Recent changes}"
filelist=$(find . \( -name '*.web' -o \
-name '*.txt' -o \
-name '*.do' \) -mtime -$timeframe)
[ "$filelist" != "" ] &&
filelist=$(ls -1t $filelist)
message="Documents listed here changed within $timeframe days
prior to $(date '+%Y-%m-%d'), which is when this list was compiled."
;;
map)
[ "$destfile" = "" ] && destfile=sitemap.web
title="${TOCNAME:-Site map}"
message=""
;;
sitemap)
[ "$destfile" = "" ] && destfile=sitemap.xml
title="${TOCNAME:-Site map}"
message=""
;;
esac
echo "m4_include(stddefs.m4)
_HEADER($title, \`\`$(basename $destfile) built $(TZ=UTC0 date '+%Y-%m-%d %H:%M:%S') by $arg0" '$Revision: 1.23 $' "'')
_META(ROBOTS, \`\`$robots'')
$title
$message
" > $destfile
[ "$mode" = "map" -o "$mode" = "deltas" ] &&
echo "_STYLE(\`\`LI.padded { margin-bottom: 1ex; margin-top: 1ex;}'')" >> $destfile
[ "$mode" = "sitemap" ] &&
echo '
' > $destfile
if [ "$mode" = "map" ]
then
recurse_map
elif [ "$mode" = "sitemap" ]
then
format_sitemap
echo ""
elif [ "$mode" = "index" ]
then
format_content_index $filelist | sed 's/,/, /g'
else
format_list $filelist
fi >> $destfile
exit 0