#!/usr/bin/perl -w
#<site2html: pretty-print LOG files.

use Getopt::Long;
use Pod::Usage;
use File::Basename;
use Date::Parse;
use Date::Format qw(time2str);
use Carp;
use strict;
use diagnostics;

use subs qw/
    cend      getchunk  otherpages  prfooter  startentry  where
    cstart    manpage   pranchor    prheader  usage
    endentry  myuuid    prep        prtime    version
/;

$ENV{'PATH'} = join ":", qw(/bin /usr/bin /usr/local/bin /opt/sfw/bin);

my $myname = basename($0);
$myname =~ s/\.\w*$//;   # strip any extension

my $sdate = '';    # start and end dates.
my $edate = '';
my $stime = 0;     # start and end dates as Unix times.
my $etime = 0;
my $cur;           # current time.
my $maxlines = 0;  # max lines allowed before continuation link.
my $prefix   = ''; # prefix for storing preformatted output.
my $file = '';     # input file.
my $fh;
my $maxpre = 98;   # lines longer than this might push sidebar over.

my $title = 'Current log';
my $kwic  = 'kwic.htm';
my $other = '../index.htm';

my %options;
my @getopt_args = (
    'h|?',    # print usage
    'c=i',    # preformatted sections
    'e=s',    # end output at this date
    'k=s',    # KWIC index
    'm',      # print manpage
    'o=s',    # other logs
    'p=s',    # write preformatted data to files
    's=s',    # start output at this date
    't=s',    # set title
    'u',      # print UUID
    'v',      # print version
    'w',      # print source location
    );

Getopt::Long::config("noignorecase", "bundling");
usage unless GetOptions(\%options, @getopt_args);

if ($options{'v'}) {
    print version, "\n";
    exit(0);
}

$title = $options{'t'} if $options{'t'};

# Max lines in preformatted output.
if ($options{'c'}) {
    $maxlines = $options{'c'};
    usage "max lines must be a number" unless $maxlines =~ m/^\d+$/;
    $maxlines = 0 unless $maxlines > 0;
}

# KWIC index.
if ($options{'k'}) {
    $kwic = $options{'k'};
    usage "no KWIC index supplied" unless length($kwic);
}

# Other logs.
if ($options{'o'}) {
    $other = $options{'o'};
    usage "no other-logs page supplied" unless length($other);
}

# Preformatted output stored in files.
if ($options{'p'}) {
    $prefix = $options{'p'};
    usage "no prefix supplied" unless length($prefix);
}

# Start date for output.
if ($options{'s'}) {
    $sdate = $options{'s'};
    usage "no start date supplied" unless length($sdate);
    $stime = str2time($sdate) || usage "$sdate: not valid";
}

# End date for output.
if ($options{'e'}) {
    $edate = $options{'e'};
    usage "no end date supplied" unless length($edate);
    $etime = str2time($edate) || usage "$edate: not valid";
}

manpage if $options{'m'};
myuuid  if $options{'u'};
where   if $options{'w'};
usage   if $options{'h'};

$file = shift;
usage "need a file" unless $file && length($file);

usage "start date comes after end date"
  if $stime > $etime && $etime > 0;

#
# Read LOG-style file, print page content.
#
# Timestamp for a log entry looks like this:
#    Wed,  10   Mar   2004   20:40:26
#    $day  $dom $mon  $yr    $hms
#
# Unfortunately, we can't print anything until we read at least
# the first chunk, or we could miss the desired title.
#

my ($day, $dom, $mon, $yr, $hms, $tz, $fullname, @x);
my $en    = 0;    # entry number
my $first = 1;    # true if we've yet to print the first entry
my %chunk;

open($fh, "< $file") || die "$file: can't read: $!\n";

while (%chunk = getchunk($fh)) {
    $_ = $chunk{'str'};

    # Title, if any.

    if ($chunk{'type'} eq 'begin') {
        $title = $_;

        if ($first) {
            $first = 0;

            prheader($title);
            pranchor("top");
            otherpages();
            cstart();
        }
    }

    # If we have a timestamp, and entries have been written,
    # close the current entry row.  Then open a new timestamp row.

    elsif ($chunk{'type'} eq 'time') {
        endentry() if $en > 0;
        ($day, $dom, $mon, $yr, $hms, $tz, @x) = split;
        $en++;
        $fullname = join(' ', @x);
        $fullname =~ s/\(.*//;

        if (defined($hms)) {
            prtime($day, $dom, $mon, $yr, $hms, $en, $fullname);
        } else {
            warn "$file, line $.: messed-up time entry?"
        }

        startentry();
    }

    # Non-formatted portion of an entry.

    elsif ($chunk{'type'} eq 'entry') {
        # Handle inline images.
        s!\[Image:\s*(.*)\]!<p><img src="$1" alt="$1" /></p>!g;

        print "     <p>\n";
        print "$_";
        print "     </p>\n";
    }

    # Preformatted portion of an entry: remove last
    # newline to keep odd whitespace from bottom of listings.

    elsif ($chunk{'type'} eq 'pre') {
        chomp;
        print "<pre>\n$_</pre>\n";
    }
}

close($fh);
endentry();
cend();

#
# Print page footer.
#

pranchor("bottom");
otherpages();
prfooter();
exit(0);

#---------------------------------------------------------------------
# Print a usage message from the comments and exit.

sub usage {
    my ($emsg) = @_;

    use Pod::Usage qw(pod2usage);
    warn "$emsg\n" if defined $emsg;
    pod2usage(-verbose => 99, -sections => "NAME|SYNOPSIS|OPTIONS");
}

sub manpage {
    use Pod::Man();

    my $parser = Pod::Man->new();
    open(STDOUT, "| groff -T ascii -man | gcat -s | less") || die "groff\n";
    $parser->parse_from_file($0);

    close STDOUT || die "$myname: can't close stdout: $!\n";
    $? = 1 if $? == 255;    # from die
    exit($?);
}

#---------------------------------------------------------------------
# Print the UUID, current version, or source location.

sub myuuid {
    my $UUID = sprintf("%s",
        q$UUID: 488751a6-0460-38e7-9fc9-619c08b02c9e $ =~ /UUID: (.*) /);
    print "$UUID\n";
    exit(0);
}

sub version {
    my $VERSION = sprintf("%d.%02d", q$Revision: 2.15 $ =~ /(\d+)\.(\d+)/);
    my $DATE =
      sprintf("%s", q$Date: 2009/10/09 17:30:41 $ =~ /Date: (.*) /);
    return "$myname $VERSION $DATE";
}

sub where {
    my $SOURCE = sprintf("%s",
          /Source: (.*) /);
    print "$SOURCE\n";
    exit(0);
}

#---------------------------------------------------------------------
# Print HTML header.

sub prheader {
    my $title = shift || "Current logfile";
    my $vers = version();

    print <<"PageHeader";
<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 3.2//EN">
<!-- $vers -->
<html>
  <head>
    <title>$title</title>
    <style type="text/css" media="all">
body {
  line-height:      120%;
  color:            black;
  background:       white;
  font-size:        18pt;
  font-weight:      normal;
  font-family:      sans-serif;
  font-style:       normal;
  text-decoration:  none;
}

a:hover {
  color:            blue;
  background-color: #eee;
}

a:active {
  color:            red;
  background-color: yellow;
}

pre, code, tt {
  font-family:      "Bitstream Vera mono", monospace;
  white-space:      pre;
}

ol li {
  max-width:        75ex;
  padding:          .1em;
}

table.search {
  background-color: #afd5e0;
  width:            100%;
  border:           10px;
}

td.sidebar {
  background-color: #afd5e0;
  vertical-align:   top;
  width:            20%;
  font-size:        12pt;
}

/* form alignment: mixed results */
.alignCenter        { text-align: center; }
.alignLeft          { text-align: left; }
.alignRight         { text-align: right; vertical-align: bottom; }
.alignTopLeft       { text-align: left; vertical-align: top; }
.alignBottomLeft    { text-align: left; vertical-align: bottom; }
.alignTop           { vertical-align: top; }
.alignBottom        { vertical-align: bottom; }

address img {
  float:            right;
  width:            88px;
}

address {
  padding-top:      .5em;
  padding-right:    1em;
  padding-left:     1em;
  padding-bottom:   .5em;
  margin-top:       3em;;
  border-bottom:    1px solid black;
  background-color: #eee;
  clear:            right;
}

h1 {
  font-family:      Tahoma, Verdana, "Myriad Web", Syntax, sans-serif;
  font-size:        2em;
  font-weight:      bold;
  font-style:       normal;
  text-decoration:  none;
  color:            #053188
}

h2 {
  font-family:      Tahoma, Verdana, "Myriad Web", Syntax, sans-serif;
  font-size:        1.7em;
  font-weight:      500;
  font-style:       normal;
  text-decoration:  none;
}

h3 {
  font-family:      Tahoma, Verdana, "Myriad Web", Syntax, sans-serif;
  font-size:        1.5em;
  font-weight:      500;
  font-style:       italic;
  text-decoration:  none;
}

h4 {
  font-family:      Tahoma, Verdana, "Myriad Web", Syntax, sans-serif;
  font-size:        1.3em;
  font-weight:      500;
  font-style:       normal;
  text-decoration:  none;
}

h5, dt {
  font-family:      Tahoma, Verdana, "Myriad Web", Syntax, sans-serif;
  font-size:        1.1em;
  font-weight:      500;
  font-style:       normal;
  text-decoration:  none;
  margin-top:       1em;
}

h6 {
  font-family:      Tahoma, Verdana, "Myriad Web", Syntax, sans-serif;
  font-size:        1em;
  font-weight:      500;
  font-style:       normal;
  text-decoration:  none;
}

pre {
  padding:          5px;
  background:       #f2f2f2;
  border:           1px solid black;
}
    </style>
  </head>

  <body>
    <h4>$title</h4>
PageHeader

    1;
}

#---------------------------------------------------------------------
# Print HTML footer.

sub prfooter {
    print <<"PageFooter";
    <br>
  </body>
</html>
PageFooter

    1;
}

#---------------------------------------------------------------------
# Print internal anchor.

sub pranchor {
    my ($name) = @_;
    die "no anchor name given" unless length($name);
    print "    <a name=\"$name\"></a>\n";
    1;
}

#---------------------------------------------------------------------
# Print pointers to other pages.

sub otherpages {
    print << "OtherPages";
    <table width="100%">
      <tr>
        <td>
          <table class="search">
            <tr align="left">
              <td> Log entries |
              <a href="$kwic">KWIC index</a> |
              <a href="$other">Other Logs</a>
              </td>

              <td align="right">
                <form action="/cgi-bin/search.cgi" method="GET">
                  <div class="alignRight">
                  <input type="text" name="keywords" value=""
                    size="10" maxlength="50">
                  <input type="submit" value="Search" name="keywords">
                  </div>
                </form>
              </td>
            </tr>
          </table>
        </td>
      </tr>
    </table>

OtherPages

    1;
}

#---------------------------------------------------------------------
# Start content.

sub cstart {
    print <<"StartContent";
    <!-- begin page content -->
    <table cellpadding="5" cellspacing="5" border="0" width="100%">
StartContent

    1;
}

#---------------------------------------------------------------------
# End content.

sub cend {
    print <<"EndContent";
    </table>
    <!-- end page content -->

EndContent

    1;
}

#---------------------------------------------------------------------
# Left field is the timestamp plus entry pointers.
# The first field written has the "previous" entry pointing to
# the top, and the last one has the "next" entry pointing to
# the bottom.

sub prtime {
    my ($day, $dom, $mon, $yr, $hms, $en, $fullname) = @_;

    # $kn, $kp = next and previous numbers.
    # $sn, $sp, $sc = next, previous, and current name strings.

    my $fmt = "entry_%d";
    my $kn  = $en + 1;
    my $kp  = $en - 1;
    my ($sn, $sp, $sc);

    if ($kp <= 0) {
        $sp = 'top';
    }
    else {
        $sp = sprintf($fmt, $kp);
    }

    $sn = sprintf($fmt, $kn);
    $sc = sprintf($fmt, $en);
    $hms = substr($hms, 0, 5);

    print <<"EndLeft";

    <tr>
     <td class="sidebar">
     <a name="$sc"></a>
     $day $dom $mon $yr $hms<br>
     $fullname
     <br><br><em>
     <a href="#top">Top</a>&nbsp;
     <a href="#$sn">Next</a>&nbsp;
     <a href="#$sp">Prev</a>&nbsp;
     <a href="#bottom">End</a>
     </em>
     </td>
EndLeft

    1;
}

#---------------------------------------------------------------------
# Right field is the log entry.

sub startentry {
    print <<"StartRight";
     <td valign="top" width="70%">
StartRight

    1;
}

sub endentry {
    print <<"EndRight";
     </td>
    </tr>
EndRight

    1;
}

# -------------------------------------------------------------------------
# Read file in chunks.

sub getchunk {
    my $fh     = shift;
    my %result = ();

    # No more records.
    return (%result) if eof($fh);

    # Skip leading blank lines.
    my $pat = '^\s*$';
    while (<$fh>) {
        last unless /$pat/;
    }

    # We've found a complete timestamp, or at least part of the
    # associated comment.  A chunk can be either a paragraph
    # from the comment, or a complete piece of preformatted text.

    if (/^BEGIN/) {
        $result{'type'} = 'begin';
        chomp;
        s/BEGINNING OF LOG FOR//;
        s/==*$//;
        $result{'str'} = $_;
        $pat = '^\s*$';
    }

    elsif (/^\s+--+S$/) {
        $result{'type'} = 'pre';
        $result{'str'}  = '';            # don't keep this string.
        $pat            = '^\s+--+E$';
    }

    elsif (/^\s/) {
        $result{'type'} = 'entry';
        $result{'str'}  = $_;
        $pat            = '^\s*$';
    }

    else {
        $result{'type'} = 'time';
        $result{'str'}  = $_;
        $pat            = '^\s*$';
    }

    # Store strings until terminating pattern found.
    # Special strings like "root#", "you%",  and "me%" indicate
    # commands where the entire line should be in bold.
    # Preformatted output can drop some leading spaces.

    my $emailhdr = 0;        # Are we in a mail header?

    while (<$fh>) {
        last if /$pat/;

        if (length() > $maxpre) {
            warn "line $. too long:\n";
            warn "  [$_]\n";
        }

        $_ = prep($_);

        # Emphasize typed commands.
        if (/root#|you\%|me\%|me\$|^\s*\$ /) {
            chomp;
            s!^!<strong>!;
            s!$!</strong>\n!;
        }

        # Emphasize email headers.
        if (/^\s*(date|message-id|from|sent|to|subject|reply-to|cc|org.*):/i) {
            $emailhdr = 1;
        }
        if ($_ eq "\n") {
            $emailhdr = 0;
        }
        if ($emailhdr) {
            chomp;
            s!^!<em>!;
            s!$!</em>\n!;
        }

        $result{'str'} .= $_;
    }

    return (%result);
}

# -------------------------------------------------------------------------
# Prepare string by fixing special characters.

sub prep {
    local ($_) = shift;

    # Tabs?
    1 while s/\t+/' ' x (length($&) * 8 - length($`) % 8)/e;

    # Leading spaces?
    s/^\s\s\s\s//;

    # Special characters?
    s!\&!\&amp;!g;
    s!<!\&lt;!g;
    s!>!\&gt;!g;

    # URLs?
    s#(http://\S+)#<a href="$1">$1</a>#g;
    s#(https://\S+)#<a href="$1">$1</a>#g;
    s#(ftp://\S+)#<a href="$1">$1</a>#g;

    return $_;
}

#---------------------------------------------------------------------
# To generate manpage:
#   pod2man -c 'User docs' -r "`date`" -d version site2html

__END__

=head1 NAME

site2html - Read LOG-style file in chunks, write HTML output

=head1 SYNOPSIS

site2html [-hmuvw] [-s start] [-e end] [-c max] [-p pfx] [-t title] file

=head1 OPTIONS

=over 4

=item B<-c> max

Maximum number of lines in preformatted output before
writing a "continued here..." link; default is unlimited.

=item B<-h>

Print a brief help message and exit.

=item B<-k> kwic

Page pointing to KWIC index.  Defaults to "kwic.htm".

=item B<-m>

Print the manual page and exit.

=item B<-o> otherlogs

Page pointing to other sitelogs.  Defaults to "../index.htm".

=item B<-p> pfx

Write preformatted data to files starting with "pfx".

=item B<-s> start

Date at which to start output; default is first entry.

=item B<-e> end

Date at which to end output; default is last entry.

=item B<-t> title

Sets the HTML title.

=item B<-u>

Print the script UUID and exit.

=item B<-v>

Print the version and exit.

=item B<-w>

Print the source location and exit.

=back

=head1 DESCRIPTION

B<site2html> will read a LOG file and print it in HTML with some decent
navigation links and highlighting.

=head1 AUTHOR

 Karl Vogel <vogelke+software@pobox.com>
 Sumaria Systems, Inc.