#!/usr/bin/perl -w
#
# $Revision: 1.7 $ $Date: 2010-09-09 14:25:47-04 $
# $Source: /src/www/sitemap/mkgsm/RCS/mkgsm,v $
# $UUID: 08e1df01-3689-3fe7-88f9-b1392e381f11 $
#
#<mkgsm: create a Google-style sitemap
# usage: mkgsm [website [sitepath [doclist]]]
# where "sitepath" is where your served files live, and "doclist" is
# generated by something like
#       cd $sitepath; find . -type f -name '*.htm'
#
# Based on:
#   http://groups.google.com/group/google-sitemaps/msg/56ba5f933c7bdb70
#   From: pcunix@gmail.com
#   Date: Sat, 04 Jun 2005 18:08:46 -0000
#   Subject: Perl site map generator (Unix)

use strict;

my $mapfile = "sitemap";           # XML file.
my $idxfile = "index.htm";         # Default server indexfile.

my $website  = shift(@ARGV) || 'http://' . fqdn();  # Website.
my $sitepath = shift(@ARGV) || '/var/www/htdocs';   # where files reside.
my $doclist  = shift(@ARGV) || "-";                 # Document list.

-d "$sitepath" or die "$sitepath: not a directory\n";
chdir("$sitepath") or die "$sitepath: cannot cd: $!\n";

open(my $ifh, "< $doclist") or die "$doclist: $!\n";
open(my $ofh, "> $mapfile") or die "$mapfile: $!\n";

print $ofh <<EndHeader;
<?xml version="1.0" encoding="UTF-8"?>
<urlset xmlns="http://www.google.com/schemas/sitemap/0.9">
EndHeader

# Generate an entry for each file.

my (
    $bad, $freq,  $hour,     $mday, $min, $mod,
    $mon, $mtime, $priority, $sec,  $year
    );

while (<$ifh>) {
    chomp;
    $bad = $_;
    $bad =~ tr!-_./a-zA-Z0-9!!cd;

    if ($bad eq $_) {
        s/^..//;
    }
    else {
        warn "bad path: [$_]";
        next;
    }

    $mtime = (stat $_)[9];
    die "$_: stat failed\n" unless defined($mtime);

    ($sec, $min, $hour, $mday, $mon, $year) = (gmtime($mtime))[0 .. 5];
    die "$_: gmtime failed\n" unless defined($sec);

    $mod = sprintf(
        "%0.4d-%0.2d-%0.2dT%0.2d:%0.2d:%0.2d+0000",
        $year + 1900,
        $mon + 1, $mday, $hour, $min, $sec
        );

    $freq     = "monthly";
    $freq     = "daily" if /$idxfile/;
    $priority = "0.5";
    $priority = "0.7" if /$idxfile/;
    $priority = "0.9" if /\/$idxfile/;

    print $ofh <<EndEntry;
  <url>
    <loc>$website/$_</loc>
    <lastmod>$mod</lastmod>
    <changefreq>$freq</changefreq>
    <priority>$priority</priority>
  </url>
EndEntry
}

print $ofh "</urlset>\n";
close($ofh);

# Gzip the results.

unlink("$mapfile.gz");
my @args = ("gzip", "$mapfile");
exec {$args[0]} @args;    # safe even with one-arg list
die "exec failed, should not get here\n";

#---------------------------------------------------------------------
# Return fully-qualified hostname from CNAME.

sub fqdn {
    use Sys::Hostname;
    my $h = shift || hostname();
    my ($name, $aliases) = gethostbyname($h);

    $_ = $name . " " . $aliases;
    foreach (split) {
        return $_ if /\./;
    }
    return ($h);
}
