#!/usr/bin/perl
#
# @(#) Perl -- Simple text2html converter. Uses Techical text format (TF)
# @(#) $Id: t2html.pls,v 1.55 1998/04/08 16:26:36 jaalto Exp $
#
# {{{ Documentation
#
# File id
#
# .Copyright (C) 1996-98 Jari Aalto
# .Created: 1996-11
# .$Contactid: $
# .$URL: ftp://cs.uta.fi/pub/ssjaaa/ssjaaa.html $
# .$Keywords: Perl txt html conversion $
# .$PerlVer: 5.001 $
#
# This program is free software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License as
# published by the Free Software Foundation; either version 2 of
# the License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful, but
# WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
# General Public License for more details.
#
# You should have received a copy of the GNU General Public License along
# with this program; if not, write to the Free Software Foundation,
# Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
#
# Perl versions supported
#
# This programs should work with any version of perl 5, but recommended
# version is 5.004 or never. You also need the LWP modulesif you're
# going to use the 'link check' option. This program was originally
# written for Perl4, so don't wonder why there are local() definitions.
#
# This program was originally written for perl4, so excuse some
# oldish perl keyword usage, like `local'.
#
# About program layout
#
# The {{ }}} marks you see in this file are party of file "fold"
# conrol package called folding.el (Unix Emacs lisp package).
# ftp://ftp.csd.uu.se/pub/users/andersl/beta/ to get the latest.
#
# There is also lines that look like # ....... &tag ... and they
# are generated by Emacs Lisp package tinybm.el, which is also
# document structure tool. You can jump between the blocks with
# Ctrl-up and Ctrl-down keys and create those "bookmarks" with
# Emacs M-x tibm-insert. See mentioned URL cs.uta.fi.
#
# Funny identifiers at the top of file
#
# The GNU RCS ident(1) program can print usefull information out
# of all variables that are in format $ IDENTIFIER: text $
# See also Unix man pages for command what(1) which outputs all lines
# matching @( # ). Try commands:
#
# % what PRGNAME
# % ident PRGNAME
#
# INTRODUCTION
#
# Please start this perl script with options
#
# -help to get the help page
# -help-sh To print the sample shelll script to stear
# this perl file (front-end)
#
# WWW CONTACT SITE
#
# See http://www.netforward.com/poboxes/?jari.aalto and navigate
# to html pages in the site to get more information about me
# and my tools (Emacs, Perl, procmail mostly)
#
# DESCRIPTION
#
# This perl program converts text files that are written in rigid
# (T)echnical layout (f)ormat (which is explained when you run -help)
# to html pages very easily and effectively.
#
# If you plan to put any text files available in HTML format you will
# find this program a very usefull. If you want to have fancy
# graphics or more personal page layout, then this program is not for
# you.
#
# I have also made package that helps you to write and format text
# files to Technical format. Please see following Emacs package at
# the previously mentioned URL site.
#
# tinytfo.el
#
# End of documentation
#
# Change Log:
# .......................................................... &RcsLog ...
#
# $Log: t2html.pls,v $
# Revision 1.55 1998/04/08 16:26:36 jaalto
# - commented out
code adding; because it seemd to
# create double empty line. Netscape display bug?
#
# Revision 1.54 1998/03/26 10:41:27 jaalto
# - small changes
#
# Revision 1.53 1998/03/23 08:10:42 jaalto
# - There was sometimes missing
code between
# paragraphs. Fixed.
#
# Revision 1.52 1998/03/15 09:35:39 jaalto
# - The ** and __ were not extrapolated inside bullets.
# This was due to previous version's change: no expasion
# beyond column 12. Now checks !$AsIs, which fixes the problem.
#
# Revision 1.51 1998/03/04 11:49:10 jaalto
# - Special marking is now disabled after column 12.
# Eg perl end of script marker __END__ is now correct.
#
# Revision 1.50 1998/02/23 08:25:55 jaalto
# - Fixed the email and url A-HREF handling. There was an error
# when deciding which url was clickable and which not.
#
# Revision 1.49 1998/02/10 15:02:13 jaalto
# - turning option -lchk mistakenly removed TOC. The reported line numbers
# didn't match the original text any more. Fixed.
#
# Revision 1.48 1998/02/06 09:34:08 jaalto
# - Switch -md didn't work. Fixed.
# - Bug in url handling fixed. Ate text after url.
# - LINK tag is not handled okay by any other than Lynx.
#
# Revision 1.47 1997/10/16 19:22:47 jaalto
# - Turned off The LINK yags, because Netscape couldn't handle them
# and with lynx you would see double prev-top-next buttons.
#
# Revision 1.46 1997/10/13 23:08:59 jaalto
# - Documentations changes only
#
# Revision 1.45 1997/10/13 19:06:38 jaalto
# - The tar|gz link check adjusted
# - Mysteriously added newlines to the beg of html doc: fixed.
# - Killing invalid characters from header NAME tag ref is now better.
# - KillToc is now always called.
#
# Revision 1.44 1997/10/11 14:40:01 jaalto
# - Changed UpdateHeaderArray. Now deletes unnecessary characters
# before constructing NAME reference. Better word recognition
# - Corrected error in "URL and MAIL made clickable". The words after
# last url vanished. Now ok.
#
# Revision 1.43 1997/10/06 18:03:26 jaalto
# - Added link Check feature
#
# Revision 1.42 1997/10/05 19:24:52 jaalto
# *** empty log message ***
#
# Revision 1.41 1997/10/03 10:31:41 jaalto
# - Corrected the (-) control code fro -http:// references.
# - Added the hr tag before each heading 0.
# - added more email suppression rules, like < noClick@this.i>. There must
# be no surrounding whitespace.
# - Common words used in URL now automatically skips the url:
# foo,bar,quu
#
# Revision 1.40 1997/09/25 21:14:33 jaalto
# - added option -simple and -quiet. Changed ## to !!
#
# Revision 1.39 1997/09/19 13:32:07 jaalto
# - Added GNU Gen. Pub. licence
#
# Revision 1.38 1997/09/16 20:04:40 jaalto
# - Small text changes
#
# Revision 1.37 1997/09/16 15:45:39 jaalto
# - Only documentation changes
#
# Revision 1.36 1997/08/28 06:56:12 jaalto
# - Added -lchk option for Perl5; but it isnot yet functional.
#
# Revision 1.35 1997/08/14 22:06:06 jaalto
# - New feature: if there is heading "Table of contents" it is automatically
# filtered out. Program itself creates TOC; so having file's toc
# make's no sense. The Program's toc items are clickable.
#
# Revision 1.34 1997/08/14 17:33:03 jaalto
# - only Help() function text adjusted.
#
# Revision 1.33 1997/08/12 14:43:20 jaalto
# - Now it can read Heading that have plus(+) character; like in
# remailer options +signsend/-signsend
#
# Revision 1.32 1997/08/12 14:29:11 jaalto
# - Now program skips these invalid email addresses and does not make
# them clickable.
# References: <5dfqlm$m50@basement.replay.com>
# Message-ID: <5dko56$1lv$1@news02.deltanet.com>
#
# Revision 1.31 1997/08/12 10:50:02 jaalto
# - Corrected "mailto" reference at the end of html doc; where Contact
# address were placed.
#
# Revision 1.30 1997/06/08 19:50:13 jaalto
# - Mailto: if there was the rest of the line after email was
# deleted. Bug, corrected
# - Very serious error corrected: if there were multiple links
# in the same line, only the last line was found. I used the
# .* operator which missed the links before text. Corrected.
#
# Revision 1.29 1997/05/29 19:46:09 jaalto
# - Added initialize function, CONTACT and URL; Reformatted beginning
# comments in the file
# - Corrected mysterious "duplication of words" after http or ftp link.
# - Added optio -v to print version number.
#
# Revision 1.28 1997/04/01 08:54:48 jaalto
# - Small fixes. The 1.1 header name eating didn}t work right. It ate
# any first number. now corrected.
# - Added new option -e, where you can specify EMAIL address of the
# document. Environmental variable can now be overruled.
#
# Revision 1.27 1997/03/28 23:20:04 jaalto
# - Added new option -name-nbr. Also now handles numbered heading so that
# the NAME reference doesn't contain the number unless this new option
# is used.
# - Renamed option -uniq to -name-uniq
# - Added support for Emacs style American english quotes `'. In emacs
# this refer to used symbols, like variables or functions.
# - Fixed header reading, now included ['] charcters too.
#
# Revision 1.26 1997/03/25 11:28:39 jaalto
# - Added -help-sh switch and included the t2h csh script to the end
# of perl file.
# - Wrote introduction + reformatted the comments at top of file
# - Corrected '-' suppression TAG in front of email and http
# references. Previsouly those refs were still highlighted, while
# they should have been unclickable.
#
# Revision 1.25 1997/02/18 11:55:59 jaalto
# - If there was multiple email addresses on theline; only last one
# was marked. Added while loop to handle every email in the line.
# - Correctd *_= markup handling. The line looped 3 time ONLY
# if it has markup.
# - Now requires 2 lines of input instead of 5. Otherwise program dies.
#
# Revision 1.24 1997/02/15 22:33:24 jaalto
# - missing { cause fatal perl exec error
#
# Revision 1.23 1997/02/15 22:08:21 jaalto
# - Corrected nested *=_ markup handling.
# - corrected #REF handling. Doesn't require spaces any more.
#
# Revision 1.22 1997/02/14 19:12:24 jaalto
# - small text change
#
# Revision 1.21 1997/02/03 16:20:14 jaalto
# - small changes, the '==' mistankely got replaced with ,
# now
# - The 'Ducument author' line now has code.
#
# Revision 1.20 1997/02/03 07:40:48 jaalto
# - Added '==' sample code marking for individual words
#
# Revision 1.19 1997/02/02 13:59:45 jaalto
# - Added -uniq option to handle non-resolvable NAME references
# that are constructed from header names. Now also dies
# if not all NAME refs are unique and suggests turning on
# -uniq option.
#
# Revision 1.18 1996/12/17 18:06:48 jaalto
# - bullet P-code continuing corrected.
#
# Revision 1.17 1996/12/12 17:20:07 jaalto
# - The endind ADDRESS statement was not separated by HR code.
# Now corrected.
#
# Revision 1.16 1996/12/12 09:14:39 jaalto
# - Fixed couple of small bugs, Eg. " code handling at column 7.
#
# Revision 1.15 1996/12/02 14:46:37 jaalto
# - Changed $RCS_ID's double quotes to single quotes to prevent problems
# in Perl 5 (interpolation of $)
# - Added die to the main if file couldn't be opened.
#
# Revision 1.14 1996/12/02 14:22:44 jaalto
# - Added expand tabs function
#
# Revision 1.13 1996/11/24 15:47:42 jaalto
# - Added new #REF command to reference inside document
# easily.
#
# Revision 1.12 1996/11/23 23:16:11 jaalto
# - The whole structure of program has been reorganized into categories.
# - The code changed so that no function prints nothing, but returns
# array of html strings.
# - Added options -del -delfld -o and added internal -db debugging
# - Started writing and designing #SPLIT to several html pages.
# This feature is not yet implemented.
# - Clarified the usage function better.
#
# Revision 1.11 1996/11/19 22:24:02 jaalto
# - Added LINK tags, Rewrote Button code.
# - Now there is P code for continuing the bullet in next paragraphs.
# - added: IsEmptyText, makeLinkHtml
# - Now doesn't use FONT SIZE command any more, but SMALL which
# is suggested by standard.
#
# Revision 1.10 1996/11/18 13:47:04 jaalto
# - The index structure vilated SGML, it printed
text. Corrected.
# - The #URL-BASE was not expanded at bullet start line (o). Now fixed.
#
# Revision 1.9 1996/11/18 09:16:41 jaalto
# - Added "-" switch to inhibit http or mail address to be made
# clickable. This is usefull in examples and in references that are not
# real or which shouldn't be used.
# - Some Perl 5 errors, like escaping \@ corrected.
# - New function XlatHtml which can take care of some common special
# characters and turn them into html codes (like > )
# - EM ** and STRONG __ words now require leading space.
#
# Revision 1.8 1996/11/17 11:16:09 jaalto
# - Added numbered list support vit "." bullet
# - Added META tags for search engined to the start of html page,
# switches: -mk -md
# - Added -pref tag to print out built NAME references
# - The html creation date is now in ISO format and not just `date`
#
# Revision 1.7 1996/11/16 20:30:19 jaalto
# - Added "," code for _not_ inserting P line is above line is empty
# - Corrected minor bugs and clarified the -help functin to show
# all text manipulation.
#
# Revision 1.6 1996/11/15 23:00:40 jaalto
# - Still small bugs, eg it doubled the
lines. now the P is not
# added if there is PRE defined, because PRE does already P.
# - Some more minor fixes.
#
# Revision 1.5 1996/11/15 21:05:42 jaalto
# - This program has changed totally. You don't even want to see version
# 1.3... The previous one shoved text in plain PRE, but this really can
# format it into html.
# - Bulletins are also now supported and the help page should document
# all trick to write tight standard .txt file where you can generate html
#
# Revision 1.4 1996/11/14 15:40:12 jaalto
# - fixed url parsing: it href'd the surrounding tabs
# - added BASE and BUT command line args
#
# Revision 1.3 1996/11/13 23:36:55 jaalto
# - Minor corrections
#
# Revision 1.2 1996/11/13 23:32:58 jaalto
# - Perl 5 corrections
#
# Revision 1.1 1996/11/13 23:25:43 jaalto
# Initial revision
# }}}
# {{{ Initial setup
# ----------------------------------------------------------- &setup ---
sub Initialize
{
# DESCRIPTION
#
# Set global variables
unshift(@INC, split(' ',$ENV{'MYPERLLIB'})) if defined $ENV{'MYPERLLIB'} ;
# My private library, not needed unless debugging
#
require 'libmisc.pl' if $ENV{'USER'} eq "jaalto";
$prgname = "t2html.pls"; $lib = $prgname;
$| = 1;
$RCS_ID = '$Id: t2html.pls,v 1.55 1998/04/08 16:26:36 jaalto Exp $';
$VERSION = (split (' ', $RCS_ID))[2];
if ($RCS_ID && $VERSION) {} # perl -w silencer, No-op
$CONTACT = "";
$URL = "ftp://cs.uta.fi/pub/ssjaaa/ssjaaa.html";
if ($CONTACT && $URL) {} # perl -w silencer, No-op
@HEADERS = ();
}
# }}}
# {{{ usage/help
# ----------------------------------------------------------- &usage ---
sub usage
{
local( $msg ) = @_; # reason why are we here...
# We must print to stderr because program is usually used
# as pipe 'PRG txt.file > out.html' and the error would not
# be seen otherwise.
#
PRINT: {
print STDERR <
Do not make this email address clickable bar\@site.com,
because it is only an example and not a real address. Notice that
it was not surrounded by <>. Common login names like foo, bar, quux
are also ignored automatically.
Also do not make < this\@site.com> because there is extra white
spaces. This may be more convenient way to disable email for
mouse click.
Heading level 1 again at colum 0
Subheading, colum 4
And regular text, column 8
txt txt txt txt txt txt txt txt txt txt txt txt
txt txt txt txt txt txt txt txt txt txt txt txt
txt txt txt txt txt txt txt txt txt txt txt txt
--//-- decription end
That's it, there is the whole layout described above.
More formally the the rules of text formatting are secribed below-
About headings
o There are only _two_ heading levels in this style.
Heading columns are 0 and 4 and the heading must start with
big letter or number
o In column 4, if the text starts with small letter, that line
is interpreted as
o The heading level 1 uses code to mark big sections.
o The headings are gathered and the index jump block is built. The
NAME reference consists of first 4 sequential words from the
heading name. Make sure your heading are uniquely named,
otherwise there will be same NAME references in the generated
html. Spaces are converted into underscore when joining the
words.
About text placement
General
o Text at column 0 is undefined if it doesn't start with
big letter or number to indicate Heading level 1.
o Text between colums 1-3 is marked with
o Column 4 is reserved for heading level 2
o Text between colums 5-7 is marked with
o Text in column 7 is if the first character is double quote.
o Column 10 is reserved for text. If yuu want to quote
someone's words or reference text, place the text in this column.
o Text in colums 9,11 are marked with
Column 8 for text and special codes
o Column 8 is reserved for normal text
Column 12 is special
o Column 12 is treated specially: block is started with
and
lines are marked as . Wwhen the last text at _column_
12 is found, the block is closed with
Note follwing example
txt txt txt ;evenly started block, fine, do it like this
txt txt
txt txt txt txt ;No! can't terminate the /pre, because last
txt txt txt txt ;column is not at 12
txt txt txt txt
Other text markings
o If there is dot-code, '.', and immediately non-whitespace, then
code is added to the end of line.
.This text contains BR code
While these two line are joined together
in normal html document.
o If there is ',' then the
code is not inserted if
the previous line is empty. If you use both '.' and ',' they must
be in order '.,'
The P-comma-code works differently if it is used in bullet
o Special text markings:
_this_ is intepreted as this
*this* is intepreted as this
=this= is intepreted as this
`this' is intepreted as this
Special #-commands
o #REF command is used for refering to NAME inside current document.
The whole command must be placed on one single line, you can't
break the line.
Example:
#REF how_to_profile ;(Note: profiling);
(1) (2)
1. The NAME reference in current document, a single word.
This can also be full http url link.
You can get NAME list by enabling -pref option.
2. The clickable text delimited by ; characters.
o #URL-BASE is substituted with the command line -base URL
reference: only directory part is used from the -base.
It allows you to refer to documents local to current site.
base = http://this.com/dir1/dir2/text.html
#URL-BASE/next.html
-->
http://this.com/dir1/dir2/next.html
o A !! in text column adds one HR code. Any text after tag
in the same line is written with STROM EM and inserted just
after hr code. Therefore the text word commands have no
effect, unless you use CODE markup.
Http and email highlighting control
o All http and ftp references as well as email
addeses are marked clickable. Email must have surrounding <>
characters to be recognized.
o If you don't want to make some reference clickable, preceed it
o If url contains character $, it is not made clickable
is clickable
< me\@here.com> is not clickable
me\@here.com is not clickable
<5dko56\$1\@news02.deltanet.com> is Message id, not clickable
http://this.com is clickable
http://foo.com is not clickable
-http://this.com is not clickable
http://\$EXAMPLE is not clickable
Lists and bullets
o The bulletin table is contructed if there is 'o' at
column 8 and 3 spaces after it, so that text starts at column 12.
Bulleted lined must be kept together, no spaces between bullet
blocks.
o The ordered list is started with '.' and contructed like bullet.
About line breaks
o All line breaks are visible in your document, do not use more
tan one line break to separate text from ewach other.
o Very important is that there is only _one_ line break
after headers 1 and 2
SPECIAL NOTE
If there is heaading 1, whose name is "Table of contents", then that
heading and all text up to next heading are discarded from the
generated html file. This is done because program itself generates
the TOC and jump blocks and NAME references.
WHEN DOCUMENT DOESN'T FORMAT RIGHT
The second most common error is wrong spacing. Remember, Keeep
_one_ empty line between headers and text, between body text and
bullet. Check those first.
Third common error is that you have put text on wrong column.
Remember that text column is at 8.
Also, _check_ that the block ends evenly, expecially when it's inside
columns 1-7 or 12.
Headings start with _big_ letter.
USAGE
$prgname [opt] textfile
-a 'M. Foo' Author of document
-del REGEXP Delete lines matching perl regexp.
Eg. if you use Unix/Emacs folding.el you can
put text into folders # {{{ and # }}}. You
don't want to show those marks in the generated
html document.
-delfld This is special option for deleting Emacs pacakge
folding.el's marks as described in the beginning
of the perl file. All lines that have {{{ or }}}
are discarded.
-doc file.txt Document file name, the original text file.
-e EMAIL The contact address eg. foo\@site.com
-butt URL Button to go to top level document
if URL is 'none', then no button is not inserted
-butp URL Button to go to previous document, or 'none'
-butn URL Button to go to next document, or 'none'
-base URL Url location of the html file in destination site
where the html will be put. IT IS VERY IMPORTANT
THAT YOU SPECIFY THIS. ALL #TAG REFERENCES WITHOUT
QUALIFIER URL REFER TO THE URL WHERE base points.
-lchk Activate 'link check'. All http and ftp links
are checked for validity. Problematic links
are outputted to stderr. This is available only in
if you have the Perl5 LWP web library.
Also turn on -quiet, if you are only interested
in failed links.
Links that are big, eg. tar.gz or that run
programs(has ? character) are ignored because
the GET request used in checking would return
whole link content.
-lchkt Only usefull if -lchk is selected. This concatenates
the url request test to single line, so that you
can view the URL: ERROR_CODE ERROR_TEXT in one
line. (Usefull in Emacs compile buffer and truncate
mode on)
-mk 'A B C' Meta keywords. Include keywords AA BB CC here
This is used by search engines. Separate
kwywords AA BB CC with spaces and do not
use comma anywhere.
-md STR Meta Description. Include description string,
max 1000 chars. This is used by search engines.
-name-uniq When the NAME references are collected, first 1-5
first words are picked from the header.
However, it is possible that you have two
headers that use exactly the same words in the
beginning. In those cases you have to turn on
this option. It will use running counter
00 - 99.. instead of words from headers to
construct NAME reference.
Please use this option only in emergencies, because
referring to NAME via this#header_name is
more convenient than using reference this#11, where
the number may change in the next run.
Make sure that the headers don't have same subjects
and you don't need this option at all.
-name-nbr When constructing NAME refrences, the header numbering
is tossed away by default so that heading's
"1.0 Intro" NAME reference is "#Intro".
This is derirable, because the numbering may change
but the heading text usually remains the same.
Uf you really want to have NAME reference like
"#1.1_Intro" turn this option on. NOT RECOMMENDED.
-pref Print references (contructed from header names) that
build up the local NAME jump points in the
document. The list is printed in stderr.
This way you can do
% $prgname tmp.txt > file.html
and the reference names printed don't go to
html file.
-ref URL Refer-to Url location of file in destination site
where the document will be put.
-simple Print minimum footer. Only Contact and date.
-quiet Print no footer at all. This option has different
meaning if -lchk option is turned on.
-t STR The title text that appears in Browser's top frame.
-v Print program's version information.
-db NBR Debug with LEVEL. Not supported in distributed
version. Can only be run by maintainer.
-h(elp)|-usage Print this help screen
-help-sh Print help for sample shell script.
EOF
}
print " >> $msg" if defined $msg; # why function was called
exit 1;
}
# }}}
# {{{ URL Link
# ............................................................ &link ...
#----------------------------------------------------------------------
sub StudyLinkExternal
{
# DESCRIPTION
#
# Check if link is valid
#
# INPUT
#
# str string containing the link or pure URL link
#
# RETURN
#
# 0 nbr Error code. Global %LINK_HASH is updated too
# with key 'link' -- 'response'
#
local( $f ) = "$lib.StudyLinkExternal";
local( $url , *LINK_HASH , *LINK_HASH_CODE) = @_;
local( $ret , $txt ) = 0;
if ( $P5 )
{
# Because this is perl 5 lib thing, we can't do this unless
# the WWW lib is present.
#
use LWP::UserAgent;
my $ua = new LWP::UserAgent;
my $request = new HTTP::Request('GET', $url);
my $obj = $ua->request($request);
# check the outcome
#
if ($obj->is_success) {
0;
}
else
{
$ret = 1;
$LINK_HASH{ $url} = $obj->code;
# There is new error code, record it.
#
if ( ! defined $LINK_HASH_CODE{ $obj->code} )
{
$txt = $obj->message;
$LINK_HASH_CODE{ $obj->code} = $txt;
}
}
}
($ret , $txt);
}
#----------------------------------------------------------------------
sub html2txt
{
# DESCRIPTION
#
# converts html files into ascii by just stripping anything between
# < and >
# written 4/21/96 by Michael Smith for WebGlimpse
#
# INPUT
#
# @txt text from the file
#
# RETURN
# @
local( $f ) = "$lib.html2txt";
local( *input ) = @_;
local( $carry, $line, @ret, $_, $comment);
$carry = $comment = 0;
for( @input )
{
$line = $_;
if (0) # enable comment stripping
{
$comment = 1 if //;
$comment = 0 if /--->/;
next if $comment;
}
if($carry==1)
{
# remove all until the first >
#
next if( $line !~ s/[^>]*>// );
# if we didn't do next, it succeeded -- reset carry
#
$carry=0;
}
while( $line =~ s/<[^>]*>//g ){};
if( $line =~ s/<.*$// )
{
$carry=1;
}
$line = &XlatText( $line);
push( @ret, $line);
}
@ret;
}
#----------------------------------------------------------------------
sub ReadLinks
{
# DESCRIPTION
#
# read external links
#
# INPUT
#
# @txt text from the file
#
# RETURN
# % all found links 'line nbr' -- 'lnk'
local( $f ) = "$lib.ReadLinks";
local( *txt ) = @_;
local( $_ , $url, %ret, $i );
for ( @txt )
{
$i++;
$url = "";
# This used to read (ftp|http), but the ftp check does not
# know GET request.
#
$url = $1 if m"((http)://[^\s\)\'\",;]+)"i;
# Do not check the tar.gz links. or program calls perl?args
#
if ( $url =~ m"\.(gz|tgz|Z)$|\?" )
{
warn " link will not be checked: $url";
next if m"\?"; # forget programs
# but try to verify at least directory
$url =~ s"(.*/)"$1";
}
if ( $url )
{
$D && print "$f: $i $url\n";
$ret{$i} = $url ;
}
}
%ret;
}
#----------------------------------------------------------------------
sub Studylinks
{
# DESCRIPTION
#
# Check if TEXT contains no data. Empty, only whitespaces
# or "none" word is considered empty text.
#
# INPUT
#
# $text string
#
# RETURN
#
# 0,1
local( $f ) = "$lib.Studylinks";
local( $FILE, *content ) = @_;
local( %link, %errDesc, %linkErr );
local( $i, $_, $lnk, $i , $text, $status );
%link = &ReadLinks( *content );
$D && &pAsc( "$f", *link);
$i = 0;
for ( sort {$a<=> $b} keys %link )
{
$i = $_;
$lnk = $link{$_};
if ( ! $QUIET )
{
print "$FILE:$i:$lnk ";
}
( $status, $err ) = &StudyLinkExternal( $lnk , *linkErr, *errDesc );
$text = "";
if ( $ERR_TEXT_ONE_LINE )
{
( $text = $err ) =~ s/\n/./;
}
if ( ! $QUIET )
{
print " $status $text\n";
}
elsif ( $status != 0 )
{
printf "$FILE:$i:%-4d $lnk $text\n", $status;
}
}
}
# }}}
# {{{ Is, testing
# ............................................................... &is ...
#----------------------------------------------------------------------
sub IsEmptyText
{
# DESCRIPTION
#
# Check if TEXT contains no data. Empty, only whitespaces
# or "none" word is considered empty text.
#
# INPUT
#
# $text string
#
# RETURN
#
# 0,1
local( $f ) = "$lib.IsEmptyText";
local( $_ ) = @_;
local( $ret ) = 0;
$ret = 1 if $_ eq "" || $_ =~ /^\s+$|none$/i;
$ret;
}
#----------------------------------------------------------------------
sub IsHeader
{
# DESCRIPTION
#
# Return level of header and header text if header
#
# INPUT
#
# $line line
#
# RETURN
#
# 0,x level of header if it was header
local( $f ) = "$lib.IsHeader";
local( $_ ) = @_;
local( $level ) = 0;
if ( /^[A-Z0-9]/ )
{
$level = 1;
}elsif ( /^ {4}[A-Z0-9.]/ )
{
$level = 2;
}
$level
}
#----------------------------------------------------------------------
sub IsBullet
{
# DESCRIPTION
#
#
# INPUT
#
# $line ,line
# $text , returned, modified line text
#
# RETURN
#
# 0,x ,level
local( $f ) = "$lib.IsBullet";
local( $_ , *text) = @_;
local( $level ) = 0;
if ( /^ {8}([o.]) {3}(.+)/ )
{
$level = 1 if $1 eq "o";
$level = 2 if $1 eq ".";
$text = $2;
}
$level;
}
# }}}
# {{{ start, end
#----------------------------------------------------------------------
sub PrintStart
{
# DESCRIPTION
#
# Creates the start of html document
#
# INPUT
#
# Too many to list here. See code.
#
# RETURN
#
# @ html lines
local( $f ) = "$lib.PrintStart";
local( $doc, $author, $title,
$base, $butt, $butp, $butn ,
$metaDesc, $metaKeywords
) = @_;
local( @ret, $str , $tab , $tmp );
$tab = " ";
# ................................................ start of html ...
$str =
"\n" .
"\n\n" .
"\n\n" .
"$title\n\n" .
"\n\n\n"
;
push ( @ret, $str );
# ............................................. meta information ...
# META tags provide "meta information" about the document.
# http://www.htmlhelp.com/reference/wilbur/head/meta.html
#
if ( defined $metaKeywords )
{
# "keywords"
# Provides keywords for search engines such as Infoseek or Alta
# Vista. These are added to the keywords found in the document
# itself. If you insert a keyword more than seven times here,
# the whole tag will be ignored!
#
push(@ret,
" \n");
}
if ( defined $metaDesc )
{
push(@ret," \n");
}
# ................................................. general meta ...
#
push(@ret, qq/ \n/ );
push(@ret, qq/ \n/);
push(@ret, qq/ \n\n/);
push(@ret, &makeComment("BUTTON DEFINITION START") );
if ( ! &IsEmptyText( $butp ) )
{
$tmp = "Previous document";
# push(@ret, $tab . &makeLinkHtml("previous","$butp", $tmp) );
push(@ret, $tab . &makeUrlRef( $butp, "Previous", "but") );
push(@ret, "\n");
}
if ( ! &IsEmptyText( $butt ) )
{
$tmp = "The homepage of site";
# push(@ret, $tab . &makeLinkHtml("home","$butt", $tmp) );
push(@ret, $tab . &makeUrlRef( $butt, "home", "but") );
push(@ret, "\n");
}
if ( ! &IsEmptyText( $butn ) )
{
$tmp = "Next document";
# push(@ret, $tab . &makeLinkHtml("next","$butt", $tmp) );
push(@ret, $tab . &makeUrlRef( $butn, "Next", "but") );
push(@ret, "\n");
}
push(@ret, "\n\n\n\n" );
# &pArr("$f", *ret);
@ret;
}
#----------------------------------------------------------------------
sub PrintEndQuiet
{
# DESCRIPTION
#
#
# INPUT
#
#
# RETURN
#
local( $f ) = "$lib.PrintEndQuiet";
local( $doc , $author, $ref , $file ) = @_;
local( @ret, $str );
push(@ret, &makeComment( "DOCUMENT END BLOCK") );
$str =
"\n\n" .
"\n" .
"\n" .
"