#!/usr/bin/perl -w # # Id: mb2md.pl,v 1.26 2004/03/28 00:09:46 juri Exp # mb2md-3.20.pl Converts Mbox mailboxes to Maildir format. # # Public domain. # # currently maintained by: # Juri Haberland # initially wrote by: # Robin Whittle # # This script's web abode is http://batleth.sapienti-sat.org/projects/mb2md/ # For a changelog see # http://batleth.sapienti-sat.org/projects/mb2md/changelog.txt # # The Mbox -> Maildir inner loop is based on qmail's script # mbox2maildir, which was kludged by Ivan Kohler in 1997 from # convertandcreate (public domain) by Russel Nelson. Both these convert # a single mailspool file. # # The qmail distribution has a maildir2mbox.c program. # # What it does: # ============= # # Reads a directory full of Mbox format mailboxes and creates a set of # Maildir format mailboxes. Some details of this are to suit Courier # IMAP's naming conventions for Maildir mailboxes. # # http://www.inter7.com/courierimap/ # # This is intended to automate the conversion of the old # /var/spool/mail/blah file - with one call of this script - and to # convert one or more mailboxes in a specifed directory with separate # calls with other command line arguments. # # Run this as the user - in these examples "blah". # This version supports conversion of: # # Date The date-time in the "From " line of the message in the # Mbox format is the date when the message was *received*. # This is transformed into the date-time of the file which # contains the message in the Maildir mailbox. # # This relies on the Date::Parse perl module and the utime # perl function. # # The script tries to cope with errant forms of the # Mbox "From " line which it may encounter, but if # there is something really screwy in a From line, # then perhaps the script will fail when "touch" # is given an invalid date. Please report the # exact nature of any such "From " line! # # # Flagged # Replied # Read = Seen # Tagged for Deletion # # In the Mbox message, flags for these are found in the # "Status: N" or "X-Status: N" headers, where "N" is 0 # or more of the following characters in the left column. # # They are converted to characters in the right column, # which become the last characters of the file name, # following the ":2," which indicates IMAP message status. # # # F -> F Flagged # A -> R Replied # R -> S Read = Seen # D -> T Tagged for Deletion (Trash) # # This is based on the work of Philip Mak who wrote a # completely separate Mbox -> Maildir converter called # perfect_maildir and posted it to the Mutt-users mailing # list on 25 December 2001: # # http://www.mail-archive.com/mutt-users@mutt.org/msg21872.html # # Michael Best originally integrated those changes into mb2md. # # # In addition, the names of the message files in the Maildir are of a # regular length and are of the form: # # 7654321.000123.mbox:2,xxx # # Where "7654321" is the Unix time in seconds when the script was # run and "000123" is the six zeroes padded message number as # messages are converted from the Mbox file. "xxx" represents zero or # more of the above flags F, R, S or T. # # # --------------------------------------------------------------------- # # # USAGE # ===== # # Run this as the user of the mailboxes, not as root. # # # mb2md -h # mb2md [-c] -m [-d destdir] # mb2md [-c] -s sourcefile [-d destdir] # mb2md [-c] -s sourcedir [-l wu-mailboxlist] [-R|-f somefolder] # [-d destdir] [-r strip_extension] # # -c use the Content-Length: headers (if present) to find the # beginning of the next message # Use with caution! Results may be unreliable. I recommend to do # a run without "-c" first and only use it if you are certain, # that the mbox in question really needs the "-c" option # # -m If this is used then the source will # be the single mailbox at /var/spool/mail/blah for # user blah and the destination mailbox will be the # "destdir" mailbox itself. # # # -s source Directory or file relative to the user's home directory, # which is where the the "somefolders" directories are located. # Or if starting with a "/" it is taken as a # absolute path, e.g. /mnt/oldmail/user # # or # # A single mbox file which will be converted to # the destdir. # # -R If defined, do not skip directories found in a mailbox # directory, but runs recursively into each of them, # creating all wanted folders in Maildir. # Incompatible with '-f' # # -f somefolder Directories, relative to "sourcedir" where the Mbox files # are. All mailboxes in the "sourcedir" # directory will be converted and placed in the # "destdir" directory. (Typically the Inbox directory # which in this instance is also functioning as a # folder for other mailboxes.) # # The "somefolder" directory # name will be encoded into the new mailboxes' names. # See the examples below. # # This does not save an UW IMAP dummy message file # at the start of the Mbox file. Small changes # in the code could adapt it for looking for # other distinctive patterns of dummy messages too. # # Don't let the source directory you give as "somefolders" # contain any "."s in its name, unless you want to # create subfolders from the IMAP user's point of # view. See the example below. # # Incompatible with '-f' # # # -d destdir Directory where the Maildir format directories will be created. # If not given, then the destination will be ~/Maildir . # Typically, this is what the IMAP server sees as the # Inbox and the folder for all user mailboxes. # If this begins with a '/' the path is considered to be # absolute, otherwise it is relative to the users # home directory. # # -r strip_ext If defined this extension will be stripped from # the original mailbox file name before creating # the corresponding maildir. The extension must be # given without the leading dot ("."). See the example below. # # -l WU-file File containing the list of subscribed folders. If # migrating from WU-IMAP the list of subscribed folders will # be found in the file called .mailboxlist in the users # home directory. This will convert all subscribed folders # for a single user: # # /bin/mb2md -s mail -l .mailboxlist -R -d Maildir # and for all users in a directory as root you can do the # following: # # for i in *; do # echo $i # su - $i -c "/bin/mb2md -s mail -l .mailboxlist -R -d Maildir" # done # # # Example # ======= # # We have a bunch of directories of Mbox mailboxes located at # /home/blah/oldmail/ # # /home/blah/oldmail/fffff # /home/blah/oldmail/ggggg # /home/blah/oldmail/xxx/aaaa # /home/blah/oldmail/xxx/bbbb # /home/blah/oldmail/xxx/cccc # /home/blah/oldmail/xxx/dddd # /home/blah/oldmail/yyyy/huey # /home/blah/oldmail/yyyy/duey # /home/blah/oldmail/yyyy/louie # # With the UW IMAP server, fffff and ggggg would have appeared in the root # of this mail server, along with the Inbox. aaaa, bbbb etc, would have # appeared in a folder called xxx from that root, and xxx was just a folder # not a mailbox for storing messages. # # We also have the mailspool Inbox at: # # /var/spool/mail/blah # # # To convert these, as user blah, we give the first command: # # mb2md -m # # The main Maildir directory will be created if it does not exist. # (This is true of any argument options, not just "-m".) # # /home/blah/Maildir/ # # It has the following subdirectories: # # /home/blah/Maildir/tmp/ # /home/blah/Maildir/new/ # /home/blah/Maildir/cur/ # # Then /var/spool/blah file is read, split into individual files and # written into /home/blah/Maildir/cur/ . # # Now we give the second command: # # mb2md -s oldmail -R # # This reads recursively all Mbox mailboxes and creates: # # /home/blah/Maildir/.fffff/ # /home/blah/Maildir/.ggggg/ # /home/blah/Maildir/.xxx/ # /home/blah/Maildir/.xxx.aaaa/ # /home/blah/Maildir/.xxx.bbbb/ # /home/blah/Maildir/.xxx.cccc/ # /home/blah/Maildir/.xxx.aaaa/ # /home/blah/Maildir/.yyyy/ # /home/blah/Maildir/.yyyy.huey/ # /home/blah/Maildir/.yyyy.duey/ # /home/blah/Maildir/.yyyy.louie/ # # The result, from the IMAP client's point of view is: # # Inbox ----------------- # | # | fffff ----------- # | ggggg ----------- # | # - xxx ------------- # | | aaaa -------- # | | bbbb -------- # | | cccc -------- # | | dddd -------- # | # - yyyy ------------ # | huey ------- # | duey ------- # | louie ------ # # Note that although ~/Maildir/.xxx/ and ~/Maildir/.yyyy may appear # as folders to the IMAP client the above commands to not generate # any Maildir folders of these names. These are simply elements # of the names of other Maildir directories. (if you used '-R', they # whill be able to act as normal folders, containing messages AND folders) # # With a separate run of this script, using just the "-s" option # without "-f" nor "-R", it would be possible to create mailboxes which # appear at the same location as far as the IMAP client is # concerned. By having Mbox mailboxes in some directory: # ~/oldmail/nnn/ of the form: # # /home/blah/oldmail/nn/xxxx # /home/blah/oldmail/nn/yyyyy # # then the command: # # mb2md -s oldmail/nn # # will create two new Maildirs: # # /home/blah/Maildir/.xxx/ # /home/blah/Maildir/.yyyy/ # # Then what used to be the xxx and yyyy folders now function as # mailboxes too. Netscape 4.77 needed to be put to sleep and given ECT # to recognise this - deleting the contents of (Win2k example): # # C:\Program Files\Netscape\Users\uu\ImapMail\aaa.bbb.ccc\ # # where "uu" is the user and "aaa.bbb.ccc" is the IMAP server # # I often find that deleting all this directory's contents, except # "rules.dat", forces Netscape back to reality after its IMAP innards # have become twisted. Then maybe use File > Subscribe - but this # seems incapable of subscribing to folders. # # For Outlook Express, select the mail server, then click the # "IMAP Folders" button and use "Reset list". In the "All" # window, select the mailboxes you want to see in normal # usage. # # # This script did not recurse subdirectories or delete old mailboxes, # before addition of the '-R' parameter :) # # Be sure not to be accessing the Mbox mailboxes while running this # script. It does not attempt to lock them. Likewise, don't run two # copies of this script either. # # # Trickier usage . . . # ==================== # # If you have a bunch of mailboxes in a directory ~/oldmail/doors/ # and you want them to appear in folders such as: # # ~/Maildir/.music.bands.doors.Jim # ~/Maildir/.music.bands.doors.John # # etc. so they appear in an IMAP folder: # # Inbox ----------------- # | music # | bands # | doors # | Jim # | John # | Robbie # | Ray # # Then you could rename the source directory to: # # ~/oldmail/music.bands.doors/ # # then use: # # mb2md -s oldmail -f music.bands.doors # # # Or simply use '-R' switch with: # mb2md -s oldmail -R # # # Stripping mailbox extensions: # ============================= # # If you want to convert mailboxes that came for example from # a Windows box than you might want to strip the extension of # the mailbox name so that it won't create a subfolder in your # mail clients view. # # Example: # You have several mailboxes named Trash.mbx, Sent.mbx, Drafts.mbx # If you don't strip the extension "mbx" you will get the following # hierarchy: # # Inbox # | # - Trash # | | mbx # | # - Sent # | | mbx # | # - Drafts # | mbx # # This is more than ugly! # Just use: # mb2md -s oldmail -r mbx # # Note: don't specify the dot! It will be stripped off # automagically ;) # #------------------------------------------------------------------------------ use strict; use Getopt::Std; use Date::Parse; use IO::Handle; use Fcntl; use File::Basename; # regular expression for a proper "From " line. my $mbbase; my $fregexp = '^From .* (Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec) .* [0-9][0-9]:'; # print the usage message sub usage() { print "Usage:\n"; print " mb2md -h\n"; print " mb2md [-c] -m [-d destdir]\n"; print " mb2md [-c] -s sourcefile [-d destdir]\n"; die " mb2md [-c] -s sourcedir [-l wu-mailboxlist] [-R|-f somefolder] [-d destdir] [-r strip_extension]\n"; } # get options my %opts; getopts('d:f:chms:r:l:R', \%opts) || usage(); usage() if (defined($opts{h}) || (!defined($opts{m}) && !defined($opts{s}))); # Get uid, username and home dir my ($name, $passwd, $uid, $gid, $quota, $comment, $gcos, $homedir, $shell) = getpwuid($<); # Get arguments and determine source # and target directories. my $mbroot = undef; # this is the base directory for the mboxes my $mbdir = undef; # this is an mbox dir relative to the $mbroot my $mbfile = undef; # this is an mbox file my $dest = undef; my $strip_ext = undef; my $use_cl = undef; # whether we use the Content-Length: header if present # if option "-c" is given, we use the Content-Length: header if present # dangerous! may be unreliable, as the whole CL stuff is a bad idea if (defined($opts{c})) { $use_cl = 1; } else { $use_cl = 0; } # first, if the user has gone the -m option # we simply convert their mailfile if (defined($opts{m})) { if (defined($ENV{'MAIL'})) { $mbfile = $ENV{'MAIL'}; } elsif (-f "/var/spool/mail/$name") { $mbfile = "/var/spool/mail/$name"; } elsif (-f "/var/mail/$name") { $mbfile = "/var/mail/$name"; } else { die("searched \$MAIL, /var/spool/mail/$name and /var/mail/$name, " . "but I couldn't find your mail spool file - "); } } # see if the user has specified a source directory elsif (defined($opts{s})) { # if opts{s} doesn't start with a "/" then # it is a subdir of the users $home # if it does start with a "/" then # let's take $mbroot as a absolut path $opts{s} = "$homedir/$opts{s}" if ($opts{s} !~ /^\//); # check if the given source is a mbox file if (-f $opts{s}) { $mbfile = $opts{s}; } # otherwise check if it is a directory elsif (-d $opts{s}) { $mbroot = $opts{s}; # get rid of trailing /'s $mbroot =~ s/\/$//; # check if we have a specified sub directory, # otherwise the sub directory is '.' if (defined($opts{f})) { $mbdir = $opts{f}; # get rid of trailing /'s $mbdir =~ s/\/$//; } } # otherwise we have an error else { die("Fatal: Source is not an mbox file or a directory!\n"); } } # get the dest defined($opts{d}) && ($dest = $opts{d}) || ($dest = "Maildir"); # see if we have anything to strip defined($opts{r}) && ($strip_ext = $opts{r}); # No '-f' with '-R' if ((defined($opts{R})) && (defined($opts{f}))) { die "No recursion with \"-f\""; } # Get list of folders my @flist; if (defined($opts{l})) { open(LIST, $opts{l}) or die "Could not open mailbox list $opts{l}: $!"; @flist = ; close LIST; } # if the destination is relative to the home dir, # check that the home dir exists die("Fatal: home dir $homedir doesn't exist.\n") if ($dest !~ /^\// && !-e $homedir); # # form the destination value # slap the home dir on the front of the dest if the dest does not begin # with a '/' $dest = "$homedir/$dest" if ($dest !~ /^\//); # get rid of trailing /'s $dest =~ s/\/$//; # Count the number of mailboxes, or # at least files, we found. my $mailboxcount = 0; # Since we'll be making sub directories of the main # Maildir, we need to make sure that the main maildir # exists &maildirmake($dest); # Now we do different things depending on whether we convert one mbox # file or a directory of mbox files if (defined($mbfile)) { if (!isamailboxfile($mbfile)) { print "Skipping $mbfile: not a mbox file\n"; } else { print "Converting $mbfile to maildir: $dest\n"; # this is easy, we just run the convert function &convert($mbfile, $dest); } } # if '-f' was used ... elsif (defined($mbdir)) { print "Converting mboxdir/mbdir: $mbroot/$mbdir to maildir: $dest/\n"; # Now set our source directory my $sourcedir = "$mbroot/$mbdir"; # check that the directory we are supposed to be finding mbox # files in, exists and is a directory -e $sourcedir or die("Fatal: MBDIR directory $sourcedir/ does not exist.\n"); -d $sourcedir or die("Fatal: MBDIR $sourcedir is not a directory.\n"); &convertit($mbdir, ""); } # Else, let's work in $mbroot else { opendir(SDIR, $mbroot) or die("Fatal: Cannot open source directory $mbroot/ \n"); while (my $sourcefile = readdir(SDIR)) { if (-d "$mbroot/$sourcefile") { # Recurse only if requested (to be changed ?) if (defined($opts{R})) { print "convertit($sourcefile,\"\")\n"; &convertit($sourcefile, ""); } else { print( "$sourcefile is a directory, but '-R' was not used... skipping\n" ); } } elsif (!-f "$mbroot/$sourcefile") { print "Skipping $mbroot/$sourcefile : not a file nor a dir\n"; next; } elsif (!isamailboxfile("$mbroot/$sourcefile")) { print "Skipping $mbroot/$sourcefile : not a mbox file\n"; next; } else { &convertit($sourcefile, ""); } } # end of "while ($sfile = readdir(SDIR))" loop. closedir(SDIR); printf("$mailboxcount files processed.\n"); } # exit 0; # My debbugging placeholder I can put somewhere to show how far the script ran. # die("So far so good.\n\n"); # The isamailboxfile function # ---------------------- # # Here we check if the file is a mailbox file, not an address-book or # something else. # If file is empty, we say it is a mbox, to create it empty. # # Returns 1 if file is said mbox, 0 else. sub isamailboxfile { my ($mbxfile) = @_; return 1 if (-z $mbxfile); sysopen(MBXFILE, "$mbxfile", O_RDONLY) or die "Could not open $mbxfile ! \n"; while () { if (/^From/) { close(MBXFILE); return 1; } else { close(MBXFILE); return 0; } } } # The convertit function # ----------------------- # # This function creates all subdirs in maildir, and calls convert() # for each mbox file. # Yes, it becomes the 'main loop' :) sub convertit { # Get subdir as argument my ($dir, $oldpath) = @_; $oldpath =~ s/\/\///; # Skip files beginning with '.' since they are # not normally mbox files nor dirs (includes '.' and '..') if ($dir =~ /^\./) { print "Skipping $dir : name begins with a '.'\n"; return; } my $destinationdir = $dir; my $temppath = $oldpath; # We don't want to have .'s in the $targetfile file # name because they will become directories in the # Maildir. Therefore we convert them to _'s $temppath =~ s/\./\_/g; $destinationdir =~ s/\./\_/g; # Appending $oldpath => path is only missing $dest $destinationdir = "$temppath.$destinationdir"; # Converting '/' to '.' in $destinationdir $destinationdir =~ s/\/+/\./g; # source dir my $srcdir = "$mbroot/$oldpath/$dir"; printf( "convertit(): Converting $dir in $mbroot/$oldpath to $dest/$destinationdir\n" ); &maildirmake("$dest/$destinationdir"); print("destination = $destinationdir\n"); if (-d $srcdir) { opendir(SUBDIR, "$srcdir") or die "can't open $srcdir !\n"; my @subdirlist = readdir(SUBDIR); closedir(SUBDIR); foreach (@subdirlist) { next if (/^\.+$/); print("Sub: $_\n"); print("convertit($_,\"$oldpath/$dir\")\n"); &convertit($_, "$oldpath/$dir"); } } else { # Source file verifs .... # return if (defined($opts{l}) && !inlist("$oldpath/$dir", @flist)); if (!isamailboxfile("$mbroot/$oldpath/$dir")) { print "Skipping $dir (is not mbox)\n"; next; } # target file verifs... # # if $strip_extension is defined, # strip it off the $targetfile defined($strip_ext) && ($destinationdir =~ s/\.$strip_ext$//); &convert("$mbroot/$oldpath/$dir", "$dest/$destinationdir"); $mailboxcount++; } } # The maildirmake function # ------------------------ # # It does the same thing that the maildirmake binary that # comes with courier-imap distribution # sub maildirmake { foreach (@_) { -d $_ or mkdir $_, 0700 or die("Fatal: Directory $_ doesn't exist and can't be created.\n"); -d "$_/tmp" or mkdir("$_/tmp", 0700) or die("Fatal: Unable to make $_/tmp/ subdirectory.\n"); -d "$_/new" or mkdir("$_/new", 0700) or die("Fatal: Unable to make $_/new/ subdirectory.\n"); -d "$_/cur" or mkdir("$_/cur", 0700) or die("Fatal: Unable to make $_/cur/ subdirectory.\n"); } } # The inlist function # ------------------------ # # It checks that the folder to be converted is in the list of subscribed # folders in WU-IMAP # sub inlist { my ($file, @flist) = @_; my $valid = 0; # Get rid of the first / if any $file =~ s/^\///; foreach my $folder (@flist) { chomp $folder; if ($file eq $folder) { $valid = 1; last; } } if (!$valid) { print "$file is not in list\n"; } else { print "$file is in list\n"; } return $valid; } # # The convert function # --------------------- # # This function does the down and dirty work of # actually converting the mbox to a maildir # sub convert { # get the source and destination as arguments my ($mbox, $maildir) = @_; printf("Source Mbox is $mbox\n"); printf("Target Maildir is $maildir \n"); # create the directories for the new maildir # # if it is the root maildir (ie. converting the inbox) # these already exist but thats not a big issue &maildirmake($maildir); # Change to the target mailbox directory. chdir "$maildir"; # Converts a Mbox to multiple files # in a Maildir. # This is adapted from mbox2maildir. # # Open the Mbox mailbox file. if (sysopen(MBOX, "$mbox", O_RDONLY)) { #printf("Converting Mbox $mbox . . . \n"); } else { die("Fatal: unable to open input mailbox file: $mbox ! \n"); } # This loop scans the input mailbox for # a line starting with "From ". The # "^" before it is pattern-matching # lingo for it being at the start of a # line. # # Each email in Mbox mailbox starts # with such a line, which is why any # such line in the body of the email # has to have a ">" put in front of it. # # This is not required in a Maildir # mailbox, and some majik below # finds any such quoted "> From"s and # gets rid of the "> " quote. # # Each email is put in a file # in the cur/ subdirectory with a # name of the form: # # nnnnnnnnn.cccc.mbox:2,XXXX # # where: # "nnnnnnnnn" is the Unix time since # 1970 when this script started # running, incremented by 1 for # every email. This is to ensure # unique names for each message # file. # # ".cccc" is the message count of # messages from this mbox. # # ".mbox" is just to indicate that # this message was converted from # an Mbox mailbox. # # ":2," is the start of potentially # multiple IMAP flag characters # "XXXX", but may be followed by # nothing. # # This is sort-of compliant with # the Maildir naming conventions # specified at: # # http://www.qmail.org/man/man5/maildir.html # # This approach does not involve the # process ID or the hostname, but it is # probably good enough. # # When the IMAP server looks at this # mailbox, it will move the files to # the cur/ directory and change their # names as it pleases. In the case # of Courier IMAP, the names will # become like: # # 995096541.25351.mbox:2,S # # with 25351 being Courier IMAP's # process ID. The :2, is the start # of the flags, and the "S" means # that this one has been seen by # the user. (But is this the same # meaning as the user actually # having opened the message to see # its contents, rather than just the # IMAP server having been asked to # list the message's Subject etc. # so the client could list it in the # visible Inbox?) # # This contrasts with a message # created by Courier IMAP, say with # a message copy, which is like: # # 995096541.25351.zair,S=14285:2,S # # where ",S=14285" is the size of the # message in bytes. # # Courier Maildrop's names are similar # but lack the ":2,XXXX" flags . . . # except for my modified Maildrop # which can deliver them with a # ":2,T" - flagged for deletion. # # I have extended the logic of the # per-message inner loop to stop # saving a file for a message with: # # Subject: DON'T DELETE THIS MESSAGE -- FOLDER INTERNAL DATA # # This is the dummy message, always # at the start of an Mbox format # mailbox file - and is put there # by UW IMAPD. Since quite a few # people will use this for # converting from a UW system, # I figure it is worth it. # # I will not save any such message # file for the dummy message. # # Plan # ---- # # We want to read the entire Mbox file, whilst # going through a loop for each message we find. # # We want to read all the headers of the message, # starting with the "From " line. For that "From " # line we want to get a date. # # For all other header lines, we want to store them # in $headers whilst parsing them to find: # # 1 - Any flags in the "Status: " or "X-Status: " or # "X-Mozilla-Status: " lines. # # 2 - A subject line indicating this is the dummy message # at the start (typically, but not necessarily) of # the Mbox. # # Once we reach the end of the headers, we will crunch any # flags we found to create a file name. Then, unless this is # the dummy message we create that file and write all the # headers to it. # # Then we continue reading the Mbox, converting ">From " to # "From " and writing it to the file, until we reach one of: # # 1 - Another "From " line (indicating the start of another # message). # # or # # 2 - The end of the Mbox. # # In the former case, which we detect at the start of the loop # we need to close the file and touch it to alter its date-time. # # In the later case, we also need to close the file and touch # it to alter its date-time - but this is beyond the end of the # loop. # Variables # --------- my $messagecount = 0; # For generating unique filenames for # each message. Initialise it here with # numeric time in seconds since 1970. my $unique = time; # Name of message file to delete if we found that # it was created by reading the Mbox dummy message. my $deletedummy = ''; # To store the complete "From (address) (date-time) # which delineates the start of each message # in the Mbox my $fromline = ''; # Set to 1 when we are reading the header lines, # including the "From " line. # # 0 means we are reading the message body and looking # for another "From " line. my $inheaders = 0; # Variable to hold all headers (apart from # the first line "From ...." which is not # part of the message itself. my $headers = ''; # Variable to hold the accumulated characters # we find in header lines of the type: # # Status: # X-Status: # X-Mozilla-Status: # X-Evolution: my $flags = ''; # To build the file name for the message in. my $messagefn = ''; # The date string from the "From " line of each # message will be written here - and used by # touch to alter the date-time of each message # file. Put non-date text here to make it # spit the dummy if my code fails to find a # date to write into this. my $receivedate = 'Bogus'; # The subject of the message my $subject = ''; my $previous_line_was_empty = 1; # We record the message start line here, for error # reporting. my $startline; # If defined, we use this as the number of bytes in the # message body rather than looking for a /^From / line. my $contentlength; # A From lines can either occur as the first # line of a file, or after an empty line. # Most mail systems will quote all From lines # appearing in the message, but some will only # do it when necessary. # Since we initialise the variable to true, # we don't need to check for beginning of file. while () { # exchange possible Windows EOL (CRLF) with Unix EOL (LF) $_ =~ s/\r\n$/\n/; if ( /$fregexp/ && $previous_line_was_empty && (!defined $contentlength)) { # We are reading the "From " line which has an # email address followed by a receive date. # Turn on the $inheaders flag until we reach # the end of the headers. $inheaders = 1; # record the message start line $startline = $.; # If this is not the first run through the loop # then this means we have already been working # on a message. if ($messagecount > 0) { # If so, then close that message file and then # use utime to change its date-time. # # Note this code should be duplicated to do # the same thing at the end of the while loop # since we must close and touch the final message # file we were writing when we hit the end of the # Mbox file. close(OUT); if ($messagefn ne '') { my $t = str2time($receivedate); utime $t, $t, $messagefn; } } # Because we opened the Mbox file without any # variable, I think this means that we have its # current line in Perl's default variable "$_". # So all sorts of pattern matching magic works # directly on it. # We are currently reading the first line starting with # "From " which contains the date we want. # # This will be of the form: # # From dduck@test.org Wed Nov 24 11:05:35 1999 # # at least with UW-IMAP. # # However, I did find a nasty exception to this in my # tests, of the form: # # "bounce-MusicNewsletter 5-rw=test.org"@announce2.mp3.com # # This makes it trickier to get rid of the email address, # but I did find a way. I can't rule out that there would # be some address like this with an "@" in the quoted # portion too. # # Unfortunately, testing with an old Inbox Mbox file, # I also found an instance where the email address # had no @ sign at all. It was just an email # account name, with no host. # # I could search for the day of the week. If I skipped # at least one word of non-whitespace (1 or more contiguous # non-whitespace characters) then searched for a day of # the week, then I should be able to avoid almost # every instance of a day of the week appearing in # the email address. # # Do I need a failsafe arrangement to provide some # other date to touch if I don't get what seems like # a date in my resulting string? For now, no. # # I will take one approach if there is an @ in the # "From " line and another (just skip the first word # after "From ") if there is no @ in the line. # # If I knew more about Perl I would probably do it in # a more elegant way. # Copy the current line into $fromline. $fromline = $_; # Now get rid of the "From ". " =~ s" means substitute. # Find the word "From " at the start of the line and # replace it with nothing. The nothing is what is # between the second and third slash. $fromline =~ s/^From //; # Likewise get rid of the email address. # This first section is if we determine there is one # (or more . . . ) "@" characters in the line, which # would normally be the case. if ($fromline =~ m/@/) { # The line has at least one "@" in it, so we assume # this is in the middle of an email address. # # If the email address had no spaces, then we could # get rid of the whole thing by searching for any number # of non-whitespace characters (\S) contiguously, and # then I think a space. Subsitute nothing for this. # # $fromline =~ s/(\S)+ // ; # # But we need something to match any number of non-@ # characters, then the "@" and then all the non-whitespace # characters from there (which takes us to the end of # "test.org") and then the space following that. # # A tutorial on regular expressions is: # # http://www.perldoc.com/perl5.6.1/pod/perlretut.html # # Get rid of all non-@ characters up to the first "@": $fromline =~ s/[^@]+//; # Get rid of the "@". $fromline =~ s/@//; } # If there was an "@" in the line, then we have now # removed the first one (lets hope there aren't more!) # and everything which preceded it. # # we now remove either something like # '(foo bar)'. eg. '(no mail address)', # or everything after the '@' up to the trailing # timezone # # FIXME: all those regexp should be combined to just one single one $fromline =~ s/(\((\S*| )+\)|\S+) *//; chomp $fromline; # Stash the date-time for later use. We will use it # to touch the file after we have closed it. $receivedate = $fromline; # Debugging lines: # # print "$receivedate is the receivedate of message $messagecount.\n"; # $receivedate = "Wed Nov 24 11:05:35 1999"; # # To look at the exact date-time of files: # # ls -lFa --full-time # # End of handling the "From " line. } # Now process header lines which are not the "From " line. if ( ($inheaders eq 1) && (!/^From /)) { # Now we are reading the header lines after the "From " line. # Keep looking for the blank line which indicates the end of the # headers. # ".=" means append the current line to the $headers # variable. # # For some reason, I was getting two blank lines # at the end of the headers, rather than one, # so I decided not to read in the blank line # which terminates the headers. # # Delete the "unless ($_ eq "\n")" to get rid # of this kludge. $headers .= $_ unless ($_ eq "\n"); # Now scan the line for various status flags # and to fine the Subject line. $flags .= $1 if /^Status: ([A-Z]+)/; $flags .= $1 if /^X-Status: ([A-Z]+)/; if (/^X-Mozilla-Status: ([0-9a-f]{4})/i) { $flags .= 'R' if (hex($1) & 0x0001); $flags .= 'A' if (hex($1) & 0x0002); $flags .= 'D' if (hex($1) & 0x0008); } if (/^X\-Evolution:\s+\w{8}\-(\w{4})/oi) { $b = pack("H4", $1); #pack it as 4 digit hex (0x0000) $b = unpack("B32", $b); #unpack into bit string # "usually" only the right most six bits are used # however, I have come across a seventh bit in # about 15 (out of 10,000) messages with this bit # activated. # I have not found any documentation in the source. # If you find out what it does, please let me know. # Notes: # Evolution 1.4 does mark forwarded messages. # The sixth bit is to denote an attachment $flags .= 'A' if ($b =~ /[01]{15}1/); #replied $flags .= 'D' if ($b =~ /[01]{14}1[01]{1}/); #deleted $flags .= 'T' if ($b =~ /[01]{13}1[01]{2}/); #draft $flags .= 'F' if ($b =~ /[01]{12}1[01]{3}/); #flagged $flags .= 'R' if ($b =~ /[01]{11}1[01]{4}/); #seen/read } $subject = $1 if /^Subject: (.*)$/; if ($use_cl eq 1) { $contentlength = $1 if /^Content-Length: (\d+)$/; } # Now look out for the end of the headers - a blank # line. When we find it, create the file name and # analyse the Subject line. if ($_ eq "\n") { # We are at the end of the headers. Set the # $inheaders flag back to 0. $inheaders = 0; # Include the current newline in the content length ++$contentlength if defined $contentlength; # Create the file name for the current message. # # A simple version of this would be: # # $messagefn = "cur/$unique.$messagecount.mbox:2,"; # # This would create names with $messagecount values of # 1, 2, etc. But for neatness when looking at a # directory of such messages, sorted by filename, # I want to have leading zeroes on message count, so # that they would be 000001 etc. This makes them # appear in message order rather than 1 being after # 19 etc. So this is good for up to 999,999 messages # in a mailbox. It is a cosmetic matter for a person # looking into the Maildir directory manually. # To do this, use sprintf instead with "%06d" for # 6 characters of zero-padding: $mbbase = basename($mbfile); $mbbase =~ s/\./-/g; $messagefn = sprintf("cur/%d.%06d.%s:2,", $unique, $messagecount, $mbbase); # Append flag characters to the end of the # filename, according to flag characters # collected from the message headers $messagefn .= 'F' if $flags =~ /F/; # Flagged. $messagefn .= 'R' if $flags =~ /A/; # Replied to. $messagefn .= 'S' if $flags =~ /R/; # Seen or Read. $messagefn .= 'T' if $flags =~ /D/; # Tagged for deletion. # Opens filename $messagefn for output (>) with filehandle OUT. open(OUT, ">$messagefn") or die("Fatal: unable to create new message $messagefn"); # Count the messages. $messagecount++; # Only for the first message, # check to see if it is a dummy. # Delete the message file we # just created if it was for the # dummy message at the start # of the Mbox. # # Add search terms as required. # The last 2 lines are for rent. # # "m" means match the regular expression, # but we can do without it. # # Do I need to escape the ' in "DON'T"? # I didn't in the original version. if ( (($messagecount == 1) && defined($subject)) && ($subject =~ m/^DON'T DELETE THIS MESSAGE -- FOLDER INTERNAL DATA/ ) ) { # Stash the file name of the dummy message so we # can delete it later. $deletedummy = "$messagefn"; } # Print the collected headers to the message file. print OUT "$headers"; # Clear $headers and $flags ready for the next message. $headers = ''; $flags = ''; # End of processing the headers once we found the # blank line which terminated them } # End of dealing with the headers. } if ($inheaders eq 0) { # We are now processing the message body. # # Now we have passed the headers to the # output file, we scan until the while # loop finds another "From " line. # Decrement our content length if we're # using it to find the end of the message # body if (defined $contentlength) { # Decrement our $contentlength variable $contentlength -= length($_); # The proper end for a message with Content-Length # specified is the $contentlength variable should # be exactly -1 and we should be on a bare # newline. Note that the bare newline is not # printed to the end of the current message as # it's actually a message separator in the mbox # format rather than part of the message. The # next line _should_ be a From_ line, but just in # case the Content-Length header is incorrect # (e.g. a corrupt mailbox), we just continue # putting lines into the current message until we # see the next From_ line. if ($contentlength < 0) { if ($contentlength == -1 && $_ eq "\n") { $contentlength = undef; next; } $contentlength = undef; } } # # We want to copy every part of the message # body to the output file, except for the # quoted ">From " lines, which was the # way the IMAP server encoded body lines # starting with "From ". # # Pattern matching Perl majik to # get rid of an Mbox quoted From. # # This works on the default variable "$_" which # contains the text from the Mbox mailbox - I # guess this is the case because of our # (open(MBOX ....) line above, which did not # assign this to anything else, so it would go # to the default variable. This enables # inscrutably terse Perlisms to follow. # # "s" means "Subsitute" and it looks for any # occurrence of ">From" starting at the start # of the line. When it finds this, it replaces # it with "From". # # So this finds all instances in the Mbox message # where the original line started with the word # "From" but was converted to ">From" in order to # not be mistaken for the "From ..." line which # is used to demark each message in the Mbox. # This was was a destructive conversion because # any message which originally had ">From" at the # start of the line, before being put into the # Mbox, will now have that line without the ">". s/^>From /From /; # Glorious tersness here. Thanks Simon for # explaining this. # # "print OUT" means print the default variable to # the file of file handle OUT. This is where # the bulk of the message text is written to # the output file. print OUT or die("Fatal: unable to write to new message to $messagefn"); # End of the if statement dealing with message body. } $previous_line_was_empty = ($_ eq "\n"); # End of while (MBOX) loop. } # Close the input file. close(MBOX); # Close the output file, and duplicate the code # from the start of the while loop which touches # the date-time of the most recent message file. close(OUT); if ($messagefn ne '') { my $t = str2time($receivedate); utime $t, $t, $messagefn; } # After all the messages have been # converted, check to see if the # first one was a dummy. # if so, delete it and make # the message count one less. if ($deletedummy ne "") { printf("Dummy mail system first message detected and not saved.\n"); unlink $deletedummy; $messagecount--; } printf("$messagecount messages.\n\n"); }