
Well, every once in a while we re forced to do something that isn t particularly interesting or pleasant. Last week it happened again: I had to import a few pretty big PSTs (most 2Gig, one 10Gig, with about 100.000 emails) into our dovecot IMAP.
Doing this with Outlook itself was out of the question: it took
way too long, even on the local network (many hours for a 1G file) and was prone to hanging and crashes, which were obviously a pain to debug and start over.
Thunderbird was unfortunately not much better, since at least in our tests it didn t import the read status of the emails (they were all marked as unread) and also wasn t particularly good at handling folders with strange names, containing . , / or some more obscure characters. We had used it before for smaller files, where manually dealing with the problems was acceptable, but this time it required something a bit more elaborate, if we were to keep our sanity.
Enter
libpst. It includes the handy
readpst utility which dumps all emails in usable formats in a directory tree, one directory per folder. Unfortunately the Debian version is somewhat outdated and doesn t support the newer Outlook formats, so I did
some packaging and even a little bit of
patching. It seems Thunderbird also uses this library, which would explain why it didn t handle the Read-Status (haven t confirmed this though; just read it somewhere).
The last step was this not-so-little script, which uses the dumped directories from readpst and imports them in IMAP. It would have probably been a bit more elegant to use libpst directly, but I unfortunately didn t have the time to mess around with that. I did have to mess around a lot with encodings though, ergo the unholy chaos with
unicode()s and
str.encode()s thrown around like rice at a wedding (I could never really wrap my head around charset problems; the subject boggles my mind to this very day).
#!/usr/bin/env python2.7
#-*- encoding: utf-8 -*-
import os, sys, time
import re
from argparse import ArgumentParser
from getpass import getpass
import imaplib
from email.header import decode_header
from mailbox import mbox
from twisted.mail import imap4 # for their imap4-utf7 implementation
parser = ArgumentParser(description="""Recursively import mbox files in a directory
structure to an IMAP server.\n
The expected structure is that generated by
'readpst -r'.""")
parser.add_argument('-s', dest='imap_server', default='localhost', help='IMAP server to import emails to')
parser.add_argument('-u', dest='imap_user', required=True, help='user for logging in to IMAP')
parser.add_argument('-p', dest='imap_passwd', help="will be prompted for if not provided")
parser.add_argument('-c', dest='charset', default='utf8', help='charset in which the folders are stored (for versions older than 2003)')
parser.add_argument('-f', dest='force', action='store_true', help='import mail even if we think it might be a duplicate')
parser.add_argument('-m', dest='mappings', help='a JSON file with mappings between folder names and mailbox names (no slashes or dots)')
parser.add_argument('folder', nargs='+', help="the base folders to import")
args = parser.parse_args()
if not args.imap_passwd:
args.imap_passwd = getpass()
if args.mappings:
import json
folderToMailbox = json.load(open(args.mappings,'r'))
else:
folderToMailbox =
def mailboxFromPath(path):
paths = []
for p in path.split(os.path.sep):
p = folderToMailbox.get(p, p)
# only other invalid char besides '/', which can't be created by readpst anyway
p = p.replace('.','')
paths.append(p)
return '.'.join(paths)
def imapFlagsFromMbox(flags):
# libpst only sets R and O
f = []
if 'R' in flags or 'O' in flags:
f.append(r'\Seen')
if 'D' in flags:
f.append(r'\Deleted')
if 'A' in flags:
f.append(r'\Answered')
if 'F' in flags:
f.append(r'\Flagged')
return '('+' '.join(f)+')'
def utf7encode(s):
return imap4.encoder(s)[0]
def headerToUnicode(s):
h = decode_header(s)[0]
try:
if h[1]: # charset != None
try:
return unicode(*h)
except LookupError:
return unicode(h[0],'utf8','replace')
else:
return unicode(h[0], 'utf8')
except UnicodeDecodeError:
try:
return unicode(h[0], 'cp1252') # the usual culprits for malformed headers
except UnicodeDecodeError:
pass
try:
return unicode(h[0], 'latin1') # the usual culprits for malformed headers
except UnicodeDecodeError:
pass
return unicode(h[0], 'ascii', 'ignore') # give up...
def main():
imap = imaplib.IMAP4_SSL(args.imap_server)
imap.login(args.imap_user, args.imap_passwd)
imap.select()
for base in args.folder:
print "importing folder "+base
for root, dirs, files in os.walk(base):
if 'mbox' in files:
folder = unicode(os.path.relpath(root, base), args.charset)
mailbox = mailboxFromPath(folder)
print u'importing mbox in 0 to 1 '.format(folder, mailbox)
mailbox_encoded = utf7encode(mailbox)
m = mbox(os.path.join(root, 'mbox'), create=False)
for msg in m:
if imap.select(mailbox_encoded)[0] != 'OK':
print "creating mailbox "+mailbox
r = imap.create(mailbox_encoded)
if r[0] != 'OK':
sys.stderr.write("Could not create mailbox: "+str(r))
continue
imap.subscribe(mailbox_encoded)
imap.select(mailbox_encoded)
# skip possibly duplicated msgs
query = 'FROM " 0 " SUBJECT " 1 "'.format(
utf7encode(headerToUnicode(msg['from']).replace('"','')),
utf7encode(headerToUnicode(msg['subject']).replace('"',r'\"'))
)
if msg.has_key('date'):
query += ' HEADER DATE " 0 "'.format(utf7encode(msg['date']))
if msg.has_key('message-id') and msg['message-id']:
query += ' HEADER MESSAGE-ID " 0 "'.format(utf7encode(msg['message-id']))
r = imap.search(None, '( 0 )'.format(query))
if r[1][0] and not args.force:
print "skipping "+mailbox+": '"+headerToUnicode(msg['subject'])[:20]+"' (mid: "+str(msg['message-id'])+")"
continue
r = imap.append(mailbox_encoded, '', imaplib.Time2Internaldate(time.time()), str(msg))
if r[0] != 'OK':
sys.stderr.write("failed to import 0 ( 1 ): 2 ".format(msg['message-id'], msg['date'], r[1]))
continue
num = re.sub(r'.*APPENDUID \d+ (\d+).*', r'\1', r[1][0])
r = imap.uid('STORE', str(num), "FLAGS", imapFlagsFromMbox(msg.get_flags()))
if r[0] != 'OK':
sys.stderr.write("failed to set flags for msg 0 in 1 ".format(num, mailbox))
imap.logout()
if __name__ == '__main__':
main()
Another useful automation is using the mappings file, which looks something like
"Deleted items": "Trash",
"Sent items": "Sent"
to automatically import in the right destinations, in case you have a standardized structure.