#!/usr/bin/python

# todo host aliases missing

import logging
import os
import sys
import email
import copy
import string
from optparse import OptionParser
import email.header
import gdbm
import mailbox
import ConfigParser
import re
import hashlib

filter_name = 'deja-vu'
file_prefix = '.'+filter_name
wanted_header_fields = set(['Content-Type','MIME-Version','Content-Transfer-Encoding','Subject','From','X-Mailer'])
unwanted_header_fields = set(['X-'+filter_name+'-digest','X-'+filter_name,'X-'+filter_name+'-line'])
unify_patterns = []

desc="""%prog is a filter for cron generated mail.

it expects mails which only differ in small amounts.
Matching is done, by matching line by line against already learned mails.

If a mail is matched, the header
X-deja-vu is set to yes.
Otherwise the header is set to, no and if specified the header X-deja-vu-line will 
contain lines of the mail which did not match.

"""

def config_read(config):
    config = ConfigParser.RawConfigParser()
    try:
        config.read(options.config_filename)
        for key in config.options('unify'):
            unify_patterns.append(config.get('unify',key))
    except:
        logging.info("no config file found, use default config")
        unify_patterns.append('[0123456789]')

def clean_mail_header(msg):
    for key in unwanted_header_fields:
        if (key in msg.keys()):
            logging.debug("remove header field from mail: " + key)
            del msg[key]
    return msg

def clean_mail(msg):
    msg = copy.deepcopy(msg)
    return clean_mail_header(msg)

def unify_string(s):
    r = s
    for p in unify_patterns:
        r = re.sub(p,'x',r)
    return r

def unify_header_line(header_line):
    intab = "\n\r\t "
    outtab = "    "
    trantab = string.maketrans(intab, outtab)
    r = re.sub(' ','',string.translate(header_line,trantab))
#    print "x "+ r + "y"
    return r
    

def flatten_mail_header(msg):
    h = ""
    for key in wanted_header_fields:
        if (key in msg.keys()):
            h = h + unify_header_line(msg[key])
    return h

def flatten_mail_body(msg,recurse=0):
    # TODO error handling
    # failsafe
    if recurse>10:
        return ""
    if msg.is_multipart():
        b = ""
        for mb in msg.get_payload():
            b = b + flatten_mail_body(mb, recurse+1)
        return b
    else:
        return msg.get_payload()

def flatten_mail(msg):
    return unify_string(flatten_mail_body(msg) + flatten_mail_header(msg))


parser = OptionParser(usage="%prog <options>", version="%prog 0.1",description=desc)
parser.add_option("-m", "--mail", dest="mail_filename",metavar="<filename of mail>",
                  help="test against mail from filename")
parser.add_option("-c", "--config", dest="config_filename",metavar="<config>",
                  help="config", default=os.getenv('HOME')+"/"+file_prefix+".cfg")
parser.add_option("-a", "--add", dest="add",metavar="<Maildir>",
                  help="build database from maildir")
parser.add_option("-s", "--show", dest="show_header_lines",metavar="<number of lines>",type="int",
                  help="show first <num> of not matched lines in header")
parser.add_option("-d", "--debug", dest="debug",action="store_true",
                  help="show debug output")
(options, args) = parser.parse_args()

config=""
config_read(config)
db_filename = os.getenv('HOME')+'/'+file_prefix+'.dbm'

if options.debug:
    logging.basicConfig(level=logging.DEBUG)
else:
    logging.basicConfig(level=logging.WARNING)
    
if (options.add):
    db = gdbm.open(db_filename,'n')
    for message in mailbox.Maildir(options.add,factory=None):
    	message_out = clean_mail_header(message)
        logging.debug("process mail msgid: " + message_out['Message-Id'])
        for line in flatten_mail(clean_mail(message_out)).splitlines():
            db[hashlib.sha256(line).hexdigest()] = '1'
    db.close
else:
    if options.mail_filename:
        f = open(options.mail_filename, "r")
        try:
            message = email.message_from_file(f)
        except:
            logging.error("file could not be read")
            sys.exit()
            
        f.close()
    else:
        try:
            message = email.message_from_file(sys.stdin)
        except:
            logging.error("pipe canceled")
            sys.exit()

    message_out = clean_mail_header(message)
    db = gdbm.open(db_filename,'r')

    match = True
    for line in flatten_mail(clean_mail(message_out)).splitlines():
        if not hashlib.sha256(line).hexdigest() in db:
            logging.debug("not match:" + line)
            if options.show_header_lines:
                message_out['X-'+filter_name+'-line'] = line
                options.show_header_lines = options.show_header_lines - 1
            match = False
        else:
            logging.debug("    match:" + line)

    if match:
        message_out['X-'+filter_name] = 'yes'
    else:
        message_out['X-'+filter_name] = 'no'

    logging.debug("------------ OUTPUT MAIL START ------------------")
    print message_out.as_string(False),
    logging.debug("------------ OUTPUT MAIL END ------------------")
    db.close

# vim:set et:
# vim:set ts=4:
# vim:set shiftwidth=4:
