scripts/email/eml_to_txt.py

   1 #! /usr/bin/env python3
   2
   3 """
   4 started from free license script posted on stackexchange:
   5 https://softwarerecs.stackexchange.com/questions/28138/converter-that-converts-eml-files-to-plain-text?newreg=48fb884924fd4d909777e5fccc8af2eb
   6
   7 Chris Koeritz: added processing to fix headers and to turn html parts into text 2023-11-08.
   8 """
   9
  10 #hmmm: would be nice to add the ability to specify a directory; script assumes current dir.
  11
  12 import os
  13 from bs4 import BeautifulSoup
  14 from email import message_from_file, header
  15
  16 def file_exists (f):
  17     return os.path.exists(os.path.join(path, f).replace("\\","/"))
  18
  19 def save_file (fn, cont):
  20     file = open(os.path.join(path, fn).replace("\\","/"), "wb")
  21     file.write(cont)
  22     file.close()
  23
  24 def construct_name (id, fn):
  25     id = id.split(".")
  26     id = id[0]+id[1]
  27     return id+"."+fn
  28
  29 def disqo (s):
  30     s = s.strip()
  31     if s.startswith("'") and s.endswith("'"): return s[1:-1]
  32     if s.startswith('"') and s.endswith('"'): return s[1:-1]
  33     return s
  34
  35 def disgra (s):
  36     s = s.strip()
  37     if s.startswith("<") and s.endswith(">"): return s[1:-1]
  38     return s
  39
  40 def pullout (m, key):
  41     Html = ""
  42     Text = ""
  43     Files = {}
  44     Parts = 0
  45     if not m.is_multipart():
  46         if m.get_filename():
  47             fn = m.get_filename()
  48             cfn = construct_name(key, fn)
  49             Files[fn] = (cfn, None)
  50             if file_exists(cfn): return Text, Html, Files, 1
  51             save_file(cfn, m.get_payload(decode=True))
  52             return Text, Html, Files, 1
  53         cp = m.get_content_type()
  54         if cp=="text/plain":
  55             Text += m.get_payload(decode=True).decode("utf-8")
  56         elif cp=="text/html":
  57             soup = BeautifulSoup(m.get_payload(decode=True).decode("utf-8"), features="html.parser")
  58             Html += soup.get_text('\n', strip=True)
  59         else:
  60             cp = m.get("content-type")
  61             try: id = disgra(m.get("content-id"))
  62             except: id = None
  63             o = cp.find("name=")
  64             if o==-1: return Text, Html, Files, 1
  65             ox = cp.find(";", o)
  66             if ox==-1: ox = None
  67             o += 5; fn = cp[o:ox]
  68             fn = disqo(fn)
  69             cfn = construct_name(key, fn)
  70             Files[fn] = (cfn, id)
  71             if file_exists(cfn): return Text, Html, Files, 1
  72             save_file(cfn, m.get_payload(decode=True))
  73         return Text, Html, Files, 1
  74     y = 0
  75     while 1:
  76         try:
  77             pl = m.get_payload(y)
  78         except: break
  79         t, h, f, p = pullout(pl, key)
  80         Text += t; Html += h; Files.update(f); Parts += p
  81         y += 1
  82     return Text, Html, Files, Parts
  83
  84 def extract (msgfile, key):
  85     m = message_from_file(msgfile)
  86     From, To, Subject, Date = caption(m)
  87     Text, Html, Files, Parts = pullout(m, key)
  88     Text = Text.strip(); Html = Html.strip()
  89     msg = {"subject": Subject, "from": From, "to": To, "date": Date,
  90         "text": Text, "html": Html, "parts": Parts}
  91     if Files: msg["files"] = Files
  92     return msg
  93
  94 def clean_header(h):
  95     return str(header.make_header(header.decode_header(h)))
  96
  97 def caption (origin):
  98     Date = ""
  99     if "date" in origin: Date = clean_header(origin["date"]).strip()
 100     From = ""
 101     if "from" in origin: From = clean_header(origin["from"]).strip()
 102     To = ""
 103     if "to" in origin: To = clean_header(origin["to"]).strip()
 104     Subject = ""
 105     if "subject" in origin: Subject = clean_header(origin["subject"]).strip()
 106     return From, To, Subject, Date
 107
 108 if __name__ == "__main__":
 109     global path
 110
 111     startdirname = "Email"
 112     num = 1
 113     for i in range(10000000):
 114         if os.path.exists(startdirname + str(num)) == False:
 115             os.makedirs("Email" + str(num))
 116             break
 117         else:
 118             num += 1
 119
 120
 121     for i in os.listdir("."):
 122         if i.endswith(".eml") == True:
 123             nam = i[:-4]
 124             path = "./" + startdirname + str(num) + "/" + nam
 125
 126             os.makedirs("./" + startdirname + str(num) + "/" + nam)
 127
 128             f = open(i, "r")
 129             emailDict = extract(f, f.name)
 130             f.close()
 131
 132             textFile = ""
 133
 134             froms = emailDict["from"]
 135             tos = emailDict["to"]
 136             subject = emailDict["subject"]
 137             parts = emailDict["parts"]
 138             date = emailDict["date"]
 139             txt = emailDict["text"]
 140             html = emailDict["html"]
 141
 142             files = []
 143             if "files" in emailDict:
 144                 for i in emailDict["files"]:
 145                     files.append(i)
 146
 147             textFile += "From: " + froms + "\n"
 148             textFile += "To: " + tos + "\n"
 149             textFile += "Subject: " + subject + "\n"
 150             textFile += "Date: " + date + "\n\n"
 151             textFile += "Files: " + ", ".join(files) + "\n"
 152             textFile += "Parts: " + str(parts) + "\n\n"
 153             textFile += "Text:\n\n" + txt + "\n\n"
 154             textFile += "HTML:\n\n" + html
 155
 156
 157             wf = open("./" + startdirname + str(num) + "/" + nam + "/" + "txt_" + nam + ".txt", "w")
 158             wf.write(textFile)
 159             wf.close()
 160