email/eml_to_txt.py

   1 #! /usr/bin/env python3
   2
   3 """
   4 started from free license script posted on stackexchange:
   5 https://softwarerecs.stackexchange.com/questions/28138/converter-that-converts-eml-files-to-plain-text?newreg=48fb884924fd4d909777e5fccc8af2eb
   6
   7 Chris Koeritz: added processing to fix headers and to turn html parts into text 2023-11-08.
   8 """
   9
  10 #hmmm: would be nice to add the ability to specify a directory; script assumes current dir.
  11
  12 import os
  13 from bs4 import BeautifulSoup
  14 from email import message_from_file, header
  15
  16 def file_exists (f):
  17     return os.path.exists(os.path.join(path, f).replace("\\","/"))
  18
  19 def save_file (fn, cont):
  20     file = open(os.path.join(path, fn).replace("\\","/"), "wb")
  21     file.write(cont)
  22     file.close()
  23
  24 def construct_name (id, fn):
  25     id = id.split(".")
  26     id = id[0]+id[1]
  27     return id+"."+fn
  28
  29 def disqo (s):
  30     s = s.strip()
  31     if s.startswith("'") and s.endswith("'"): return s[1:-1]
  32     if s.startswith('"') and s.endswith('"'): return s[1:-1]
  33     return s
  34
  35 def disgra (s):
  36     s = s.strip()
  37     if s.startswith("<") and s.endswith(">"): return s[1:-1]
  38     return s
  39
  40 def pullout (m, key):
  41     Html = ""
  42     Text = ""
  43     Files = {}
  44     Parts = 0
  45     if not m.is_multipart():
  46         if m.get_filename():
  47             fn = m.get_filename()
  48             cfn = construct_name(key, fn)
  49             Files[fn] = (cfn, None)
  50             if file_exists(cfn): return Text, Html, Files, 1
  51             save_file(cfn, m.get_payload(decode=True))
  52             return Text, Html, Files, 1
  53         cp = m.get_content_type()
  54         if cp=="text/plain":
  55             try:
  56                 Text += m.get_payload(decode=True).decode("utf-8")
  57             except:
  58                 try:
  59                     Text += m.get_payload(decode=True).decode("cp437")
  60                 except:
  61                     print("failed to process text attachment with either utf-8 or cp437 code pages.")
  62                     exit(1)
  63         elif cp=="text/html":
  64             try:
  65                 soup = BeautifulSoup(m.get_payload(decode=True).decode("utf-8"), features="html.parser")
  66                 Html += soup.get_text('\n', strip=True)
  67             except:
  68                 try:
  69                     soup = BeautifulSoup(m.get_payload(decode=True).decode("cp437"), features="html.parser")
  70                     Html += soup.get_text('\n', strip=True)
  71                 except:
  72                     print("failed to process html attachment with either utf-8 or cp437 code pages.")
  73                     exit(1)
  74         else:
  75             cp = m.get("content-type")
  76             try: id = disgra(m.get("content-id"))
  77             except: id = None
  78             o = cp.find("name=")
  79             if o==-1: return Text, Html, Files, 1
  80             ox = cp.find(";", o)
  81             if ox==-1: ox = None
  82             o += 5; fn = cp[o:ox]
  83             fn = disqo(fn)
  84             cfn = construct_name(key, fn)
  85             Files[fn] = (cfn, id)
  86             if file_exists(cfn): return Text, Html, Files, 1
  87             save_file(cfn, m.get_payload(decode=True))
  88         return Text, Html, Files, 1
  89     y = 0
  90     while 1:
  91         try:
  92             pl = m.get_payload(y)
  93         except: break
  94         t, h, f, p = pullout(pl, key)
  95         Text += t; Html += h; Files.update(f); Parts += p
  96         y += 1
  97     return Text, Html, Files, Parts
  98
  99 def extract (msgfile, key):
 100     m = message_from_file(msgfile)
 101     From, To, Subject, Date = caption(m)
 102     Text, Html, Files, Parts = pullout(m, key)
 103     Text = Text.strip(); Html = Html.strip()
 104     msg = {"subject": Subject, "from": From, "to": To, "date": Date,
 105         "text": Text, "html": Html, "parts": Parts}
 106     if Files: msg["files"] = Files
 107     return msg
 108
 109 def clean_header(h):
 110     return str(header.make_header(header.decode_header(h)))
 111
 112 def caption (origin):
 113     Date = ""
 114     if "date" in origin: Date = clean_header(origin["date"]).strip()
 115     From = ""
 116     if "from" in origin: From = clean_header(origin["from"]).strip()
 117     To = ""
 118     if "to" in origin: To = clean_header(origin["to"]).strip()
 119     Subject = ""
 120     if "subject" in origin: Subject = clean_header(origin["subject"]).strip()
 121     return From, To, Subject, Date
 122
 123 if __name__ == "__main__":
 124     global path
 125
 126     startdirname = "Email"
 127     num = 1
 128     for i in range(10000000):
 129         if os.path.exists(startdirname + str(num)) == False:
 130             os.makedirs("Email" + str(num))
 131             break
 132         else:
 133             num += 1
 134
 135
 136     for i in os.listdir("."):
 137         if i.endswith(".eml") == True:
 138             nam = i[:-4]
 139             path = "./" + startdirname + str(num) + "/" + nam
 140
 141             os.makedirs("./" + startdirname + str(num) + "/" + nam)
 142
 143             f = open(i, "r")
 144             emailDict = extract(f, f.name)
 145             f.close()
 146
 147             textFile = ""
 148
 149             froms = emailDict["from"]
 150             tos = emailDict["to"]
 151             subject = emailDict["subject"]
 152             parts = emailDict["parts"]
 153             date = emailDict["date"]
 154             txt = emailDict["text"]
 155             html = emailDict["html"]
 156
 157             files = []
 158             if "files" in emailDict:
 159                 for i in emailDict["files"]:
 160                     files.append(i)
 161
 162             textFile += "From: " + froms + "\n"
 163             textFile += "To: " + tos + "\n"
 164             textFile += "Subject: " + subject + "\n"
 165             textFile += "Date: " + date + "\n\n"
 166             textFile += "Files: " + ", ".join(files) + "\n"
 167             textFile += "Parts: " + str(parts) + "\n\n"
 168             textFile += "Text:\n\n" + txt + "\n\n"
 169             textFile += "HTML:\n\n" + html
 170
 171
 172             wf = open("./" + startdirname + str(num) + "/" + nam + "/" + "txt_" + nam + ".txt", "w")
 173             wf.write(textFile)
 174             wf.close()
 175