converter script, actually works, for EML format to text format
authorFred T. Hamster <fred@gruntose.com>
Wed, 8 Nov 2023 21:22:23 +0000 (16:22 -0500)
committerFred T. Hamster <fred@gruntose.com>
Wed, 8 Nov 2023 21:22:23 +0000 (16:22 -0500)
not perfect yet, but will pull the text out of EML format, plus will now pull out the html content as text also.

scripts/email/eml_to_txt.py [new file with mode: 0644]

diff --git a/scripts/email/eml_to_txt.py b/scripts/email/eml_to_txt.py
new file mode 100644 (file)
index 0000000..6c4eb1a
--- /dev/null
@@ -0,0 +1,160 @@
+#! /usr/bin/env python3
+
+"""
+started from free license script posted on stackexchange:
+https://softwarerecs.stackexchange.com/questions/28138/converter-that-converts-eml-files-to-plain-text?newreg=48fb884924fd4d909777e5fccc8af2eb
+
+Chris Koeritz: added processing to fix headers and to turn html parts into text 2023-11-08.
+"""
+
+#hmmm: would be nice to add the ability to specify a directory; script assumes current dir.
+
+import os
+from bs4 import BeautifulSoup
+from email import message_from_file, header
+
+def file_exists (f):
+    return os.path.exists(os.path.join(path, f).replace("\\","/"))
+
+def save_file (fn, cont):
+    file = open(os.path.join(path, fn).replace("\\","/"), "wb")
+    file.write(cont)
+    file.close()
+
+def construct_name (id, fn):
+    id = id.split(".")
+    id = id[0]+id[1]
+    return id+"."+fn
+
+def disqo (s):
+    s = s.strip()
+    if s.startswith("'") and s.endswith("'"): return s[1:-1]
+    if s.startswith('"') and s.endswith('"'): return s[1:-1]
+    return s
+
+def disgra (s):
+    s = s.strip()
+    if s.startswith("<") and s.endswith(">"): return s[1:-1]
+    return s
+
+def pullout (m, key):
+    Html = ""
+    Text = ""
+    Files = {}
+    Parts = 0
+    if not m.is_multipart():
+        if m.get_filename():
+            fn = m.get_filename()
+            cfn = construct_name(key, fn)
+            Files[fn] = (cfn, None)
+            if file_exists(cfn): return Text, Html, Files, 1
+            save_file(cfn, m.get_payload(decode=True))
+            return Text, Html, Files, 1
+        cp = m.get_content_type()
+        if cp=="text/plain":
+            Text += m.get_payload(decode=True).decode("utf-8")
+        elif cp=="text/html":
+            soup = BeautifulSoup(m.get_payload(decode=True).decode("utf-8"), features="html.parser")
+            Html += soup.get_text('\n', strip=True)
+        else:
+            cp = m.get("content-type")
+            try: id = disgra(m.get("content-id"))
+            except: id = None
+            o = cp.find("name=")
+            if o==-1: return Text, Html, Files, 1
+            ox = cp.find(";", o)
+            if ox==-1: ox = None
+            o += 5; fn = cp[o:ox]
+            fn = disqo(fn)
+            cfn = construct_name(key, fn)
+            Files[fn] = (cfn, id)
+            if file_exists(cfn): return Text, Html, Files, 1
+            save_file(cfn, m.get_payload(decode=True))
+        return Text, Html, Files, 1
+    y = 0
+    while 1:
+        try:
+            pl = m.get_payload(y)
+        except: break
+        t, h, f, p = pullout(pl, key)
+        Text += t; Html += h; Files.update(f); Parts += p
+        y += 1
+    return Text, Html, Files, Parts
+
+def extract (msgfile, key): 
+    m = message_from_file(msgfile)
+    From, To, Subject, Date = caption(m)
+    Text, Html, Files, Parts = pullout(m, key)
+    Text = Text.strip(); Html = Html.strip()
+    msg = {"subject": Subject, "from": From, "to": To, "date": Date,
+        "text": Text, "html": Html, "parts": Parts}
+    if Files: msg["files"] = Files
+    return msg
+
+def clean_header(h):
+    return str(header.make_header(header.decode_header(h)))
+
+def caption (origin):
+    Date = ""
+    if "date" in origin: Date = clean_header(origin["date"]).strip()
+    From = ""
+    if "from" in origin: From = clean_header(origin["from"]).strip()
+    To = ""
+    if "to" in origin: To = clean_header(origin["to"]).strip()
+    Subject = ""
+    if "subject" in origin: Subject = clean_header(origin["subject"]).strip()
+    return From, To, Subject, Date
+
+if __name__ == "__main__":
+    global path
+
+    startdirname = "Email"
+    num = 1
+    for i in range(10000000):
+        if os.path.exists(startdirname + str(num)) == False:
+            os.makedirs("Email" + str(num))
+            break
+        else:
+            num += 1
+
+
+    for i in os.listdir("."):
+        if i.endswith(".eml") == True:
+            nam = i[:-4]
+            path = "./" + startdirname + str(num) + "/" + nam
+
+            os.makedirs("./" + startdirname + str(num) + "/" + nam)
+
+            f = open(i, "r")
+            emailDict = extract(f, f.name)
+            f.close()
+
+            textFile = ""
+
+            froms = emailDict["from"]
+            tos = emailDict["to"]
+            subject = emailDict["subject"]
+            parts = emailDict["parts"]
+            date = emailDict["date"]
+            txt = emailDict["text"]
+            html = emailDict["html"]
+
+            files = []
+            if "files" in emailDict:
+                for i in emailDict["files"]:
+                    files.append(i)
+
+            textFile += "From: " + froms + "\n"
+            textFile += "To: " + tos + "\n"
+            textFile += "Subject: " + subject + "\n"
+            textFile += "Date: " + date + "\n\n"
+            textFile += "Files: " + ", ".join(files) + "\n"
+            textFile += "Parts: " + str(parts) + "\n\n"
+            textFile += "Text:\n\n" + txt + "\n\n" 
+            textFile += "HTML:\n\n" + html
+
+
+            wf = open("./" + startdirname + str(num) + "/" + nam + "/" + "txt_" + nam + ".txt", "w")
+            wf.write(textFile)
+            wf.close()
+