From: Fred T. Hamster <fred@gruntose.com>
Date: Wed, 8 Nov 2023 21:22:23 +0000 (-0500)
Subject: converter script, actually works, for EML format to text format
X-Git-Url: https://feistymeow.org/gitweb/?p=feisty_meow.git;a=commitdiff_plain;h=7536c669e045ca3d432d146603cb531badac767f

converter script, actually works, for EML format to text format

not perfect yet, but will pull the text out of EML format, plus will now pull out the html content as text also.
---

diff --git a/scripts/email/eml_to_txt.py b/scripts/email/eml_to_txt.py
new file mode 100644
index 00000000..6c4eb1a3
--- /dev/null
+++ b/scripts/email/eml_to_txt.py
@@ -0,0 +1,160 @@
+#! /usr/bin/env python3
+
+"""
+started from free license script posted on stackexchange:
+https://softwarerecs.stackexchange.com/questions/28138/converter-that-converts-eml-files-to-plain-text?newreg=48fb884924fd4d909777e5fccc8af2eb
+
+Chris Koeritz: added processing to fix headers and to turn html parts into text 2023-11-08.
+"""
+
+#hmmm: would be nice to add the ability to specify a directory; script assumes current dir.
+
+import os
+from bs4 import BeautifulSoup
+from email import message_from_file, header
+
+def file_exists (f):
+    return os.path.exists(os.path.join(path, f).replace("\\","/"))
+
+def save_file (fn, cont):
+    file = open(os.path.join(path, fn).replace("\\","/"), "wb")
+    file.write(cont)
+    file.close()
+
+def construct_name (id, fn):
+    id = id.split(".")
+    id = id[0]+id[1]
+    return id+"."+fn
+
+def disqo (s):
+    s = s.strip()
+    if s.startswith("'") and s.endswith("'"): return s[1:-1]
+    if s.startswith('"') and s.endswith('"'): return s[1:-1]
+    return s
+
+def disgra (s):
+    s = s.strip()
+    if s.startswith("<") and s.endswith(">"): return s[1:-1]
+    return s
+
+def pullout (m, key):
+    Html = ""
+    Text = ""
+    Files = {}
+    Parts = 0
+    if not m.is_multipart():
+        if m.get_filename():
+            fn = m.get_filename()
+            cfn = construct_name(key, fn)
+            Files[fn] = (cfn, None)
+            if file_exists(cfn): return Text, Html, Files, 1
+            save_file(cfn, m.get_payload(decode=True))
+            return Text, Html, Files, 1
+        cp = m.get_content_type()
+        if cp=="text/plain":
+            Text += m.get_payload(decode=True).decode("utf-8")
+        elif cp=="text/html":
+            soup = BeautifulSoup(m.get_payload(decode=True).decode("utf-8"), features="html.parser")
+            Html += soup.get_text('\n', strip=True)
+        else:
+            cp = m.get("content-type")
+            try: id = disgra(m.get("content-id"))
+            except: id = None
+            o = cp.find("name=")
+            if o==-1: return Text, Html, Files, 1
+            ox = cp.find(";", o)
+            if ox==-1: ox = None
+            o += 5; fn = cp[o:ox]
+            fn = disqo(fn)
+            cfn = construct_name(key, fn)
+            Files[fn] = (cfn, id)
+            if file_exists(cfn): return Text, Html, Files, 1
+            save_file(cfn, m.get_payload(decode=True))
+        return Text, Html, Files, 1
+    y = 0
+    while 1:
+        try:
+            pl = m.get_payload(y)
+        except: break
+        t, h, f, p = pullout(pl, key)
+        Text += t; Html += h; Files.update(f); Parts += p
+        y += 1
+    return Text, Html, Files, Parts
+
+def extract (msgfile, key): 
+    m = message_from_file(msgfile)
+    From, To, Subject, Date = caption(m)
+    Text, Html, Files, Parts = pullout(m, key)
+    Text = Text.strip(); Html = Html.strip()
+    msg = {"subject": Subject, "from": From, "to": To, "date": Date,
+        "text": Text, "html": Html, "parts": Parts}
+    if Files: msg["files"] = Files
+    return msg
+
+def clean_header(h):
+    return str(header.make_header(header.decode_header(h)))
+
+def caption (origin):
+    Date = ""
+    if "date" in origin: Date = clean_header(origin["date"]).strip()
+    From = ""
+    if "from" in origin: From = clean_header(origin["from"]).strip()
+    To = ""
+    if "to" in origin: To = clean_header(origin["to"]).strip()
+    Subject = ""
+    if "subject" in origin: Subject = clean_header(origin["subject"]).strip()
+    return From, To, Subject, Date
+
+if __name__ == "__main__":
+    global path
+
+    startdirname = "Email"
+    num = 1
+    for i in range(10000000):
+        if os.path.exists(startdirname + str(num)) == False:
+            os.makedirs("Email" + str(num))
+            break
+        else:
+            num += 1
+
+
+    for i in os.listdir("."):
+        if i.endswith(".eml") == True:
+            nam = i[:-4]
+            path = "./" + startdirname + str(num) + "/" + nam
+
+            os.makedirs("./" + startdirname + str(num) + "/" + nam)
+
+            f = open(i, "r")
+            emailDict = extract(f, f.name)
+            f.close()
+
+            textFile = ""
+
+            froms = emailDict["from"]
+            tos = emailDict["to"]
+            subject = emailDict["subject"]
+            parts = emailDict["parts"]
+            date = emailDict["date"]
+            txt = emailDict["text"]
+            html = emailDict["html"]
+
+            files = []
+            if "files" in emailDict:
+                for i in emailDict["files"]:
+                    files.append(i)
+
+            textFile += "From: " + froms + "\n"
+            textFile += "To: " + tos + "\n"
+            textFile += "Subject: " + subject + "\n"
+            textFile += "Date: " + date + "\n\n"
+            textFile += "Files: " + ", ".join(files) + "\n"
+            textFile += "Parts: " + str(parts) + "\n\n"
+            textFile += "Text:\n\n" + txt + "\n\n" 
+            textFile += "HTML:\n\n" + html
+
+
+            wf = open("./" + startdirname + str(num) + "/" + nam + "/" + "txt_" + nam + ".txt", "w")
+            wf.write(textFile)
+            wf.close()
+