1 #! /usr/bin/env python3
4 started from free license script posted on stackexchange:
5 https://softwarerecs.stackexchange.com/questions/28138/converter-that-converts-eml-files-to-plain-text?newreg=48fb884924fd4d909777e5fccc8af2eb
7 Chris Koeritz: added processing to fix headers and to turn html parts into text 2023-11-08.
10 #hmmm: would be nice to add the ability to specify a directory; script assumes current dir.
13 from bs4 import BeautifulSoup
14 from email import message_from_file, header
17 return os.path.exists(os.path.join(path, f).replace("\\","/"))
19 def save_file (fn, cont):
20 file = open(os.path.join(path, fn).replace("\\","/"), "wb")
24 def construct_name (id, fn):
31 if s.startswith("'") and s.endswith("'"): return s[1:-1]
32 if s.startswith('"') and s.endswith('"'): return s[1:-1]
37 if s.startswith("<") and s.endswith(">"): return s[1:-1]
45 if not m.is_multipart():
48 cfn = construct_name(key, fn)
49 Files[fn] = (cfn, None)
50 if file_exists(cfn): return Text, Html, Files, 1
51 save_file(cfn, m.get_payload(decode=True))
52 return Text, Html, Files, 1
53 cp = m.get_content_type()
55 Text += m.get_payload(decode=True).decode("utf-8")
57 soup = BeautifulSoup(m.get_payload(decode=True).decode("utf-8"), features="html.parser")
58 Html += soup.get_text('\n', strip=True)
60 cp = m.get("content-type")
61 try: id = disgra(m.get("content-id"))
64 if o==-1: return Text, Html, Files, 1
69 cfn = construct_name(key, fn)
71 if file_exists(cfn): return Text, Html, Files, 1
72 save_file(cfn, m.get_payload(decode=True))
73 return Text, Html, Files, 1
79 t, h, f, p = pullout(pl, key)
80 Text += t; Html += h; Files.update(f); Parts += p
82 return Text, Html, Files, Parts
84 def extract (msgfile, key):
85 m = message_from_file(msgfile)
86 From, To, Subject, Date = caption(m)
87 Text, Html, Files, Parts = pullout(m, key)
88 Text = Text.strip(); Html = Html.strip()
89 msg = {"subject": Subject, "from": From, "to": To, "date": Date,
90 "text": Text, "html": Html, "parts": Parts}
91 if Files: msg["files"] = Files
95 return str(header.make_header(header.decode_header(h)))
99 if "date" in origin: Date = clean_header(origin["date"]).strip()
101 if "from" in origin: From = clean_header(origin["from"]).strip()
103 if "to" in origin: To = clean_header(origin["to"]).strip()
105 if "subject" in origin: Subject = clean_header(origin["subject"]).strip()
106 return From, To, Subject, Date
108 if __name__ == "__main__":
111 startdirname = "Email"
113 for i in range(10000000):
114 if os.path.exists(startdirname + str(num)) == False:
115 os.makedirs("Email" + str(num))
121 for i in os.listdir("."):
122 if i.endswith(".eml") == True:
124 path = "./" + startdirname + str(num) + "/" + nam
126 os.makedirs("./" + startdirname + str(num) + "/" + nam)
129 emailDict = extract(f, f.name)
134 froms = emailDict["from"]
135 tos = emailDict["to"]
136 subject = emailDict["subject"]
137 parts = emailDict["parts"]
138 date = emailDict["date"]
139 txt = emailDict["text"]
140 html = emailDict["html"]
143 if "files" in emailDict:
144 for i in emailDict["files"]:
147 textFile += "From: " + froms + "\n"
148 textFile += "To: " + tos + "\n"
149 textFile += "Subject: " + subject + "\n"
150 textFile += "Date: " + date + "\n\n"
151 textFile += "Files: " + ", ".join(files) + "\n"
152 textFile += "Parts: " + str(parts) + "\n\n"
153 textFile += "Text:\n\n" + txt + "\n\n"
154 textFile += "HTML:\n\n" + html
157 wf = open("./" + startdirname + str(num) + "/" + nam + "/" + "txt_" + nam + ".txt", "w")