1 #! /usr/bin/env python3
4 started from free license script posted on stackexchange:
5 https://softwarerecs.stackexchange.com/questions/28138/converter-that-converts-eml-files-to-plain-text?newreg=48fb884924fd4d909777e5fccc8af2eb
7 Chris Koeritz: added processing to fix headers and to turn html parts into text 2023-11-08.
10 #hmmm: would be nice to add the ability to specify a directory; script assumes current dir.
13 from bs4 import BeautifulSoup
14 from email import message_from_file, header
17 return os.path.exists(os.path.join(path, f).replace("\\","/"))
19 def save_file (fn, cont):
20 file = open(os.path.join(path, fn).replace("\\","/"), "wb")
24 def construct_name (id, fn):
31 if s.startswith("'") and s.endswith("'"): return s[1:-1]
32 if s.startswith('"') and s.endswith('"'): return s[1:-1]
37 if s.startswith("<") and s.endswith(">"): return s[1:-1]
45 if not m.is_multipart():
48 cfn = construct_name(key, fn)
49 Files[fn] = (cfn, None)
50 if file_exists(cfn): return Text, Html, Files, 1
51 save_file(cfn, m.get_payload(decode=True))
52 return Text, Html, Files, 1
53 cp = m.get_content_type()
56 Text += m.get_payload(decode=True).decode("utf-8")
59 Text += m.get_payload(decode=True).decode("cp437")
61 print("failed to process text attachment with either utf-8 or cp437 code pages.")
65 soup = BeautifulSoup(m.get_payload(decode=True).decode("utf-8"), features="html.parser")
66 Html += soup.get_text('\n', strip=True)
69 soup = BeautifulSoup(m.get_payload(decode=True).decode("cp437"), features="html.parser")
70 Html += soup.get_text('\n', strip=True)
72 print("failed to process html attachment with either utf-8 or cp437 code pages.")
75 cp = m.get("content-type")
76 try: id = disgra(m.get("content-id"))
79 if o==-1: return Text, Html, Files, 1
84 cfn = construct_name(key, fn)
86 if file_exists(cfn): return Text, Html, Files, 1
87 save_file(cfn, m.get_payload(decode=True))
88 return Text, Html, Files, 1
94 t, h, f, p = pullout(pl, key)
95 Text += t; Html += h; Files.update(f); Parts += p
97 return Text, Html, Files, Parts
99 def extract (msgfile, key):
100 m = message_from_file(msgfile)
101 From, To, Subject, Date = caption(m)
102 Text, Html, Files, Parts = pullout(m, key)
103 Text = Text.strip(); Html = Html.strip()
104 msg = {"subject": Subject, "from": From, "to": To, "date": Date,
105 "text": Text, "html": Html, "parts": Parts}
106 if Files: msg["files"] = Files
110 return str(header.make_header(header.decode_header(h)))
112 def caption (origin):
114 if "date" in origin: Date = clean_header(origin["date"]).strip()
116 if "from" in origin: From = clean_header(origin["from"]).strip()
118 if "to" in origin: To = clean_header(origin["to"]).strip()
120 if "subject" in origin: Subject = clean_header(origin["subject"]).strip()
121 return From, To, Subject, Date
123 if __name__ == "__main__":
126 startdirname = "Email"
128 for i in range(10000000):
129 if os.path.exists(startdirname + str(num)) == False:
130 os.makedirs("Email" + str(num))
136 for i in os.listdir("."):
137 if i.endswith(".eml") == True:
139 path = "./" + startdirname + str(num) + "/" + nam
141 os.makedirs("./" + startdirname + str(num) + "/" + nam)
144 emailDict = extract(f, f.name)
149 froms = emailDict["from"]
150 tos = emailDict["to"]
151 subject = emailDict["subject"]
152 parts = emailDict["parts"]
153 date = emailDict["date"]
154 txt = emailDict["text"]
155 html = emailDict["html"]
158 if "files" in emailDict:
159 for i in emailDict["files"]:
162 textFile += "From: " + froms + "\n"
163 textFile += "To: " + tos + "\n"
164 textFile += "Subject: " + subject + "\n"
165 textFile += "Date: " + date + "\n\n"
166 textFile += "Files: " + ", ".join(files) + "\n"
167 textFile += "Parts: " + str(parts) + "\n\n"
168 textFile += "Text:\n\n" + txt + "\n\n"
169 textFile += "HTML:\n\n" + html
172 wf = open("./" + startdirname + str(num) + "/" + nam + "/" + "txt_" + nam + ".txt", "w")