4 started from free license script posted on stackexchange:
5 https://softwarerecs.stackexchange.com/questions/28138/converter-that-converts-eml-files-to-plain-text?newreg=48fb884924fd4d909777e5fccc8af2eb
7 Chris Koeritz: added processing to fix headers and to turn html parts into text 2023-11-08.
13 from bs4
import BeautifulSoup
14 from email
import message_from_file, header
17 return os.path.exists(os.path.join(path, f).replace(
"\\",
"/"))
20 file =
open(os.path.join(path, fn).replace(
"\\",
"/"),
"wb")
31 if s.startswith(
"'")
and s.endswith(
"'"):
return s[1:-1]
32 if s.startswith(
'"')
and s.endswith(
'"'):
return s[1:-1]
37 if s.startswith(
"<")
and s.endswith(
">"):
return s[1:-1]
45 if not m.is_multipart():
49 Files[fn] = (cfn,
None)
51 save_file(cfn, m.get_payload(decode=
True))
52 return Text, Html, Files, 1
53 cp = m.get_content_type()
56 Text += m.get_payload(decode=
True).decode(
"utf-8")
59 Text += m.get_payload(decode=
True).decode(
"cp437")
61 print(
"failed to process text attachment with either utf-8 or cp437 code pages.")
65 soup = BeautifulSoup(m.get_payload(decode=
True).decode(
"utf-8"), features=
"html.parser")
66 Html += soup.get_text(
'\n', strip=
True)
69 soup = BeautifulSoup(m.get_payload(decode=
True).decode(
"cp437"), features=
"html.parser")
70 Html += soup.get_text(
'\n', strip=
True)
72 print(
"failed to process html attachment with either utf-8 or cp437 code pages.")
75 cp = m.get(
"content-type")
76 try: id =
disgra(m.get(
"content-id"))
79 if o==-1:
return Text, Html, Files, 1
87 save_file(cfn, m.get_payload(decode=
True))
88 return Text, Html, Files, 1
95 Text += t; Html += h; Files.update(f); Parts += p
97 return Text, Html, Files, Parts
100 m = message_from_file(msgfile)
101 From, To, Subject, Date =
caption(m)
102 Text, Html, Files, Parts =
pullout(m, key)
103 Text = Text.strip(); Html = Html.strip()
104 msg = {
"subject": Subject,
"from": From,
"to": To,
"date": Date,
105 "text": Text,
"html": Html,
"parts": Parts}
106 if Files: msg[
"files"] = Files
110 return str(header.make_header(header.decode_header(h)))
114 if "date" in origin: Date =
clean_header(origin[
"date"]).strip()
116 if "from" in origin: From =
clean_header(origin[
"from"]).strip()
118 if "to" in origin: To =
clean_header(origin[
"to"]).strip()
120 if "subject" in origin: Subject =
clean_header(origin[
"subject"]).strip()
121 return From, To, Subject, Date
123 if __name__ ==
"__main__":
126 startdirname =
"Email"
128 for i
in range(10000000):
129 if os.path.exists(startdirname + str(num)) ==
False:
130 os.makedirs(
"Email" + str(num))
136 for i
in os.listdir(
"."):
137 if i.endswith(
".eml") ==
True:
139 path =
"./" + startdirname + str(num) +
"/" + nam
141 os.makedirs(
"./" + startdirname + str(num) +
"/" + nam)
149 froms = emailDict[
"from"]
150 tos = emailDict[
"to"]
151 subject = emailDict[
"subject"]
152 parts = emailDict[
"parts"]
153 date = emailDict[
"date"]
154 txt = emailDict[
"text"]
155 html = emailDict[
"html"]
158 if "files" in emailDict:
159 for i
in emailDict[
"files"]:
162 textFile +=
"From: " + froms +
"\n"
163 textFile +=
"To: " + tos +
"\n"
164 textFile +=
"Subject: " + subject +
"\n"
165 textFile +=
"Date: " + date +
"\n\n"
166 textFile +=
"Files: " +
", ".join(files) +
"\n"
167 textFile +=
"Parts: " + str(parts) +
"\n\n"
168 textFile +=
"Text:\n\n" + txt +
"\n\n"
169 textFile +=
"HTML:\n\n" + html
172 wf =
open(
"./" + startdirname + str(num) +
"/" + nam +
"/" +
"txt_" + nam +
".txt",
"w")
def construct_name(id, fn)
def extract(msgfile, key)