fix to handle windows encoded emails also.
[feisty_meow.git] / scripts / email / eml_to_txt.py
1 #! /usr/bin/env python3
2
3 """
4 started from free license script posted on stackexchange:
5 https://softwarerecs.stackexchange.com/questions/28138/converter-that-converts-eml-files-to-plain-text?newreg=48fb884924fd4d909777e5fccc8af2eb
6
7 Chris Koeritz: added processing to fix headers and to turn html parts into text 2023-11-08.
8 """
9
10 #hmmm: would be nice to add the ability to specify a directory; script assumes current dir.
11
12 import os
13 from bs4 import BeautifulSoup
14 from email import message_from_file, header
15
16 def file_exists (f):
17     return os.path.exists(os.path.join(path, f).replace("\\","/"))
18
19 def save_file (fn, cont):
20     file = open(os.path.join(path, fn).replace("\\","/"), "wb")
21     file.write(cont)
22     file.close()
23
24 def construct_name (id, fn):
25     id = id.split(".")
26     id = id[0]+id[1]
27     return id+"."+fn
28
29 def disqo (s):
30     s = s.strip()
31     if s.startswith("'") and s.endswith("'"): return s[1:-1]
32     if s.startswith('"') and s.endswith('"'): return s[1:-1]
33     return s
34
35 def disgra (s):
36     s = s.strip()
37     if s.startswith("<") and s.endswith(">"): return s[1:-1]
38     return s
39
40 def pullout (m, key):
41     Html = ""
42     Text = ""
43     Files = {}
44     Parts = 0
45     if not m.is_multipart():
46         if m.get_filename():
47             fn = m.get_filename()
48             cfn = construct_name(key, fn)
49             Files[fn] = (cfn, None)
50             if file_exists(cfn): return Text, Html, Files, 1
51             save_file(cfn, m.get_payload(decode=True))
52             return Text, Html, Files, 1
53         cp = m.get_content_type()
54         if cp=="text/plain":
55             try:
56                 Text += m.get_payload(decode=True).decode("utf-8")
57             except:
58                 try:
59                     Text += m.get_payload(decode=True).decode("cp437")
60                 except:
61                     print("failed to process text attachment with either utf-8 or cp437 code pages.")
62                     exit(1)
63         elif cp=="text/html":
64             try:
65                 soup = BeautifulSoup(m.get_payload(decode=True).decode("utf-8"), features="html.parser")
66                 Html += soup.get_text('\n', strip=True)
67             except:
68                 try:
69                     soup = BeautifulSoup(m.get_payload(decode=True).decode("cp437"), features="html.parser")
70                     Html += soup.get_text('\n', strip=True)
71                 except:
72                     print("failed to process html attachment with either utf-8 or cp437 code pages.")
73                     exit(1)
74         else:
75             cp = m.get("content-type")
76             try: id = disgra(m.get("content-id"))
77             except: id = None
78             o = cp.find("name=")
79             if o==-1: return Text, Html, Files, 1
80             ox = cp.find(";", o)
81             if ox==-1: ox = None
82             o += 5; fn = cp[o:ox]
83             fn = disqo(fn)
84             cfn = construct_name(key, fn)
85             Files[fn] = (cfn, id)
86             if file_exists(cfn): return Text, Html, Files, 1
87             save_file(cfn, m.get_payload(decode=True))
88         return Text, Html, Files, 1
89     y = 0
90     while 1:
91         try:
92             pl = m.get_payload(y)
93         except: break
94         t, h, f, p = pullout(pl, key)
95         Text += t; Html += h; Files.update(f); Parts += p
96         y += 1
97     return Text, Html, Files, Parts
98
99 def extract (msgfile, key): 
100     m = message_from_file(msgfile)
101     From, To, Subject, Date = caption(m)
102     Text, Html, Files, Parts = pullout(m, key)
103     Text = Text.strip(); Html = Html.strip()
104     msg = {"subject": Subject, "from": From, "to": To, "date": Date,
105         "text": Text, "html": Html, "parts": Parts}
106     if Files: msg["files"] = Files
107     return msg
108
109 def clean_header(h):
110     return str(header.make_header(header.decode_header(h)))
111
112 def caption (origin):
113     Date = ""
114     if "date" in origin: Date = clean_header(origin["date"]).strip()
115     From = ""
116     if "from" in origin: From = clean_header(origin["from"]).strip()
117     To = ""
118     if "to" in origin: To = clean_header(origin["to"]).strip()
119     Subject = ""
120     if "subject" in origin: Subject = clean_header(origin["subject"]).strip()
121     return From, To, Subject, Date
122
123 if __name__ == "__main__":
124     global path
125
126     startdirname = "Email"
127     num = 1
128     for i in range(10000000):
129         if os.path.exists(startdirname + str(num)) == False:
130             os.makedirs("Email" + str(num))
131             break
132         else:
133             num += 1
134
135
136     for i in os.listdir("."):
137         if i.endswith(".eml") == True:
138             nam = i[:-4]
139             path = "./" + startdirname + str(num) + "/" + nam
140
141             os.makedirs("./" + startdirname + str(num) + "/" + nam)
142
143             f = open(i, "r")
144             emailDict = extract(f, f.name)
145             f.close()
146
147             textFile = ""
148
149             froms = emailDict["from"]
150             tos = emailDict["to"]
151             subject = emailDict["subject"]
152             parts = emailDict["parts"]
153             date = emailDict["date"]
154             txt = emailDict["text"]
155             html = emailDict["html"]
156
157             files = []
158             if "files" in emailDict:
159                 for i in emailDict["files"]:
160                     files.append(i)
161
162             textFile += "From: " + froms + "\n"
163             textFile += "To: " + tos + "\n"
164             textFile += "Subject: " + subject + "\n"
165             textFile += "Date: " + date + "\n\n"
166             textFile += "Files: " + ", ".join(files) + "\n"
167             textFile += "Parts: " + str(parts) + "\n\n"
168             textFile += "Text:\n\n" + txt + "\n\n" 
169             textFile += "HTML:\n\n" + html
170
171
172             wf = open("./" + startdirname + str(num) + "/" + nam + "/" + "txt_" + nam + ".txt", "w")
173             wf.write(textFile)
174             wf.close()
175