feisty meow concerns codebase  2.140
eml_to_txt.py
Go to the documentation of this file.
1 #! /usr/bin/env python3
2 
3 """
4 started from free license script posted on stackexchange:
5 https://softwarerecs.stackexchange.com/questions/28138/converter-that-converts-eml-files-to-plain-text?newreg=48fb884924fd4d909777e5fccc8af2eb
6 
7 Chris Koeritz: added processing to fix headers and to turn html parts into text 2023-11-08.
8 """
9 
10 #hmmm: would be nice to add the ability to specify a directory; script assumes current dir.
11 
12 import os
13 from bs4 import BeautifulSoup
14 from email import message_from_file, header
15 
16 def file_exists (f):
17  return os.path.exists(os.path.join(path, f).replace("\\","/"))
18 
19 def save_file (fn, cont):
20  file = open(os.path.join(path, fn).replace("\\","/"), "wb")
21  file.write(cont)
22  file.close()
23 
24 def construct_name (id, fn):
25  id = id.split(".")
26  id = id[0]+id[1]
27  return id+"."+fn
28 
29 def disqo (s):
30  s = s.strip()
31  if s.startswith("'") and s.endswith("'"): return s[1:-1]
32  if s.startswith('"') and s.endswith('"'): return s[1:-1]
33  return s
34 
35 def disgra (s):
36  s = s.strip()
37  if s.startswith("<") and s.endswith(">"): return s[1:-1]
38  return s
39 
40 def pullout (m, key):
41  Html = ""
42  Text = ""
43  Files = {}
44  Parts = 0
45  if not m.is_multipart():
46  if m.get_filename():
47  fn = m.get_filename()
48  cfn = construct_name(key, fn)
49  Files[fn] = (cfn, None)
50  if file_exists(cfn): return Text, Html, Files, 1
51  save_file(cfn, m.get_payload(decode=True))
52  return Text, Html, Files, 1
53  cp = m.get_content_type()
54  if cp=="text/plain":
55  try:
56  Text += m.get_payload(decode=True).decode("utf-8")
57  except:
58  try:
59  Text += m.get_payload(decode=True).decode("cp437")
60  except:
61  print("failed to process text attachment with either utf-8 or cp437 code pages.")
62  exit(1)
63  elif cp=="text/html":
64  try:
65  soup = BeautifulSoup(m.get_payload(decode=True).decode("utf-8"), features="html.parser")
66  Html += soup.get_text('\n', strip=True)
67  except:
68  try:
69  soup = BeautifulSoup(m.get_payload(decode=True).decode("cp437"), features="html.parser")
70  Html += soup.get_text('\n', strip=True)
71  except:
72  print("failed to process html attachment with either utf-8 or cp437 code pages.")
73  exit(1)
74  else:
75  cp = m.get("content-type")
76  try: id = disgra(m.get("content-id"))
77  except: id = None
78  o = cp.find("name=")
79  if o==-1: return Text, Html, Files, 1
80  ox = cp.find(";", o)
81  if ox==-1: ox = None
82  o += 5; fn = cp[o:ox]
83  fn = disqo(fn)
84  cfn = construct_name(key, fn)
85  Files[fn] = (cfn, id)
86  if file_exists(cfn): return Text, Html, Files, 1
87  save_file(cfn, m.get_payload(decode=True))
88  return Text, Html, Files, 1
89  y = 0
90  while 1:
91  try:
92  pl = m.get_payload(y)
93  except: break
94  t, h, f, p = pullout(pl, key)
95  Text += t; Html += h; Files.update(f); Parts += p
96  y += 1
97  return Text, Html, Files, Parts
98 
99 def extract (msgfile, key):
100  m = message_from_file(msgfile)
101  From, To, Subject, Date = caption(m)
102  Text, Html, Files, Parts = pullout(m, key)
103  Text = Text.strip(); Html = Html.strip()
104  msg = {"subject": Subject, "from": From, "to": To, "date": Date,
105  "text": Text, "html": Html, "parts": Parts}
106  if Files: msg["files"] = Files
107  return msg
108 
110  return str(header.make_header(header.decode_header(h)))
111 
112 def caption (origin):
113  Date = ""
114  if "date" in origin: Date = clean_header(origin["date"]).strip()
115  From = ""
116  if "from" in origin: From = clean_header(origin["from"]).strip()
117  To = ""
118  if "to" in origin: To = clean_header(origin["to"]).strip()
119  Subject = ""
120  if "subject" in origin: Subject = clean_header(origin["subject"]).strip()
121  return From, To, Subject, Date
122 
123 if __name__ == "__main__":
124  global path
125 
126  startdirname = "Email"
127  num = 1
128  for i in range(10000000):
129  if os.path.exists(startdirname + str(num)) == False:
130  os.makedirs("Email" + str(num))
131  break
132  else:
133  num += 1
134 
135 
136  for i in os.listdir("."):
137  if i.endswith(".eml") == True:
138  nam = i[:-4]
139  path = "./" + startdirname + str(num) + "/" + nam
140 
141  os.makedirs("./" + startdirname + str(num) + "/" + nam)
142 
143  f = open(i, "r")
144  emailDict = extract(f, f.name)
145  f.close()
146 
147  textFile = ""
148 
149  froms = emailDict["from"]
150  tos = emailDict["to"]
151  subject = emailDict["subject"]
152  parts = emailDict["parts"]
153  date = emailDict["date"]
154  txt = emailDict["text"]
155  html = emailDict["html"]
156 
157  files = []
158  if "files" in emailDict:
159  for i in emailDict["files"]:
160  files.append(i)
161 
162  textFile += "From: " + froms + "\n"
163  textFile += "To: " + tos + "\n"
164  textFile += "Subject: " + subject + "\n"
165  textFile += "Date: " + date + "\n\n"
166  textFile += "Files: " + ", ".join(files) + "\n"
167  textFile += "Parts: " + str(parts) + "\n\n"
168  textFile += "Text:\n\n" + txt + "\n\n"
169  textFile += "HTML:\n\n" + html
170 
171 
172  wf = open("./" + startdirname + str(num) + "/" + nam + "/" + "txt_" + nam + ".txt", "w")
173  wf.write(textFile)
174  wf.close()
175 
#define open
Definition: Xos2defs.h:36
def save_file(fn, cont)
Definition: eml_to_txt.py:19
def construct_name(id, fn)
Definition: eml_to_txt.py:24
def file_exists(f)
Definition: eml_to_txt.py:16
def disgra(s)
Definition: eml_to_txt.py:35
def pullout(m, key)
Definition: eml_to_txt.py:40
def disqo(s)
Definition: eml_to_txt.py:29
def clean_header(h)
Definition: eml_to_txt.py:109
def caption(origin)
Definition: eml_to_txt.py:112
def extract(msgfile, key)
Definition: eml_to_txt.py:99