|
@@ -12,12 +12,20 @@ from email.utils import parseaddr, parsedate_tz, mktime_tz
|
12
|
12
|
from email import message_from_bytes
|
13
|
13
|
|
14
|
14
|
import requests
|
|
15
|
+from bs4 import BeautifulSoup
|
15
|
16
|
|
16
|
17
|
from tracim.controllers.events import VALID_TOKEN_VALUE
|
17
|
18
|
|
18
|
19
|
|
19
|
20
|
TRACIM_SPECIAL_KEY_HEADER = "X-Tracim-Key"
|
20
|
|
-
|
|
21
|
+BS_HTML_BODY_PARSE_CONFIG = {
|
|
22
|
+ 'tag_blacklist': ["script", "style", "blockquote"],
|
|
23
|
+ 'class_blacklist': ['moz-cite-prefix'],
|
|
24
|
+ 'tag_whitelist': ['a', 'b', 'strong', 'i', 'br', 'ul', 'li', 'ol',
|
|
25
|
+ 'em','i', 'u',
|
|
26
|
+ 'thead', 'tr', 'td','tbody', 'table', 'p', 'pre'],
|
|
27
|
+ 'attrs_whitelist' : ['href']
|
|
28
|
+}
|
21
|
29
|
|
22
|
30
|
class DecodedMail(object):
|
23
|
31
|
|
|
@@ -55,11 +63,35 @@ class DecodedMail(object):
|
55
|
63
|
if ctype == "text/plain":
|
56
|
64
|
body = body_part.get_payload(decode=True).decode(
|
57
|
65
|
charset)
|
|
66
|
+
|
58
|
67
|
elif ctype == "text/html":
|
59
|
|
- body = body_part.get_payload(decode=True).decode(
|
|
68
|
+ html_body = body_part.get_payload(decode=True).decode(
|
60
|
69
|
charset)
|
|
70
|
+ body = DecodedMail._parse_html_body(html_body)
|
|
71
|
+
|
61
|
72
|
return body
|
62
|
73
|
|
|
74
|
+ @staticmethod
|
|
75
|
+ def _parse_html_body(html_body:str):
|
|
76
|
+
|
|
77
|
+ soup = BeautifulSoup(html_body)
|
|
78
|
+ config = BS_HTML_BODY_PARSE_CONFIG
|
|
79
|
+ for tag in soup.findAll():
|
|
80
|
+ if tag.name.lower() in config['tag_blacklist']:
|
|
81
|
+ tag.extract()
|
|
82
|
+ elif 'class' in tag.attrs:
|
|
83
|
+ for elem in config['class_blacklist']:
|
|
84
|
+ if elem in tag.attrs['class']:
|
|
85
|
+ tag.extract()
|
|
86
|
+ elif tag.name.lower() in config['tag_whitelist']:
|
|
87
|
+ attrs = dict(tag.attrs)
|
|
88
|
+ for attr in attrs:
|
|
89
|
+ if attr not in config['attrs_whitelist']:
|
|
90
|
+ del tag.attrs[attr]
|
|
91
|
+ else:
|
|
92
|
+ tag.unwrap()
|
|
93
|
+ return str(soup)
|
|
94
|
+
|
63
|
95
|
def _get_mime_body_message(self) -> typing.Optional[Message]:
|
64
|
96
|
# FIXME - G.M - 2017-11-16 - Use stdlib msg.get_body feature for py3.6+
|
65
|
97
|
# FIXME - G.M - 2017-11-16 - Check support for non-multipart mail
|