|
@@ -10,22 +10,12 @@ from email.utils import parseaddr
|
10
|
10
|
|
11
|
11
|
import markdown
|
12
|
12
|
import requests
|
13
|
|
-from bs4 import BeautifulSoup, Tag
|
14
|
13
|
from email_reply_parser import EmailReplyParser
|
15
|
14
|
from tracim.lib.base import logger
|
16
|
15
|
from tracim.lib.email_processing.parser import ParsedHTMLMail
|
|
16
|
+from tracim.lib.email_processing.sanitizer import HtmlSanitizer
|
17
|
17
|
|
18
|
18
|
TRACIM_SPECIAL_KEY_HEADER = 'X-Tracim-Key'
|
19
|
|
-# TODO BS 20171124: Think about replace thin dict config by object
|
20
|
|
-BEAUTIFULSOUP_HTML_BODY_SANITIZE_CONFIG = {
|
21
|
|
- 'tag_blacklist': ['script', 'style'],
|
22
|
|
- 'class_blacklist': [],
|
23
|
|
- 'id_blacklist': ['reply-intro'],
|
24
|
|
- 'tag_whitelist': ['a', 'b', 'strong', 'i', 'br', 'ul', 'li', 'ol',
|
25
|
|
- 'em', 'i', 'u', 'blockquote', 'h1','h2','h3','h4',
|
26
|
|
- 'thead', 'tr', 'td', 'tbody', 'table', 'p', 'pre'],
|
27
|
|
- 'attrs_whitelist': ['href'],
|
28
|
|
-}
|
29
|
19
|
CONTENT_TYPE_TEXT_PLAIN = 'text/plain'
|
30
|
20
|
CONTENT_TYPE_TEXT_HTML = 'text/html'
|
31
|
21
|
|
|
@@ -72,48 +62,17 @@ class DecodedMail(object):
|
72
|
62
|
if use_txt_parsing:
|
73
|
63
|
txt_body = EmailReplyParser.parse_reply(txt_body)
|
74
|
64
|
html_body = markdown.markdown(txt_body)
|
75
|
|
- body = DecodedMail._sanitize_html_body(html_body)
|
|
65
|
+ body = HtmlSanitizer.sanitize(html_body)
|
76
|
66
|
|
77
|
67
|
elif content_type == CONTENT_TYPE_TEXT_HTML:
|
78
|
68
|
html_body = body_part.get_payload(decode=True).decode(
|
79
|
69
|
charset)
|
80
|
70
|
if use_html_parsing:
|
81
|
71
|
html_body = str(ParsedHTMLMail(html_body))
|
82
|
|
- body = DecodedMail._sanitize_html_body(html_body)
|
|
72
|
+ body = HtmlSanitizer.sanitize(html_body)
|
83
|
73
|
|
84
|
74
|
return body
|
85
|
75
|
|
86
|
|
- @classmethod
|
87
|
|
- def _sanitize_html_body(cls, html_body: str) -> str:
|
88
|
|
- soup = BeautifulSoup(html_body, 'html.parser')
|
89
|
|
- config = BEAUTIFULSOUP_HTML_BODY_SANITIZE_CONFIG
|
90
|
|
- for tag in soup.findAll():
|
91
|
|
- if DecodedMail._tag_to_extract(tag):
|
92
|
|
- tag.extract()
|
93
|
|
- elif tag.name.lower() in config['tag_whitelist']:
|
94
|
|
- attrs = dict(tag.attrs)
|
95
|
|
- for attr in attrs:
|
96
|
|
- if attr not in config['attrs_whitelist']:
|
97
|
|
- del tag.attrs[attr]
|
98
|
|
- else:
|
99
|
|
- tag.unwrap()
|
100
|
|
- return str(soup)
|
101
|
|
-
|
102
|
|
- @classmethod
|
103
|
|
- def _tag_to_extract(cls, tag: Tag) -> bool:
|
104
|
|
- config = BEAUTIFULSOUP_HTML_BODY_SANITIZE_CONFIG
|
105
|
|
- if tag.name.lower() in config['tag_blacklist']:
|
106
|
|
- return True
|
107
|
|
- if 'class' in tag.attrs:
|
108
|
|
- for elem in config['class_blacklist']:
|
109
|
|
- if elem in tag.attrs['class']:
|
110
|
|
- return True
|
111
|
|
- if 'id' in tag.attrs:
|
112
|
|
- for elem in config['id_blacklist']:
|
113
|
|
- if elem in tag.attrs['id']:
|
114
|
|
- return True
|
115
|
|
- return False
|
116
|
|
-
|
117
|
76
|
def _get_mime_body_message(self) -> typing.Optional[Message]:
|
118
|
77
|
# TODO - G.M - 2017-11-16 - Use stdlib msg.get_body feature for py3.6+
|
119
|
78
|
part = None
|