|
@@ -1,42 +1,19 @@
|
|
1
|
+import typing
|
1
|
2
|
from bs4 import BeautifulSoup, Tag
|
2
|
|
-
|
|
3
|
+from tracim.lib.email_processing.sanitizer_config.attrs_whitelist import ATTRS_WHITELIST # nopep8
|
|
4
|
+from tracim.lib.email_processing.sanitizer_config.class_blacklist import CLASS_BLACKLIST # nopep8
|
|
5
|
+from tracim.lib.email_processing.sanitizer_config.id_blacklist import ID_BLACKLIST # nopep8
|
|
6
|
+from tracim.lib.email_processing.sanitizer_config.tag_blacklist import TAG_BLACKLIST # nopep8
|
|
7
|
+from tracim.lib.email_processing.sanitizer_config.tag_whitelist import TAG_WHITELIST # nopep8
|
3
|
8
|
|
4
|
9
|
class HtmlSanitizerConfig(object):
|
5
|
|
- # some Default_html_tags type
|
6
|
|
- HTML_Heading_tag = ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']
|
7
|
|
- HTML_Text_parts_tag = ['p',
|
8
|
|
- 'br', 'hr',
|
9
|
|
- 'pre', 'code', 'samp', # preformatted content
|
10
|
|
- 'q', 'blockquote', # quotes
|
11
|
|
- ]
|
12
|
|
- HTML_Text_format_tag = ['b', 'i', 'u', 'small', 'sub', 'sup', ]
|
13
|
|
- HTML_Text_semantic_tag = ['strong', 'em',
|
14
|
|
- 'mark', 'cite', 'dfn',
|
15
|
|
- 'del', 'ins', ]
|
16
|
|
- HTML_Table_tag = ['table',
|
17
|
|
- 'thead', 'tfoot', 'tbody',
|
18
|
|
- 'tr', 'td', 'caption', ]
|
19
|
|
-
|
20
|
|
- HTML_List_tag = ['ul', 'li', 'ol', # simple list
|
21
|
|
- 'dl', 'dt', 'dd', ] # definition list
|
22
|
|
-
|
23
|
|
- # Rules
|
24
|
|
- Tag_whitelist = HTML_Heading_tag \
|
25
|
|
- + HTML_Text_parts_tag \
|
26
|
|
- + HTML_Text_format_tag \
|
27
|
|
- + HTML_Text_semantic_tag \
|
28
|
|
- + HTML_Table_tag \
|
29
|
|
- + HTML_List_tag
|
30
|
|
-
|
31
|
|
- Tag_blacklist = ['script', 'style']
|
32
|
|
-
|
33
|
|
- # TODO - G.M - 2017-12-01 - Think about removing class/id Blacklist
|
34
|
|
- # These elements are no longer required.
|
35
|
|
- Class_blacklist = []
|
36
|
|
- Id_blacklist = []
|
37
|
|
-
|
38
|
|
- Attrs_whitelist = ['href']
|
39
|
|
-
|
|
10
|
+ # whitelist : keep tag and content
|
|
11
|
+ Tag_whitelist = TAG_WHITELIST
|
|
12
|
+ Attrs_whitelist = ATTRS_WHITELIST
|
|
13
|
+ # blacklist : remove content
|
|
14
|
+ Tag_blacklist = TAG_BLACKLIST
|
|
15
|
+ Class_blacklist = CLASS_BLACKLIST
|
|
16
|
+ Id_blacklist = ID_BLACKLIST
|
40
|
17
|
|
41
|
18
|
class HtmlSanitizer(object):
|
42
|
19
|
"""
|
|
@@ -50,7 +27,7 @@ class HtmlSanitizer(object):
|
50
|
27
|
"""
|
51
|
28
|
|
52
|
29
|
@classmethod
|
53
|
|
- def sanitize(cls, html_body: str) -> str:
|
|
30
|
+ def sanitize(cls, html_body: str) -> typing.Optional[str]:
|
54
|
31
|
soup = BeautifulSoup(html_body, 'html.parser')
|
55
|
32
|
for tag in soup.findAll():
|
56
|
33
|
if cls._tag_to_extract(tag):
|
|
@@ -62,7 +39,17 @@ class HtmlSanitizer(object):
|
62
|
39
|
del tag.attrs[attr]
|
63
|
40
|
else:
|
64
|
41
|
tag.unwrap()
|
65
|
|
- return str(soup)
|
|
42
|
+
|
|
43
|
+ if cls._is_content_empty(soup):
|
|
44
|
+ return None
|
|
45
|
+ else:
|
|
46
|
+ return str(soup)
|
|
47
|
+
|
|
48
|
+ @classmethod
|
|
49
|
+ def _is_content_empty(cls, soup):
|
|
50
|
+ img = soup.find('img')
|
|
51
|
+ txt = soup.get_text().replace('\n', '').strip()
|
|
52
|
+ return (not img and not txt)
|
66
|
53
|
|
67
|
54
|
@classmethod
|
68
|
55
|
def _tag_to_extract(cls, tag: Tag) -> bool:
|