|
@@ -1,42 +1,23 @@
|
1
|
1
|
from bs4 import BeautifulSoup, Tag
|
2
|
|
-
|
|
2
|
+from tracim.lib.email_processing.sanitizer_config.attrs_whitelist import \
|
|
3
|
+ ATTRS_WHITELIST
|
|
4
|
+from tracim.lib.email_processing.sanitizer_config.class_blacklist import \
|
|
5
|
+ CLASS_BLACKLIST
|
|
6
|
+from tracim.lib.email_processing.sanitizer_config.id_blacklist import \
|
|
7
|
+ ID_BLACKLIST
|
|
8
|
+from tracim.lib.email_processing.sanitizer_config.tag_blacklist import \
|
|
9
|
+ TAG_BLACKLIST
|
|
10
|
+from tracim.lib.email_processing.sanitizer_config.tag_whitelist import \
|
|
11
|
+ TAG_WHITELIST
|
3
|
12
|
|
4
|
13
|
class HtmlSanitizerConfig(object):
|
5
|
|
- # some Default_html_tags type
|
6
|
|
- HTML_Heading_tag = ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']
|
7
|
|
- HTML_Text_parts_tag = ['p',
|
8
|
|
- 'br', 'hr',
|
9
|
|
- 'pre', 'code', 'samp', # preformatted content
|
10
|
|
- 'q', 'blockquote', # quotes
|
11
|
|
- ]
|
12
|
|
- HTML_Text_format_tag = ['b', 'i', 'u', 'small', 'sub', 'sup', ]
|
13
|
|
- HTML_Text_semantic_tag = ['strong', 'em',
|
14
|
|
- 'mark', 'cite', 'dfn',
|
15
|
|
- 'del', 'ins', ]
|
16
|
|
- HTML_Table_tag = ['table',
|
17
|
|
- 'thead', 'tfoot', 'tbody',
|
18
|
|
- 'tr', 'td', 'caption', ]
|
19
|
|
-
|
20
|
|
- HTML_List_tag = ['ul', 'li', 'ol', # simple list
|
21
|
|
- 'dl', 'dt', 'dd', ] # definition list
|
22
|
|
-
|
23
|
|
- # Rules
|
24
|
|
- Tag_whitelist = HTML_Heading_tag \
|
25
|
|
- + HTML_Text_parts_tag \
|
26
|
|
- + HTML_Text_format_tag \
|
27
|
|
- + HTML_Text_semantic_tag \
|
28
|
|
- + HTML_Table_tag \
|
29
|
|
- + HTML_List_tag
|
30
|
|
-
|
31
|
|
- Tag_blacklist = ['script', 'style']
|
32
|
|
-
|
33
|
|
- # TODO - G.M - 2017-12-01 - Think about removing class/id Blacklist
|
34
|
|
- # These elements are no longer required.
|
35
|
|
- Class_blacklist = []
|
36
|
|
- Id_blacklist = []
|
37
|
|
-
|
38
|
|
- Attrs_whitelist = ['href']
|
39
|
|
-
|
|
14
|
+ # whitelist : keep tag and content
|
|
15
|
+ Tag_whitelist = TAG_WHITELIST
|
|
16
|
+ Attrs_whitelist = ATTRS_WHITELIST
|
|
17
|
+ # blacklist : remove content
|
|
18
|
+ Tag_blacklist = TAG_BLACKLIST
|
|
19
|
+ Class_blacklist = CLASS_BLACKLIST
|
|
20
|
+ Id_blacklist = ID_BLACKLIST
|
40
|
21
|
|
41
|
22
|
class HtmlSanitizer(object):
|
42
|
23
|
"""
|