Browse Source

Refactoring Sanitizer config

Guénaël Muller 7 years ago
parent
commit
f101178257
1 changed files with 50 additions and 17 deletions
  1. 50 17
      tracim/tracim/lib/email_processing/sanitizer.py

+ 50 - 17
tracim/tracim/lib/email_processing/sanitizer.py View File

@@ -1,30 +1,64 @@
1 1
 from bs4 import BeautifulSoup, Tag
2 2
 
3
-# TODO BS 20171124: Think about replace thin dict config by object
4
-BEAUTIFULSOUP_HTML_BODY_SANITIZE_CONFIG = {
5
-    'tag_blacklist': ['script', 'style'],
6
-    'class_blacklist': [],
7
-    'id_blacklist': ['reply-intro'],
8
-    'tag_whitelist': ['a', 'b', 'strong', 'i', 'br', 'ul', 'li', 'ol',
9
-                      'em', 'i', 'u', 'blockquote', 'h1', 'h2', 'h3', 'h4',
10
-                      'thead', 'tr', 'td', 'tbody', 'table', 'p', 'pre'],
11
-    'attrs_whitelist': ['href'],
12
-}
3
+
4
+class HtmlSanitizerConfig(object):
5
+    # some Default_html_tags type
6
+    HTML_Heading_tag = ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']
7
+    HTML_Text_parts_tag = ['p',
8
+                           'br', 'hr',
9
+                           'pre', 'code', 'samp',  # preformatted content
10
+                           'q', 'blockquote',  # quotes
11
+                           ]
12
+    HTML_Text_format_tag = ['b', 'i', 'u', 'small', 'sub', 'sup', ]
13
+    HTML_Text_semantic_tag = ['strong', 'em',
14
+                              'mark', 'cite', 'dfn',
15
+                              'del', 'ins', ]
16
+    HTML_Table_tag = ['table',
17
+                      'thead', 'tfoot', 'tbody',
18
+                      'tr', 'td', 'caption', ]
19
+
20
+    HTML_List_tag = ['ul', 'li', 'ol',  # simple list
21
+                     'dl', 'dt', 'dd', ]  # definition list
22
+
23
+    # Rules
24
+    Tag_whitelist = HTML_Heading_tag \
25
+                    + HTML_Text_parts_tag \
26
+                    + HTML_Text_format_tag \
27
+                    + HTML_Text_semantic_tag \
28
+                    + HTML_Table_tag \
29
+                    + HTML_List_tag
30
+
31
+    Tag_blacklist = ['script', 'style']
32
+
33
+    # TODO - G.M - 2017-12-01 - Think about removing class/id Blacklist
34
+    # These elements are no longer required.
35
+    Class_blacklist = []
36
+    Id_blacklist = []
37
+
38
+    Attrs_whitelist = ['href']
13 39
 
14 40
 
15 41
 class HtmlSanitizer(object):
42
+    """
43
+    Sanitize Html Rules :
44
+    - Tag :
45
+      - Remove Tag_blacklist tag
46
+      - Keep Tag_whitelist tag
47
+      - Unwrap others tags
48
+    - Attrs :
49
+      - Remove non-whitelisted attributes
50
+    """
16 51
 
17 52
     @classmethod
18 53
     def sanitize(cls, html_body: str) -> str:
19 54
         soup = BeautifulSoup(html_body, 'html.parser')
20
-        config = BEAUTIFULSOUP_HTML_BODY_SANITIZE_CONFIG
21 55
         for tag in soup.findAll():
22 56
             if cls._tag_to_extract(tag):
23 57
                 tag.extract()
24
-            elif tag.name.lower() in config['tag_whitelist']:
58
+            elif tag.name.lower() in HtmlSanitizerConfig.Tag_whitelist:
25 59
                 attrs = dict(tag.attrs)
26 60
                 for attr in attrs:
27
-                    if attr not in config['attrs_whitelist']:
61
+                    if attr not in HtmlSanitizerConfig.Attrs_whitelist:
28 62
                         del tag.attrs[attr]
29 63
             else:
30 64
                 tag.unwrap()
@@ -32,15 +66,14 @@ class HtmlSanitizer(object):
32 66
 
33 67
     @classmethod
34 68
     def _tag_to_extract(cls, tag: Tag) -> bool:
35
-        config = BEAUTIFULSOUP_HTML_BODY_SANITIZE_CONFIG
36
-        if tag.name.lower() in config['tag_blacklist']:
69
+        if tag.name.lower() in HtmlSanitizerConfig.Tag_blacklist:
37 70
             return True
38 71
         if 'class' in tag.attrs:
39
-            for elem in config['class_blacklist']:
72
+            for elem in HtmlSanitizerConfig.Class_blacklist:
40 73
                 if elem in tag.attrs['class']:
41 74
                     return True
42 75
         if 'id' in tag.attrs:
43
-            for elem in config['id_blacklist']:
76
+            for elem in HtmlSanitizerConfig.Id_blacklist:
44 77
                 if elem in tag.attrs['id']:
45 78
                     return True
46 79
         return False