瀏覽代碼

Refactoring: Html Sanitizer distinct from email fetcher

Guénaël Muller 7 年之前
父節點
當前提交
50facda3cf
共有 2 個文件被更改,包括 49 次插入44 次删除
  1. 3 44
      tracim/tracim/lib/email_fetcher.py
  2. 46 0
      tracim/tracim/lib/email_processing/sanitizer.py

+ 3 - 44
tracim/tracim/lib/email_fetcher.py 查看文件

@@ -10,22 +10,12 @@ from email.utils import parseaddr
10 10
 
11 11
 import markdown
12 12
 import requests
13
-from bs4 import BeautifulSoup, Tag
14 13
 from email_reply_parser import EmailReplyParser
15 14
 from tracim.lib.base import logger
16 15
 from tracim.lib.email_processing.parser import ParsedHTMLMail
16
+from tracim.lib.email_processing.sanitizer import HtmlSanitizer
17 17
 
18 18
 TRACIM_SPECIAL_KEY_HEADER = 'X-Tracim-Key'
19
-# TODO BS 20171124: Think about replace thin dict config by object
20
-BEAUTIFULSOUP_HTML_BODY_SANITIZE_CONFIG = {
21
-    'tag_blacklist': ['script', 'style'],
22
-    'class_blacklist': [],
23
-    'id_blacklist': ['reply-intro'],
24
-    'tag_whitelist': ['a', 'b', 'strong', 'i', 'br', 'ul', 'li', 'ol',
25
-                      'em', 'i', 'u', 'blockquote', 'h1','h2','h3','h4',
26
-                      'thead', 'tr', 'td', 'tbody', 'table', 'p', 'pre'],
27
-    'attrs_whitelist': ['href'],
28
-}
29 19
 CONTENT_TYPE_TEXT_PLAIN = 'text/plain'
30 20
 CONTENT_TYPE_TEXT_HTML = 'text/html'
31 21
 
@@ -72,48 +62,17 @@ class DecodedMail(object):
72 62
                 if use_txt_parsing:
73 63
                     txt_body = EmailReplyParser.parse_reply(txt_body)
74 64
                 html_body = markdown.markdown(txt_body)
75
-                body = DecodedMail._sanitize_html_body(html_body)
65
+                body = HtmlSanitizer.sanitize(html_body)
76 66
 
77 67
             elif content_type == CONTENT_TYPE_TEXT_HTML:
78 68
                 html_body = body_part.get_payload(decode=True).decode(
79 69
                     charset)
80 70
                 if use_html_parsing:
81 71
                     html_body = str(ParsedHTMLMail(html_body))
82
-                body = DecodedMail._sanitize_html_body(html_body)
72
+                body = HtmlSanitizer.sanitize(html_body)
83 73
 
84 74
         return body
85 75
 
86
-    @classmethod
87
-    def _sanitize_html_body(cls, html_body: str) -> str:
88
-        soup = BeautifulSoup(html_body, 'html.parser')
89
-        config = BEAUTIFULSOUP_HTML_BODY_SANITIZE_CONFIG
90
-        for tag in soup.findAll():
91
-            if DecodedMail._tag_to_extract(tag):
92
-                tag.extract()
93
-            elif tag.name.lower() in config['tag_whitelist']:
94
-                attrs = dict(tag.attrs)
95
-                for attr in attrs:
96
-                    if attr not in config['attrs_whitelist']:
97
-                        del tag.attrs[attr]
98
-            else:
99
-                tag.unwrap()
100
-        return str(soup)
101
-
102
-    @classmethod
103
-    def _tag_to_extract(cls, tag: Tag) -> bool:
104
-        config = BEAUTIFULSOUP_HTML_BODY_SANITIZE_CONFIG
105
-        if tag.name.lower() in config['tag_blacklist']:
106
-            return True
107
-        if 'class' in tag.attrs:
108
-            for elem in config['class_blacklist']:
109
-                if elem in tag.attrs['class']:
110
-                    return True
111
-        if 'id' in tag.attrs:
112
-            for elem in config['id_blacklist']:
113
-                if elem in tag.attrs['id']:
114
-                    return True
115
-        return False
116
-
117 76
     def _get_mime_body_message(self) -> typing.Optional[Message]:
118 77
         # TODO - G.M - 2017-11-16 - Use stdlib msg.get_body feature for py3.6+
119 78
         part = None

+ 46 - 0
tracim/tracim/lib/email_processing/sanitizer.py 查看文件

@@ -0,0 +1,46 @@
1
+from bs4 import BeautifulSoup, Tag
2
+
3
+# TODO BS 20171124: Think about replace thin dict config by object
4
+BEAUTIFULSOUP_HTML_BODY_SANITIZE_CONFIG = {
5
+    'tag_blacklist': ['script', 'style'],
6
+    'class_blacklist': [],
7
+    'id_blacklist': ['reply-intro'],
8
+    'tag_whitelist': ['a', 'b', 'strong', 'i', 'br', 'ul', 'li', 'ol',
9
+                      'em', 'i', 'u', 'blockquote', 'h1', 'h2', 'h3', 'h4',
10
+                      'thead', 'tr', 'td', 'tbody', 'table', 'p', 'pre'],
11
+    'attrs_whitelist': ['href'],
12
+}
13
+
14
+
15
+class HtmlSanitizer(object):
16
+
17
+    @classmethod
18
+    def sanitize(cls, html_body: str) -> str:
19
+        soup = BeautifulSoup(html_body, 'html.parser')
20
+        config = BEAUTIFULSOUP_HTML_BODY_SANITIZE_CONFIG
21
+        for tag in soup.findAll():
22
+            if cls._tag_to_extract(tag):
23
+                tag.extract()
24
+            elif tag.name.lower() in config['tag_whitelist']:
25
+                attrs = dict(tag.attrs)
26
+                for attr in attrs:
27
+                    if attr not in config['attrs_whitelist']:
28
+                        del tag.attrs[attr]
29
+            else:
30
+                tag.unwrap()
31
+        return str(soup)
32
+
33
+    @classmethod
34
+    def _tag_to_extract(cls, tag: Tag) -> bool:
35
+        config = BEAUTIFULSOUP_HTML_BODY_SANITIZE_CONFIG
36
+        if tag.name.lower() in config['tag_blacklist']:
37
+            return True
38
+        if 'class' in tag.attrs:
39
+            for elem in config['class_blacklist']:
40
+                if elem in tag.attrs['class']:
41
+                    return True
42
+        if 'id' in tag.attrs:
43
+            for elem in config['id_blacklist']:
44
+                if elem in tag.attrs['id']:
45
+                    return True
46
+        return False