Browse Source

Homemade Html mail sanitizer with beautifulsoup

Guénaël Muller 7 years ago
parent
commit
830624aa71
1 changed files with 34 additions and 2 deletions
  1. 34 2
      tracim/tracim/lib/email_fetcher.py

+ 34 - 2
tracim/tracim/lib/email_fetcher.py View File

@@ -12,12 +12,20 @@ from email.utils import parseaddr, parsedate_tz, mktime_tz
12 12
 from email import message_from_bytes
13 13
 
14 14
 import requests
15
+from bs4 import BeautifulSoup
15 16
 
16 17
 from tracim.controllers.events import VALID_TOKEN_VALUE
17 18
 
18 19
 
19 20
 TRACIM_SPECIAL_KEY_HEADER = "X-Tracim-Key"
20
-
21
+BS_HTML_BODY_PARSE_CONFIG = {
22
+    'tag_blacklist': ["script", "style", "blockquote"],
23
+    'class_blacklist': ['moz-cite-prefix'],
24
+    'tag_whitelist': ['a', 'b', 'strong', 'i', 'br', 'ul', 'li', 'ol',
25
+                      'em','i', 'u',
26
+                      'thead', 'tr', 'td','tbody', 'table', 'p', 'pre'],
27
+    'attrs_whitelist' : ['href']
28
+}
21 29
 
22 30
 class DecodedMail(object):
23 31
 
@@ -55,11 +63,35 @@ class DecodedMail(object):
55 63
             if ctype == "text/plain":
56 64
                 body = body_part.get_payload(decode=True).decode(
57 65
                     charset)
66
+
58 67
             elif ctype == "text/html":
59
-                body = body_part.get_payload(decode=True).decode(
68
+                html_body = body_part.get_payload(decode=True).decode(
60 69
                     charset)
70
+                body = DecodedMail._parse_html_body(html_body)
71
+
61 72
         return body
62 73
 
74
+    @staticmethod
75
+    def _parse_html_body(html_body:str):
76
+
77
+        soup = BeautifulSoup(html_body)
78
+        config = BS_HTML_BODY_PARSE_CONFIG
79
+        for tag in soup.findAll():
80
+            if tag.name.lower() in config['tag_blacklist']:
81
+                tag.extract()
82
+            elif 'class' in tag.attrs:
83
+                for elem in config['class_blacklist']:
84
+                    if elem in tag.attrs['class']:
85
+                        tag.extract()
86
+            elif tag.name.lower() in config['tag_whitelist']:
87
+                attrs = dict(tag.attrs)
88
+                for attr in attrs:
89
+                    if attr not in config['attrs_whitelist']:
90
+                        del tag.attrs[attr]
91
+            else:
92
+                tag.unwrap()
93
+        return str(soup)
94
+
63 95
     def _get_mime_body_message(self) -> typing.Optional[Message]:
64 96
         # FIXME - G.M - 2017-11-16 - Use stdlib msg.get_body feature for py3.6+
65 97
         # FIXME - G.M - 2017-11-16 - Check support for non-multipart mail