7 年前 · 4308f87bca
--- a/tracim/tracim/lib/email_fetcher.py
+++ b/tracim/tracim/lib/email_fetcher.py
@@ -331,13 +331,27 @@ class MailFetcher(object):
 
				         #  if no from address for example) and catch it here
			
 
				         while mails:
			
 
				             mail = mails.pop()
			
 
				+            body =  mail.get_body(
			
 
				+                use_html_parsing=self.use_html_parsing,
			
 
				+                use_txt_parsing=self.use_txt_parsing,
			
 
				+            )
			
 
				+            from_address = mail.get_from_address()
			
 
				+
			
 
				+            # don't create element for 'empty' mail
			
 
				+            if not body:
			
 
				+                logger.warning(
			
 
				+                    self,
			
 
				+                    'Mail from {} has not valable content'.format(
			
 
				+                        from_address
			
 
				+                    ),
			
 
				+                )
			
 
				+                continue
			
 
				+
			
 
				             msg = {'token': self.token,
			
 
				-                   'user_mail': mail.get_from_address(),
			
 
				+                   'user_mail': from_address,
			
 
				                    'content_id': mail.get_key(),
			
 
				                    'payload': {
			
 
				-                       'content': mail.get_body(
			
 
				-                           use_html_parsing=self.use_html_parsing,
			
 
				-                           use_txt_parsing=self.use_txt_parsing),
			
 
				+                       'content': body,
			
 
				                    }}
			
 
				             try:
			
 
				                 logger.debug(
			
--- a/tracim/tracim/lib/email_processing/models.py
+++ b/tracim/tracim/lib/email_processing/models.py
@@ -109,7 +109,8 @@ class HtmlBodyMailParts(BodyMailParts):
 
				         if len(self._list) > 0:
			
 
				             txt = BeautifulSoup(value.text, 'html.parser').get_text()
			
 
				             txt = txt.replace('\n', '').strip()
			
 
				-            if not txt:
			
 
				+            img = BeautifulSoup(value.text, 'html.parser').find('img')
			
 
				+            if not txt and not img:
			
 
				                 value.part_type = self._list[-1].part_type
			
 
				         BodyMailParts._check_value(value)
			
 
				         BodyMailParts._append(self, value)
			
--- a/tracim/tracim/lib/email_processing/sanitizer.py
+++ b/tracim/tracim/lib/email_processing/sanitizer.py
@@ -1,42 +1,19 @@
 
				+import typing
			
 
				 from bs4 import BeautifulSoup, Tag
			
 
				-
			
 
				+from tracim.lib.email_processing.sanitizer_config.attrs_whitelist import ATTRS_WHITELIST  # nopep8
			
 
				+from tracim.lib.email_processing.sanitizer_config.class_blacklist import CLASS_BLACKLIST  # nopep8
			
 
				+from tracim.lib.email_processing.sanitizer_config.id_blacklist import ID_BLACKLIST  # nopep8
			
 
				+from tracim.lib.email_processing.sanitizer_config.tag_blacklist import TAG_BLACKLIST  # nopep8
			
 
				+from tracim.lib.email_processing.sanitizer_config.tag_whitelist import TAG_WHITELIST  # nopep8
			
 
				 
			
 
				 class HtmlSanitizerConfig(object):
			
 
				-    # some Default_html_tags type
			
 
				-    HTML_Heading_tag = ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']
			
 
				-    HTML_Text_parts_tag = ['p',
			
 
				-                           'br', 'hr',
			
 
				-                           'pre', 'code', 'samp',  # preformatted content
			
 
				-                           'q', 'blockquote',  # quotes
			
 
				-                           ]
			
 
				-    HTML_Text_format_tag = ['b', 'i', 'u', 'small', 'sub', 'sup', ]
			
 
				-    HTML_Text_semantic_tag = ['strong', 'em',
			
 
				-                              'mark', 'cite', 'dfn',
			
 
				-                              'del', 'ins', ]
			
 
				-    HTML_Table_tag = ['table',
			
 
				-                      'thead', 'tfoot', 'tbody',
			
 
				-                      'tr', 'td', 'caption', ]
			
 
				-
			
 
				-    HTML_List_tag = ['ul', 'li', 'ol',  # simple list
			
 
				-                     'dl', 'dt', 'dd', ]  # definition list
			
 
				-
			
 
				-    # Rules
			
 
				-    Tag_whitelist = HTML_Heading_tag \
			
 
				-                    + HTML_Text_parts_tag \
			
 
				-                    + HTML_Text_format_tag \
			
 
				-                    + HTML_Text_semantic_tag \
			
 
				-                    + HTML_Table_tag \
			
 
				-                    + HTML_List_tag
			
 
				-
			
 
				-    Tag_blacklist = ['script', 'style']
			
 
				-
			
 
				-    # TODO - G.M - 2017-12-01 - Think about removing class/id Blacklist
			
 
				-    # These elements are no longer required.
			
 
				-    Class_blacklist = []
			
 
				-    Id_blacklist = []
			
 
				-
			
 
				-    Attrs_whitelist = ['href']
			
 
				-
			
 
				+    # whitelist : keep tag and content
			
 
				+    Tag_whitelist = TAG_WHITELIST
			
 
				+    Attrs_whitelist = ATTRS_WHITELIST
			
 
				+    # blacklist : remove content
			
 
				+    Tag_blacklist = TAG_BLACKLIST
			
 
				+    Class_blacklist = CLASS_BLACKLIST
			
 
				+    Id_blacklist = ID_BLACKLIST
			
 
				 
			
 
				 class HtmlSanitizer(object):
			
 
				     """
			
@@ -50,7 +27,7 @@ class HtmlSanitizer(object):
 
				     """
			
 
				 
			
 
				     @classmethod
			
 
				-    def sanitize(cls, html_body: str) -> str:
			
 
				+    def sanitize(cls, html_body: str) -> typing.Optional[str]:
			
 
				         soup = BeautifulSoup(html_body, 'html.parser')
			
 
				         for tag in soup.findAll():
			
 
				             if cls._tag_to_extract(tag):
			
@@ -62,7 +39,17 @@ class HtmlSanitizer(object):
 
				                         del tag.attrs[attr]
			
 
				             else:
			
 
				                 tag.unwrap()
			
 
				-        return str(soup)
			
 
				+
			
 
				+        if cls._is_content_empty(soup):
			
 
				+            return None
			
 
				+        else:
			
 
				+            return str(soup)
			
 
				+
			
 
				+    @classmethod
			
 
				+    def _is_content_empty(cls, soup):
			
 
				+        img = soup.find('img')
			
 
				+        txt = soup.get_text().replace('\n', '').strip()
			
 
				+        return (not img and not txt)
			
 
				 
			
 
				     @classmethod
			
 
				     def _tag_to_extract(cls, tag: Tag) -> bool:
			
--- a/tracim/tracim/lib/email_processing/sanitizer_config/attrs_whitelist.py
+++ b/tracim/tracim/lib/email_processing/sanitizer_config/attrs_whitelist.py
@@ -0,0 +1 @@
 
				+ATTRS_WHITELIST = ['href']
			
--- a/tracim/tracim/lib/email_processing/sanitizer_config/class_blacklist.py
+++ b/tracim/tracim/lib/email_processing/sanitizer_config/class_blacklist.py
@@ -0,0 +1 @@
 
				+CLASS_BLACKLIST =  []
			
--- a/tracim/tracim/lib/email_processing/sanitizer_config/id_blacklist.py
+++ b/tracim/tracim/lib/email_processing/sanitizer_config/id_blacklist.py
@@ -0,0 +1 @@
 
				+ID_BLACKLIST = []
			
--- a/tracim/tracim/lib/email_processing/sanitizer_config/tag_blacklist.py
+++ b/tracim/tracim/lib/email_processing/sanitizer_config/tag_blacklist.py
@@ -0,0 +1 @@
 
				+TAG_BLACKLIST = ['script', 'style']
			
--- a/tracim/tracim/lib/email_processing/sanitizer_config/tag_whitelist.py
+++ b/tracim/tracim/lib/email_processing/sanitizer_config/tag_whitelist.py
@@ -0,0 +1,16 @@
 
				+TAG_WHITELIST = [
			
 
				+    'b', 'blockquote', 'br',
			
 
				+    'caption', 'cite', 'code',
			
 
				+    'dd', 'del', 'dfn', 'dl', 'dt',
			
 
				+    'em',
			
 
				+    'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'hr',
			
 
				+    'i', 'img', 'ins',
			
 
				+    'li',
			
 
				+    'mark',
			
 
				+    'ol',
			
 
				+    'p', 'pre',
			
 
				+    'q',
			
 
				+    'samp', 'small', 'strong', 'sub', 'sup',
			
 
				+    'table', 'tbody', 'td', 'tfoot', 'thead', 'tr',
			
 
				+    'u', 'ul'
			
 
				+]