Procházet zdrojové kódy

Merge pull request #521 from inkhey/feature/somes_fixes

Bastien Sevajol před 6 roky
rodič
revize
4308f87bca
No account linked to committer's email

+ 18 - 4
tracim/tracim/lib/email_fetcher.py Zobrazit soubor

@@ -331,13 +331,27 @@ class MailFetcher(object):
331 331
         #  if no from address for example) and catch it here
332 332
         while mails:
333 333
             mail = mails.pop()
334
+            body =  mail.get_body(
335
+                use_html_parsing=self.use_html_parsing,
336
+                use_txt_parsing=self.use_txt_parsing,
337
+            )
338
+            from_address = mail.get_from_address()
339
+
340
+            # don't create element for 'empty' mail
341
+            if not body:
342
+                logger.warning(
343
+                    self,
344
+                    'Mail from {} has not valable content'.format(
345
+                        from_address
346
+                    ),
347
+                )
348
+                continue
349
+
334 350
             msg = {'token': self.token,
335
-                   'user_mail': mail.get_from_address(),
351
+                   'user_mail': from_address,
336 352
                    'content_id': mail.get_key(),
337 353
                    'payload': {
338
-                       'content': mail.get_body(
339
-                           use_html_parsing=self.use_html_parsing,
340
-                           use_txt_parsing=self.use_txt_parsing),
354
+                       'content': body,
341 355
                    }}
342 356
             try:
343 357
                 logger.debug(

+ 2 - 1
tracim/tracim/lib/email_processing/models.py Zobrazit soubor

@@ -109,7 +109,8 @@ class HtmlBodyMailParts(BodyMailParts):
109 109
         if len(self._list) > 0:
110 110
             txt = BeautifulSoup(value.text, 'html.parser').get_text()
111 111
             txt = txt.replace('\n', '').strip()
112
-            if not txt:
112
+            img = BeautifulSoup(value.text, 'html.parser').find('img')
113
+            if not txt and not img:
113 114
                 value.part_type = self._list[-1].part_type
114 115
         BodyMailParts._check_value(value)
115 116
         BodyMailParts._append(self, value)

+ 25 - 38
tracim/tracim/lib/email_processing/sanitizer.py Zobrazit soubor

@@ -1,42 +1,19 @@
1
+import typing
1 2
 from bs4 import BeautifulSoup, Tag
2
-
3
+from tracim.lib.email_processing.sanitizer_config.attrs_whitelist import ATTRS_WHITELIST  # nopep8
4
+from tracim.lib.email_processing.sanitizer_config.class_blacklist import CLASS_BLACKLIST  # nopep8
5
+from tracim.lib.email_processing.sanitizer_config.id_blacklist import ID_BLACKLIST  # nopep8
6
+from tracim.lib.email_processing.sanitizer_config.tag_blacklist import TAG_BLACKLIST  # nopep8
7
+from tracim.lib.email_processing.sanitizer_config.tag_whitelist import TAG_WHITELIST  # nopep8
3 8
 
4 9
 class HtmlSanitizerConfig(object):
5
-    # some Default_html_tags type
6
-    HTML_Heading_tag = ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']
7
-    HTML_Text_parts_tag = ['p',
8
-                           'br', 'hr',
9
-                           'pre', 'code', 'samp',  # preformatted content
10
-                           'q', 'blockquote',  # quotes
11
-                           ]
12
-    HTML_Text_format_tag = ['b', 'i', 'u', 'small', 'sub', 'sup', ]
13
-    HTML_Text_semantic_tag = ['strong', 'em',
14
-                              'mark', 'cite', 'dfn',
15
-                              'del', 'ins', ]
16
-    HTML_Table_tag = ['table',
17
-                      'thead', 'tfoot', 'tbody',
18
-                      'tr', 'td', 'caption', ]
19
-
20
-    HTML_List_tag = ['ul', 'li', 'ol',  # simple list
21
-                     'dl', 'dt', 'dd', ]  # definition list
22
-
23
-    # Rules
24
-    Tag_whitelist = HTML_Heading_tag \
25
-                    + HTML_Text_parts_tag \
26
-                    + HTML_Text_format_tag \
27
-                    + HTML_Text_semantic_tag \
28
-                    + HTML_Table_tag \
29
-                    + HTML_List_tag
30
-
31
-    Tag_blacklist = ['script', 'style']
32
-
33
-    # TODO - G.M - 2017-12-01 - Think about removing class/id Blacklist
34
-    # These elements are no longer required.
35
-    Class_blacklist = []
36
-    Id_blacklist = []
37
-
38
-    Attrs_whitelist = ['href']
39
-
10
+    # whitelist : keep tag and content
11
+    Tag_whitelist = TAG_WHITELIST
12
+    Attrs_whitelist = ATTRS_WHITELIST
13
+    # blacklist : remove content
14
+    Tag_blacklist = TAG_BLACKLIST
15
+    Class_blacklist = CLASS_BLACKLIST
16
+    Id_blacklist = ID_BLACKLIST
40 17
 
41 18
 class HtmlSanitizer(object):
42 19
     """
@@ -50,7 +27,7 @@ class HtmlSanitizer(object):
50 27
     """
51 28
 
52 29
     @classmethod
53
-    def sanitize(cls, html_body: str) -> str:
30
+    def sanitize(cls, html_body: str) -> typing.Optional[str]:
54 31
         soup = BeautifulSoup(html_body, 'html.parser')
55 32
         for tag in soup.findAll():
56 33
             if cls._tag_to_extract(tag):
@@ -62,7 +39,17 @@ class HtmlSanitizer(object):
62 39
                         del tag.attrs[attr]
63 40
             else:
64 41
                 tag.unwrap()
65
-        return str(soup)
42
+
43
+        if cls._is_content_empty(soup):
44
+            return None
45
+        else:
46
+            return str(soup)
47
+
48
+    @classmethod
49
+    def _is_content_empty(cls, soup):
50
+        img = soup.find('img')
51
+        txt = soup.get_text().replace('\n', '').strip()
52
+        return (not img and not txt)
66 53
 
67 54
     @classmethod
68 55
     def _tag_to_extract(cls, tag: Tag) -> bool:

+ 1 - 0
tracim/tracim/lib/email_processing/sanitizer_config/attrs_whitelist.py Zobrazit soubor

@@ -0,0 +1 @@
1
+ATTRS_WHITELIST = ['href']

+ 1 - 0
tracim/tracim/lib/email_processing/sanitizer_config/class_blacklist.py Zobrazit soubor

@@ -0,0 +1 @@
1
+CLASS_BLACKLIST =  []

+ 1 - 0
tracim/tracim/lib/email_processing/sanitizer_config/id_blacklist.py Zobrazit soubor

@@ -0,0 +1 @@
1
+ID_BLACKLIST = []

+ 1 - 0
tracim/tracim/lib/email_processing/sanitizer_config/tag_blacklist.py Zobrazit soubor

@@ -0,0 +1 @@
1
+TAG_BLACKLIST = ['script', 'style']

+ 16 - 0
tracim/tracim/lib/email_processing/sanitizer_config/tag_whitelist.py Zobrazit soubor

@@ -0,0 +1,16 @@
1
+TAG_WHITELIST = [
2
+    'b', 'blockquote', 'br',
3
+    'caption', 'cite', 'code',
4
+    'dd', 'del', 'dfn', 'dl', 'dt',
5
+    'em',
6
+    'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'hr',
7
+    'i', 'img', 'ins',
8
+    'li',
9
+    'mark',
10
+    'ol',
11
+    'p', 'pre',
12
+    'q',
13
+    'samp', 'small', 'strong', 'sub', 'sup',
14
+    'table', 'tbody', 'td', 'tfoot', 'thead', 'tr',
15
+    'u', 'ul'
16
+]