浏览代码

Refactoring presanitize part to allow more freedom in Quote/Signature checker

Guénaël Muller 7 年前
父节点
当前提交
02cc3c2d0b
共有 1 个文件被更改,包括 38 次插入33 次删除
  1. 38 33
      tracim/tracim/lib/email_body_parser.py

+ 38 - 33
tracim/tracim/lib/email_body_parser.py 查看文件

101
 class SignatureIndexError(Exception):
101
 class SignatureIndexError(Exception):
102
     pass
102
     pass
103
 
103
 
104
+
104
 class ProprietaryHTMLProperties(object):
105
 class ProprietaryHTMLProperties(object):
105
     # Gmail
106
     # Gmail
106
     Gmail_extras_class = 'gmail_extra'
107
     Gmail_extras_class = 'gmail_extra'
120
     # see : https://github.com/roundcube/roundcubemail/issues/6049
121
     # see : https://github.com/roundcube/roundcubemail/issues/6049
121
     Roundcube_quote_prefix_class = 'reply-intro'
122
     Roundcube_quote_prefix_class = 'reply-intro'
122
 
123
 
124
+
125
+
123
 class HtmlChecker(object):
126
 class HtmlChecker(object):
124
 
127
 
125
     @classmethod
128
     @classmethod
181
                 ProprietaryHTMLProperties.Gmail_extras_class):
184
                 ProprietaryHTMLProperties.Gmail_extras_class):
182
             for child in elem.children:
185
             for child in elem.children:
183
                 if cls._has_attr_value(
186
                 if cls._has_attr_value(
184
-                    child,
185
-                    'class',
186
-                    ProprietaryHTMLProperties.Gmail_quote_class):
187
+                        child,
188
+                        'class',
189
+                        ProprietaryHTMLProperties.Gmail_quote_class):
187
                     return True
190
                     return True
188
         return False
191
         return False
189
 
192
 
283
         return False
286
         return False
284
 
287
 
285
 
288
 
289
+class PreSanitizeConfig(object):
290
+    Ignored_tags = ['br', 'hr', 'script', 'style']
291
+    meta_tag = ['body','div']
292
+
286
 class ParsedHTMLMail(object):
293
 class ParsedHTMLMail(object):
287
     """
294
     """
288
     Parse HTML Mail depending of some rules.
295
     Parse HTML Mail depending of some rules.
297
         return str(self._parse_mail())
304
         return str(self._parse_mail())
298
 
305
 
299
     def get_elements(self) -> BodyMailParts:
306
     def get_elements(self) -> BodyMailParts:
300
-        tree = self._make_sanitized_tree()
307
+        tree = self._get_proper_main_body_tree()
301
         return self._distinct_elements(tree)
308
         return self._distinct_elements(tree)
302
 
309
 
303
     def _parse_mail(self) -> BodyMailParts:
310
     def _parse_mail(self) -> BodyMailParts:
305
         elements = self._process_elements(elements)
312
         elements = self._process_elements(elements)
306
         return elements
313
         return elements
307
 
314
 
308
-    def _make_sanitized_tree(self) -> BeautifulSoup:
315
+    def _get_proper_main_body_tree(self) -> BeautifulSoup:
309
         """
316
         """
310
-        Get only html body content and remove some unneeded elements
311
-        :return:
317
+        Get html body tree without some kind of wrapper.
318
+        We need to have text, quote and signature parts at the same tree level
312
         """
319
         """
313
         tree = BeautifulSoup(self.src_html_body, 'html.parser')
320
         tree = BeautifulSoup(self.src_html_body, 'html.parser')
314
 
321
 
317
         if subtree:
324
         if subtree:
318
             tree = BeautifulSoup(str(subtree), 'html.parser')
325
             tree = BeautifulSoup(str(subtree), 'html.parser')
319
 
326
 
320
-        # if some sort of "meta_div", unwrap it
327
+        # if some kind of "meta_div", unwrap it
321
         while len(tree.findAll(recursive=None)) == 1 and \
328
         while len(tree.findAll(recursive=None)) == 1 and \
322
-                tree.find().name.lower() in ['body', 'div']:
329
+                tree.find().name.lower() in PreSanitizeConfig.meta_tag:
323
             tree.find().unwrap()
330
             tree.find().unwrap()
324
 
331
 
325
-        # drop some html elem
326
         for tag in tree.findAll():
332
         for tag in tree.findAll():
327
             # HACK - G.M - 2017-11-28 - Unwrap outlook.com mail
333
             # HACK - G.M - 2017-11-28 - Unwrap outlook.com mail
328
             # if Text -> Signature -> Quote Mail
334
             # if Text -> Signature -> Quote Mail
331
                 if ProprietaryHTMLProperties.Outlook_com_wrapper_id\
337
                 if ProprietaryHTMLProperties.Outlook_com_wrapper_id\
332
                         in tag.attrs['id']:
338
                         in tag.attrs['id']:
333
                     tag.unwrap()
339
                     tag.unwrap()
334
-            # Hack - G.M - 2017-11-28 : remove tag with no enclosure
335
-            # <br> and <hr> tag alone broke html.parser tree,
336
-            # Using another parser may be a solution.
337
-            if tag.name.lower() in ['br', 'hr']:
338
-                tag.unwrap()
339
-                continue
340
-            if tag.name.lower() in ['script', 'style']:
341
-                tag.extract()
342
-
343
         return tree
340
         return tree
344
 
341
 
345
     @classmethod
342
     @classmethod
346
     def _distinct_elements(cls, tree: BeautifulSoup) -> BodyMailParts:
343
     def _distinct_elements(cls, tree: BeautifulSoup) -> BodyMailParts:
347
-        elements = BodyMailParts()
348
-        for tag in list(tree):
349
-            txt = str(tag)
344
+        parts = BodyMailParts()
345
+        for elem in list(tree):
346
+            part_txt = str(elem)
350
             part_type = BodyMailPartType.Main
347
             part_type = BodyMailPartType.Main
351
-            if isinstance(tag, NavigableString):
352
-                txt = tag.replace('\n', '').strip()
353
-            if not txt:
354
-                continue
355
-            if HtmlMailQuoteChecker.is_quote(tag):
348
+            # sanitize NavigableString
349
+            if isinstance(elem, NavigableString):
350
+                part_txt = part_txt.replace('\n', '').strip()
351
+
352
+            if HtmlMailQuoteChecker.is_quote(elem):
356
                 part_type = BodyMailPartType.Quote
353
                 part_type = BodyMailPartType.Quote
357
-            elif HtmlMailSignatureChecker.is_signature(tag):
354
+            elif HtmlMailSignatureChecker.is_signature(elem):
358
                 part_type = BodyMailPartType.Signature
355
                 part_type = BodyMailPartType.Signature
359
-            element = BodyMailPart(txt, part_type)
360
-            elements.append(element)
356
+            else:
357
+                # INFO - G.M -2017-11-28 - ignore unwanted parts
358
+                if not part_txt:
359
+                    continue
360
+                if isinstance(elem, Tag) \
361
+                        and elem.name.lower() in PreSanitizeConfig.Ignored_tags:
362
+                    continue
363
+
364
+            part = BodyMailPart(part_txt, part_type)
365
+            parts.append(part)
361
             # INFO - G.M - 2017-11-28 - Outlook.com special case
366
             # INFO - G.M - 2017-11-28 - Outlook.com special case
362
             # all after quote tag is quote
367
             # all after quote tag is quote
363
-            if HtmlMailQuoteChecker._is_outlook_com_quote(tag):
364
-                elements.follow = True
365
-        return elements
368
+            if HtmlMailQuoteChecker._is_outlook_com_quote(elem):
369
+                parts.follow = True
370
+        return parts
366
 
371
 
367
     @classmethod
372
     @classmethod
368
     def _process_elements(cls, elements: BodyMailParts) -> BodyMailParts:
373
     def _process_elements(cls, elements: BodyMailParts) -> BodyMailParts: