Browse Source

Refactoring presanitize part to allow more freedom in Quote/Signature checker

Guénaël Muller 6 years ago
parent
commit
02cc3c2d0b
1 changed files with 38 additions and 33 deletions
  1. 38 33
      tracim/tracim/lib/email_body_parser.py

+ 38 - 33
tracim/tracim/lib/email_body_parser.py View File

@@ -101,6 +101,7 @@ class BodyMailParts(object):
101 101
 class SignatureIndexError(Exception):
102 102
     pass
103 103
 
104
+
104 105
 class ProprietaryHTMLProperties(object):
105 106
     # Gmail
106 107
     Gmail_extras_class = 'gmail_extra'
@@ -120,6 +121,8 @@ class ProprietaryHTMLProperties(object):
120 121
     # see : https://github.com/roundcube/roundcubemail/issues/6049
121 122
     Roundcube_quote_prefix_class = 'reply-intro'
122 123
 
124
+
125
+
123 126
 class HtmlChecker(object):
124 127
 
125 128
     @classmethod
@@ -181,9 +184,9 @@ class HtmlMailQuoteChecker(HtmlChecker):
181 184
                 ProprietaryHTMLProperties.Gmail_extras_class):
182 185
             for child in elem.children:
183 186
                 if cls._has_attr_value(
184
-                    child,
185
-                    'class',
186
-                    ProprietaryHTMLProperties.Gmail_quote_class):
187
+                        child,
188
+                        'class',
189
+                        ProprietaryHTMLProperties.Gmail_quote_class):
187 190
                     return True
188 191
         return False
189 192
 
@@ -283,6 +286,10 @@ class HtmlMailSignatureChecker(HtmlChecker):
283 286
         return False
284 287
 
285 288
 
289
+class PreSanitizeConfig(object):
290
+    Ignored_tags = ['br', 'hr', 'script', 'style']
291
+    meta_tag = ['body','div']
292
+
286 293
 class ParsedHTMLMail(object):
287 294
     """
288 295
     Parse HTML Mail depending of some rules.
@@ -297,7 +304,7 @@ class ParsedHTMLMail(object):
297 304
         return str(self._parse_mail())
298 305
 
299 306
     def get_elements(self) -> BodyMailParts:
300
-        tree = self._make_sanitized_tree()
307
+        tree = self._get_proper_main_body_tree()
301 308
         return self._distinct_elements(tree)
302 309
 
303 310
     def _parse_mail(self) -> BodyMailParts:
@@ -305,10 +312,10 @@ class ParsedHTMLMail(object):
305 312
         elements = self._process_elements(elements)
306 313
         return elements
307 314
 
308
-    def _make_sanitized_tree(self) -> BeautifulSoup:
315
+    def _get_proper_main_body_tree(self) -> BeautifulSoup:
309 316
         """
310
-        Get only html body content and remove some unneeded elements
311
-        :return:
317
+        Get html body tree without some kind of wrapper.
318
+        We need to have text, quote and signature parts at the same tree level
312 319
         """
313 320
         tree = BeautifulSoup(self.src_html_body, 'html.parser')
314 321
 
@@ -317,12 +324,11 @@ class ParsedHTMLMail(object):
317 324
         if subtree:
318 325
             tree = BeautifulSoup(str(subtree), 'html.parser')
319 326
 
320
-        # if some sort of "meta_div", unwrap it
327
+        # if some kind of "meta_div", unwrap it
321 328
         while len(tree.findAll(recursive=None)) == 1 and \
322
-                tree.find().name.lower() in ['body', 'div']:
329
+                tree.find().name.lower() in PreSanitizeConfig.meta_tag:
323 330
             tree.find().unwrap()
324 331
 
325
-        # drop some html elem
326 332
         for tag in tree.findAll():
327 333
             # HACK - G.M - 2017-11-28 - Unwrap outlook.com mail
328 334
             # if Text -> Signature -> Quote Mail
@@ -331,38 +337,37 @@ class ParsedHTMLMail(object):
331 337
                 if ProprietaryHTMLProperties.Outlook_com_wrapper_id\
332 338
                         in tag.attrs['id']:
333 339
                     tag.unwrap()
334
-            # Hack - G.M - 2017-11-28 : remove tag with no enclosure
335
-            # <br> and <hr> tag alone broke html.parser tree,
336
-            # Using another parser may be a solution.
337
-            if tag.name.lower() in ['br', 'hr']:
338
-                tag.unwrap()
339
-                continue
340
-            if tag.name.lower() in ['script', 'style']:
341
-                tag.extract()
342
-
343 340
         return tree
344 341
 
345 342
     @classmethod
346 343
     def _distinct_elements(cls, tree: BeautifulSoup) -> BodyMailParts:
347
-        elements = BodyMailParts()
348
-        for tag in list(tree):
349
-            txt = str(tag)
344
+        parts = BodyMailParts()
345
+        for elem in list(tree):
346
+            part_txt = str(elem)
350 347
             part_type = BodyMailPartType.Main
351
-            if isinstance(tag, NavigableString):
352
-                txt = tag.replace('\n', '').strip()
353
-            if not txt:
354
-                continue
355
-            if HtmlMailQuoteChecker.is_quote(tag):
348
+            # sanitize NavigableString
349
+            if isinstance(elem, NavigableString):
350
+                part_txt = part_txt.replace('\n', '').strip()
351
+
352
+            if HtmlMailQuoteChecker.is_quote(elem):
356 353
                 part_type = BodyMailPartType.Quote
357
-            elif HtmlMailSignatureChecker.is_signature(tag):
354
+            elif HtmlMailSignatureChecker.is_signature(elem):
358 355
                 part_type = BodyMailPartType.Signature
359
-            element = BodyMailPart(txt, part_type)
360
-            elements.append(element)
356
+            else:
357
+                # INFO - G.M -2017-11-28 - ignore unwanted parts
358
+                if not part_txt:
359
+                    continue
360
+                if isinstance(elem, Tag) \
361
+                        and elem.name.lower() in PreSanitizeConfig.Ignored_tags:
362
+                    continue
363
+
364
+            part = BodyMailPart(part_txt, part_type)
365
+            parts.append(part)
361 366
             # INFO - G.M - 2017-11-28 - Outlook.com special case
362 367
             # all after quote tag is quote
363
-            if HtmlMailQuoteChecker._is_outlook_com_quote(tag):
364
-                elements.follow = True
365
-        return elements
368
+            if HtmlMailQuoteChecker._is_outlook_com_quote(elem):
369
+                parts.follow = True
370
+        return parts
366 371
 
367 372
     @classmethod
368 373
     def _process_elements(cls, elements: BodyMailParts) -> BodyMailParts: