Browse Source

Use last part_type instead of removing empty html elements

Guénaël Muller 7 years ago
parent
commit
110b180291

+ 13 - 0
tracim/tracim/lib/email_processing/models.py View File

@@ -1,3 +1,5 @@
1
+from bs4 import BeautifulSoup
2
+
1 3
 # -*- coding: utf-8 -*-
2 4
 class BodyMailPartType(object):
3 5
     Signature = 'sign'
@@ -96,3 +98,14 @@ class BodyMailParts(object):
96 98
             s_mail += elem.text
97 99
         return str(s_mail)
98 100
 
101
+class HtmlBodyMailParts(BodyMailParts):
102
+
103
+    def append(self, value):
104
+        # INFO - G.M - 2017-12-01 - Override part_type is elem has no content.
105
+        # Choose last elem part_type instead of the proposed one.
106
+        if len(self._list) > 0:
107
+            txt = BeautifulSoup(value.text).get_text().replace('\n','').strip()
108
+            if not txt:
109
+                value.part_type = self._list[-1].part_type
110
+        BodyMailParts._check_value(value)
111
+        BodyMailParts._append(self,value)

+ 11 - 22
tracim/tracim/lib/email_processing/parser.py View File

@@ -8,14 +8,13 @@ from tracim.lib.email_processing.checkers import HtmlMailQuoteChecker
8 8
 from tracim.lib.email_processing.checkers import HtmlMailSignatureChecker
9 9
 from tracim.lib.email_processing.models import BodyMailPartType
10 10
 from tracim.lib.email_processing.models import BodyMailPart
11
-from tracim.lib.email_processing.models import BodyMailParts
11
+from tracim.lib.email_processing.models import HtmlBodyMailParts
12 12
 
13 13
 class PreSanitizeConfig(object):
14 14
     """
15
-    To avoid problems, html need to be a bit during parsing to distinct
15
+    To avoid problems, html need to be sanitize a bit during parsing to distinct
16 16
     Main,Quote and Signature elements
17 17
     """
18
-    Ignored_tags = ['br', 'hr', 'script', 'style']
19 18
     meta_tag = ['body', 'div']
20 19
 
21 20
 
@@ -32,11 +31,11 @@ class ParsedHTMLMail(object):
32 31
     def __str__(self):
33 32
         return str(self._parse_mail())
34 33
 
35
-    def get_elements(self) -> BodyMailParts:
34
+    def get_elements(self) -> HtmlBodyMailParts:
36 35
         tree = self._get_proper_main_body_tree()
37 36
         return self._distinct_elements(tree)
38 37
 
39
-    def _parse_mail(self) -> BodyMailParts:
38
+    def _parse_mail(self) -> HtmlBodyMailParts:
40 39
         elements = self.get_elements()
41 40
         elements = self._process_elements(elements)
42 41
         return elements
@@ -69,26 +68,16 @@ class ParsedHTMLMail(object):
69 68
         return tree
70 69
 
71 70
     @classmethod
72
-    def _distinct_elements(cls, tree: BeautifulSoup) -> BodyMailParts:
73
-        parts = BodyMailParts()
71
+    def _distinct_elements(cls, tree: BeautifulSoup) -> HtmlBodyMailParts:
72
+        parts = HtmlBodyMailParts()
74 73
         for elem in list(tree):
75 74
             part_txt = str(elem)
76 75
             part_type = BodyMailPartType.Main
77
-            # sanitize NavigableString
78
-            if isinstance(elem, NavigableString):
79
-                part_txt = part_txt.replace('\n', '').strip()
80 76
 
81 77
             if HtmlMailQuoteChecker.is_quote(elem):
82 78
                 part_type = BodyMailPartType.Quote
83 79
             elif HtmlMailSignatureChecker.is_signature(elem):
84 80
                 part_type = BodyMailPartType.Signature
85
-            else:
86
-                # INFO - G.M -2017-11-28 - ignore unwanted parts
87
-                if not part_txt:
88
-                    continue
89
-                if isinstance(elem, Tag) \
90
-                        and elem.name.lower() in PreSanitizeConfig.Ignored_tags:
91
-                    continue
92 81
 
93 82
             part = BodyMailPart(part_txt, part_type)
94 83
             parts.append(part)
@@ -99,7 +88,7 @@ class ParsedHTMLMail(object):
99 88
         return parts
100 89
 
101 90
     @classmethod
102
-    def _process_elements(cls, elements: BodyMailParts) -> BodyMailParts:
91
+    def _process_elements(cls, elements: HtmlBodyMailParts) -> HtmlBodyMailParts:
103 92
         if len(elements) >= 2:
104 93
             # Case 1 and 2, only one main and one quote
105 94
             if elements.get_nb_part_type('main') == 1 and \
@@ -119,19 +108,19 @@ class ParsedHTMLMail(object):
119 108
         return elements
120 109
 
121 110
     @classmethod
122
-    def _process_quote_first_case(cls, elements: BodyMailParts) -> None:
111
+    def _process_quote_first_case(cls, elements: HtmlBodyMailParts) -> None:
123 112
         elements.drop_part_type(BodyMailPartType.Signature)
124 113
 
125 114
     @classmethod
126
-    def _process_main_first_case(cls, elements: BodyMailParts) -> None:
115
+    def _process_main_first_case(cls, elements: HtmlBodyMailParts) -> None:
127 116
         elements.drop_part_type(BodyMailPartType.Quote)
128 117
         elements.drop_part_type(BodyMailPartType.Signature)
129 118
 
130 119
     @classmethod
131
-    def _process_multiples_elems_case(cls, elements: BodyMailParts) -> None:
120
+    def _process_multiples_elems_case(cls, elements: HtmlBodyMailParts) -> None:
132 121
         elements.drop_part_type(BodyMailPartType.Signature)
133 122
 
134 123
     @classmethod
135
-    def _process_default_case(cls, elements: BodyMailParts) -> None:
124
+    def _process_default_case(cls, elements: HtmlBodyMailParts) -> None:
136 125
         elements.drop_part_type(BodyMailPartType.Quote)
137 126
         elements.drop_part_type(BodyMailPartType.Signature)