Bläddra i källkod

Use last part_type instead of removing empty html elements

Guénaël Muller 7 år sedan
förälder
incheckning
110b180291

+ 13 - 0
tracim/tracim/lib/email_processing/models.py Visa fil

1
+from bs4 import BeautifulSoup
2
+
1
 # -*- coding: utf-8 -*-
3
 # -*- coding: utf-8 -*-
2
 class BodyMailPartType(object):
4
 class BodyMailPartType(object):
3
     Signature = 'sign'
5
     Signature = 'sign'
96
             s_mail += elem.text
98
             s_mail += elem.text
97
         return str(s_mail)
99
         return str(s_mail)
98
 
100
 
101
+class HtmlBodyMailParts(BodyMailParts):
102
+
103
+    def append(self, value):
104
+        # INFO - G.M - 2017-12-01 - Override part_type is elem has no content.
105
+        # Choose last elem part_type instead of the proposed one.
106
+        if len(self._list) > 0:
107
+            txt = BeautifulSoup(value.text).get_text().replace('\n','').strip()
108
+            if not txt:
109
+                value.part_type = self._list[-1].part_type
110
+        BodyMailParts._check_value(value)
111
+        BodyMailParts._append(self,value)

+ 11 - 22
tracim/tracim/lib/email_processing/parser.py Visa fil

8
 from tracim.lib.email_processing.checkers import HtmlMailSignatureChecker
8
 from tracim.lib.email_processing.checkers import HtmlMailSignatureChecker
9
 from tracim.lib.email_processing.models import BodyMailPartType
9
 from tracim.lib.email_processing.models import BodyMailPartType
10
 from tracim.lib.email_processing.models import BodyMailPart
10
 from tracim.lib.email_processing.models import BodyMailPart
11
-from tracim.lib.email_processing.models import BodyMailParts
11
+from tracim.lib.email_processing.models import HtmlBodyMailParts
12
 
12
 
13
 class PreSanitizeConfig(object):
13
 class PreSanitizeConfig(object):
14
     """
14
     """
15
-    To avoid problems, html need to be a bit during parsing to distinct
15
+    To avoid problems, html need to be sanitize a bit during parsing to distinct
16
     Main,Quote and Signature elements
16
     Main,Quote and Signature elements
17
     """
17
     """
18
-    Ignored_tags = ['br', 'hr', 'script', 'style']
19
     meta_tag = ['body', 'div']
18
     meta_tag = ['body', 'div']
20
 
19
 
21
 
20
 
32
     def __str__(self):
31
     def __str__(self):
33
         return str(self._parse_mail())
32
         return str(self._parse_mail())
34
 
33
 
35
-    def get_elements(self) -> BodyMailParts:
34
+    def get_elements(self) -> HtmlBodyMailParts:
36
         tree = self._get_proper_main_body_tree()
35
         tree = self._get_proper_main_body_tree()
37
         return self._distinct_elements(tree)
36
         return self._distinct_elements(tree)
38
 
37
 
39
-    def _parse_mail(self) -> BodyMailParts:
38
+    def _parse_mail(self) -> HtmlBodyMailParts:
40
         elements = self.get_elements()
39
         elements = self.get_elements()
41
         elements = self._process_elements(elements)
40
         elements = self._process_elements(elements)
42
         return elements
41
         return elements
69
         return tree
68
         return tree
70
 
69
 
71
     @classmethod
70
     @classmethod
72
-    def _distinct_elements(cls, tree: BeautifulSoup) -> BodyMailParts:
73
-        parts = BodyMailParts()
71
+    def _distinct_elements(cls, tree: BeautifulSoup) -> HtmlBodyMailParts:
72
+        parts = HtmlBodyMailParts()
74
         for elem in list(tree):
73
         for elem in list(tree):
75
             part_txt = str(elem)
74
             part_txt = str(elem)
76
             part_type = BodyMailPartType.Main
75
             part_type = BodyMailPartType.Main
77
-            # sanitize NavigableString
78
-            if isinstance(elem, NavigableString):
79
-                part_txt = part_txt.replace('\n', '').strip()
80
 
76
 
81
             if HtmlMailQuoteChecker.is_quote(elem):
77
             if HtmlMailQuoteChecker.is_quote(elem):
82
                 part_type = BodyMailPartType.Quote
78
                 part_type = BodyMailPartType.Quote
83
             elif HtmlMailSignatureChecker.is_signature(elem):
79
             elif HtmlMailSignatureChecker.is_signature(elem):
84
                 part_type = BodyMailPartType.Signature
80
                 part_type = BodyMailPartType.Signature
85
-            else:
86
-                # INFO - G.M -2017-11-28 - ignore unwanted parts
87
-                if not part_txt:
88
-                    continue
89
-                if isinstance(elem, Tag) \
90
-                        and elem.name.lower() in PreSanitizeConfig.Ignored_tags:
91
-                    continue
92
 
81
 
93
             part = BodyMailPart(part_txt, part_type)
82
             part = BodyMailPart(part_txt, part_type)
94
             parts.append(part)
83
             parts.append(part)
99
         return parts
88
         return parts
100
 
89
 
101
     @classmethod
90
     @classmethod
102
-    def _process_elements(cls, elements: BodyMailParts) -> BodyMailParts:
91
+    def _process_elements(cls, elements: HtmlBodyMailParts) -> HtmlBodyMailParts:
103
         if len(elements) >= 2:
92
         if len(elements) >= 2:
104
             # Case 1 and 2, only one main and one quote
93
             # Case 1 and 2, only one main and one quote
105
             if elements.get_nb_part_type('main') == 1 and \
94
             if elements.get_nb_part_type('main') == 1 and \
119
         return elements
108
         return elements
120
 
109
 
121
     @classmethod
110
     @classmethod
122
-    def _process_quote_first_case(cls, elements: BodyMailParts) -> None:
111
+    def _process_quote_first_case(cls, elements: HtmlBodyMailParts) -> None:
123
         elements.drop_part_type(BodyMailPartType.Signature)
112
         elements.drop_part_type(BodyMailPartType.Signature)
124
 
113
 
125
     @classmethod
114
     @classmethod
126
-    def _process_main_first_case(cls, elements: BodyMailParts) -> None:
115
+    def _process_main_first_case(cls, elements: HtmlBodyMailParts) -> None:
127
         elements.drop_part_type(BodyMailPartType.Quote)
116
         elements.drop_part_type(BodyMailPartType.Quote)
128
         elements.drop_part_type(BodyMailPartType.Signature)
117
         elements.drop_part_type(BodyMailPartType.Signature)
129
 
118
 
130
     @classmethod
119
     @classmethod
131
-    def _process_multiples_elems_case(cls, elements: BodyMailParts) -> None:
120
+    def _process_multiples_elems_case(cls, elements: HtmlBodyMailParts) -> None:
132
         elements.drop_part_type(BodyMailPartType.Signature)
121
         elements.drop_part_type(BodyMailPartType.Signature)
133
 
122
 
134
     @classmethod
123
     @classmethod
135
-    def _process_default_case(cls, elements: BodyMailParts) -> None:
124
+    def _process_default_case(cls, elements: HtmlBodyMailParts) -> None:
136
         elements.drop_part_type(BodyMailPartType.Quote)
125
         elements.drop_part_type(BodyMailPartType.Quote)
137
         elements.drop_part_type(BodyMailPartType.Signature)
126
         elements.drop_part_type(BodyMailPartType.Signature)