Browse Source

Refactoring: Email Processing dir

Guénaël Muller 6 years ago
parent
commit
9e21ac90c9

+ 0 - 418
tracim/tracim/lib/email_body_parser.py View File

@@ -1,418 +0,0 @@
1
-import typing
2
-
3
-from bs4 import BeautifulSoup
4
-from bs4 import Tag
5
-from bs4 import NavigableString
6
-
7
-# BodyParts and Body Parts Objects #
8
-
9
-
10
-class BodyMailPartType(object):
11
-    Signature = 'sign'
12
-    Main = 'main'
13
-    Quote = 'quote'
14
-
15
-
16
-class BodyMailPart(object):
17
-    def __init__(
18
-            self,
19
-            text: str,
20
-            part_type: str
21
-    )-> None:
22
-        self.text = text
23
-        self.part_type = part_type
24
-
25
-
26
-class BodyMailParts(object):
27
-    """
28
-    Data Structure to Distinct part of a Mail body into a "list" of BodyMailPart
29
-    When 2 similar BodyMailPart (same part_type) are added one after the other,
30
-    it doesn't create a new Part, it just merge those elements into one.
31
-    It should always have only one Signature type part, normally
32
-    at the end of the body.
33
-    This object doesn't provide other set method than append() in order to
34
-    preserve object coherence.
35
-    """
36
-    def __init__(self) -> None:
37
-        self._list = []  # type; List[BodyMailPart]
38
-        # INFO - G.M -
39
-        # automatically merge new value with last item if true, without any
40
-        # part_type check, same type as the older one, useful when some tag
41
-        # say "all elem after me is Signature"
42
-        self.follow = False
43
-
44
-    def __len__(self) -> int:
45
-        return len(self._list)
46
-
47
-    def __getitem__(self, index) -> BodyMailPart:
48
-        return self._list[index]
49
-
50
-    def __delitem__(self, index) -> None:
51
-        del self._list[index]
52
-        # FIXME - G.M - 2017-11-27 - Preserve BodyMailParts consistence
53
-        # check elem after and before index and merge them if necessary.
54
-
55
-    def append(self, value) -> None:
56
-        BodyMailParts._check_value(value)
57
-        self._append(value)
58
-
59
-    def _append(self, value) -> None:
60
-        same_type_as_last = len(self._list) > 0 and \
61
-                            self._list[-1].part_type == value.part_type
62
-        if same_type_as_last or self.follow:
63
-            self._list[-1].text += value.text
64
-        else:
65
-            self._list.append(value)
66
-
67
-    @classmethod
68
-    def _check_value(cls, value) -> None:
69
-        if not isinstance(value, BodyMailPart):
70
-            raise TypeError()
71
-
72
-    def drop_part_type(self, part_type: str) -> None:
73
-        """
74
-        Drop all elem of one part_type
75
-        :param part_type: part_type to completely remove
76
-        :return: None
77
-        """
78
-        new_list = [x for x in self._list if x.part_type != part_type]
79
-        self._list = []
80
-        # INFO - G.M - 2017-11-27 - use append() to have a consistent list
81
-        for elem in new_list:
82
-            self.append(elem)
83
-
84
-    def get_nb_part_type(self, part_type: str) -> int:
85
-        """
86
-        Get number of elements of one part_type
87
-        :param part_type: part_type to check
88
-        :return: number of part_type elements
89
-        """
90
-        count = 0
91
-        for elem in self._list:
92
-            if elem.part_type == part_type:
93
-                count += 1
94
-        return count
95
-
96
-    def __str__(self) -> str:
97
-        s_mail = ''
98
-        for elem in self._list:
99
-            s_mail += elem.text
100
-        return str(s_mail)
101
-
102
-# Elements Checkers #
103
-
104
-
105
-class ProprietaryHTMLAttrValues(object):
106
-    """
107
-    This are all Proprietary (mail client specific) html attr value we need to
108
-    check Html Elements
109
-    """
110
-    # Gmail
111
-    Gmail_extras_class = 'gmail_extra'
112
-    Gmail_quote_class = 'gmail_quote'
113
-    Gmail_signature_class = 'gmail_signature'
114
-    # Thunderbird
115
-    Thunderbird_quote_prefix_class = 'moz-cite-prefix'
116
-    Thunderbird_signature_class = 'moz-signature'
117
-    # Outlook.com
118
-    Outlook_com_quote_id = 'divRplyFwdMsg'
119
-    Outlook_com_signature_id = 'Signature'
120
-    Outlook_com_wrapper_id = 'divtagdefaultwrapper'
121
-    # Yahoo
122
-    Yahoo_quote_class = 'yahoo_quoted'
123
-    # Roundcube
124
-    # INFO - G.M - 2017-11-29 - New tag
125
-    # see : https://github.com/roundcube/roundcubemail/issues/6049
126
-    Roundcube_quote_prefix_class = 'reply-intro'
127
-
128
-
129
-class HtmlChecker(object):
130
-
131
-    @classmethod
132
-    def _has_attr_value(
133
-            cls,
134
-            elem: typing.Union[Tag, NavigableString],
135
-            attribute_name: str,
136
-            attribute_value: str,
137
-    )-> bool:
138
-        if isinstance(elem, Tag) and \
139
-                        attribute_name in elem.attrs and \
140
-                        attribute_value in elem.attrs[attribute_name]:
141
-            return True
142
-        return False
143
-
144
-
145
-class HtmlMailQuoteChecker(HtmlChecker):
146
-
147
-    @classmethod
148
-    def is_quote(
149
-            cls,
150
-            elem: typing.Union[Tag, NavigableString]
151
-    ) -> bool:
152
-        return cls._is_standard_quote(elem) \
153
-               or cls._is_thunderbird_quote(elem) \
154
-               or cls._is_gmail_quote(elem) \
155
-               or cls._is_outlook_com_quote(elem) \
156
-               or cls._is_yahoo_quote(elem) \
157
-               or cls._is_roundcube_quote(elem)
158
-
159
-    @classmethod
160
-    def _is_standard_quote(
161
-            cls,
162
-            elem: typing.Union[Tag, NavigableString]
163
-    ) -> bool:
164
-        if isinstance(elem, Tag) \
165
-                and elem.name.lower() == 'blockquote':
166
-            return True
167
-        return False
168
-
169
-    @classmethod
170
-    def _is_thunderbird_quote(
171
-            cls,
172
-            elem: typing.Union[Tag, NavigableString]
173
-    ) -> bool:
174
-        return cls._has_attr_value(
175
-            elem,
176
-            'class',
177
-            ProprietaryHTMLAttrValues.Thunderbird_quote_prefix_class)
178
-
179
-    @classmethod
180
-    def _is_gmail_quote(
181
-            cls,
182
-            elem: typing.Union[Tag, NavigableString]
183
-    ) -> bool:
184
-        if cls._has_attr_value(
185
-                elem,
186
-                'class',
187
-                ProprietaryHTMLAttrValues.Gmail_extras_class):
188
-            for child in elem.children:
189
-                if cls._has_attr_value(
190
-                        child,
191
-                        'class',
192
-                        ProprietaryHTMLAttrValues.Gmail_quote_class):
193
-                    return True
194
-        return False
195
-
196
-    @classmethod
197
-    def _is_outlook_com_quote(
198
-        cls,
199
-        elem: typing.Union[Tag, NavigableString]
200
-    ) -> bool:
201
-        if cls._has_attr_value(
202
-                elem,
203
-                'id',
204
-                ProprietaryHTMLAttrValues.Outlook_com_quote_id):
205
-            return True
206
-        return False
207
-
208
-    @classmethod
209
-    def _is_yahoo_quote(
210
-            cls,
211
-            elem: typing.Union[Tag, NavigableString]
212
-    ) -> bool:
213
-        return cls._has_attr_value(
214
-            elem,
215
-            'class',
216
-            ProprietaryHTMLAttrValues.Yahoo_quote_class)
217
-
218
-    @classmethod
219
-    def _is_roundcube_quote(
220
-            cls,
221
-            elem: typing.Union[Tag, NavigableString]
222
-    ) -> bool:
223
-        return cls._has_attr_value(
224
-            elem,
225
-            'id',
226
-            ProprietaryHTMLAttrValues.Roundcube_quote_prefix_class)
227
-
228
-
229
-class HtmlMailSignatureChecker(HtmlChecker):
230
-
231
-    @classmethod
232
-    def is_signature(
233
-            cls,
234
-            elem: typing.Union[Tag, NavigableString]
235
-    ) -> bool:
236
-        return cls._is_thunderbird_signature(elem) \
237
-               or cls._is_gmail_signature(elem) \
238
-               or cls._is_outlook_com_signature(elem)
239
-
240
-    @classmethod
241
-    def _is_thunderbird_signature(
242
-            cls,
243
-            elem: typing.Union[Tag, NavigableString]
244
-    ) -> bool:
245
-        return cls._has_attr_value(
246
-            elem,
247
-            'class',
248
-            ProprietaryHTMLAttrValues.Thunderbird_signature_class)
249
-
250
-    @classmethod
251
-    def _is_gmail_signature(
252
-            cls,
253
-            elem: typing.Union[Tag, NavigableString]
254
-    ) -> bool:
255
-        if cls._has_attr_value(
256
-                elem,
257
-                'class',
258
-                ProprietaryHTMLAttrValues.Gmail_signature_class):
259
-            return True
260
-        if cls._has_attr_value(
261
-                elem,
262
-                'class',
263
-                ProprietaryHTMLAttrValues.Gmail_extras_class):
264
-            for child in elem.children:
265
-                if cls._has_attr_value(
266
-                        child,
267
-                        'class',
268
-                        ProprietaryHTMLAttrValues.Gmail_signature_class):
269
-                    return True
270
-        if isinstance(elem, Tag) and elem.name.lower() == 'div':
271
-            for child in elem.children:
272
-                if cls._has_attr_value(
273
-                        child,
274
-                        'class',
275
-                        ProprietaryHTMLAttrValues.Gmail_signature_class):
276
-                    return True
277
-        return False
278
-
279
-    @classmethod
280
-    def _is_outlook_com_signature(
281
-            cls,
282
-            elem: typing.Union[Tag, NavigableString]
283
-    ) -> bool:
284
-        if cls._has_attr_value(
285
-                elem,
286
-                'id',
287
-                ProprietaryHTMLAttrValues.Outlook_com_signature_id):
288
-            return True
289
-        return False
290
-
291
-# ParsedHTMLMail #
292
-
293
-
294
-class PreSanitizeConfig(object):
295
-    """
296
-    To avoid problems, html need to be a bit during parsing to distinct
297
-    Main,Quote and Signature elements
298
-    """
299
-    Ignored_tags = ['br', 'hr', 'script', 'style']
300
-    meta_tag = ['body', 'div']
301
-
302
-
303
-class ParsedHTMLMail(object):
304
-    """
305
-    Parse HTML Mail depending of some rules.
306
-    Distinct part of html mail body using BodyMailParts object and
307
-    process differents rules using HtmlChecker(s)
308
-    """
309
-
310
-    def __init__(self, html_body: str):
311
-        self.src_html_body = html_body
312
-
313
-    def __str__(self):
314
-        return str(self._parse_mail())
315
-
316
-    def get_elements(self) -> BodyMailParts:
317
-        tree = self._get_proper_main_body_tree()
318
-        return self._distinct_elements(tree)
319
-
320
-    def _parse_mail(self) -> BodyMailParts:
321
-        elements = self.get_elements()
322
-        elements = self._process_elements(elements)
323
-        return elements
324
-
325
-    def _get_proper_main_body_tree(self) -> BeautifulSoup:
326
-        """
327
-        Get html body tree without some kind of wrapper.
328
-        We need to have text, quote and signature parts at the same tree level
329
-        """
330
-        tree = BeautifulSoup(self.src_html_body, 'html.parser')
331
-
332
-        # Only parse body part of html if available
333
-        subtree = tree.find('body')
334
-        if subtree:
335
-            tree = BeautifulSoup(str(subtree), 'html.parser')
336
-
337
-        # if some kind of "meta_div", unwrap it
338
-        while len(tree.findAll(recursive=None)) == 1 and \
339
-                tree.find().name.lower() in PreSanitizeConfig.meta_tag:
340
-            tree.find().unwrap()
341
-
342
-        for tag in tree.findAll():
343
-            # HACK - G.M - 2017-11-28 - Unwrap outlook.com mail
344
-            # if Text -> Signature -> Quote Mail
345
-            # Text and signature are wrapped into divtagdefaultwrapper
346
-            if tag.attrs.get('id'):
347
-                if ProprietaryHTMLAttrValues.Outlook_com_wrapper_id\
348
-                        in tag.attrs['id']:
349
-                    tag.unwrap()
350
-        return tree
351
-
352
-    @classmethod
353
-    def _distinct_elements(cls, tree: BeautifulSoup) -> BodyMailParts:
354
-        parts = BodyMailParts()
355
-        for elem in list(tree):
356
-            part_txt = str(elem)
357
-            part_type = BodyMailPartType.Main
358
-            # sanitize NavigableString
359
-            if isinstance(elem, NavigableString):
360
-                part_txt = part_txt.replace('\n', '').strip()
361
-
362
-            if HtmlMailQuoteChecker.is_quote(elem):
363
-                part_type = BodyMailPartType.Quote
364
-            elif HtmlMailSignatureChecker.is_signature(elem):
365
-                part_type = BodyMailPartType.Signature
366
-            else:
367
-                # INFO - G.M -2017-11-28 - ignore unwanted parts
368
-                if not part_txt:
369
-                    continue
370
-                if isinstance(elem, Tag) \
371
-                        and elem.name.lower() in PreSanitizeConfig.Ignored_tags:
372
-                    continue
373
-
374
-            part = BodyMailPart(part_txt, part_type)
375
-            parts.append(part)
376
-            # INFO - G.M - 2017-11-28 - Outlook.com special case
377
-            # all after quote tag is quote
378
-            if HtmlMailQuoteChecker._is_outlook_com_quote(elem):
379
-                parts.follow = True
380
-        return parts
381
-
382
-    @classmethod
383
-    def _process_elements(cls, elements: BodyMailParts) -> BodyMailParts:
384
-        if len(elements) >= 2:
385
-            # Case 1 and 2, only one main and one quote
386
-            if elements.get_nb_part_type('main') == 1 and \
387
-                            elements.get_nb_part_type('quote') == 1:
388
-                # Case 1 : Main first
389
-                if elements[0].part_type == BodyMailPartType.Main:
390
-                    cls._process_main_first_case(elements)
391
-                # Case 2 : Quote first
392
-                if elements[0].part_type == BodyMailPartType.Quote:
393
-                    cls._process_quote_first_case(elements)
394
-            else:
395
-                # Case 3 : Multiple quotes and/or main
396
-                cls._process_multiples_elems_case(elements)
397
-        else:
398
-            cls._process_default_case(elements)
399
-            # default case (only one element or empty list)
400
-        return elements
401
-
402
-    @classmethod
403
-    def _process_quote_first_case(cls, elements: BodyMailParts) -> None:
404
-        elements.drop_part_type(BodyMailPartType.Signature)
405
-
406
-    @classmethod
407
-    def _process_main_first_case(cls, elements: BodyMailParts) -> None:
408
-        elements.drop_part_type(BodyMailPartType.Quote)
409
-        elements.drop_part_type(BodyMailPartType.Signature)
410
-
411
-    @classmethod
412
-    def _process_multiples_elems_case(cls, elements: BodyMailParts) -> None:
413
-        elements.drop_part_type(BodyMailPartType.Signature)
414
-
415
-    @classmethod
416
-    def _process_default_case(cls, elements: BodyMailParts) -> None:
417
-        elements.drop_part_type(BodyMailPartType.Quote)
418
-        elements.drop_part_type(BodyMailPartType.Signature)

+ 5 - 9
tracim/tracim/lib/email_fetcher.py View File

@@ -1,23 +1,19 @@
1 1
 # -*- coding: utf-8 -*-
2 2
 
3
-import sys
4
-import time
5 3
 import imaplib
6
-import datetime
7
-import json
4
+import time
8 5
 import typing
9
-from email.message import Message
10
-from email.header import Header, decode_header, make_header
11
-from email.utils import parseaddr, parsedate_tz, mktime_tz
12 6
 from email import message_from_bytes
7
+from email.header import decode_header, make_header
8
+from email.message import Message
9
+from email.utils import parseaddr
13 10
 
14 11
 import markdown
15 12
 import requests
16 13
 from bs4 import BeautifulSoup, Tag
17 14
 from email_reply_parser import EmailReplyParser
18
-
19 15
 from tracim.lib.base import logger
20
-from tracim.lib.email_body_parser import ParsedHTMLMail
16
+from tracim.lib.email_processing.parser import ParsedHTMLMail
21 17
 
22 18
 TRACIM_SPECIAL_KEY_HEADER = 'X-Tracim-Key'
23 19
 # TODO BS 20171124: Think about replace thin dict config by object

+ 0 - 0
tracim/tracim/lib/email_processing/__init__.py View File


+ 192 - 0
tracim/tracim/lib/email_processing/checkers.py View File

@@ -0,0 +1,192 @@
1
+# -*- coding: utf-8 -*-
2
+import typing
3
+
4
+from bs4 import Tag, NavigableString
5
+
6
+
7
+class ProprietaryHTMLAttrValues(object):
8
+    """
9
+    This are all Proprietary (mail client specific) html attr value we need to
10
+    check Html Elements
11
+    """
12
+    # Gmail
13
+    Gmail_extras_class = 'gmail_extra'
14
+    Gmail_quote_class = 'gmail_quote'
15
+    Gmail_signature_class = 'gmail_signature'
16
+    # Thunderbird
17
+    Thunderbird_quote_prefix_class = 'moz-cite-prefix'
18
+    Thunderbird_signature_class = 'moz-signature'
19
+    # Outlook.com
20
+    Outlook_com_quote_id = 'divRplyFwdMsg'
21
+    Outlook_com_signature_id = 'Signature'
22
+    Outlook_com_wrapper_id = 'divtagdefaultwrapper'
23
+    # Yahoo
24
+    Yahoo_quote_class = 'yahoo_quoted'
25
+    # Roundcube
26
+    # INFO - G.M - 2017-11-29 - New tag
27
+    # see : https://github.com/roundcube/roundcubemail/issues/6049
28
+    Roundcube_quote_prefix_class = 'reply-intro'
29
+
30
+
31
+class HtmlChecker(object):
32
+
33
+    @classmethod
34
+    def _has_attr_value(
35
+            cls,
36
+            elem: typing.Union[Tag, NavigableString],
37
+            attribute_name: str,
38
+            attribute_value: str,
39
+    )-> bool:
40
+        if isinstance(elem, Tag) and \
41
+                        attribute_name in elem.attrs and \
42
+                        attribute_value in elem.attrs[attribute_name]:
43
+            return True
44
+        return False
45
+
46
+
47
+class HtmlMailQuoteChecker(HtmlChecker):
48
+
49
+    @classmethod
50
+    def is_quote(
51
+            cls,
52
+            elem: typing.Union[Tag, NavigableString]
53
+    ) -> bool:
54
+        return cls._is_standard_quote(elem) \
55
+               or cls._is_thunderbird_quote(elem) \
56
+               or cls._is_gmail_quote(elem) \
57
+               or cls._is_outlook_com_quote(elem) \
58
+               or cls._is_yahoo_quote(elem) \
59
+               or cls._is_roundcube_quote(elem)
60
+
61
+    @classmethod
62
+    def _is_standard_quote(
63
+            cls,
64
+            elem: typing.Union[Tag, NavigableString]
65
+    ) -> bool:
66
+        if isinstance(elem, Tag) \
67
+                and elem.name.lower() == 'blockquote':
68
+            return True
69
+        return False
70
+
71
+    @classmethod
72
+    def _is_thunderbird_quote(
73
+            cls,
74
+            elem: typing.Union[Tag, NavigableString]
75
+    ) -> bool:
76
+        return cls._has_attr_value(
77
+            elem,
78
+            'class',
79
+            ProprietaryHTMLAttrValues.Thunderbird_quote_prefix_class)
80
+
81
+    @classmethod
82
+    def _is_gmail_quote(
83
+            cls,
84
+            elem: typing.Union[Tag, NavigableString]
85
+    ) -> bool:
86
+        if cls._has_attr_value(
87
+                elem,
88
+                'class',
89
+                ProprietaryHTMLAttrValues.Gmail_extras_class):
90
+            for child in elem.children:
91
+                if cls._has_attr_value(
92
+                        child,
93
+                        'class',
94
+                        ProprietaryHTMLAttrValues.Gmail_quote_class):
95
+                    return True
96
+        return False
97
+
98
+    @classmethod
99
+    def _is_outlook_com_quote(
100
+        cls,
101
+        elem: typing.Union[Tag, NavigableString]
102
+    ) -> bool:
103
+        if cls._has_attr_value(
104
+                elem,
105
+                'id',
106
+                ProprietaryHTMLAttrValues.Outlook_com_quote_id):
107
+            return True
108
+        return False
109
+
110
+    @classmethod
111
+    def _is_yahoo_quote(
112
+            cls,
113
+            elem: typing.Union[Tag, NavigableString]
114
+    ) -> bool:
115
+        return cls._has_attr_value(
116
+            elem,
117
+            'class',
118
+            ProprietaryHTMLAttrValues.Yahoo_quote_class)
119
+
120
+    @classmethod
121
+    def _is_roundcube_quote(
122
+            cls,
123
+            elem: typing.Union[Tag, NavigableString]
124
+    ) -> bool:
125
+        return cls._has_attr_value(
126
+            elem,
127
+            'id',
128
+            ProprietaryHTMLAttrValues.Roundcube_quote_prefix_class)
129
+
130
+
131
+class HtmlMailSignatureChecker(HtmlChecker):
132
+
133
+    @classmethod
134
+    def is_signature(
135
+            cls,
136
+            elem: typing.Union[Tag, NavigableString]
137
+    ) -> bool:
138
+        return cls._is_thunderbird_signature(elem) \
139
+               or cls._is_gmail_signature(elem) \
140
+               or cls._is_outlook_com_signature(elem)
141
+
142
+    @classmethod
143
+    def _is_thunderbird_signature(
144
+            cls,
145
+            elem: typing.Union[Tag, NavigableString]
146
+    ) -> bool:
147
+        return cls._has_attr_value(
148
+            elem,
149
+            'class',
150
+            ProprietaryHTMLAttrValues.Thunderbird_signature_class)
151
+
152
+    @classmethod
153
+    def _is_gmail_signature(
154
+            cls,
155
+            elem: typing.Union[Tag, NavigableString]
156
+    ) -> bool:
157
+        if cls._has_attr_value(
158
+                elem,
159
+                'class',
160
+                ProprietaryHTMLAttrValues.Gmail_signature_class):
161
+            return True
162
+        if cls._has_attr_value(
163
+                elem,
164
+                'class',
165
+                ProprietaryHTMLAttrValues.Gmail_extras_class):
166
+            for child in elem.children:
167
+                if cls._has_attr_value(
168
+                        child,
169
+                        'class',
170
+                        ProprietaryHTMLAttrValues.Gmail_signature_class):
171
+                    return True
172
+        if isinstance(elem, Tag) and elem.name.lower() == 'div':
173
+            for child in elem.children:
174
+                if cls._has_attr_value(
175
+                        child,
176
+                        'class',
177
+                        ProprietaryHTMLAttrValues.Gmail_signature_class):
178
+                    return True
179
+        return False
180
+
181
+    @classmethod
182
+    def _is_outlook_com_signature(
183
+            cls,
184
+            elem: typing.Union[Tag, NavigableString]
185
+    ) -> bool:
186
+        if cls._has_attr_value(
187
+                elem,
188
+                'id',
189
+                ProprietaryHTMLAttrValues.Outlook_com_signature_id):
190
+            return True
191
+        return False
192
+

+ 93 - 0
tracim/tracim/lib/email_processing/models.py View File

@@ -0,0 +1,93 @@
1
+# -*- coding: utf-8 -*-
2
+class BodyMailPartType(object):
3
+    Signature = 'sign'
4
+    Main = 'main'
5
+    Quote = 'quote'
6
+
7
+
8
+class BodyMailPart(object):
9
+    def __init__(
10
+            self,
11
+            text: str,
12
+            part_type: str
13
+    )-> None:
14
+        self.text = text
15
+        self.part_type = part_type
16
+
17
+
18
+class BodyMailParts(object):
19
+    """
20
+    Data Structure to Distinct part of a Mail body into a "list" of BodyMailPart
21
+    When 2 similar BodyMailPart (same part_type) are added one after the other,
22
+    it doesn't create a new Part, it just merge those elements into one.
23
+    It should always have only one Signature type part, normally
24
+    at the end of the body.
25
+    This object doesn't provide other set method than append() in order to
26
+    preserve object coherence.
27
+    """
28
+    def __init__(self) -> None:
29
+        self._list = []  # type; List[BodyMailPart]
30
+        # INFO - G.M -
31
+        # automatically merge new value with last item if true, without any
32
+        # part_type check, same type as the older one, useful when some tag
33
+        # say "all elem after me is Signature"
34
+        self.follow = False
35
+
36
+    def __len__(self) -> int:
37
+        return len(self._list)
38
+
39
+    def __getitem__(self, index) -> BodyMailPart:
40
+        return self._list[index]
41
+
42
+    def __delitem__(self, index) -> None:
43
+        del self._list[index]
44
+        # FIXME - G.M - 2017-11-27 - Preserve BodyMailParts consistence
45
+        # check elem after and before index and merge them if necessary.
46
+
47
+    def append(self, value) -> None:
48
+        BodyMailParts._check_value(value)
49
+        self._append(value)
50
+
51
+    def _append(self, value) -> None:
52
+        same_type_as_last = len(self._list) > 0 and \
53
+                            self._list[-1].part_type == value.part_type
54
+        if same_type_as_last or self.follow:
55
+            self._list[-1].text += value.text
56
+        else:
57
+            self._list.append(value)
58
+
59
+    @classmethod
60
+    def _check_value(cls, value) -> None:
61
+        if not isinstance(value, BodyMailPart):
62
+            raise TypeError()
63
+
64
+    def drop_part_type(self, part_type: str) -> None:
65
+        """
66
+        Drop all elem of one part_type
67
+        :param part_type: part_type to completely remove
68
+        :return: None
69
+        """
70
+        new_list = [x for x in self._list if x.part_type != part_type]
71
+        self._list = []
72
+        # INFO - G.M - 2017-11-27 - use append() to have a consistent list
73
+        for elem in new_list:
74
+            self.append(elem)
75
+
76
+    def get_nb_part_type(self, part_type: str) -> int:
77
+        """
78
+        Get number of elements of one part_type
79
+        :param part_type: part_type to check
80
+        :return: number of part_type elements
81
+        """
82
+        count = 0
83
+        for elem in self._list:
84
+            if elem.part_type == part_type:
85
+                count += 1
86
+        return count
87
+
88
+    def __str__(self) -> str:
89
+        s_mail = ''
90
+        for elem in self._list:
91
+            s_mail += elem.text
92
+        return str(s_mail)
93
+

+ 137 - 0
tracim/tracim/lib/email_processing/parser.py View File

@@ -0,0 +1,137 @@
1
+# -*- coding: utf-8 -*-
2
+from bs4 import BeautifulSoup
3
+from bs4 import NavigableString
4
+from bs4 import Tag
5
+
6
+from tracim.lib.email_processing.checkers import ProprietaryHTMLAttrValues
7
+from tracim.lib.email_processing.checkers import HtmlMailQuoteChecker
8
+from tracim.lib.email_processing.checkers import HtmlMailSignatureChecker
9
+from tracim.lib.email_processing.models import BodyMailPartType
10
+from tracim.lib.email_processing.models import BodyMailPart
11
+from tracim.lib.email_processing.models import BodyMailParts
12
+
13
+class PreSanitizeConfig(object):
14
+    """
15
+    To avoid problems, html need to be a bit during parsing to distinct
16
+    Main,Quote and Signature elements
17
+    """
18
+    Ignored_tags = ['br', 'hr', 'script', 'style']
19
+    meta_tag = ['body', 'div']
20
+
21
+
22
+class ParsedHTMLMail(object):
23
+    """
24
+    Parse HTML Mail depending of some rules.
25
+    Distinct part of html mail body using BodyMailParts object and
26
+    process differents rules using HtmlChecker(s)
27
+    """
28
+
29
+    def __init__(self, html_body: str):
30
+        self.src_html_body = html_body
31
+
32
+    def __str__(self):
33
+        return str(self._parse_mail())
34
+
35
+    def get_elements(self) -> BodyMailParts:
36
+        tree = self._get_proper_main_body_tree()
37
+        return self._distinct_elements(tree)
38
+
39
+    def _parse_mail(self) -> BodyMailParts:
40
+        elements = self.get_elements()
41
+        elements = self._process_elements(elements)
42
+        return elements
43
+
44
+    def _get_proper_main_body_tree(self) -> BeautifulSoup:
45
+        """
46
+        Get html body tree without some kind of wrapper.
47
+        We need to have text, quote and signature parts at the same tree level
48
+        """
49
+        tree = BeautifulSoup(self.src_html_body, 'html.parser')
50
+
51
+        # Only parse body part of html if available
52
+        subtree = tree.find('body')
53
+        if subtree:
54
+            tree = BeautifulSoup(str(subtree), 'html.parser')
55
+
56
+        # if some kind of "meta_div", unwrap it
57
+        while len(tree.findAll(recursive=None)) == 1 and \
58
+                tree.find().name.lower() in PreSanitizeConfig.meta_tag:
59
+            tree.find().unwrap()
60
+
61
+        for tag in tree.findAll():
62
+            # HACK - G.M - 2017-11-28 - Unwrap outlook.com mail
63
+            # if Text -> Signature -> Quote Mail
64
+            # Text and signature are wrapped into divtagdefaultwrapper
65
+            if tag.attrs.get('id'):
66
+                if ProprietaryHTMLAttrValues.Outlook_com_wrapper_id\
67
+                        in tag.attrs['id']:
68
+                    tag.unwrap()
69
+        return tree
70
+
71
+    @classmethod
72
+    def _distinct_elements(cls, tree: BeautifulSoup) -> BodyMailParts:
73
+        parts = BodyMailParts()
74
+        for elem in list(tree):
75
+            part_txt = str(elem)
76
+            part_type = BodyMailPartType.Main
77
+            # sanitize NavigableString
78
+            if isinstance(elem, NavigableString):
79
+                part_txt = part_txt.replace('\n', '').strip()
80
+
81
+            if HtmlMailQuoteChecker.is_quote(elem):
82
+                part_type = BodyMailPartType.Quote
83
+            elif HtmlMailSignatureChecker.is_signature(elem):
84
+                part_type = BodyMailPartType.Signature
85
+            else:
86
+                # INFO - G.M -2017-11-28 - ignore unwanted parts
87
+                if not part_txt:
88
+                    continue
89
+                if isinstance(elem, Tag) \
90
+                        and elem.name.lower() in PreSanitizeConfig.Ignored_tags:
91
+                    continue
92
+
93
+            part = BodyMailPart(part_txt, part_type)
94
+            parts.append(part)
95
+            # INFO - G.M - 2017-11-28 - Outlook.com special case
96
+            # all after quote tag is quote
97
+            if HtmlMailQuoteChecker._is_outlook_com_quote(elem):
98
+                parts.follow = True
99
+        return parts
100
+
101
+    @classmethod
102
+    def _process_elements(cls, elements: BodyMailParts) -> BodyMailParts:
103
+        if len(elements) >= 2:
104
+            # Case 1 and 2, only one main and one quote
105
+            if elements.get_nb_part_type('main') == 1 and \
106
+                            elements.get_nb_part_type('quote') == 1:
107
+                # Case 1 : Main first
108
+                if elements[0].part_type == BodyMailPartType.Main:
109
+                    cls._process_main_first_case(elements)
110
+                # Case 2 : Quote first
111
+                if elements[0].part_type == BodyMailPartType.Quote:
112
+                    cls._process_quote_first_case(elements)
113
+            else:
114
+                # Case 3 : Multiple quotes and/or main
115
+                cls._process_multiples_elems_case(elements)
116
+        else:
117
+            cls._process_default_case(elements)
118
+            # default case (only one element or empty list)
119
+        return elements
120
+
121
+    @classmethod
122
+    def _process_quote_first_case(cls, elements: BodyMailParts) -> None:
123
+        elements.drop_part_type(BodyMailPartType.Signature)
124
+
125
+    @classmethod
126
+    def _process_main_first_case(cls, elements: BodyMailParts) -> None:
127
+        elements.drop_part_type(BodyMailPartType.Quote)
128
+        elements.drop_part_type(BodyMailPartType.Signature)
129
+
130
+    @classmethod
131
+    def _process_multiples_elems_case(cls, elements: BodyMailParts) -> None:
132
+        elements.drop_part_type(BodyMailPartType.Signature)
133
+
134
+    @classmethod
135
+    def _process_default_case(cls, elements: BodyMailParts) -> None:
136
+        elements.drop_part_type(BodyMailPartType.Quote)
137
+        elements.drop_part_type(BodyMailPartType.Signature)

+ 7 - 8
tracim/tracim/tests/library/test_email_body_parser.py View File

@@ -1,12 +1,11 @@
1
-from tracim.lib.email_body_parser import HtmlMailQuoteChecker
2
-from tracim.lib.email_body_parser import HtmlMailSignatureChecker
3
-from tracim.lib.email_body_parser import BodyMailParts
4
-from tracim.lib.email_body_parser import BodyMailPart
5
-from tracim.lib.email_body_parser import BodyMailPartType
6
-from tracim.lib.email_body_parser import ParsedHTMLMail
7
-from tracim.tests import TestStandard
8
-from bs4 import BeautifulSoup,Tag
1
+from bs4 import BeautifulSoup
9 2
 from nose.tools import raises
3
+from tracim.lib.email_processing.checkers import HtmlMailQuoteChecker, \
4
+    HtmlMailSignatureChecker
5
+from tracim.lib.email_processing.parser import ParsedHTMLMail
6
+from tracim.lib.email_processing.models import BodyMailPartType, BodyMailPart, \
7
+    BodyMailParts
8
+from tracim.tests import TestStandard
10 9
 
11 10
 
12 11
 class TestHtmlMailQuoteChecker(TestStandard):