浏览代码

Refactoring: Email Processing dir

Guénaël Muller 7 年前
父节点
当前提交
9e21ac90c9

+ 0 - 418
tracim/tracim/lib/email_body_parser.py 查看文件

1
-import typing
2
-
3
-from bs4 import BeautifulSoup
4
-from bs4 import Tag
5
-from bs4 import NavigableString
6
-
7
-# BodyParts and Body Parts Objects #
8
-
9
-
10
-class BodyMailPartType(object):
11
-    Signature = 'sign'
12
-    Main = 'main'
13
-    Quote = 'quote'
14
-
15
-
16
-class BodyMailPart(object):
17
-    def __init__(
18
-            self,
19
-            text: str,
20
-            part_type: str
21
-    )-> None:
22
-        self.text = text
23
-        self.part_type = part_type
24
-
25
-
26
-class BodyMailParts(object):
27
-    """
28
-    Data Structure to Distinct part of a Mail body into a "list" of BodyMailPart
29
-    When 2 similar BodyMailPart (same part_type) are added one after the other,
30
-    it doesn't create a new Part, it just merge those elements into one.
31
-    It should always have only one Signature type part, normally
32
-    at the end of the body.
33
-    This object doesn't provide other set method than append() in order to
34
-    preserve object coherence.
35
-    """
36
-    def __init__(self) -> None:
37
-        self._list = []  # type; List[BodyMailPart]
38
-        # INFO - G.M -
39
-        # automatically merge new value with last item if true, without any
40
-        # part_type check, same type as the older one, useful when some tag
41
-        # say "all elem after me is Signature"
42
-        self.follow = False
43
-
44
-    def __len__(self) -> int:
45
-        return len(self._list)
46
-
47
-    def __getitem__(self, index) -> BodyMailPart:
48
-        return self._list[index]
49
-
50
-    def __delitem__(self, index) -> None:
51
-        del self._list[index]
52
-        # FIXME - G.M - 2017-11-27 - Preserve BodyMailParts consistence
53
-        # check elem after and before index and merge them if necessary.
54
-
55
-    def append(self, value) -> None:
56
-        BodyMailParts._check_value(value)
57
-        self._append(value)
58
-
59
-    def _append(self, value) -> None:
60
-        same_type_as_last = len(self._list) > 0 and \
61
-                            self._list[-1].part_type == value.part_type
62
-        if same_type_as_last or self.follow:
63
-            self._list[-1].text += value.text
64
-        else:
65
-            self._list.append(value)
66
-
67
-    @classmethod
68
-    def _check_value(cls, value) -> None:
69
-        if not isinstance(value, BodyMailPart):
70
-            raise TypeError()
71
-
72
-    def drop_part_type(self, part_type: str) -> None:
73
-        """
74
-        Drop all elem of one part_type
75
-        :param part_type: part_type to completely remove
76
-        :return: None
77
-        """
78
-        new_list = [x for x in self._list if x.part_type != part_type]
79
-        self._list = []
80
-        # INFO - G.M - 2017-11-27 - use append() to have a consistent list
81
-        for elem in new_list:
82
-            self.append(elem)
83
-
84
-    def get_nb_part_type(self, part_type: str) -> int:
85
-        """
86
-        Get number of elements of one part_type
87
-        :param part_type: part_type to check
88
-        :return: number of part_type elements
89
-        """
90
-        count = 0
91
-        for elem in self._list:
92
-            if elem.part_type == part_type:
93
-                count += 1
94
-        return count
95
-
96
-    def __str__(self) -> str:
97
-        s_mail = ''
98
-        for elem in self._list:
99
-            s_mail += elem.text
100
-        return str(s_mail)
101
-
102
-# Elements Checkers #
103
-
104
-
105
-class ProprietaryHTMLAttrValues(object):
106
-    """
107
-    This are all Proprietary (mail client specific) html attr value we need to
108
-    check Html Elements
109
-    """
110
-    # Gmail
111
-    Gmail_extras_class = 'gmail_extra'
112
-    Gmail_quote_class = 'gmail_quote'
113
-    Gmail_signature_class = 'gmail_signature'
114
-    # Thunderbird
115
-    Thunderbird_quote_prefix_class = 'moz-cite-prefix'
116
-    Thunderbird_signature_class = 'moz-signature'
117
-    # Outlook.com
118
-    Outlook_com_quote_id = 'divRplyFwdMsg'
119
-    Outlook_com_signature_id = 'Signature'
120
-    Outlook_com_wrapper_id = 'divtagdefaultwrapper'
121
-    # Yahoo
122
-    Yahoo_quote_class = 'yahoo_quoted'
123
-    # Roundcube
124
-    # INFO - G.M - 2017-11-29 - New tag
125
-    # see : https://github.com/roundcube/roundcubemail/issues/6049
126
-    Roundcube_quote_prefix_class = 'reply-intro'
127
-
128
-
129
-class HtmlChecker(object):
130
-
131
-    @classmethod
132
-    def _has_attr_value(
133
-            cls,
134
-            elem: typing.Union[Tag, NavigableString],
135
-            attribute_name: str,
136
-            attribute_value: str,
137
-    )-> bool:
138
-        if isinstance(elem, Tag) and \
139
-                        attribute_name in elem.attrs and \
140
-                        attribute_value in elem.attrs[attribute_name]:
141
-            return True
142
-        return False
143
-
144
-
145
-class HtmlMailQuoteChecker(HtmlChecker):
146
-
147
-    @classmethod
148
-    def is_quote(
149
-            cls,
150
-            elem: typing.Union[Tag, NavigableString]
151
-    ) -> bool:
152
-        return cls._is_standard_quote(elem) \
153
-               or cls._is_thunderbird_quote(elem) \
154
-               or cls._is_gmail_quote(elem) \
155
-               or cls._is_outlook_com_quote(elem) \
156
-               or cls._is_yahoo_quote(elem) \
157
-               or cls._is_roundcube_quote(elem)
158
-
159
-    @classmethod
160
-    def _is_standard_quote(
161
-            cls,
162
-            elem: typing.Union[Tag, NavigableString]
163
-    ) -> bool:
164
-        if isinstance(elem, Tag) \
165
-                and elem.name.lower() == 'blockquote':
166
-            return True
167
-        return False
168
-
169
-    @classmethod
170
-    def _is_thunderbird_quote(
171
-            cls,
172
-            elem: typing.Union[Tag, NavigableString]
173
-    ) -> bool:
174
-        return cls._has_attr_value(
175
-            elem,
176
-            'class',
177
-            ProprietaryHTMLAttrValues.Thunderbird_quote_prefix_class)
178
-
179
-    @classmethod
180
-    def _is_gmail_quote(
181
-            cls,
182
-            elem: typing.Union[Tag, NavigableString]
183
-    ) -> bool:
184
-        if cls._has_attr_value(
185
-                elem,
186
-                'class',
187
-                ProprietaryHTMLAttrValues.Gmail_extras_class):
188
-            for child in elem.children:
189
-                if cls._has_attr_value(
190
-                        child,
191
-                        'class',
192
-                        ProprietaryHTMLAttrValues.Gmail_quote_class):
193
-                    return True
194
-        return False
195
-
196
-    @classmethod
197
-    def _is_outlook_com_quote(
198
-        cls,
199
-        elem: typing.Union[Tag, NavigableString]
200
-    ) -> bool:
201
-        if cls._has_attr_value(
202
-                elem,
203
-                'id',
204
-                ProprietaryHTMLAttrValues.Outlook_com_quote_id):
205
-            return True
206
-        return False
207
-
208
-    @classmethod
209
-    def _is_yahoo_quote(
210
-            cls,
211
-            elem: typing.Union[Tag, NavigableString]
212
-    ) -> bool:
213
-        return cls._has_attr_value(
214
-            elem,
215
-            'class',
216
-            ProprietaryHTMLAttrValues.Yahoo_quote_class)
217
-
218
-    @classmethod
219
-    def _is_roundcube_quote(
220
-            cls,
221
-            elem: typing.Union[Tag, NavigableString]
222
-    ) -> bool:
223
-        return cls._has_attr_value(
224
-            elem,
225
-            'id',
226
-            ProprietaryHTMLAttrValues.Roundcube_quote_prefix_class)
227
-
228
-
229
-class HtmlMailSignatureChecker(HtmlChecker):
230
-
231
-    @classmethod
232
-    def is_signature(
233
-            cls,
234
-            elem: typing.Union[Tag, NavigableString]
235
-    ) -> bool:
236
-        return cls._is_thunderbird_signature(elem) \
237
-               or cls._is_gmail_signature(elem) \
238
-               or cls._is_outlook_com_signature(elem)
239
-
240
-    @classmethod
241
-    def _is_thunderbird_signature(
242
-            cls,
243
-            elem: typing.Union[Tag, NavigableString]
244
-    ) -> bool:
245
-        return cls._has_attr_value(
246
-            elem,
247
-            'class',
248
-            ProprietaryHTMLAttrValues.Thunderbird_signature_class)
249
-
250
-    @classmethod
251
-    def _is_gmail_signature(
252
-            cls,
253
-            elem: typing.Union[Tag, NavigableString]
254
-    ) -> bool:
255
-        if cls._has_attr_value(
256
-                elem,
257
-                'class',
258
-                ProprietaryHTMLAttrValues.Gmail_signature_class):
259
-            return True
260
-        if cls._has_attr_value(
261
-                elem,
262
-                'class',
263
-                ProprietaryHTMLAttrValues.Gmail_extras_class):
264
-            for child in elem.children:
265
-                if cls._has_attr_value(
266
-                        child,
267
-                        'class',
268
-                        ProprietaryHTMLAttrValues.Gmail_signature_class):
269
-                    return True
270
-        if isinstance(elem, Tag) and elem.name.lower() == 'div':
271
-            for child in elem.children:
272
-                if cls._has_attr_value(
273
-                        child,
274
-                        'class',
275
-                        ProprietaryHTMLAttrValues.Gmail_signature_class):
276
-                    return True
277
-        return False
278
-
279
-    @classmethod
280
-    def _is_outlook_com_signature(
281
-            cls,
282
-            elem: typing.Union[Tag, NavigableString]
283
-    ) -> bool:
284
-        if cls._has_attr_value(
285
-                elem,
286
-                'id',
287
-                ProprietaryHTMLAttrValues.Outlook_com_signature_id):
288
-            return True
289
-        return False
290
-
291
-# ParsedHTMLMail #
292
-
293
-
294
-class PreSanitizeConfig(object):
295
-    """
296
-    To avoid problems, html need to be a bit during parsing to distinct
297
-    Main,Quote and Signature elements
298
-    """
299
-    Ignored_tags = ['br', 'hr', 'script', 'style']
300
-    meta_tag = ['body', 'div']
301
-
302
-
303
-class ParsedHTMLMail(object):
304
-    """
305
-    Parse HTML Mail depending of some rules.
306
-    Distinct part of html mail body using BodyMailParts object and
307
-    process differents rules using HtmlChecker(s)
308
-    """
309
-
310
-    def __init__(self, html_body: str):
311
-        self.src_html_body = html_body
312
-
313
-    def __str__(self):
314
-        return str(self._parse_mail())
315
-
316
-    def get_elements(self) -> BodyMailParts:
317
-        tree = self._get_proper_main_body_tree()
318
-        return self._distinct_elements(tree)
319
-
320
-    def _parse_mail(self) -> BodyMailParts:
321
-        elements = self.get_elements()
322
-        elements = self._process_elements(elements)
323
-        return elements
324
-
325
-    def _get_proper_main_body_tree(self) -> BeautifulSoup:
326
-        """
327
-        Get html body tree without some kind of wrapper.
328
-        We need to have text, quote and signature parts at the same tree level
329
-        """
330
-        tree = BeautifulSoup(self.src_html_body, 'html.parser')
331
-
332
-        # Only parse body part of html if available
333
-        subtree = tree.find('body')
334
-        if subtree:
335
-            tree = BeautifulSoup(str(subtree), 'html.parser')
336
-
337
-        # if some kind of "meta_div", unwrap it
338
-        while len(tree.findAll(recursive=None)) == 1 and \
339
-                tree.find().name.lower() in PreSanitizeConfig.meta_tag:
340
-            tree.find().unwrap()
341
-
342
-        for tag in tree.findAll():
343
-            # HACK - G.M - 2017-11-28 - Unwrap outlook.com mail
344
-            # if Text -> Signature -> Quote Mail
345
-            # Text and signature are wrapped into divtagdefaultwrapper
346
-            if tag.attrs.get('id'):
347
-                if ProprietaryHTMLAttrValues.Outlook_com_wrapper_id\
348
-                        in tag.attrs['id']:
349
-                    tag.unwrap()
350
-        return tree
351
-
352
-    @classmethod
353
-    def _distinct_elements(cls, tree: BeautifulSoup) -> BodyMailParts:
354
-        parts = BodyMailParts()
355
-        for elem in list(tree):
356
-            part_txt = str(elem)
357
-            part_type = BodyMailPartType.Main
358
-            # sanitize NavigableString
359
-            if isinstance(elem, NavigableString):
360
-                part_txt = part_txt.replace('\n', '').strip()
361
-
362
-            if HtmlMailQuoteChecker.is_quote(elem):
363
-                part_type = BodyMailPartType.Quote
364
-            elif HtmlMailSignatureChecker.is_signature(elem):
365
-                part_type = BodyMailPartType.Signature
366
-            else:
367
-                # INFO - G.M -2017-11-28 - ignore unwanted parts
368
-                if not part_txt:
369
-                    continue
370
-                if isinstance(elem, Tag) \
371
-                        and elem.name.lower() in PreSanitizeConfig.Ignored_tags:
372
-                    continue
373
-
374
-            part = BodyMailPart(part_txt, part_type)
375
-            parts.append(part)
376
-            # INFO - G.M - 2017-11-28 - Outlook.com special case
377
-            # all after quote tag is quote
378
-            if HtmlMailQuoteChecker._is_outlook_com_quote(elem):
379
-                parts.follow = True
380
-        return parts
381
-
382
-    @classmethod
383
-    def _process_elements(cls, elements: BodyMailParts) -> BodyMailParts:
384
-        if len(elements) >= 2:
385
-            # Case 1 and 2, only one main and one quote
386
-            if elements.get_nb_part_type('main') == 1 and \
387
-                            elements.get_nb_part_type('quote') == 1:
388
-                # Case 1 : Main first
389
-                if elements[0].part_type == BodyMailPartType.Main:
390
-                    cls._process_main_first_case(elements)
391
-                # Case 2 : Quote first
392
-                if elements[0].part_type == BodyMailPartType.Quote:
393
-                    cls._process_quote_first_case(elements)
394
-            else:
395
-                # Case 3 : Multiple quotes and/or main
396
-                cls._process_multiples_elems_case(elements)
397
-        else:
398
-            cls._process_default_case(elements)
399
-            # default case (only one element or empty list)
400
-        return elements
401
-
402
-    @classmethod
403
-    def _process_quote_first_case(cls, elements: BodyMailParts) -> None:
404
-        elements.drop_part_type(BodyMailPartType.Signature)
405
-
406
-    @classmethod
407
-    def _process_main_first_case(cls, elements: BodyMailParts) -> None:
408
-        elements.drop_part_type(BodyMailPartType.Quote)
409
-        elements.drop_part_type(BodyMailPartType.Signature)
410
-
411
-    @classmethod
412
-    def _process_multiples_elems_case(cls, elements: BodyMailParts) -> None:
413
-        elements.drop_part_type(BodyMailPartType.Signature)
414
-
415
-    @classmethod
416
-    def _process_default_case(cls, elements: BodyMailParts) -> None:
417
-        elements.drop_part_type(BodyMailPartType.Quote)
418
-        elements.drop_part_type(BodyMailPartType.Signature)

+ 5 - 9
tracim/tracim/lib/email_fetcher.py 查看文件

1
 # -*- coding: utf-8 -*-
1
 # -*- coding: utf-8 -*-
2
 
2
 
3
-import sys
4
-import time
5
 import imaplib
3
 import imaplib
6
-import datetime
7
-import json
4
+import time
8
 import typing
5
 import typing
9
-from email.message import Message
10
-from email.header import Header, decode_header, make_header
11
-from email.utils import parseaddr, parsedate_tz, mktime_tz
12
 from email import message_from_bytes
6
 from email import message_from_bytes
7
+from email.header import decode_header, make_header
8
+from email.message import Message
9
+from email.utils import parseaddr
13
 
10
 
14
 import markdown
11
 import markdown
15
 import requests
12
 import requests
16
 from bs4 import BeautifulSoup, Tag
13
 from bs4 import BeautifulSoup, Tag
17
 from email_reply_parser import EmailReplyParser
14
 from email_reply_parser import EmailReplyParser
18
-
19
 from tracim.lib.base import logger
15
 from tracim.lib.base import logger
20
-from tracim.lib.email_body_parser import ParsedHTMLMail
16
+from tracim.lib.email_processing.parser import ParsedHTMLMail
21
 
17
 
22
 TRACIM_SPECIAL_KEY_HEADER = 'X-Tracim-Key'
18
 TRACIM_SPECIAL_KEY_HEADER = 'X-Tracim-Key'
23
 # TODO BS 20171124: Think about replace thin dict config by object
19
 # TODO BS 20171124: Think about replace thin dict config by object

+ 0 - 0
tracim/tracim/lib/email_processing/__init__.py 查看文件


+ 192 - 0
tracim/tracim/lib/email_processing/checkers.py 查看文件

1
+# -*- coding: utf-8 -*-
2
+import typing
3
+
4
+from bs4 import Tag, NavigableString
5
+
6
+
7
+class ProprietaryHTMLAttrValues(object):
8
+    """
9
+    This are all Proprietary (mail client specific) html attr value we need to
10
+    check Html Elements
11
+    """
12
+    # Gmail
13
+    Gmail_extras_class = 'gmail_extra'
14
+    Gmail_quote_class = 'gmail_quote'
15
+    Gmail_signature_class = 'gmail_signature'
16
+    # Thunderbird
17
+    Thunderbird_quote_prefix_class = 'moz-cite-prefix'
18
+    Thunderbird_signature_class = 'moz-signature'
19
+    # Outlook.com
20
+    Outlook_com_quote_id = 'divRplyFwdMsg'
21
+    Outlook_com_signature_id = 'Signature'
22
+    Outlook_com_wrapper_id = 'divtagdefaultwrapper'
23
+    # Yahoo
24
+    Yahoo_quote_class = 'yahoo_quoted'
25
+    # Roundcube
26
+    # INFO - G.M - 2017-11-29 - New tag
27
+    # see : https://github.com/roundcube/roundcubemail/issues/6049
28
+    Roundcube_quote_prefix_class = 'reply-intro'
29
+
30
+
31
+class HtmlChecker(object):
32
+
33
+    @classmethod
34
+    def _has_attr_value(
35
+            cls,
36
+            elem: typing.Union[Tag, NavigableString],
37
+            attribute_name: str,
38
+            attribute_value: str,
39
+    )-> bool:
40
+        if isinstance(elem, Tag) and \
41
+                        attribute_name in elem.attrs and \
42
+                        attribute_value in elem.attrs[attribute_name]:
43
+            return True
44
+        return False
45
+
46
+
47
+class HtmlMailQuoteChecker(HtmlChecker):
48
+
49
+    @classmethod
50
+    def is_quote(
51
+            cls,
52
+            elem: typing.Union[Tag, NavigableString]
53
+    ) -> bool:
54
+        return cls._is_standard_quote(elem) \
55
+               or cls._is_thunderbird_quote(elem) \
56
+               or cls._is_gmail_quote(elem) \
57
+               or cls._is_outlook_com_quote(elem) \
58
+               or cls._is_yahoo_quote(elem) \
59
+               or cls._is_roundcube_quote(elem)
60
+
61
+    @classmethod
62
+    def _is_standard_quote(
63
+            cls,
64
+            elem: typing.Union[Tag, NavigableString]
65
+    ) -> bool:
66
+        if isinstance(elem, Tag) \
67
+                and elem.name.lower() == 'blockquote':
68
+            return True
69
+        return False
70
+
71
+    @classmethod
72
+    def _is_thunderbird_quote(
73
+            cls,
74
+            elem: typing.Union[Tag, NavigableString]
75
+    ) -> bool:
76
+        return cls._has_attr_value(
77
+            elem,
78
+            'class',
79
+            ProprietaryHTMLAttrValues.Thunderbird_quote_prefix_class)
80
+
81
+    @classmethod
82
+    def _is_gmail_quote(
83
+            cls,
84
+            elem: typing.Union[Tag, NavigableString]
85
+    ) -> bool:
86
+        if cls._has_attr_value(
87
+                elem,
88
+                'class',
89
+                ProprietaryHTMLAttrValues.Gmail_extras_class):
90
+            for child in elem.children:
91
+                if cls._has_attr_value(
92
+                        child,
93
+                        'class',
94
+                        ProprietaryHTMLAttrValues.Gmail_quote_class):
95
+                    return True
96
+        return False
97
+
98
+    @classmethod
99
+    def _is_outlook_com_quote(
100
+        cls,
101
+        elem: typing.Union[Tag, NavigableString]
102
+    ) -> bool:
103
+        if cls._has_attr_value(
104
+                elem,
105
+                'id',
106
+                ProprietaryHTMLAttrValues.Outlook_com_quote_id):
107
+            return True
108
+        return False
109
+
110
+    @classmethod
111
+    def _is_yahoo_quote(
112
+            cls,
113
+            elem: typing.Union[Tag, NavigableString]
114
+    ) -> bool:
115
+        return cls._has_attr_value(
116
+            elem,
117
+            'class',
118
+            ProprietaryHTMLAttrValues.Yahoo_quote_class)
119
+
120
+    @classmethod
121
+    def _is_roundcube_quote(
122
+            cls,
123
+            elem: typing.Union[Tag, NavigableString]
124
+    ) -> bool:
125
+        return cls._has_attr_value(
126
+            elem,
127
+            'id',
128
+            ProprietaryHTMLAttrValues.Roundcube_quote_prefix_class)
129
+
130
+
131
+class HtmlMailSignatureChecker(HtmlChecker):
132
+
133
+    @classmethod
134
+    def is_signature(
135
+            cls,
136
+            elem: typing.Union[Tag, NavigableString]
137
+    ) -> bool:
138
+        return cls._is_thunderbird_signature(elem) \
139
+               or cls._is_gmail_signature(elem) \
140
+               or cls._is_outlook_com_signature(elem)
141
+
142
+    @classmethod
143
+    def _is_thunderbird_signature(
144
+            cls,
145
+            elem: typing.Union[Tag, NavigableString]
146
+    ) -> bool:
147
+        return cls._has_attr_value(
148
+            elem,
149
+            'class',
150
+            ProprietaryHTMLAttrValues.Thunderbird_signature_class)
151
+
152
+    @classmethod
153
+    def _is_gmail_signature(
154
+            cls,
155
+            elem: typing.Union[Tag, NavigableString]
156
+    ) -> bool:
157
+        if cls._has_attr_value(
158
+                elem,
159
+                'class',
160
+                ProprietaryHTMLAttrValues.Gmail_signature_class):
161
+            return True
162
+        if cls._has_attr_value(
163
+                elem,
164
+                'class',
165
+                ProprietaryHTMLAttrValues.Gmail_extras_class):
166
+            for child in elem.children:
167
+                if cls._has_attr_value(
168
+                        child,
169
+                        'class',
170
+                        ProprietaryHTMLAttrValues.Gmail_signature_class):
171
+                    return True
172
+        if isinstance(elem, Tag) and elem.name.lower() == 'div':
173
+            for child in elem.children:
174
+                if cls._has_attr_value(
175
+                        child,
176
+                        'class',
177
+                        ProprietaryHTMLAttrValues.Gmail_signature_class):
178
+                    return True
179
+        return False
180
+
181
+    @classmethod
182
+    def _is_outlook_com_signature(
183
+            cls,
184
+            elem: typing.Union[Tag, NavigableString]
185
+    ) -> bool:
186
+        if cls._has_attr_value(
187
+                elem,
188
+                'id',
189
+                ProprietaryHTMLAttrValues.Outlook_com_signature_id):
190
+            return True
191
+        return False
192
+

+ 93 - 0
tracim/tracim/lib/email_processing/models.py 查看文件

1
+# -*- coding: utf-8 -*-
2
+class BodyMailPartType(object):
3
+    Signature = 'sign'
4
+    Main = 'main'
5
+    Quote = 'quote'
6
+
7
+
8
+class BodyMailPart(object):
9
+    def __init__(
10
+            self,
11
+            text: str,
12
+            part_type: str
13
+    )-> None:
14
+        self.text = text
15
+        self.part_type = part_type
16
+
17
+
18
+class BodyMailParts(object):
19
+    """
20
+    Data Structure to Distinct part of a Mail body into a "list" of BodyMailPart
21
+    When 2 similar BodyMailPart (same part_type) are added one after the other,
22
+    it doesn't create a new Part, it just merge those elements into one.
23
+    It should always have only one Signature type part, normally
24
+    at the end of the body.
25
+    This object doesn't provide other set method than append() in order to
26
+    preserve object coherence.
27
+    """
28
+    def __init__(self) -> None:
29
+        self._list = []  # type; List[BodyMailPart]
30
+        # INFO - G.M -
31
+        # automatically merge new value with last item if true, without any
32
+        # part_type check, same type as the older one, useful when some tag
33
+        # say "all elem after me is Signature"
34
+        self.follow = False
35
+
36
+    def __len__(self) -> int:
37
+        return len(self._list)
38
+
39
+    def __getitem__(self, index) -> BodyMailPart:
40
+        return self._list[index]
41
+
42
+    def __delitem__(self, index) -> None:
43
+        del self._list[index]
44
+        # FIXME - G.M - 2017-11-27 - Preserve BodyMailParts consistence
45
+        # check elem after and before index and merge them if necessary.
46
+
47
+    def append(self, value) -> None:
48
+        BodyMailParts._check_value(value)
49
+        self._append(value)
50
+
51
+    def _append(self, value) -> None:
52
+        same_type_as_last = len(self._list) > 0 and \
53
+                            self._list[-1].part_type == value.part_type
54
+        if same_type_as_last or self.follow:
55
+            self._list[-1].text += value.text
56
+        else:
57
+            self._list.append(value)
58
+
59
+    @classmethod
60
+    def _check_value(cls, value) -> None:
61
+        if not isinstance(value, BodyMailPart):
62
+            raise TypeError()
63
+
64
+    def drop_part_type(self, part_type: str) -> None:
65
+        """
66
+        Drop all elem of one part_type
67
+        :param part_type: part_type to completely remove
68
+        :return: None
69
+        """
70
+        new_list = [x for x in self._list if x.part_type != part_type]
71
+        self._list = []
72
+        # INFO - G.M - 2017-11-27 - use append() to have a consistent list
73
+        for elem in new_list:
74
+            self.append(elem)
75
+
76
+    def get_nb_part_type(self, part_type: str) -> int:
77
+        """
78
+        Get number of elements of one part_type
79
+        :param part_type: part_type to check
80
+        :return: number of part_type elements
81
+        """
82
+        count = 0
83
+        for elem in self._list:
84
+            if elem.part_type == part_type:
85
+                count += 1
86
+        return count
87
+
88
+    def __str__(self) -> str:
89
+        s_mail = ''
90
+        for elem in self._list:
91
+            s_mail += elem.text
92
+        return str(s_mail)
93
+

+ 137 - 0
tracim/tracim/lib/email_processing/parser.py 查看文件

1
+# -*- coding: utf-8 -*-
2
+from bs4 import BeautifulSoup
3
+from bs4 import NavigableString
4
+from bs4 import Tag
5
+
6
+from tracim.lib.email_processing.checkers import ProprietaryHTMLAttrValues
7
+from tracim.lib.email_processing.checkers import HtmlMailQuoteChecker
8
+from tracim.lib.email_processing.checkers import HtmlMailSignatureChecker
9
+from tracim.lib.email_processing.models import BodyMailPartType
10
+from tracim.lib.email_processing.models import BodyMailPart
11
+from tracim.lib.email_processing.models import BodyMailParts
12
+
13
+class PreSanitizeConfig(object):
14
+    """
15
+    To avoid problems, html need to be a bit during parsing to distinct
16
+    Main,Quote and Signature elements
17
+    """
18
+    Ignored_tags = ['br', 'hr', 'script', 'style']
19
+    meta_tag = ['body', 'div']
20
+
21
+
22
+class ParsedHTMLMail(object):
23
+    """
24
+    Parse HTML Mail depending of some rules.
25
+    Distinct part of html mail body using BodyMailParts object and
26
+    process differents rules using HtmlChecker(s)
27
+    """
28
+
29
+    def __init__(self, html_body: str):
30
+        self.src_html_body = html_body
31
+
32
+    def __str__(self):
33
+        return str(self._parse_mail())
34
+
35
+    def get_elements(self) -> BodyMailParts:
36
+        tree = self._get_proper_main_body_tree()
37
+        return self._distinct_elements(tree)
38
+
39
+    def _parse_mail(self) -> BodyMailParts:
40
+        elements = self.get_elements()
41
+        elements = self._process_elements(elements)
42
+        return elements
43
+
44
+    def _get_proper_main_body_tree(self) -> BeautifulSoup:
45
+        """
46
+        Get html body tree without some kind of wrapper.
47
+        We need to have text, quote and signature parts at the same tree level
48
+        """
49
+        tree = BeautifulSoup(self.src_html_body, 'html.parser')
50
+
51
+        # Only parse body part of html if available
52
+        subtree = tree.find('body')
53
+        if subtree:
54
+            tree = BeautifulSoup(str(subtree), 'html.parser')
55
+
56
+        # if some kind of "meta_div", unwrap it
57
+        while len(tree.findAll(recursive=None)) == 1 and \
58
+                tree.find().name.lower() in PreSanitizeConfig.meta_tag:
59
+            tree.find().unwrap()
60
+
61
+        for tag in tree.findAll():
62
+            # HACK - G.M - 2017-11-28 - Unwrap outlook.com mail
63
+            # if Text -> Signature -> Quote Mail
64
+            # Text and signature are wrapped into divtagdefaultwrapper
65
+            if tag.attrs.get('id'):
66
+                if ProprietaryHTMLAttrValues.Outlook_com_wrapper_id\
67
+                        in tag.attrs['id']:
68
+                    tag.unwrap()
69
+        return tree
70
+
71
+    @classmethod
72
+    def _distinct_elements(cls, tree: BeautifulSoup) -> BodyMailParts:
73
+        parts = BodyMailParts()
74
+        for elem in list(tree):
75
+            part_txt = str(elem)
76
+            part_type = BodyMailPartType.Main
77
+            # sanitize NavigableString
78
+            if isinstance(elem, NavigableString):
79
+                part_txt = part_txt.replace('\n', '').strip()
80
+
81
+            if HtmlMailQuoteChecker.is_quote(elem):
82
+                part_type = BodyMailPartType.Quote
83
+            elif HtmlMailSignatureChecker.is_signature(elem):
84
+                part_type = BodyMailPartType.Signature
85
+            else:
86
+                # INFO - G.M -2017-11-28 - ignore unwanted parts
87
+                if not part_txt:
88
+                    continue
89
+                if isinstance(elem, Tag) \
90
+                        and elem.name.lower() in PreSanitizeConfig.Ignored_tags:
91
+                    continue
92
+
93
+            part = BodyMailPart(part_txt, part_type)
94
+            parts.append(part)
95
+            # INFO - G.M - 2017-11-28 - Outlook.com special case
96
+            # all after quote tag is quote
97
+            if HtmlMailQuoteChecker._is_outlook_com_quote(elem):
98
+                parts.follow = True
99
+        return parts
100
+
101
+    @classmethod
102
+    def _process_elements(cls, elements: BodyMailParts) -> BodyMailParts:
103
+        if len(elements) >= 2:
104
+            # Case 1 and 2, only one main and one quote
105
+            if elements.get_nb_part_type('main') == 1 and \
106
+                            elements.get_nb_part_type('quote') == 1:
107
+                # Case 1 : Main first
108
+                if elements[0].part_type == BodyMailPartType.Main:
109
+                    cls._process_main_first_case(elements)
110
+                # Case 2 : Quote first
111
+                if elements[0].part_type == BodyMailPartType.Quote:
112
+                    cls._process_quote_first_case(elements)
113
+            else:
114
+                # Case 3 : Multiple quotes and/or main
115
+                cls._process_multiples_elems_case(elements)
116
+        else:
117
+            cls._process_default_case(elements)
118
+            # default case (only one element or empty list)
119
+        return elements
120
+
121
+    @classmethod
122
+    def _process_quote_first_case(cls, elements: BodyMailParts) -> None:
123
+        elements.drop_part_type(BodyMailPartType.Signature)
124
+
125
+    @classmethod
126
+    def _process_main_first_case(cls, elements: BodyMailParts) -> None:
127
+        elements.drop_part_type(BodyMailPartType.Quote)
128
+        elements.drop_part_type(BodyMailPartType.Signature)
129
+
130
+    @classmethod
131
+    def _process_multiples_elems_case(cls, elements: BodyMailParts) -> None:
132
+        elements.drop_part_type(BodyMailPartType.Signature)
133
+
134
+    @classmethod
135
+    def _process_default_case(cls, elements: BodyMailParts) -> None:
136
+        elements.drop_part_type(BodyMailPartType.Quote)
137
+        elements.drop_part_type(BodyMailPartType.Signature)

+ 7 - 8
tracim/tracim/tests/library/test_email_body_parser.py 查看文件

1
-from tracim.lib.email_body_parser import HtmlMailQuoteChecker
2
-from tracim.lib.email_body_parser import HtmlMailSignatureChecker
3
-from tracim.lib.email_body_parser import BodyMailParts
4
-from tracim.lib.email_body_parser import BodyMailPart
5
-from tracim.lib.email_body_parser import BodyMailPartType
6
-from tracim.lib.email_body_parser import ParsedHTMLMail
7
-from tracim.tests import TestStandard
8
-from bs4 import BeautifulSoup,Tag
1
+from bs4 import BeautifulSoup
9
 from nose.tools import raises
2
 from nose.tools import raises
3
+from tracim.lib.email_processing.checkers import HtmlMailQuoteChecker, \
4
+    HtmlMailSignatureChecker
5
+from tracim.lib.email_processing.parser import ParsedHTMLMail
6
+from tracim.lib.email_processing.models import BodyMailPartType, BodyMailPart, \
7
+    BodyMailParts
8
+from tracim.tests import TestStandard
10
 
9
 
11
 
10
 
12
 class TestHtmlMailQuoteChecker(TestStandard):
11
 class TestHtmlMailQuoteChecker(TestStandard):