|
@@ -1,418 +0,0 @@
|
1
|
|
-import typing
|
2
|
|
-
|
3
|
|
-from bs4 import BeautifulSoup
|
4
|
|
-from bs4 import Tag
|
5
|
|
-from bs4 import NavigableString
|
6
|
|
-
|
7
|
|
-# BodyParts and Body Parts Objects #
|
8
|
|
-
|
9
|
|
-
|
10
|
|
-class BodyMailPartType(object):
|
11
|
|
- Signature = 'sign'
|
12
|
|
- Main = 'main'
|
13
|
|
- Quote = 'quote'
|
14
|
|
-
|
15
|
|
-
|
16
|
|
-class BodyMailPart(object):
|
17
|
|
- def __init__(
|
18
|
|
- self,
|
19
|
|
- text: str,
|
20
|
|
- part_type: str
|
21
|
|
- )-> None:
|
22
|
|
- self.text = text
|
23
|
|
- self.part_type = part_type
|
24
|
|
-
|
25
|
|
-
|
26
|
|
-class BodyMailParts(object):
|
27
|
|
- """
|
28
|
|
- Data Structure to Distinct part of a Mail body into a "list" of BodyMailPart
|
29
|
|
- When 2 similar BodyMailPart (same part_type) are added one after the other,
|
30
|
|
- it doesn't create a new Part, it just merge those elements into one.
|
31
|
|
- It should always have only one Signature type part, normally
|
32
|
|
- at the end of the body.
|
33
|
|
- This object doesn't provide other set method than append() in order to
|
34
|
|
- preserve object coherence.
|
35
|
|
- """
|
36
|
|
- def __init__(self) -> None:
|
37
|
|
- self._list = [] # type; List[BodyMailPart]
|
38
|
|
- # INFO - G.M -
|
39
|
|
- # automatically merge new value with last item if true, without any
|
40
|
|
- # part_type check, same type as the older one, useful when some tag
|
41
|
|
- # say "all elem after me is Signature"
|
42
|
|
- self.follow = False
|
43
|
|
-
|
44
|
|
- def __len__(self) -> int:
|
45
|
|
- return len(self._list)
|
46
|
|
-
|
47
|
|
- def __getitem__(self, index) -> BodyMailPart:
|
48
|
|
- return self._list[index]
|
49
|
|
-
|
50
|
|
- def __delitem__(self, index) -> None:
|
51
|
|
- del self._list[index]
|
52
|
|
- # FIXME - G.M - 2017-11-27 - Preserve BodyMailParts consistence
|
53
|
|
- # check elem after and before index and merge them if necessary.
|
54
|
|
-
|
55
|
|
- def append(self, value) -> None:
|
56
|
|
- BodyMailParts._check_value(value)
|
57
|
|
- self._append(value)
|
58
|
|
-
|
59
|
|
- def _append(self, value) -> None:
|
60
|
|
- same_type_as_last = len(self._list) > 0 and \
|
61
|
|
- self._list[-1].part_type == value.part_type
|
62
|
|
- if same_type_as_last or self.follow:
|
63
|
|
- self._list[-1].text += value.text
|
64
|
|
- else:
|
65
|
|
- self._list.append(value)
|
66
|
|
-
|
67
|
|
- @classmethod
|
68
|
|
- def _check_value(cls, value) -> None:
|
69
|
|
- if not isinstance(value, BodyMailPart):
|
70
|
|
- raise TypeError()
|
71
|
|
-
|
72
|
|
- def drop_part_type(self, part_type: str) -> None:
|
73
|
|
- """
|
74
|
|
- Drop all elem of one part_type
|
75
|
|
- :param part_type: part_type to completely remove
|
76
|
|
- :return: None
|
77
|
|
- """
|
78
|
|
- new_list = [x for x in self._list if x.part_type != part_type]
|
79
|
|
- self._list = []
|
80
|
|
- # INFO - G.M - 2017-11-27 - use append() to have a consistent list
|
81
|
|
- for elem in new_list:
|
82
|
|
- self.append(elem)
|
83
|
|
-
|
84
|
|
- def get_nb_part_type(self, part_type: str) -> int:
|
85
|
|
- """
|
86
|
|
- Get number of elements of one part_type
|
87
|
|
- :param part_type: part_type to check
|
88
|
|
- :return: number of part_type elements
|
89
|
|
- """
|
90
|
|
- count = 0
|
91
|
|
- for elem in self._list:
|
92
|
|
- if elem.part_type == part_type:
|
93
|
|
- count += 1
|
94
|
|
- return count
|
95
|
|
-
|
96
|
|
- def __str__(self) -> str:
|
97
|
|
- s_mail = ''
|
98
|
|
- for elem in self._list:
|
99
|
|
- s_mail += elem.text
|
100
|
|
- return str(s_mail)
|
101
|
|
-
|
102
|
|
-# Elements Checkers #
|
103
|
|
-
|
104
|
|
-
|
105
|
|
-class ProprietaryHTMLAttrValues(object):
|
106
|
|
- """
|
107
|
|
- This are all Proprietary (mail client specific) html attr value we need to
|
108
|
|
- check Html Elements
|
109
|
|
- """
|
110
|
|
- # Gmail
|
111
|
|
- Gmail_extras_class = 'gmail_extra'
|
112
|
|
- Gmail_quote_class = 'gmail_quote'
|
113
|
|
- Gmail_signature_class = 'gmail_signature'
|
114
|
|
- # Thunderbird
|
115
|
|
- Thunderbird_quote_prefix_class = 'moz-cite-prefix'
|
116
|
|
- Thunderbird_signature_class = 'moz-signature'
|
117
|
|
- # Outlook.com
|
118
|
|
- Outlook_com_quote_id = 'divRplyFwdMsg'
|
119
|
|
- Outlook_com_signature_id = 'Signature'
|
120
|
|
- Outlook_com_wrapper_id = 'divtagdefaultwrapper'
|
121
|
|
- # Yahoo
|
122
|
|
- Yahoo_quote_class = 'yahoo_quoted'
|
123
|
|
- # Roundcube
|
124
|
|
- # INFO - G.M - 2017-11-29 - New tag
|
125
|
|
- # see : https://github.com/roundcube/roundcubemail/issues/6049
|
126
|
|
- Roundcube_quote_prefix_class = 'reply-intro'
|
127
|
|
-
|
128
|
|
-
|
129
|
|
-class HtmlChecker(object):
|
130
|
|
-
|
131
|
|
- @classmethod
|
132
|
|
- def _has_attr_value(
|
133
|
|
- cls,
|
134
|
|
- elem: typing.Union[Tag, NavigableString],
|
135
|
|
- attribute_name: str,
|
136
|
|
- attribute_value: str,
|
137
|
|
- )-> bool:
|
138
|
|
- if isinstance(elem, Tag) and \
|
139
|
|
- attribute_name in elem.attrs and \
|
140
|
|
- attribute_value in elem.attrs[attribute_name]:
|
141
|
|
- return True
|
142
|
|
- return False
|
143
|
|
-
|
144
|
|
-
|
145
|
|
-class HtmlMailQuoteChecker(HtmlChecker):
|
146
|
|
-
|
147
|
|
- @classmethod
|
148
|
|
- def is_quote(
|
149
|
|
- cls,
|
150
|
|
- elem: typing.Union[Tag, NavigableString]
|
151
|
|
- ) -> bool:
|
152
|
|
- return cls._is_standard_quote(elem) \
|
153
|
|
- or cls._is_thunderbird_quote(elem) \
|
154
|
|
- or cls._is_gmail_quote(elem) \
|
155
|
|
- or cls._is_outlook_com_quote(elem) \
|
156
|
|
- or cls._is_yahoo_quote(elem) \
|
157
|
|
- or cls._is_roundcube_quote(elem)
|
158
|
|
-
|
159
|
|
- @classmethod
|
160
|
|
- def _is_standard_quote(
|
161
|
|
- cls,
|
162
|
|
- elem: typing.Union[Tag, NavigableString]
|
163
|
|
- ) -> bool:
|
164
|
|
- if isinstance(elem, Tag) \
|
165
|
|
- and elem.name.lower() == 'blockquote':
|
166
|
|
- return True
|
167
|
|
- return False
|
168
|
|
-
|
169
|
|
- @classmethod
|
170
|
|
- def _is_thunderbird_quote(
|
171
|
|
- cls,
|
172
|
|
- elem: typing.Union[Tag, NavigableString]
|
173
|
|
- ) -> bool:
|
174
|
|
- return cls._has_attr_value(
|
175
|
|
- elem,
|
176
|
|
- 'class',
|
177
|
|
- ProprietaryHTMLAttrValues.Thunderbird_quote_prefix_class)
|
178
|
|
-
|
179
|
|
- @classmethod
|
180
|
|
- def _is_gmail_quote(
|
181
|
|
- cls,
|
182
|
|
- elem: typing.Union[Tag, NavigableString]
|
183
|
|
- ) -> bool:
|
184
|
|
- if cls._has_attr_value(
|
185
|
|
- elem,
|
186
|
|
- 'class',
|
187
|
|
- ProprietaryHTMLAttrValues.Gmail_extras_class):
|
188
|
|
- for child in elem.children:
|
189
|
|
- if cls._has_attr_value(
|
190
|
|
- child,
|
191
|
|
- 'class',
|
192
|
|
- ProprietaryHTMLAttrValues.Gmail_quote_class):
|
193
|
|
- return True
|
194
|
|
- return False
|
195
|
|
-
|
196
|
|
- @classmethod
|
197
|
|
- def _is_outlook_com_quote(
|
198
|
|
- cls,
|
199
|
|
- elem: typing.Union[Tag, NavigableString]
|
200
|
|
- ) -> bool:
|
201
|
|
- if cls._has_attr_value(
|
202
|
|
- elem,
|
203
|
|
- 'id',
|
204
|
|
- ProprietaryHTMLAttrValues.Outlook_com_quote_id):
|
205
|
|
- return True
|
206
|
|
- return False
|
207
|
|
-
|
208
|
|
- @classmethod
|
209
|
|
- def _is_yahoo_quote(
|
210
|
|
- cls,
|
211
|
|
- elem: typing.Union[Tag, NavigableString]
|
212
|
|
- ) -> bool:
|
213
|
|
- return cls._has_attr_value(
|
214
|
|
- elem,
|
215
|
|
- 'class',
|
216
|
|
- ProprietaryHTMLAttrValues.Yahoo_quote_class)
|
217
|
|
-
|
218
|
|
- @classmethod
|
219
|
|
- def _is_roundcube_quote(
|
220
|
|
- cls,
|
221
|
|
- elem: typing.Union[Tag, NavigableString]
|
222
|
|
- ) -> bool:
|
223
|
|
- return cls._has_attr_value(
|
224
|
|
- elem,
|
225
|
|
- 'id',
|
226
|
|
- ProprietaryHTMLAttrValues.Roundcube_quote_prefix_class)
|
227
|
|
-
|
228
|
|
-
|
229
|
|
-class HtmlMailSignatureChecker(HtmlChecker):
|
230
|
|
-
|
231
|
|
- @classmethod
|
232
|
|
- def is_signature(
|
233
|
|
- cls,
|
234
|
|
- elem: typing.Union[Tag, NavigableString]
|
235
|
|
- ) -> bool:
|
236
|
|
- return cls._is_thunderbird_signature(elem) \
|
237
|
|
- or cls._is_gmail_signature(elem) \
|
238
|
|
- or cls._is_outlook_com_signature(elem)
|
239
|
|
-
|
240
|
|
- @classmethod
|
241
|
|
- def _is_thunderbird_signature(
|
242
|
|
- cls,
|
243
|
|
- elem: typing.Union[Tag, NavigableString]
|
244
|
|
- ) -> bool:
|
245
|
|
- return cls._has_attr_value(
|
246
|
|
- elem,
|
247
|
|
- 'class',
|
248
|
|
- ProprietaryHTMLAttrValues.Thunderbird_signature_class)
|
249
|
|
-
|
250
|
|
- @classmethod
|
251
|
|
- def _is_gmail_signature(
|
252
|
|
- cls,
|
253
|
|
- elem: typing.Union[Tag, NavigableString]
|
254
|
|
- ) -> bool:
|
255
|
|
- if cls._has_attr_value(
|
256
|
|
- elem,
|
257
|
|
- 'class',
|
258
|
|
- ProprietaryHTMLAttrValues.Gmail_signature_class):
|
259
|
|
- return True
|
260
|
|
- if cls._has_attr_value(
|
261
|
|
- elem,
|
262
|
|
- 'class',
|
263
|
|
- ProprietaryHTMLAttrValues.Gmail_extras_class):
|
264
|
|
- for child in elem.children:
|
265
|
|
- if cls._has_attr_value(
|
266
|
|
- child,
|
267
|
|
- 'class',
|
268
|
|
- ProprietaryHTMLAttrValues.Gmail_signature_class):
|
269
|
|
- return True
|
270
|
|
- if isinstance(elem, Tag) and elem.name.lower() == 'div':
|
271
|
|
- for child in elem.children:
|
272
|
|
- if cls._has_attr_value(
|
273
|
|
- child,
|
274
|
|
- 'class',
|
275
|
|
- ProprietaryHTMLAttrValues.Gmail_signature_class):
|
276
|
|
- return True
|
277
|
|
- return False
|
278
|
|
-
|
279
|
|
- @classmethod
|
280
|
|
- def _is_outlook_com_signature(
|
281
|
|
- cls,
|
282
|
|
- elem: typing.Union[Tag, NavigableString]
|
283
|
|
- ) -> bool:
|
284
|
|
- if cls._has_attr_value(
|
285
|
|
- elem,
|
286
|
|
- 'id',
|
287
|
|
- ProprietaryHTMLAttrValues.Outlook_com_signature_id):
|
288
|
|
- return True
|
289
|
|
- return False
|
290
|
|
-
|
291
|
|
-# ParsedHTMLMail #
|
292
|
|
-
|
293
|
|
-
|
294
|
|
-class PreSanitizeConfig(object):
|
295
|
|
- """
|
296
|
|
- To avoid problems, html need to be a bit during parsing to distinct
|
297
|
|
- Main,Quote and Signature elements
|
298
|
|
- """
|
299
|
|
- Ignored_tags = ['br', 'hr', 'script', 'style']
|
300
|
|
- meta_tag = ['body', 'div']
|
301
|
|
-
|
302
|
|
-
|
303
|
|
-class ParsedHTMLMail(object):
|
304
|
|
- """
|
305
|
|
- Parse HTML Mail depending of some rules.
|
306
|
|
- Distinct part of html mail body using BodyMailParts object and
|
307
|
|
- process differents rules using HtmlChecker(s)
|
308
|
|
- """
|
309
|
|
-
|
310
|
|
- def __init__(self, html_body: str):
|
311
|
|
- self.src_html_body = html_body
|
312
|
|
-
|
313
|
|
- def __str__(self):
|
314
|
|
- return str(self._parse_mail())
|
315
|
|
-
|
316
|
|
- def get_elements(self) -> BodyMailParts:
|
317
|
|
- tree = self._get_proper_main_body_tree()
|
318
|
|
- return self._distinct_elements(tree)
|
319
|
|
-
|
320
|
|
- def _parse_mail(self) -> BodyMailParts:
|
321
|
|
- elements = self.get_elements()
|
322
|
|
- elements = self._process_elements(elements)
|
323
|
|
- return elements
|
324
|
|
-
|
325
|
|
- def _get_proper_main_body_tree(self) -> BeautifulSoup:
|
326
|
|
- """
|
327
|
|
- Get html body tree without some kind of wrapper.
|
328
|
|
- We need to have text, quote and signature parts at the same tree level
|
329
|
|
- """
|
330
|
|
- tree = BeautifulSoup(self.src_html_body, 'html.parser')
|
331
|
|
-
|
332
|
|
- # Only parse body part of html if available
|
333
|
|
- subtree = tree.find('body')
|
334
|
|
- if subtree:
|
335
|
|
- tree = BeautifulSoup(str(subtree), 'html.parser')
|
336
|
|
-
|
337
|
|
- # if some kind of "meta_div", unwrap it
|
338
|
|
- while len(tree.findAll(recursive=None)) == 1 and \
|
339
|
|
- tree.find().name.lower() in PreSanitizeConfig.meta_tag:
|
340
|
|
- tree.find().unwrap()
|
341
|
|
-
|
342
|
|
- for tag in tree.findAll():
|
343
|
|
- # HACK - G.M - 2017-11-28 - Unwrap outlook.com mail
|
344
|
|
- # if Text -> Signature -> Quote Mail
|
345
|
|
- # Text and signature are wrapped into divtagdefaultwrapper
|
346
|
|
- if tag.attrs.get('id'):
|
347
|
|
- if ProprietaryHTMLAttrValues.Outlook_com_wrapper_id\
|
348
|
|
- in tag.attrs['id']:
|
349
|
|
- tag.unwrap()
|
350
|
|
- return tree
|
351
|
|
-
|
352
|
|
- @classmethod
|
353
|
|
- def _distinct_elements(cls, tree: BeautifulSoup) -> BodyMailParts:
|
354
|
|
- parts = BodyMailParts()
|
355
|
|
- for elem in list(tree):
|
356
|
|
- part_txt = str(elem)
|
357
|
|
- part_type = BodyMailPartType.Main
|
358
|
|
- # sanitize NavigableString
|
359
|
|
- if isinstance(elem, NavigableString):
|
360
|
|
- part_txt = part_txt.replace('\n', '').strip()
|
361
|
|
-
|
362
|
|
- if HtmlMailQuoteChecker.is_quote(elem):
|
363
|
|
- part_type = BodyMailPartType.Quote
|
364
|
|
- elif HtmlMailSignatureChecker.is_signature(elem):
|
365
|
|
- part_type = BodyMailPartType.Signature
|
366
|
|
- else:
|
367
|
|
- # INFO - G.M -2017-11-28 - ignore unwanted parts
|
368
|
|
- if not part_txt:
|
369
|
|
- continue
|
370
|
|
- if isinstance(elem, Tag) \
|
371
|
|
- and elem.name.lower() in PreSanitizeConfig.Ignored_tags:
|
372
|
|
- continue
|
373
|
|
-
|
374
|
|
- part = BodyMailPart(part_txt, part_type)
|
375
|
|
- parts.append(part)
|
376
|
|
- # INFO - G.M - 2017-11-28 - Outlook.com special case
|
377
|
|
- # all after quote tag is quote
|
378
|
|
- if HtmlMailQuoteChecker._is_outlook_com_quote(elem):
|
379
|
|
- parts.follow = True
|
380
|
|
- return parts
|
381
|
|
-
|
382
|
|
- @classmethod
|
383
|
|
- def _process_elements(cls, elements: BodyMailParts) -> BodyMailParts:
|
384
|
|
- if len(elements) >= 2:
|
385
|
|
- # Case 1 and 2, only one main and one quote
|
386
|
|
- if elements.get_nb_part_type('main') == 1 and \
|
387
|
|
- elements.get_nb_part_type('quote') == 1:
|
388
|
|
- # Case 1 : Main first
|
389
|
|
- if elements[0].part_type == BodyMailPartType.Main:
|
390
|
|
- cls._process_main_first_case(elements)
|
391
|
|
- # Case 2 : Quote first
|
392
|
|
- if elements[0].part_type == BodyMailPartType.Quote:
|
393
|
|
- cls._process_quote_first_case(elements)
|
394
|
|
- else:
|
395
|
|
- # Case 3 : Multiple quotes and/or main
|
396
|
|
- cls._process_multiples_elems_case(elements)
|
397
|
|
- else:
|
398
|
|
- cls._process_default_case(elements)
|
399
|
|
- # default case (only one element or empty list)
|
400
|
|
- return elements
|
401
|
|
-
|
402
|
|
- @classmethod
|
403
|
|
- def _process_quote_first_case(cls, elements: BodyMailParts) -> None:
|
404
|
|
- elements.drop_part_type(BodyMailPartType.Signature)
|
405
|
|
-
|
406
|
|
- @classmethod
|
407
|
|
- def _process_main_first_case(cls, elements: BodyMailParts) -> None:
|
408
|
|
- elements.drop_part_type(BodyMailPartType.Quote)
|
409
|
|
- elements.drop_part_type(BodyMailPartType.Signature)
|
410
|
|
-
|
411
|
|
- @classmethod
|
412
|
|
- def _process_multiples_elems_case(cls, elements: BodyMailParts) -> None:
|
413
|
|
- elements.drop_part_type(BodyMailPartType.Signature)
|
414
|
|
-
|
415
|
|
- @classmethod
|
416
|
|
- def _process_default_case(cls, elements: BodyMailParts) -> None:
|
417
|
|
- elements.drop_part_type(BodyMailPartType.Quote)
|
418
|
|
- elements.drop_part_type(BodyMailPartType.Signature)
|