|
@@ -101,6 +101,7 @@ class BodyMailParts(object):
|
101
|
101
|
class SignatureIndexError(Exception):
|
102
|
102
|
pass
|
103
|
103
|
|
|
104
|
+
|
104
|
105
|
class ProprietaryHTMLProperties(object):
|
105
|
106
|
# Gmail
|
106
|
107
|
Gmail_extras_class = 'gmail_extra'
|
|
@@ -120,6 +121,8 @@ class ProprietaryHTMLProperties(object):
|
120
|
121
|
# see : https://github.com/roundcube/roundcubemail/issues/6049
|
121
|
122
|
Roundcube_quote_prefix_class = 'reply-intro'
|
122
|
123
|
|
|
124
|
+
|
|
125
|
+
|
123
|
126
|
class HtmlChecker(object):
|
124
|
127
|
|
125
|
128
|
@classmethod
|
|
@@ -181,9 +184,9 @@ class HtmlMailQuoteChecker(HtmlChecker):
|
181
|
184
|
ProprietaryHTMLProperties.Gmail_extras_class):
|
182
|
185
|
for child in elem.children:
|
183
|
186
|
if cls._has_attr_value(
|
184
|
|
- child,
|
185
|
|
- 'class',
|
186
|
|
- ProprietaryHTMLProperties.Gmail_quote_class):
|
|
187
|
+ child,
|
|
188
|
+ 'class',
|
|
189
|
+ ProprietaryHTMLProperties.Gmail_quote_class):
|
187
|
190
|
return True
|
188
|
191
|
return False
|
189
|
192
|
|
|
@@ -283,6 +286,10 @@ class HtmlMailSignatureChecker(HtmlChecker):
|
283
|
286
|
return False
|
284
|
287
|
|
285
|
288
|
|
|
289
|
+class PreSanitizeConfig(object):
|
|
290
|
+ Ignored_tags = ['br', 'hr', 'script', 'style']
|
|
291
|
+ meta_tag = ['body','div']
|
|
292
|
+
|
286
|
293
|
class ParsedHTMLMail(object):
|
287
|
294
|
"""
|
288
|
295
|
Parse HTML Mail depending of some rules.
|
|
@@ -297,7 +304,7 @@ class ParsedHTMLMail(object):
|
297
|
304
|
return str(self._parse_mail())
|
298
|
305
|
|
299
|
306
|
def get_elements(self) -> BodyMailParts:
|
300
|
|
- tree = self._make_sanitized_tree()
|
|
307
|
+ tree = self._get_proper_main_body_tree()
|
301
|
308
|
return self._distinct_elements(tree)
|
302
|
309
|
|
303
|
310
|
def _parse_mail(self) -> BodyMailParts:
|
|
@@ -305,10 +312,10 @@ class ParsedHTMLMail(object):
|
305
|
312
|
elements = self._process_elements(elements)
|
306
|
313
|
return elements
|
307
|
314
|
|
308
|
|
- def _make_sanitized_tree(self) -> BeautifulSoup:
|
|
315
|
+ def _get_proper_main_body_tree(self) -> BeautifulSoup:
|
309
|
316
|
"""
|
310
|
|
- Get only html body content and remove some unneeded elements
|
311
|
|
- :return:
|
|
317
|
+ Get html body tree without some kind of wrapper.
|
|
318
|
+ We need to have text, quote and signature parts at the same tree level
|
312
|
319
|
"""
|
313
|
320
|
tree = BeautifulSoup(self.src_html_body, 'html.parser')
|
314
|
321
|
|
|
@@ -317,12 +324,11 @@ class ParsedHTMLMail(object):
|
317
|
324
|
if subtree:
|
318
|
325
|
tree = BeautifulSoup(str(subtree), 'html.parser')
|
319
|
326
|
|
320
|
|
- # if some sort of "meta_div", unwrap it
|
|
327
|
+ # if some kind of "meta_div", unwrap it
|
321
|
328
|
while len(tree.findAll(recursive=None)) == 1 and \
|
322
|
|
- tree.find().name.lower() in ['body', 'div']:
|
|
329
|
+ tree.find().name.lower() in PreSanitizeConfig.meta_tag:
|
323
|
330
|
tree.find().unwrap()
|
324
|
331
|
|
325
|
|
- # drop some html elem
|
326
|
332
|
for tag in tree.findAll():
|
327
|
333
|
# HACK - G.M - 2017-11-28 - Unwrap outlook.com mail
|
328
|
334
|
# if Text -> Signature -> Quote Mail
|
|
@@ -331,38 +337,37 @@ class ParsedHTMLMail(object):
|
331
|
337
|
if ProprietaryHTMLProperties.Outlook_com_wrapper_id\
|
332
|
338
|
in tag.attrs['id']:
|
333
|
339
|
tag.unwrap()
|
334
|
|
- # Hack - G.M - 2017-11-28 : remove tag with no enclosure
|
335
|
|
- # <br> and <hr> tag alone broke html.parser tree,
|
336
|
|
- # Using another parser may be a solution.
|
337
|
|
- if tag.name.lower() in ['br', 'hr']:
|
338
|
|
- tag.unwrap()
|
339
|
|
- continue
|
340
|
|
- if tag.name.lower() in ['script', 'style']:
|
341
|
|
- tag.extract()
|
342
|
|
-
|
343
|
340
|
return tree
|
344
|
341
|
|
345
|
342
|
@classmethod
|
346
|
343
|
def _distinct_elements(cls, tree: BeautifulSoup) -> BodyMailParts:
|
347
|
|
- elements = BodyMailParts()
|
348
|
|
- for tag in list(tree):
|
349
|
|
- txt = str(tag)
|
|
344
|
+ parts = BodyMailParts()
|
|
345
|
+ for elem in list(tree):
|
|
346
|
+ part_txt = str(elem)
|
350
|
347
|
part_type = BodyMailPartType.Main
|
351
|
|
- if isinstance(tag, NavigableString):
|
352
|
|
- txt = tag.replace('\n', '').strip()
|
353
|
|
- if not txt:
|
354
|
|
- continue
|
355
|
|
- if HtmlMailQuoteChecker.is_quote(tag):
|
|
348
|
+ # sanitize NavigableString
|
|
349
|
+ if isinstance(elem, NavigableString):
|
|
350
|
+ part_txt = part_txt.replace('\n', '').strip()
|
|
351
|
+
|
|
352
|
+ if HtmlMailQuoteChecker.is_quote(elem):
|
356
|
353
|
part_type = BodyMailPartType.Quote
|
357
|
|
- elif HtmlMailSignatureChecker.is_signature(tag):
|
|
354
|
+ elif HtmlMailSignatureChecker.is_signature(elem):
|
358
|
355
|
part_type = BodyMailPartType.Signature
|
359
|
|
- element = BodyMailPart(txt, part_type)
|
360
|
|
- elements.append(element)
|
|
356
|
+ else:
|
|
357
|
+ # INFO - G.M -2017-11-28 - ignore unwanted parts
|
|
358
|
+ if not part_txt:
|
|
359
|
+ continue
|
|
360
|
+ if isinstance(elem, Tag) \
|
|
361
|
+ and elem.name.lower() in PreSanitizeConfig.Ignored_tags:
|
|
362
|
+ continue
|
|
363
|
+
|
|
364
|
+ part = BodyMailPart(part_txt, part_type)
|
|
365
|
+ parts.append(part)
|
361
|
366
|
# INFO - G.M - 2017-11-28 - Outlook.com special case
|
362
|
367
|
# all after quote tag is quote
|
363
|
|
- if HtmlMailQuoteChecker._is_outlook_com_quote(tag):
|
364
|
|
- elements.follow = True
|
365
|
|
- return elements
|
|
368
|
+ if HtmlMailQuoteChecker._is_outlook_com_quote(elem):
|
|
369
|
+ parts.follow = True
|
|
370
|
+ return parts
|
366
|
371
|
|
367
|
372
|
@classmethod
|
368
|
373
|
def _process_elements(cls, elements: BodyMailParts) -> BodyMailParts:
|