parser.py 4.8KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129
  1. # -*- coding: utf-8 -*-
  2. from bs4 import BeautifulSoup
  3. from tracim.lib.email_processing.checkers import ProprietaryHTMLAttrValues
  4. from tracim.lib.email_processing.checkers import HtmlMailQuoteChecker
  5. from tracim.lib.email_processing.checkers import HtmlMailSignatureChecker
  6. from tracim.lib.email_processing.models import BodyMailPartType
  7. from tracim.lib.email_processing.models import BodyMailPart
  8. from tracim.lib.email_processing.models import HtmlBodyMailParts
  9. class PreSanitizeConfig(object):
  10. """
  11. To avoid problems, html need to be sanitize a bit during parsing to distinct
  12. Main,Quote and Signature elements
  13. """
  14. meta_tag = ['body', 'div']
  15. class ParsedHTMLMail(object):
  16. """
  17. Parse HTML Mail depending of some rules.
  18. Distinct part of html mail body using BodyMailParts object and
  19. process differents rules using HtmlChecker(s)
  20. """
  21. def __init__(self, html_body: str):
  22. self.src_html_body = html_body
  23. def __str__(self):
  24. return str(self._parse_mail())
  25. def get_elements(self) -> HtmlBodyMailParts:
  26. tree = self._get_proper_main_body_tree()
  27. return self._distinct_elements(tree)
  28. def _parse_mail(self) -> HtmlBodyMailParts:
  29. elements = self.get_elements()
  30. elements = self._process_elements(elements)
  31. return elements
  32. def _get_proper_main_body_tree(self) -> BeautifulSoup:
  33. """
  34. Get html body tree without some kind of wrapper.
  35. We need to have text, quote and signature parts at the same tree level
  36. """
  37. tree = BeautifulSoup(self.src_html_body, 'html.parser')
  38. # Only parse body part of html if available
  39. subtree = tree.find('body')
  40. if subtree:
  41. tree = BeautifulSoup(str(subtree), 'html.parser')
  42. # if some kind of "meta_div", unwrap it
  43. while len(tree.findAll(recursive=None)) == 1 and \
  44. tree.find().name.lower() in PreSanitizeConfig.meta_tag:
  45. tree.find().unwrap()
  46. for tag in tree.findAll():
  47. # HACK - G.M - 2017-11-28 - Unwrap outlook.com mail
  48. # if Text -> Signature -> Quote Mail
  49. # Text and signature are wrapped into divtagdefaultwrapper
  50. if tag.attrs.get('id'):
  51. if ProprietaryHTMLAttrValues.Outlook_com_wrapper_id\
  52. in tag.attrs['id']:
  53. tag.unwrap()
  54. return tree
  55. @classmethod
  56. def _distinct_elements(cls, tree: BeautifulSoup) -> HtmlBodyMailParts:
  57. parts = HtmlBodyMailParts()
  58. for elem in list(tree):
  59. part_txt = str(elem)
  60. part_type = BodyMailPartType.Main
  61. if HtmlMailQuoteChecker.is_quote(elem):
  62. part_type = BodyMailPartType.Quote
  63. elif HtmlMailSignatureChecker.is_signature(elem):
  64. part_type = BodyMailPartType.Signature
  65. part = BodyMailPart(part_txt, part_type)
  66. parts.append(part)
  67. # INFO - G.M - 2017-11-28 - Outlook.com special case
  68. # all after quote tag is quote
  69. if HtmlMailQuoteChecker._is_outlook_com_quote(elem):
  70. parts.follow = True
  71. return parts
  72. @classmethod
  73. def _process_elements(
  74. cls,
  75. elements: HtmlBodyMailParts,
  76. ) -> HtmlBodyMailParts:
  77. if len(elements) >= 2:
  78. # Case 1 and 2, only one main and one quote
  79. if elements.get_nb_part_type('main') == 1 and \
  80. elements.get_nb_part_type('quote') == 1:
  81. # Case 1 : Main first
  82. if elements[0].part_type == BodyMailPartType.Main:
  83. cls._process_main_first_case(elements)
  84. # Case 2 : Quote first
  85. if elements[0].part_type == BodyMailPartType.Quote:
  86. cls._process_quote_first_case(elements)
  87. else:
  88. # Case 3 : Multiple quotes and/or main
  89. cls._process_multiples_elems_case(elements)
  90. else:
  91. cls._process_default_case(elements)
  92. # default case (only one element or empty list)
  93. return elements
  94. @classmethod
  95. def _process_quote_first_case(cls, elements: HtmlBodyMailParts) -> None:
  96. elements.drop_part_type(BodyMailPartType.Signature)
  97. @classmethod
  98. def _process_main_first_case(cls, elements: HtmlBodyMailParts) -> None:
  99. elements.drop_part_type(BodyMailPartType.Quote)
  100. elements.drop_part_type(BodyMailPartType.Signature)
  101. @classmethod
  102. def _process_multiples_elems_case(cls, elements: HtmlBodyMailParts) -> None:
  103. elements.drop_part_type(BodyMailPartType.Signature)
  104. @classmethod
  105. def _process_default_case(cls, elements: HtmlBodyMailParts) -> None:
  106. elements.drop_part_type(BodyMailPartType.Quote)
  107. elements.drop_part_type(BodyMailPartType.Signature)