email_body_parser.py 13KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409
  1. import typing
  2. from bs4 import BeautifulSoup
  3. from bs4 import Tag
  4. from bs4 import NavigableString
  5. class BodyMailPartType(object):
  6. Signature = 'sign'
  7. Main = 'main'
  8. Quote = 'quote'
  9. class BodyMailPart(object):
  10. def __init__(
  11. self,
  12. text: str,
  13. part_type: str
  14. )-> None:
  15. self.text = text
  16. self.part_type = part_type
  17. class BodyMailParts(object):
  18. """
  19. Data Structure to Distinct part of a Mail body into a "list" of BodyMailPart
  20. When 2 similar BodyMailPart (same part_type) are added one after the other,
  21. it doesn't create a new Part, it just merge those elements into one.
  22. It should always have only one Signature type part, normally
  23. at the end of the body.
  24. This object doesn't provide other set method than append() in order to
  25. preserve object coherence.
  26. """
  27. def __init__(self) -> None:
  28. self._list = [] # type; List[BodyMailPart]
  29. # INFO - G.M -
  30. # automatically merge new value with last item if true, without any
  31. # part_type check, same type as the older one, useful when some tag
  32. # say "all elem after me is Signature"
  33. self.follow = False
  34. def __len__(self) -> int:
  35. return len(self._list)
  36. def __getitem__(self, index) -> BodyMailPart:
  37. return self._list[index]
  38. def __delitem__(self, index) -> None:
  39. del self._list[index]
  40. # FIXME - G.M - 2017-11-27 - Preserve BodyMailParts consistence
  41. # check elem after and before index and merge them if necessary.
  42. def append(self, value) -> None:
  43. BodyMailParts._check_value(value)
  44. self._append(value)
  45. def _append(self, value) -> None:
  46. same_type_as_last = len(self._list) > 0 and \
  47. self._list[-1].part_type == value.part_type
  48. if same_type_as_last or self.follow:
  49. self._list[-1].text += value.text
  50. else:
  51. self._list.append(value)
  52. @classmethod
  53. def _check_value(cls, value) -> None:
  54. if not isinstance(value, BodyMailPart):
  55. raise TypeError()
  56. def drop_part_type(self, part_type: str) -> None:
  57. """
  58. Drop all elem of one part_type
  59. :param part_type: part_type to completely remove
  60. :return: None
  61. """
  62. new_list = [x for x in self._list if x.part_type != part_type]
  63. self._list = []
  64. # INFO - G.M - 2017-11-27 - use append() to have a consistent list
  65. for elem in new_list:
  66. self.append(elem)
  67. def get_nb_part_type(self, part_type: str) -> int:
  68. """
  69. Get number of elements of one part_type
  70. :param part_type: part_type to check
  71. :return: number of part_type elements
  72. """
  73. count = 0
  74. for elem in self._list:
  75. if elem.part_type == part_type:
  76. count += 1
  77. return count
  78. def __str__(self) -> str:
  79. s_mail = ''
  80. for elem in self._list:
  81. s_mail += elem.text
  82. return str(s_mail)
  83. class SignatureIndexError(Exception):
  84. pass
  85. class ProprietaryHTMLProperties(object):
  86. # Gmail
  87. Gmail_extras_class = 'gmail_extra'
  88. Gmail_quote_class = 'gmail_quote'
  89. Gmail_signature_class = 'gmail_signature'
  90. # Thunderbird
  91. Thunderbird_quote_prefix_class = 'moz-cite-prefix'
  92. Thunderbird_signature_class = 'moz-signature'
  93. # Outlook.com
  94. Outlook_com_quote_id = 'divRplyFwdMsg'
  95. Outlook_com_signature_id = 'Signature'
  96. Outlook_com_wrapper_id = 'divtagdefaultwrapper'
  97. # Yahoo
  98. Yahoo_quote_class = 'yahoo_quoted'
  99. # Roundcube
  100. # INFO - G.M - 2017-11-29 - New tag
  101. # see : https://github.com/roundcube/roundcubemail/issues/6049
  102. Roundcube_quote_prefix_class = 'reply-intro'
  103. class HtmlChecker(object):
  104. @classmethod
  105. def _has_attr_value(
  106. cls,
  107. elem: typing.Union[Tag, NavigableString],
  108. attribute_name: str,
  109. attribute_value: str,
  110. )-> bool:
  111. if isinstance(elem, Tag) and \
  112. attribute_name in elem.attrs and \
  113. attribute_value in elem.attrs[attribute_name]:
  114. return True
  115. return False
  116. class HtmlMailQuoteChecker(HtmlChecker):
  117. @classmethod
  118. def is_quote(
  119. cls,
  120. elem: typing.Union[Tag, NavigableString]
  121. ) -> bool:
  122. return cls._is_standard_quote(elem) \
  123. or cls._is_thunderbird_quote(elem) \
  124. or cls._is_gmail_quote(elem) \
  125. or cls._is_outlook_com_quote(elem) \
  126. or cls._is_yahoo_quote(elem) \
  127. or cls._is_roundcube_quote(elem)
  128. @classmethod
  129. def _is_standard_quote(
  130. cls,
  131. elem: typing.Union[Tag, NavigableString]
  132. ) -> bool:
  133. if isinstance(elem, Tag) \
  134. and elem.name.lower() == 'blockquote':
  135. return True
  136. return False
  137. @classmethod
  138. def _is_thunderbird_quote(
  139. cls,
  140. elem: typing.Union[Tag, NavigableString]
  141. ) -> bool:
  142. return cls._has_attr_value(
  143. elem,
  144. 'class',
  145. ProprietaryHTMLProperties.Thunderbird_quote_prefix_class)
  146. @classmethod
  147. def _is_gmail_quote(
  148. cls,
  149. elem: typing.Union[Tag, NavigableString]
  150. ) -> bool:
  151. if cls._has_attr_value(
  152. elem,
  153. 'class',
  154. ProprietaryHTMLProperties.Gmail_extras_class):
  155. for child in elem.children:
  156. if cls._has_attr_value(
  157. child,
  158. 'class',
  159. ProprietaryHTMLProperties.Gmail_quote_class):
  160. return True
  161. return False
  162. @classmethod
  163. def _is_outlook_com_quote(
  164. cls,
  165. elem: typing.Union[Tag, NavigableString]
  166. ) -> bool:
  167. if cls._has_attr_value(
  168. elem,
  169. 'id',
  170. ProprietaryHTMLProperties.Outlook_com_quote_id):
  171. return True
  172. return False
  173. @classmethod
  174. def _is_yahoo_quote(
  175. cls,
  176. elem: typing.Union[Tag, NavigableString]
  177. ) -> bool:
  178. return cls._has_attr_value(
  179. elem,
  180. 'class',
  181. ProprietaryHTMLProperties.Yahoo_quote_class)
  182. @classmethod
  183. def _is_roundcube_quote(
  184. cls,
  185. elem: typing.Union[Tag, NavigableString]
  186. ) -> bool:
  187. return cls._has_attr_value(
  188. elem,
  189. 'id',
  190. ProprietaryHTMLProperties.Roundcube_quote_prefix_class)
  191. class HtmlMailSignatureChecker(HtmlChecker):
  192. @classmethod
  193. def is_signature(
  194. cls,
  195. elem: typing.Union[Tag, NavigableString]
  196. ) -> bool:
  197. return cls._is_thunderbird_signature(elem) \
  198. or cls._is_gmail_signature(elem) \
  199. or cls._is_outlook_com_signature(elem)
  200. @classmethod
  201. def _is_thunderbird_signature(
  202. cls,
  203. elem: typing.Union[Tag, NavigableString]
  204. ) -> bool:
  205. return cls._has_attr_value(
  206. elem,
  207. 'class',
  208. ProprietaryHTMLProperties.Thunderbird_signature_class)
  209. @classmethod
  210. def _is_gmail_signature(
  211. cls,
  212. elem: typing.Union[Tag, NavigableString]
  213. ) -> bool:
  214. if cls._has_attr_value(
  215. elem,
  216. 'class',
  217. ProprietaryHTMLProperties.Gmail_signature_class):
  218. return True
  219. if cls._has_attr_value(
  220. elem,
  221. 'class',
  222. ProprietaryHTMLProperties.Gmail_extras_class):
  223. for child in elem.children:
  224. if cls._has_attr_value(
  225. child,
  226. 'class',
  227. ProprietaryHTMLProperties.Gmail_signature_class):
  228. return True
  229. if isinstance(elem, Tag) and elem.name.lower() == 'div':
  230. for child in elem.children:
  231. if cls._has_attr_value(
  232. child,
  233. 'class',
  234. ProprietaryHTMLProperties.Gmail_signature_class):
  235. return True
  236. return False
  237. @classmethod
  238. def _is_outlook_com_signature(
  239. cls,
  240. elem: typing.Union[Tag, NavigableString]
  241. ) -> bool:
  242. if cls._has_attr_value(
  243. elem,
  244. 'id',
  245. ProprietaryHTMLProperties.Outlook_com_signature_id):
  246. return True
  247. return False
  248. class PreSanitizeConfig(object):
  249. Ignored_tags = ['br', 'hr', 'script', 'style']
  250. meta_tag = ['body','div']
  251. class ParsedHTMLMail(object):
  252. """
  253. Parse HTML Mail depending of some rules.
  254. Distinct part of html mail body using BodyMailParts object and
  255. process different rules.
  256. """
  257. def __init__(self, html_body: str):
  258. self.src_html_body = html_body
  259. def __str__(self):
  260. return str(self._parse_mail())
  261. def get_elements(self) -> BodyMailParts:
  262. tree = self._get_proper_main_body_tree()
  263. return self._distinct_elements(tree)
  264. def _parse_mail(self) -> BodyMailParts:
  265. elements = self.get_elements()
  266. elements = self._process_elements(elements)
  267. return elements
  268. def _get_proper_main_body_tree(self) -> BeautifulSoup:
  269. """
  270. Get html body tree without some kind of wrapper.
  271. We need to have text, quote and signature parts at the same tree level
  272. """
  273. tree = BeautifulSoup(self.src_html_body, 'html.parser')
  274. # Only parse body part of html if available
  275. subtree = tree.find('body')
  276. if subtree:
  277. tree = BeautifulSoup(str(subtree), 'html.parser')
  278. # if some kind of "meta_div", unwrap it
  279. while len(tree.findAll(recursive=None)) == 1 and \
  280. tree.find().name.lower() in PreSanitizeConfig.meta_tag:
  281. tree.find().unwrap()
  282. for tag in tree.findAll():
  283. # HACK - G.M - 2017-11-28 - Unwrap outlook.com mail
  284. # if Text -> Signature -> Quote Mail
  285. # Text and signature are wrapped into divtagdefaultwrapper
  286. if tag.attrs.get('id'):
  287. if ProprietaryHTMLProperties.Outlook_com_wrapper_id\
  288. in tag.attrs['id']:
  289. tag.unwrap()
  290. return tree
  291. @classmethod
  292. def _distinct_elements(cls, tree: BeautifulSoup) -> BodyMailParts:
  293. parts = BodyMailParts()
  294. for elem in list(tree):
  295. part_txt = str(elem)
  296. part_type = BodyMailPartType.Main
  297. # sanitize NavigableString
  298. if isinstance(elem, NavigableString):
  299. part_txt = part_txt.replace('\n', '').strip()
  300. if HtmlMailQuoteChecker.is_quote(elem):
  301. part_type = BodyMailPartType.Quote
  302. elif HtmlMailSignatureChecker.is_signature(elem):
  303. part_type = BodyMailPartType.Signature
  304. else:
  305. # INFO - G.M -2017-11-28 - ignore unwanted parts
  306. if not part_txt:
  307. continue
  308. if isinstance(elem, Tag) \
  309. and elem.name.lower() in PreSanitizeConfig.Ignored_tags:
  310. continue
  311. part = BodyMailPart(part_txt, part_type)
  312. parts.append(part)
  313. # INFO - G.M - 2017-11-28 - Outlook.com special case
  314. # all after quote tag is quote
  315. if HtmlMailQuoteChecker._is_outlook_com_quote(elem):
  316. parts.follow = True
  317. return parts
  318. @classmethod
  319. def _process_elements(cls, elements: BodyMailParts) -> BodyMailParts:
  320. if len(elements) >= 2:
  321. # Case 1 and 2, only one main and one quote
  322. if elements.get_nb_part_type('main') == 1 and \
  323. elements.get_nb_part_type('quote') == 1:
  324. # Case 1 : Main first
  325. if elements[0].part_type == BodyMailPartType.Main:
  326. cls._process_main_first_case(elements)
  327. # Case 2 : Quote first
  328. if elements[0].part_type == BodyMailPartType.Quote:
  329. cls._process_quote_first_case(elements)
  330. else:
  331. # Case 3 : Multiple quotes and/or main
  332. cls._process_multiples_elems_case(elements)
  333. else:
  334. cls._process_default_case(elements)
  335. # default case (only one element or empty list)
  336. return elements
  337. @classmethod
  338. def _process_quote_first_case(cls, elements: BodyMailParts) -> None:
  339. elements.drop_part_type(BodyMailPartType.Signature)
  340. @classmethod
  341. def _process_main_first_case(cls, elements: BodyMailParts) -> None:
  342. elements.drop_part_type(BodyMailPartType.Quote)
  343. elements.drop_part_type(BodyMailPartType.Signature)
  344. @classmethod
  345. def _process_multiples_elems_case(cls, elements: BodyMailParts) -> None:
  346. elements.drop_part_type(BodyMailPartType.Signature)
  347. @classmethod
  348. def _process_default_case(cls, elements: BodyMailParts) -> None:
  349. elements.drop_part_type(BodyMailPartType.Quote)
  350. elements.drop_part_type(BodyMailPartType.Signature)