email_fetcher.py 13KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348
  1. # -*- coding: utf-8 -*-
  2. import time
  3. import imaplib
  4. import json
  5. import typing
  6. from email import message_from_bytes
  7. from email.header import decode_header
  8. from email.header import make_header
  9. from email.message import Message
  10. from email.utils import parseaddr
  11. import markdown
  12. import requests
  13. from email_reply_parser import EmailReplyParser
  14. from tracim.lib.base import logger
  15. from tracim.lib.email_processing.parser import ParsedHTMLMail
  16. from tracim.lib.email_processing.sanitizer import HtmlSanitizer
  17. TRACIM_SPECIAL_KEY_HEADER = 'X-Tracim-Key'
  18. CONTENT_TYPE_TEXT_PLAIN = 'text/plain'
  19. CONTENT_TYPE_TEXT_HTML = 'text/html'
  20. class DecodedMail(object):
  21. def __init__(self, message: Message) -> None:
  22. self._message = message
  23. def _decode_header(self, header_title: str) -> typing.Optional[str]:
  24. # FIXME : Handle exception
  25. if header_title in self._message:
  26. return str(make_header(decode_header(self._message[header_title])))
  27. else:
  28. return None
  29. def get_subject(self) -> typing.Optional[str]:
  30. return self._decode_header('subject')
  31. def get_from_address(self) -> str:
  32. return parseaddr(self._message['From'])[1]
  33. def get_to_address(self) -> str:
  34. return parseaddr(self._message['To'])[1]
  35. def get_first_ref(self) -> str:
  36. return parseaddr(self._message['References'])[1]
  37. def get_special_key(self) -> typing.Optional[str]:
  38. return self._decode_header(TRACIM_SPECIAL_KEY_HEADER)
  39. def get_body(
  40. self,
  41. use_html_parsing=True,
  42. use_txt_parsing=True,
  43. ) -> typing.Optional[str]:
  44. body_part = self._get_mime_body_message()
  45. body = None
  46. if body_part:
  47. charset = body_part.get_content_charset('iso-8859-1')
  48. content_type = body_part.get_content_type()
  49. if content_type == CONTENT_TYPE_TEXT_PLAIN:
  50. txt_body = body_part.get_payload(decode=True).decode(
  51. charset)
  52. if use_txt_parsing:
  53. txt_body = EmailReplyParser.parse_reply(txt_body)
  54. html_body = markdown.markdown(txt_body)
  55. body = HtmlSanitizer.sanitize(html_body)
  56. elif content_type == CONTENT_TYPE_TEXT_HTML:
  57. html_body = body_part.get_payload(decode=True).decode(
  58. charset)
  59. if use_html_parsing:
  60. html_body = str(ParsedHTMLMail(html_body))
  61. body = HtmlSanitizer.sanitize(html_body)
  62. return body
  63. def _get_mime_body_message(self) -> typing.Optional[Message]:
  64. # TODO - G.M - 2017-11-16 - Use stdlib msg.get_body feature for py3.6+
  65. part = None
  66. # Check for html
  67. for part in self._message.walk():
  68. content_type = part.get_content_type()
  69. content_dispo = str(part.get('Content-Disposition'))
  70. if content_type == CONTENT_TYPE_TEXT_HTML \
  71. and 'attachment' not in content_dispo:
  72. return part
  73. # check for plain text
  74. for part in self._message.walk():
  75. content_type = part.get_content_type()
  76. content_dispo = str(part.get('Content-Disposition'))
  77. if content_type == CONTENT_TYPE_TEXT_PLAIN \
  78. and 'attachment' not in content_dispo:
  79. return part
  80. return part
  81. def get_key(self) -> typing.Optional[str]:
  82. """
  83. key is the string contain in some mail header we need to retrieve.
  84. First try checking special header, them check 'to' header
  85. and finally check first(oldest) mail-id of 'references' header
  86. """
  87. first_ref = self.get_first_ref()
  88. to_address = self.get_to_address()
  89. special_key = self.get_special_key()
  90. if special_key:
  91. return special_key
  92. if to_address:
  93. return DecodedMail.find_key_from_mail_address(to_address)
  94. if first_ref:
  95. return DecodedMail.find_key_from_mail_address(first_ref)
  96. return None
  97. @classmethod
  98. def find_key_from_mail_address(
  99. cls,
  100. mail_address: str,
  101. ) -> typing.Optional[str]:
  102. """ Parse mail_adress-like string
  103. to retrieve key.
  104. :param mail_address: user+key@something like string
  105. :return: key
  106. """
  107. username = mail_address.split('@')[0]
  108. username_data = username.split('+')
  109. if len(username_data) == 2:
  110. return username_data[1]
  111. return None
  112. class MailFetcher(object):
  113. def __init__(
  114. self,
  115. host: str,
  116. port: str,
  117. user: str,
  118. password: str,
  119. use_ssl: bool,
  120. folder: str,
  121. delay: int,
  122. endpoint: str,
  123. token: str,
  124. use_html_parsing: bool,
  125. use_txt_parsing: bool,
  126. ) -> None:
  127. """
  128. Fetch mail from a mailbox folder through IMAP and add their content to
  129. Tracim through http according to mail Headers.
  130. Fetch is regular.
  131. :param host: imap server hostname
  132. :param port: imap connection port
  133. :param user: user login of mailbox
  134. :param password: user password of mailbox
  135. :param use_ssl: use imap over ssl connection
  136. :param folder: mail folder where new mail are fetched
  137. :param delay: seconds to wait before fetching new mail again
  138. :param endpoint: tracim http endpoint where decoded mail are send.
  139. :param token: token to authenticate http connexion
  140. :param use_html_parsing: parse html mail
  141. :param use_txt_parsing: parse txt mail
  142. """
  143. self._connection = None
  144. self.host = host
  145. self.port = port
  146. self.user = user
  147. self.password = password
  148. self.use_ssl = use_ssl
  149. self.folder = folder
  150. self.delay = delay
  151. self.endpoint = endpoint
  152. self.token = token
  153. self.use_html_parsing = use_html_parsing
  154. self.use_txt_parsing = use_txt_parsing
  155. self._is_active = True
  156. def run(self) -> None:
  157. logger.info(self, 'Starting MailFetcher')
  158. while self._is_active:
  159. logger.debug(self, 'sleep for {}'.format(self.delay))
  160. time.sleep(self.delay)
  161. try:
  162. self._connect()
  163. messages = self._fetch()
  164. # TODO - G.M - 2017-11-22 retry sending unsended mail
  165. # These mails are return by _notify_tracim, flag them with "unseen" # nopep8
  166. # or store them until new _notify_tracim call
  167. cleaned_mails = [DecodedMail(msg) for msg in messages]
  168. self._notify_tracim(cleaned_mails)
  169. self._disconnect()
  170. except Exception as e:
  171. # TODO - G.M - 2017-11-23 - Identify possible exceptions
  172. log = 'IMAP error: {}'
  173. logger.warning(self, log.format(e.__str__()))
  174. def stop(self) -> None:
  175. self._is_active = False
  176. def _connect(self) -> None:
  177. # TODO - G.M - 2017-11-15 Verify connection/disconnection
  178. # Are old connexion properly close this way ?
  179. if self._connection:
  180. logger.debug(self, 'Disconnect from IMAP')
  181. self._disconnect()
  182. # TODO - G.M - 2017-11-23 Support for predefined SSLContext ?
  183. # without ssl_context param, tracim use default security configuration
  184. # which is great in most case.
  185. if self.use_ssl:
  186. logger.debug(self, 'Connect IMAP {}:{} using SSL'.format(
  187. self.host,
  188. self.port,
  189. ))
  190. self._connection = imaplib.IMAP4_SSL(self.host, self.port)
  191. else:
  192. logger.debug(self, 'Connect IMAP {}:{}'.format(
  193. self.host,
  194. self.port,
  195. ))
  196. self._connection = imaplib.IMAP4(self.host, self.port)
  197. try:
  198. logger.debug(self, 'Login IMAP with login {}'.format(
  199. self.user,
  200. ))
  201. self._connection.login(self.user, self.password)
  202. except Exception as e:
  203. log = 'IMAP login error: {}'
  204. logger.error(self, log.format(e.__str__()))
  205. def _disconnect(self) -> None:
  206. if self._connection:
  207. self._connection.close()
  208. self._connection.logout()
  209. self._connection = None
  210. def _fetch(self) -> typing.List[Message]:
  211. """
  212. Get news message from mailbox
  213. :return: list of new mails
  214. """
  215. messages = []
  216. # select mailbox
  217. logger.debug(self, 'Fetch messages from folder {}'.format(
  218. self.folder,
  219. ))
  220. rv, data = self._connection.select(self.folder)
  221. logger.debug(self, 'Response status {}'.format(
  222. rv,
  223. ))
  224. if rv == 'OK':
  225. # get mails
  226. # TODO - G.M - 2017-11-15 Which files to select as new file ?
  227. # Unseen file or All file from a directory (old one should be
  228. # moved/ deleted from mailbox during this process) ?
  229. logger.debug(self, 'Fetch unseen messages')
  230. rv, data = self._connection.search(None, "(UNSEEN)")
  231. logger.debug(self, 'Response status {}'.format(
  232. rv,
  233. ))
  234. if rv == 'OK':
  235. # get mail content
  236. logger.debug(self, 'Found {} unseen mails'.format(
  237. len(data[0].split()),
  238. ))
  239. for num in data[0].split():
  240. # INFO - G.M - 2017-11-23 - Fetch (RFC288) to retrieve all
  241. # complete mails see example : https://docs.python.org/fr/3.5/library/imaplib.html#imap4-example . # nopep8
  242. # Be careful, This method remove also mails from Unseen
  243. # mails
  244. logger.debug(self, 'Fetch mail "{}"'.format(
  245. num,
  246. ))
  247. rv, data = self._connection.fetch(num, '(RFC822)')
  248. logger.debug(self, 'Response status {}'.format(
  249. rv,
  250. ))
  251. if rv == 'OK':
  252. msg = message_from_bytes(data[0][1])
  253. messages.append(msg)
  254. else:
  255. log = 'IMAP : Unable to get mail : {}'
  256. logger.error(self, log.format(str(rv)))
  257. else:
  258. log = 'IMAP : Unable to get unseen mail : {}'
  259. logger.error(self, log.format(str(rv)))
  260. else:
  261. log = 'IMAP : Unable to open mailbox : {}'
  262. logger.error(self, log.format(str(rv)))
  263. return messages
  264. def _notify_tracim(
  265. self,
  266. mails: typing.List[DecodedMail],
  267. ) -> typing.List[DecodedMail]:
  268. """
  269. Send http request to tracim endpoint
  270. :param mails: list of mails to send
  271. :return: unsended mails
  272. """
  273. logger.debug(self, 'Notify tracim about {} new responses'.format(
  274. len(mails),
  275. ))
  276. unsended_mails = []
  277. # TODO BS 20171124: Look around mail.get_from_address(), mail.get_key()
  278. # , mail.get_body() etc ... for raise InvalidEmailError if missing
  279. # required informations (actually get_from_address raise IndexError
  280. # if no from address for example) and catch it here
  281. while mails:
  282. mail = mails.pop()
  283. msg = {'token': self.token,
  284. 'user_mail': mail.get_from_address(),
  285. 'content_id': mail.get_key(),
  286. 'payload': {
  287. 'content': mail.get_body(
  288. use_html_parsing=self.use_html_parsing,
  289. use_txt_parsing=self.use_txt_parsing),
  290. }}
  291. try:
  292. logger.debug(
  293. self,
  294. 'Contact API on {} with body {}'.format(
  295. self.endpoint,
  296. json.dumps(msg),
  297. ),
  298. )
  299. r = requests.post(self.endpoint, json=msg)
  300. if r.status_code not in [200, 204]:
  301. details = r.json().get('msg')
  302. log = 'bad status code {} response when sending mail to tracim: {}' # nopep8
  303. logger.error(self, log.format(
  304. str(r.status_code),
  305. details,
  306. ))
  307. # TODO - G.M - Verify exception correctly works
  308. except requests.exceptions.Timeout as e:
  309. log = 'Timeout error to transmit fetched mail to tracim : {}'
  310. logger.error(self, log.format(str(e)))
  311. unsended_mails.append(mail)
  312. except requests.exceptions.RequestException as e:
  313. log = 'Fail to transmit fetched mail to tracim : {}'
  314. logger.error(self, log.format(str(e)))
  315. return unsended_mails