email_fetcher.py 13KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370
  1. # -*- coding: utf-8 -*-
  2. import time
  3. import imaplib
  4. import json
  5. import typing
  6. from email import message_from_bytes
  7. from email.header import decode_header
  8. from email.header import make_header
  9. from email.message import Message
  10. from email.utils import parseaddr
  11. import markdown
  12. import requests
  13. from email_reply_parser import EmailReplyParser
  14. from tracim.lib.base import logger
  15. from tracim.lib.email_processing.parser import ParsedHTMLMail
  16. from tracim.lib.email_processing.sanitizer import HtmlSanitizer
  17. TRACIM_SPECIAL_KEY_HEADER = 'X-Tracim-Key'
  18. CONTENT_TYPE_TEXT_PLAIN = 'text/plain'
  19. CONTENT_TYPE_TEXT_HTML = 'text/html'
  20. class MessageContainer(object):
  21. def __init__(self, message: Message, uid: int) -> None:
  22. self.message = message
  23. self.uid = uid
  24. class DecodedMail(object):
  25. def __init__(self, message: Message, uid: int=None) -> None:
  26. self._message = message
  27. self.uid = uid
  28. def _decode_header(self, header_title: str) -> typing.Optional[str]:
  29. # FIXME : Handle exception
  30. if header_title in self._message:
  31. return str(make_header(decode_header(self._message[header_title])))
  32. else:
  33. return None
  34. def get_subject(self) -> typing.Optional[str]:
  35. return self._decode_header('subject')
  36. def get_from_address(self) -> str:
  37. return parseaddr(self._message['From'])[1]
  38. def get_to_address(self) -> str:
  39. return parseaddr(self._message['To'])[1]
  40. def get_first_ref(self) -> str:
  41. return parseaddr(self._message['References'])[1]
  42. def get_special_key(self) -> typing.Optional[str]:
  43. return self._decode_header(TRACIM_SPECIAL_KEY_HEADER)
  44. def get_body(
  45. self,
  46. use_html_parsing=True,
  47. use_txt_parsing=True,
  48. ) -> typing.Optional[str]:
  49. body_part = self._get_mime_body_message()
  50. body = None
  51. if body_part:
  52. charset = body_part.get_content_charset('iso-8859-1')
  53. content_type = body_part.get_content_type()
  54. if content_type == CONTENT_TYPE_TEXT_PLAIN:
  55. txt_body = body_part.get_payload(decode=True).decode(
  56. charset)
  57. if use_txt_parsing:
  58. txt_body = EmailReplyParser.parse_reply(txt_body)
  59. html_body = markdown.markdown(txt_body)
  60. body = HtmlSanitizer.sanitize(html_body)
  61. elif content_type == CONTENT_TYPE_TEXT_HTML:
  62. html_body = body_part.get_payload(decode=True).decode(
  63. charset)
  64. if use_html_parsing:
  65. html_body = str(ParsedHTMLMail(html_body))
  66. body = HtmlSanitizer.sanitize(html_body)
  67. return body
  68. def _get_mime_body_message(self) -> typing.Optional[Message]:
  69. # TODO - G.M - 2017-11-16 - Use stdlib msg.get_body feature for py3.6+
  70. part = None
  71. # Check for html
  72. for part in self._message.walk():
  73. content_type = part.get_content_type()
  74. content_dispo = str(part.get('Content-Disposition'))
  75. if content_type == CONTENT_TYPE_TEXT_HTML \
  76. and 'attachment' not in content_dispo:
  77. return part
  78. # check for plain text
  79. for part in self._message.walk():
  80. content_type = part.get_content_type()
  81. content_dispo = str(part.get('Content-Disposition'))
  82. if content_type == CONTENT_TYPE_TEXT_PLAIN \
  83. and 'attachment' not in content_dispo:
  84. return part
  85. return part
  86. def get_key(self) -> typing.Optional[str]:
  87. """
  88. key is the string contain in some mail header we need to retrieve.
  89. First try checking special header, them check 'to' header
  90. and finally check first(oldest) mail-id of 'references' header
  91. """
  92. first_ref = self.get_first_ref()
  93. to_address = self.get_to_address()
  94. special_key = self.get_special_key()
  95. if special_key:
  96. return special_key
  97. if to_address:
  98. return DecodedMail.find_key_from_mail_address(to_address)
  99. if first_ref:
  100. return DecodedMail.find_key_from_mail_address(first_ref)
  101. return None
  102. @classmethod
  103. def find_key_from_mail_address(
  104. cls,
  105. mail_address: str,
  106. ) -> typing.Optional[str]:
  107. """ Parse mail_adress-like string
  108. to retrieve key.
  109. :param mail_address: user+key@something like string
  110. :return: key
  111. """
  112. username = mail_address.split('@')[0]
  113. username_data = username.split('+')
  114. if len(username_data) == 2:
  115. return username_data[1]
  116. return None
  117. class MailFetcher(object):
  118. def __init__(
  119. self,
  120. host: str,
  121. port: str,
  122. user: str,
  123. password: str,
  124. use_ssl: bool,
  125. folder: str,
  126. delay: int,
  127. endpoint: str,
  128. token: str,
  129. use_html_parsing: bool,
  130. use_txt_parsing: bool,
  131. ) -> None:
  132. """
  133. Fetch mail from a mailbox folder through IMAP and add their content to
  134. Tracim through http according to mail Headers.
  135. Fetch is regular.
  136. :param host: imap server hostname
  137. :param port: imap connection port
  138. :param user: user login of mailbox
  139. :param password: user password of mailbox
  140. :param use_ssl: use imap over ssl connection
  141. :param folder: mail folder where new mail are fetched
  142. :param delay: seconds to wait before fetching new mail again
  143. :param endpoint: tracim http endpoint where decoded mail are send.
  144. :param token: token to authenticate http connexion
  145. :param use_html_parsing: parse html mail
  146. :param use_txt_parsing: parse txt mail
  147. """
  148. self._connection = None
  149. self.host = host
  150. self.port = port
  151. self.user = user
  152. self.password = password
  153. self.use_ssl = use_ssl
  154. self.folder = folder
  155. self.delay = delay
  156. self.endpoint = endpoint
  157. self.token = token
  158. self.use_html_parsing = use_html_parsing
  159. self.use_txt_parsing = use_txt_parsing
  160. self._is_active = True
  161. def run(self) -> None:
  162. logger.info(self, 'Starting MailFetcher')
  163. while self._is_active:
  164. logger.debug(self, 'sleep for {}'.format(self.delay))
  165. time.sleep(self.delay)
  166. try:
  167. self._connect()
  168. messages = self._fetch()
  169. cleaned_mails = [DecodedMail(m.message, m.uid)
  170. for m in messages]
  171. self._notify_tracim(cleaned_mails)
  172. self._disconnect()
  173. except Exception as e:
  174. # TODO - G.M - 2017-11-23 - Identify possible exceptions
  175. log = 'IMAP error: {}'
  176. logger.warning(self, log.format(e.__str__()))
  177. def stop(self) -> None:
  178. self._is_active = False
  179. def _connect(self) -> None:
  180. # TODO - G.M - 2017-11-15 Verify connection/disconnection
  181. # Are old connexion properly close this way ?
  182. if self._connection:
  183. logger.debug(self, 'Disconnect from IMAP')
  184. self._disconnect()
  185. # TODO - G.M - 2017-11-23 Support for predefined SSLContext ?
  186. # without ssl_context param, tracim use default security configuration
  187. # which is great in most case.
  188. if self.use_ssl:
  189. logger.debug(self, 'Connect IMAP {}:{} using SSL'.format(
  190. self.host,
  191. self.port,
  192. ))
  193. self._connection = imaplib.IMAP4_SSL(self.host, self.port)
  194. else:
  195. logger.debug(self, 'Connect IMAP {}:{}'.format(
  196. self.host,
  197. self.port,
  198. ))
  199. self._connection = imaplib.IMAP4(self.host, self.port)
  200. try:
  201. logger.debug(self, 'Login IMAP with login {}'.format(
  202. self.user,
  203. ))
  204. self._connection.login(self.user, self.password)
  205. except Exception as e:
  206. log = 'IMAP login error: {}'
  207. logger.error(self, log.format(e.__str__()))
  208. def _disconnect(self) -> None:
  209. if self._connection:
  210. self._connection.close()
  211. self._connection.logout()
  212. self._connection = None
  213. def _fetch(self) -> typing.List[MessageContainer]:
  214. """
  215. Get news message from mailbox
  216. :return: list of new mails
  217. """
  218. messages = []
  219. # select mailbox
  220. logger.debug(self, 'Fetch messages from folder {}'.format(
  221. self.folder,
  222. ))
  223. rv, data = self._connection.select(self.folder)
  224. logger.debug(self, 'Response status {}'.format(
  225. rv,
  226. ))
  227. if rv == 'OK':
  228. # get mails
  229. # TODO - G.M - 2017-11-15 Which files to select as new file ?
  230. # Unseen file or All file from a directory (old one should be
  231. # moved/ deleted from mailbox during this process) ?
  232. logger.debug(self, 'Fetch unseen messages')
  233. rv, data = self._connection.search(None, "(UNSEEN)")
  234. logger.debug(self, 'Response status {}'.format(
  235. rv,
  236. ))
  237. if rv == 'OK':
  238. # get mail content
  239. logger.debug(self, 'Found {} unseen mails'.format(
  240. len(data[0].split()),
  241. ))
  242. for uid in data[0].split():
  243. # INFO - G.M - 2017-12-08 - Fetch BODY.PEEK[]
  244. # Retrieve all mail(body and header) but don't set mail
  245. # as seen because of PEEK
  246. # see rfc3501
  247. logger.debug(self, 'Fetch mail "{}"'.format(
  248. uid,
  249. ))
  250. rv, data = self._connection.fetch(uid, 'BODY.PEEK[]')
  251. logger.debug(self, 'Response status {}'.format(
  252. rv,
  253. ))
  254. if rv == 'OK':
  255. msg = message_from_bytes(data[0][1])
  256. msg_container = MessageContainer(msg, uid)
  257. messages.append(msg_container)
  258. else:
  259. log = 'IMAP : Unable to get mail : {}'
  260. logger.error(self, log.format(str(rv)))
  261. else:
  262. log = 'IMAP : Unable to get unseen mail : {}'
  263. logger.error(self, log.format(str(rv)))
  264. else:
  265. log = 'IMAP : Unable to open mailbox : {}'
  266. logger.error(self, log.format(str(rv)))
  267. return messages
  268. def _notify_tracim(
  269. self,
  270. mails: typing.List[DecodedMail],
  271. ) -> typing.List[DecodedMail]:
  272. """
  273. Send http request to tracim endpoint
  274. :param mails: list of mails to send
  275. :return: unsended mails
  276. """
  277. logger.debug(self, 'Notify tracim about {} new responses'.format(
  278. len(mails),
  279. ))
  280. unsended_mails = []
  281. # TODO BS 20171124: Look around mail.get_from_address(), mail.get_key()
  282. # , mail.get_body() etc ... for raise InvalidEmailError if missing
  283. # required informations (actually get_from_address raise IndexError
  284. # if no from address for example) and catch it here
  285. while mails:
  286. mail = mails.pop()
  287. msg = {'token': self.token,
  288. 'user_mail': mail.get_from_address(),
  289. 'content_id': mail.get_key(),
  290. 'payload': {
  291. 'content': mail.get_body(
  292. use_html_parsing=self.use_html_parsing,
  293. use_txt_parsing=self.use_txt_parsing),
  294. }}
  295. try:
  296. logger.debug(
  297. self,
  298. 'Contact API on {} with body {}'.format(
  299. self.endpoint,
  300. json.dumps(msg),
  301. ),
  302. )
  303. r = requests.post(self.endpoint, json=msg)
  304. if r.status_code not in [200, 204]:
  305. details = r.json().get('msg')
  306. log = 'bad status code {} response when sending mail to tracim: {}' # nopep8
  307. logger.error(self, log.format(
  308. str(r.status_code),
  309. details,
  310. ))
  311. else:
  312. self._set_flag(mail.uid)
  313. # TODO - G.M - Verify exception correctly works
  314. except requests.exceptions.Timeout as e:
  315. log = 'Timeout error to transmit fetched mail to tracim : {}'
  316. logger.error(self, log.format(str(e)))
  317. unsended_mails.append(mail)
  318. except requests.exceptions.RequestException as e:
  319. log = 'Fail to transmit fetched mail to tracim : {}'
  320. logger.error(self, log.format(str(e)))
  321. return unsended_mails
  322. def _set_flag(self, uid):
  323. assert uid is not None
  324. rv, data = self._connection.store(
  325. uid,
  326. '+FLAGS',
  327. '\\Seen'
  328. )
  329. if rv == 'OK':
  330. log = 'Message {} set as seen.'.format(uid)
  331. logger.debug(self, log)
  332. else:
  333. log = 'Can not set Message {} as seen : {}'.format(uid, rv)
  334. logger.error(self, log)