email_fetcher.py 16KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453
  1. # -*- coding: utf-8 -*-
  2. import time
  3. import json
  4. import typing
  5. import socket
  6. import ssl
  7. from email import message_from_bytes
  8. from email.header import decode_header
  9. from email.header import make_header
  10. from email.message import Message
  11. from email.utils import parseaddr
  12. import filelock
  13. import markdown
  14. import requests
  15. import imapclient
  16. from email_reply_parser import EmailReplyParser
  17. from tracim.lib.base import logger
  18. from tracim.lib.email_processing.parser import ParsedHTMLMail
  19. from tracim.lib.email_processing.sanitizer import HtmlSanitizer
  20. TRACIM_SPECIAL_KEY_HEADER = 'X-Tracim-Key'
  21. CONTENT_TYPE_TEXT_PLAIN = 'text/plain'
  22. CONTENT_TYPE_TEXT_HTML = 'text/html'
  23. IMAP_CHECKED_FLAG = imapclient.FLAGGED
  24. IMAP_SEEN_FLAG = imapclient.SEEN
  25. MAIL_FETCHER_FILELOCK_TIMEOUT = 10
  26. MAIL_FETCHER_CONNECTION_TIMEOUT = 60*3
  27. MAIL_FETCHER_IDLE_RESPONSE_TIMEOUT = 60*9 # this should be not more
  28. # that 29 minutes according to rfc2177.(server wait 30min by default)
  29. class MessageContainer(object):
  30. def __init__(self, message: Message, uid: int) -> None:
  31. self.message = message
  32. self.uid = uid
  33. class DecodedMail(object):
  34. def __init__(self, message: Message, uid: int=None) -> None:
  35. self._message = message
  36. self.uid = uid
  37. def _decode_header(self, header_title: str) -> typing.Optional[str]:
  38. # FIXME : Handle exception
  39. if header_title in self._message:
  40. return str(make_header(decode_header(self._message[header_title])))
  41. else:
  42. return None
  43. def get_subject(self) -> typing.Optional[str]:
  44. return self._decode_header('subject')
  45. def get_from_address(self) -> str:
  46. return parseaddr(self._message['From'])[1]
  47. def get_to_address(self) -> str:
  48. return parseaddr(self._message['To'])[1]
  49. def get_first_ref(self) -> str:
  50. return parseaddr(self._message['References'])[1]
  51. def get_special_key(self) -> typing.Optional[str]:
  52. return self._decode_header(TRACIM_SPECIAL_KEY_HEADER)
  53. def get_body(
  54. self,
  55. use_html_parsing=True,
  56. use_txt_parsing=True,
  57. ) -> typing.Optional[str]:
  58. body_part = self._get_mime_body_message()
  59. body = None
  60. if body_part:
  61. charset = body_part.get_content_charset('iso-8859-1')
  62. content_type = body_part.get_content_type()
  63. if content_type == CONTENT_TYPE_TEXT_PLAIN:
  64. txt_body = body_part.get_payload(decode=True).decode(
  65. charset)
  66. if use_txt_parsing:
  67. txt_body = EmailReplyParser.parse_reply(txt_body)
  68. html_body = markdown.markdown(txt_body)
  69. body = HtmlSanitizer.sanitize(html_body)
  70. elif content_type == CONTENT_TYPE_TEXT_HTML:
  71. html_body = body_part.get_payload(decode=True).decode(
  72. charset)
  73. if use_html_parsing:
  74. html_body = str(ParsedHTMLMail(html_body))
  75. body = HtmlSanitizer.sanitize(html_body)
  76. return body
  77. def _get_mime_body_message(self) -> typing.Optional[Message]:
  78. # TODO - G.M - 2017-11-16 - Use stdlib msg.get_body feature for py3.6+
  79. part = None
  80. # Check for html
  81. for part in self._message.walk():
  82. content_type = part.get_content_type()
  83. content_dispo = str(part.get('Content-Disposition'))
  84. if content_type == CONTENT_TYPE_TEXT_HTML \
  85. and 'attachment' not in content_dispo:
  86. return part
  87. # check for plain text
  88. for part in self._message.walk():
  89. content_type = part.get_content_type()
  90. content_dispo = str(part.get('Content-Disposition'))
  91. if content_type == CONTENT_TYPE_TEXT_PLAIN \
  92. and 'attachment' not in content_dispo:
  93. return part
  94. return part
  95. def get_key(self) -> typing.Optional[str]:
  96. """
  97. key is the string contain in some mail header we need to retrieve.
  98. First try checking special header, them check 'to' header
  99. and finally check first(oldest) mail-id of 'references' header
  100. """
  101. first_ref = self.get_first_ref()
  102. to_address = self.get_to_address()
  103. special_key = self.get_special_key()
  104. if special_key:
  105. return special_key
  106. if to_address:
  107. return DecodedMail.find_key_from_mail_address(to_address)
  108. if first_ref:
  109. return DecodedMail.find_key_from_mail_address(first_ref)
  110. return None
  111. @classmethod
  112. def find_key_from_mail_address(
  113. cls,
  114. mail_address: str,
  115. ) -> typing.Optional[str]:
  116. """ Parse mail_adress-like string
  117. to retrieve key.
  118. :param mail_address: user+key@something like string
  119. :return: key
  120. """
  121. username = mail_address.split('@')[0]
  122. username_data = username.split('+')
  123. if len(username_data) == 2:
  124. return username_data[1]
  125. return None
  126. class BadIMAPFetchResponse(Exception):
  127. pass
  128. class MailFetcher(object):
  129. def __init__(
  130. self,
  131. host: str,
  132. port: str,
  133. user: str,
  134. password: str,
  135. use_ssl: bool,
  136. folder: str,
  137. use_idle: bool,
  138. connection_max_lifetime: int,
  139. heartbeat: int,
  140. endpoint: str,
  141. token: str,
  142. use_html_parsing: bool,
  143. use_txt_parsing: bool,
  144. lockfile_path: str,
  145. ) -> None:
  146. """
  147. Fetch mail from a mailbox folder through IMAP and add their content to
  148. Tracim through http according to mail Headers.
  149. Fetch is regular.
  150. :param host: imap server hostname
  151. :param port: imap connection port
  152. :param user: user login of mailbox
  153. :param password: user password of mailbox
  154. :param use_ssl: use imap over ssl connection
  155. :param folder: mail folder where new mail are fetched
  156. :param use_idle: use IMAP IDLE(server notification) when available
  157. :param heartbeat: seconds to wait before fetching new mail again
  158. :param connection_max_lifetime: maximum duration allowed for a
  159. connection . connection are automatically renew when their
  160. lifetime excess this duration.
  161. :param endpoint: tracim http endpoint where decoded mail are send.
  162. :param token: token to authenticate http connexion
  163. :param use_html_parsing: parse html mail
  164. :param use_txt_parsing: parse txt mail
  165. """
  166. self.host = host
  167. self.port = port
  168. self.user = user
  169. self.password = password
  170. self.use_ssl = use_ssl
  171. self.folder = folder
  172. self.heartbeat = heartbeat
  173. self.use_idle = use_idle
  174. self.connection_max_lifetime = connection_max_lifetime
  175. self.endpoint = endpoint
  176. self.token = token
  177. self.use_html_parsing = use_html_parsing
  178. self.use_txt_parsing = use_txt_parsing
  179. self.lock = filelock.FileLock(lockfile_path)
  180. self._is_active = True
  181. def run(self) -> None:
  182. logger.info(self, 'Starting MailFetcher')
  183. while self._is_active:
  184. imapc = None
  185. sleep_after_connection = True
  186. try:
  187. imapc = imapclient.IMAPClient(
  188. self.host,
  189. self.port,
  190. ssl=self.use_ssl,
  191. timeout=MAIL_FETCHER_CONNECTION_TIMEOUT
  192. )
  193. imapc.login(self.user, self.password)
  194. logger.debug(self, 'Select folder {}'.format(
  195. self.folder,
  196. ))
  197. imapc.select_folder(self.folder)
  198. # force renew connection when deadline is reached
  199. deadline = time.time() + self.connection_max_lifetime
  200. while True:
  201. if not self._is_active:
  202. logger.warning(self, 'Mail Fetcher process aborted')
  203. sleep_after_connection = False
  204. break
  205. if time.time() > deadline:
  206. logger.debug(
  207. self,
  208. "MailFetcher Connection Lifetime limit excess"
  209. ", Try Re-new connection")
  210. sleep_after_connection = False
  211. break
  212. # check for new mails
  213. self._check_mail(imapc)
  214. if self.use_idle and imapc.has_capability('IDLE'):
  215. # IDLE_mode wait until event from server
  216. logger.debug(self, 'wail for event(IDLE)')
  217. imapc.idle()
  218. imapc.idle_check(
  219. timeout=MAIL_FETCHER_IDLE_RESPONSE_TIMEOUT
  220. )
  221. imapc.idle_done()
  222. else:
  223. if self.use_idle and not imapc.has_capability('IDLE'):
  224. log = 'IDLE mode activated but server do not' \
  225. 'support it, use polling instead.'
  226. logger.warning(self, log)
  227. # normal polling mode : sleep a define duration
  228. logger.debug(self,
  229. 'sleep for {}'.format(self.heartbeat))
  230. time.sleep(self.heartbeat)
  231. # Socket
  232. except (socket.error,
  233. socket.gaierror,
  234. socket.herror) as e:
  235. log = 'Socket fail with IMAP connection {}'
  236. logger.error(self, log.format(e.__str__()))
  237. except socket.timeout as e:
  238. log = 'Socket timeout on IMAP connection {}'
  239. logger.error(self, log.format(e.__str__()))
  240. # SSL
  241. except ssl.SSLError as e:
  242. log = 'SSL error on IMAP connection'
  243. logger.error(self, log.format(e.__str__()))
  244. except ssl.CertificateError as e:
  245. log = 'SSL Certificate verification failed on IMAP connection'
  246. logger.error(self, log.format(e.__str__()))
  247. # Filelock
  248. except filelock.Timeout as e:
  249. log = 'Mail Fetcher Lock Timeout {}'
  250. logger.warning(self, log.format(e.__str__()))
  251. # IMAP
  252. # TODO - G.M - 10-01-2017 - Support imapclient exceptions
  253. # when Imapclient stable will be 2.0+
  254. except BadIMAPFetchResponse as e:
  255. log = 'Imap Fetch command return bad response.' \
  256. 'Is someone else connected to the mailbox ?: ' \
  257. '{}'
  258. logger.error(self, log.format(e.__str__()))
  259. # Others
  260. except Exception as e:
  261. log = 'Mail Fetcher error {}'
  262. logger.error(self, log.format(e.__str__()))
  263. finally:
  264. # INFO - G.M - 2018-01-09 - Connection closing
  265. # Properly close connection according to
  266. # https://github.com/mjs/imapclient/pull/279/commits/043e4bd0c5c775c5a08cb5f1baa93876a46732ee
  267. # TODO : Use __exit__ method instead when imapclient stable will
  268. # be 2.0+ .
  269. if imapc:
  270. logger.debug(self, 'Try logout')
  271. try:
  272. imapc.logout()
  273. except Exception:
  274. try:
  275. imapc.shutdown()
  276. except Exception as e:
  277. log = "Can't logout, connection broken ? {}"
  278. logger.error(self, log.format(e.__str__()))
  279. if sleep_after_connection:
  280. logger.debug(self, 'sleep for {}'.format(self.heartbeat))
  281. time.sleep(self.heartbeat)
  282. log = 'Mail Fetcher stopped'
  283. logger.debug(self, log)
  284. def _check_mail(self, imapc: imapclient.IMAPClient) -> None:
  285. with self.lock.acquire(
  286. timeout=MAIL_FETCHER_FILELOCK_TIMEOUT
  287. ):
  288. messages = self._fetch(imapc)
  289. cleaned_mails = [DecodedMail(m.message, m.uid)
  290. for m in messages]
  291. self._notify_tracim(cleaned_mails, imapc)
  292. def stop(self) -> None:
  293. self._is_active = False
  294. def _fetch(
  295. self,
  296. imapc: imapclient.IMAPClient,
  297. ) -> typing.List[MessageContainer]:
  298. """
  299. Get news message from mailbox
  300. :return: list of new mails
  301. """
  302. messages = []
  303. logger.debug(self, 'Fetch unflagged messages')
  304. uids = imapc.search(['UNFLAGGED'])
  305. logger.debug(self, 'Found {} unflagged mails'.format(
  306. len(uids),
  307. ))
  308. for msgid, data in imapc.fetch(uids, ['BODY.PEEK[]']).items():
  309. # INFO - G.M - 2017-12-08 - Fetch BODY.PEEK[]
  310. # Retrieve all mail(body and header) but don't set mail
  311. # as seen because of PEEK
  312. # see rfc3501
  313. logger.debug(self, 'Fetch mail "{}"'.format(
  314. msgid,
  315. ))
  316. try:
  317. msg = message_from_bytes(data[b'BODY[]'])
  318. except KeyError as e:
  319. # INFO - G.M - 12-01-2018 - Fetch may return events response
  320. # In some specific case, fetch command may return events
  321. # response unrelated to fetch request.
  322. # This should happen only when someone-else use the mailbox
  323. # at the same time of the fetcher.
  324. # see https://github.com/mjs/imapclient/issues/334
  325. except_msg = 'fetch response : {}'.format(str(data))
  326. raise BadIMAPFetchResponse(except_msg) from e
  327. msg_container = MessageContainer(msg, msgid)
  328. messages.append(msg_container)
  329. return messages
  330. def _notify_tracim(
  331. self,
  332. mails: typing.List[DecodedMail],
  333. imapc: imapclient.IMAPClient
  334. ) -> None:
  335. """
  336. Send http request to tracim endpoint
  337. :param mails: list of mails to send
  338. :return: none
  339. """
  340. logger.debug(self, 'Notify tracim about {} new responses'.format(
  341. len(mails),
  342. ))
  343. # TODO BS 20171124: Look around mail.get_from_address(), mail.get_key()
  344. # , mail.get_body() etc ... for raise InvalidEmailError if missing
  345. # required informations (actually get_from_address raise IndexError
  346. # if no from address for example) and catch it here
  347. while mails:
  348. mail = mails.pop()
  349. body = mail.get_body(
  350. use_html_parsing=self.use_html_parsing,
  351. use_txt_parsing=self.use_txt_parsing,
  352. )
  353. from_address = mail.get_from_address()
  354. # don't create element for 'empty' mail
  355. if not body:
  356. logger.warning(
  357. self,
  358. 'Mail from {} has not valable content'.format(
  359. from_address
  360. ),
  361. )
  362. continue
  363. msg = {'token': self.token,
  364. 'user_mail': from_address,
  365. 'content_id': mail.get_key(),
  366. 'payload': {
  367. 'content': body,
  368. }}
  369. try:
  370. logger.debug(
  371. self,
  372. 'Contact API on {} with body {}'.format(
  373. self.endpoint,
  374. json.dumps(msg),
  375. ),
  376. )
  377. r = requests.post(self.endpoint, json=msg)
  378. if r.status_code not in [200, 204]:
  379. details = r.json().get('msg')
  380. log = 'bad status code {} response when sending mail to tracim: {}' # nopep8
  381. logger.error(self, log.format(
  382. str(r.status_code),
  383. details,
  384. ))
  385. # Flag all correctly checked mail
  386. if r.status_code in [200, 204, 400]:
  387. imapc.add_flags((mail.uid,), IMAP_CHECKED_FLAG)
  388. imapc.add_flags((mail.uid,), IMAP_SEEN_FLAG)
  389. # TODO - G.M - Verify exception correctly works
  390. except requests.exceptions.Timeout as e:
  391. log = 'Timeout error to transmit fetched mail to tracim : {}'
  392. logger.error(self, log.format(str(e)))
  393. except requests.exceptions.RequestException as e:
  394. log = 'Fail to transmit fetched mail to tracim : {}'
  395. logger.error(self, log.format(str(e)))