email_fetcher.py 15KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415
  1. # -*- coding: utf-8 -*-
  2. import time
  3. import imaplib
  4. import json
  5. import typing
  6. from email import message_from_bytes
  7. from email.header import decode_header
  8. from email.header import make_header
  9. from email.message import Message
  10. from email.utils import parseaddr
  11. import filelock
  12. import markdown
  13. import requests
  14. from email_reply_parser import EmailReplyParser
  15. from tracim.lib.base import logger
  16. from tracim.lib.email_processing.parser import ParsedHTMLMail
  17. from tracim.lib.email_processing.sanitizer import HtmlSanitizer
  18. TRACIM_SPECIAL_KEY_HEADER = 'X-Tracim-Key'
  19. CONTENT_TYPE_TEXT_PLAIN = 'text/plain'
  20. CONTENT_TYPE_TEXT_HTML = 'text/html'
  21. IMAP_SEEN_FLAG = '\\Seen'
  22. IMAP_CHECKED_FLAG = '\\Flagged'
  23. class MessageContainer(object):
  24. def __init__(self, message: Message, uid: int) -> None:
  25. self.message = message
  26. self.uid = uid
  27. class DecodedMail(object):
  28. def __init__(self, message: Message, uid: int=None) -> None:
  29. self._message = message
  30. self.uid = uid
  31. def _decode_header(self, header_title: str) -> typing.Optional[str]:
  32. # FIXME : Handle exception
  33. if header_title in self._message:
  34. return str(make_header(decode_header(self._message[header_title])))
  35. else:
  36. return None
  37. def get_subject(self) -> typing.Optional[str]:
  38. return self._decode_header('subject')
  39. def get_from_address(self) -> str:
  40. return parseaddr(self._message['From'])[1]
  41. def get_to_address(self) -> str:
  42. return parseaddr(self._message['To'])[1]
  43. def get_first_ref(self) -> str:
  44. return parseaddr(self._message['References'])[1]
  45. def get_special_key(self) -> typing.Optional[str]:
  46. return self._decode_header(TRACIM_SPECIAL_KEY_HEADER)
  47. def get_body(
  48. self,
  49. use_html_parsing=True,
  50. use_txt_parsing=True,
  51. ) -> typing.Optional[str]:
  52. body_part = self._get_mime_body_message()
  53. body = None
  54. if body_part:
  55. charset = body_part.get_content_charset('iso-8859-1')
  56. content_type = body_part.get_content_type()
  57. if content_type == CONTENT_TYPE_TEXT_PLAIN:
  58. txt_body = body_part.get_payload(decode=True).decode(
  59. charset)
  60. if use_txt_parsing:
  61. txt_body = EmailReplyParser.parse_reply(txt_body)
  62. html_body = markdown.markdown(txt_body)
  63. body = HtmlSanitizer.sanitize(html_body)
  64. elif content_type == CONTENT_TYPE_TEXT_HTML:
  65. html_body = body_part.get_payload(decode=True).decode(
  66. charset)
  67. if use_html_parsing:
  68. html_body = str(ParsedHTMLMail(html_body))
  69. body = HtmlSanitizer.sanitize(html_body)
  70. return body
  71. def _get_mime_body_message(self) -> typing.Optional[Message]:
  72. # TODO - G.M - 2017-11-16 - Use stdlib msg.get_body feature for py3.6+
  73. part = None
  74. # Check for html
  75. for part in self._message.walk():
  76. content_type = part.get_content_type()
  77. content_dispo = str(part.get('Content-Disposition'))
  78. if content_type == CONTENT_TYPE_TEXT_HTML \
  79. and 'attachment' not in content_dispo:
  80. return part
  81. # check for plain text
  82. for part in self._message.walk():
  83. content_type = part.get_content_type()
  84. content_dispo = str(part.get('Content-Disposition'))
  85. if content_type == CONTENT_TYPE_TEXT_PLAIN \
  86. and 'attachment' not in content_dispo:
  87. return part
  88. return part
  89. def get_key(self) -> typing.Optional[str]:
  90. """
  91. key is the string contain in some mail header we need to retrieve.
  92. First try checking special header, them check 'to' header
  93. and finally check first(oldest) mail-id of 'references' header
  94. """
  95. first_ref = self.get_first_ref()
  96. to_address = self.get_to_address()
  97. special_key = self.get_special_key()
  98. if special_key:
  99. return special_key
  100. if to_address:
  101. return DecodedMail.find_key_from_mail_address(to_address)
  102. if first_ref:
  103. return DecodedMail.find_key_from_mail_address(first_ref)
  104. return None
  105. @classmethod
  106. def find_key_from_mail_address(
  107. cls,
  108. mail_address: str,
  109. ) -> typing.Optional[str]:
  110. """ Parse mail_adress-like string
  111. to retrieve key.
  112. :param mail_address: user+key@something like string
  113. :return: key
  114. """
  115. username = mail_address.split('@')[0]
  116. username_data = username.split('+')
  117. if len(username_data) == 2:
  118. return username_data[1]
  119. return None
  120. class MailFetcher(object):
  121. def __init__(
  122. self,
  123. host: str,
  124. port: str,
  125. user: str,
  126. password: str,
  127. use_ssl: bool,
  128. folder: str,
  129. delay: int,
  130. endpoint: str,
  131. token: str,
  132. use_html_parsing: bool,
  133. use_txt_parsing: bool,
  134. filelock_path: str,
  135. ) -> None:
  136. """
  137. Fetch mail from a mailbox folder through IMAP and add their content to
  138. Tracim through http according to mail Headers.
  139. Fetch is regular.
  140. :param host: imap server hostname
  141. :param port: imap connection port
  142. :param user: user login of mailbox
  143. :param password: user password of mailbox
  144. :param use_ssl: use imap over ssl connection
  145. :param folder: mail folder where new mail are fetched
  146. :param delay: seconds to wait before fetching new mail again
  147. :param endpoint: tracim http endpoint where decoded mail are send.
  148. :param token: token to authenticate http connexion
  149. :param use_html_parsing: parse html mail
  150. :param use_txt_parsing: parse txt mail
  151. """
  152. self._connection = None
  153. self.host = host
  154. self.port = port
  155. self.user = user
  156. self.password = password
  157. self.use_ssl = use_ssl
  158. self.folder = folder
  159. self.delay = delay
  160. self.endpoint = endpoint
  161. self.token = token
  162. self.use_html_parsing = use_html_parsing
  163. self.use_txt_parsing = use_txt_parsing
  164. self.lock = filelock.FileLock(filelock_path)
  165. self._is_active = True
  166. def run(self) -> None:
  167. logger.info(self, 'Starting MailFetcher')
  168. while self._is_active:
  169. logger.debug(self, 'sleep for {}'.format(self.delay))
  170. time.sleep(self.delay)
  171. try:
  172. self._connect()
  173. with self.lock.acquire(timeout=10):
  174. messages = self._fetch()
  175. cleaned_mails = [DecodedMail(m.message, m.uid)
  176. for m in messages]
  177. self._notify_tracim(cleaned_mails)
  178. self._disconnect()
  179. except Exception as e:
  180. # TODO - G.M - 2017-11-23 - Identify possible exceptions
  181. log = 'IMAP error: {}'
  182. logger.warning(self, log.format(e.__str__()))
  183. def stop(self) -> None:
  184. self._is_active = False
  185. del self.lock
  186. def _connect(self) -> None:
  187. # TODO - G.M - 2017-11-15 Verify connection/disconnection
  188. # Are old connexion properly close this way ?
  189. if self._connection:
  190. logger.debug(self, 'Disconnect from IMAP')
  191. self._disconnect()
  192. # TODO - G.M - 2017-11-23 Support for predefined SSLContext ?
  193. # without ssl_context param, tracim use default security configuration
  194. # which is great in most case.
  195. if self.use_ssl:
  196. logger.debug(self, 'Connect IMAP {}:{} using SSL'.format(
  197. self.host,
  198. self.port,
  199. ))
  200. self._connection = imaplib.IMAP4_SSL(self.host, self.port)
  201. else:
  202. logger.debug(self, 'Connect IMAP {}:{}'.format(
  203. self.host,
  204. self.port,
  205. ))
  206. self._connection = imaplib.IMAP4(self.host, self.port)
  207. try:
  208. logger.debug(self, 'Login IMAP with login {}'.format(
  209. self.user,
  210. ))
  211. self._connection.login(self.user, self.password)
  212. except Exception as e:
  213. log = 'IMAP login error: {}'
  214. logger.error(self, log.format(e.__str__()))
  215. def _disconnect(self) -> None:
  216. if self._connection:
  217. self._connection.close()
  218. self._connection.logout()
  219. self._connection = None
  220. def _fetch(self) -> typing.List[MessageContainer]:
  221. """
  222. Get news message from mailbox
  223. :return: list of new mails
  224. """
  225. messages = []
  226. # select mailbox
  227. logger.debug(self, 'Fetch messages from folder {}'.format(
  228. self.folder,
  229. ))
  230. rv, data = self._connection.select(self.folder)
  231. logger.debug(self, 'Response status {}'.format(
  232. rv,
  233. ))
  234. if rv == 'OK':
  235. # get mails
  236. # TODO - G.M - 2017-11-15 Which files to select as new file ?
  237. # Unseen file or All file from a directory (old one should be
  238. # moved/ deleted from mailbox during this process) ?
  239. logger.debug(self, 'Fetch unseen messages')
  240. rv, data = self._connection.search(None, "(UNSEEN)")
  241. logger.debug(self, 'Response status {}'.format(
  242. rv,
  243. ))
  244. if rv == 'OK':
  245. # get mail content
  246. logger.debug(self, 'Found {} unseen mails'.format(
  247. len(data[0].split()),
  248. ))
  249. for uid in data[0].split():
  250. # INFO - G.M - 2017-12-08 - Fetch BODY.PEEK[]
  251. # Retrieve all mail(body and header) but don't set mail
  252. # as seen because of PEEK
  253. # see rfc3501
  254. logger.debug(self, 'Fetch mail "{}"'.format(
  255. uid,
  256. ))
  257. rv, data = self._connection.fetch(uid, 'BODY.PEEK[]')
  258. logger.debug(self, 'Response status {}'.format(
  259. rv,
  260. ))
  261. if rv == 'OK':
  262. msg = message_from_bytes(data[0][1])
  263. msg_container = MessageContainer(msg, uid)
  264. messages.append(msg_container)
  265. self._set_flag(uid, IMAP_SEEN_FLAG)
  266. else:
  267. log = 'IMAP : Unable to get mail : {}'
  268. logger.error(self, log.format(str(rv)))
  269. else:
  270. log = 'IMAP : Unable to get unseen mail : {}'
  271. logger.error(self, log.format(str(rv)))
  272. else:
  273. log = 'IMAP : Unable to open mailbox : {}'
  274. logger.error(self, log.format(str(rv)))
  275. return messages
  276. def _notify_tracim(
  277. self,
  278. mails: typing.List[DecodedMail],
  279. ) -> None:
  280. """
  281. Send http request to tracim endpoint
  282. :param mails: list of mails to send
  283. :return: unsended mails
  284. """
  285. logger.debug(self, 'Notify tracim about {} new responses'.format(
  286. len(mails),
  287. ))
  288. unsended_mails = []
  289. # TODO BS 20171124: Look around mail.get_from_address(), mail.get_key()
  290. # , mail.get_body() etc ... for raise InvalidEmailError if missing
  291. # required informations (actually get_from_address raise IndexError
  292. # if no from address for example) and catch it here
  293. while mails:
  294. mail = mails.pop()
  295. msg = {'token': self.token,
  296. 'user_mail': mail.get_from_address(),
  297. 'content_id': mail.get_key(),
  298. 'payload': {
  299. 'content': mail.get_body(
  300. use_html_parsing=self.use_html_parsing,
  301. use_txt_parsing=self.use_txt_parsing),
  302. }}
  303. try:
  304. logger.debug(
  305. self,
  306. 'Contact API on {} with body {}'.format(
  307. self.endpoint,
  308. json.dumps(msg),
  309. ),
  310. )
  311. r = requests.post(self.endpoint, json=msg)
  312. if r.status_code not in [200, 204]:
  313. details = r.json().get('msg')
  314. log = 'bad status code {} response when sending mail to tracim: {}' # nopep8
  315. logger.error(self, log.format(
  316. str(r.status_code),
  317. details,
  318. ))
  319. # Flag all correctly checked mail, unseen the others
  320. if r.status_code in [200, 204, 400]:
  321. self._set_flag(mail.uid, IMAP_CHECKED_FLAG)
  322. else:
  323. self._unset_flag(mail.uid, IMAP_SEEN_FLAG)
  324. # TODO - G.M - Verify exception correctly works
  325. except requests.exceptions.Timeout as e:
  326. log = 'Timeout error to transmit fetched mail to tracim : {}'
  327. logger.error(self, log.format(str(e)))
  328. unsended_mails.append(mail)
  329. self._unset_flag(mail.uid, IMAP_SEEN_FLAG)
  330. except requests.exceptions.RequestException as e:
  331. log = 'Fail to transmit fetched mail to tracim : {}'
  332. logger.error(self, log.format(str(e)))
  333. self._unset_flag(mail.uid, IMAP_SEEN_FLAG)
  334. def _set_flag(
  335. self,
  336. uid: int,
  337. flag: str,
  338. ) -> None:
  339. assert uid is not None
  340. rv, data = self._connection.store(
  341. uid,
  342. '+FLAGS',
  343. flag,
  344. )
  345. if rv == 'OK':
  346. log = 'Message {uid} set as {flag}.'.format(
  347. uid=uid,
  348. flag=flag)
  349. logger.debug(self, log)
  350. else:
  351. log = 'Can not set Message {uid} as {flag} : {rv}'.format(
  352. uid=uid,
  353. flag=flag,
  354. rv=rv)
  355. logger.error(self, log)
  356. def _unset_flag(
  357. self,
  358. uid: int,
  359. flag: str,
  360. ) -> None:
  361. assert uid is not None
  362. rv, data = self._connection.store(
  363. uid,
  364. '-FLAGS',
  365. flag,
  366. )
  367. if rv == 'OK':
  368. log = 'Message {uid} unset as {flag}.'.format(
  369. uid=uid,
  370. flag=flag)
  371. logger.debug(self, log)
  372. else:
  373. log = 'Can not unset Message {uid} as {flag} : {rv}'.format(
  374. uid=uid,
  375. flag=flag,
  376. rv=rv)
  377. logger.error(self, log)