소스 검색

Merge pull request #506 from inkhey/feature/better_html_email_parsing

Bastien Sevajol 6 년 전
부모
커밋
7708553bb6
No account linked to committer's email

+ 2 - 0
tracim/development.ini.base 파일 보기

@@ -226,6 +226,8 @@ email.reply.imap.use_ssl = true
226 226
 email.reply.token = mysecuretoken
227 227
 # Delay in seconds between each check
228 228
 email.reply.check.heartbeat = 60
229
+email.reply.use_html_parsing = true
230
+email.reply.use_txt_parsing = true
229 231
 
230 232
 ## Radical (CalDav server) configuration
231 233
 # radicale.server.host = 0.0.0.0

+ 9 - 1
tracim/tracim/config/app_cfg.py 파일 보기

@@ -128,7 +128,7 @@ def start_daemons(manager: DaemonsManager):
128 128
         manager.run('mail_sender', MailSenderDaemon)
129 129
 
130 130
     if cfg.EMAIL_REPLY_ACTIVATED:
131
-        manager.run('mail_fetcher',MailFetcherDaemon)
131
+        manager.run('mail_fetcher', MailFetcherDaemon)
132 132
 
133 133
 
134 134
 def configure_depot():
@@ -384,6 +384,14 @@ class CFG(object):
384 384
         self.EMAIL_REPLY_IMAP_USE_SSL = asbool(tg.config.get(
385 385
             'email.reply.imap.use_ssl',
386 386
         ))
387
+        self.EMAIL_REPLY_USE_HTML_PARSING = asbool(tg.config.get(
388
+            'email.reply.use_html_parsing',
389
+            True,
390
+        ))
391
+        self.EMAIL_REPLY_USE_TXT_PARSING = asbool(tg.config.get(
392
+            'email.reply.use_txt_parsing',
393
+            True,
394
+        ))
387 395
 
388 396
         self.TRACKER_JS_PATH = tg.config.get(
389 397
             'js_tracker_path',

+ 2 - 0
tracim/tracim/lib/daemons.py 파일 보기

@@ -177,6 +177,8 @@ class MailFetcherDaemon(Daemon):
177 177
             # FIXME - G.M - 2017-11-15 - proper tracim url formatting
178 178
             endpoint=cfg.WEBSITE_BASE_URL + "/events",
179 179
             token=cfg.EMAIL_REPLY_TOKEN,
180
+            use_html_parsing=cfg.EMAIL_REPLY_USE_HTML_PARSING,
181
+            use_txt_parsing=cfg.EMAIL_REPLY_USE_TXT_PARSING,
180 182
         )
181 183
         self._fetcher.run()
182 184
 

+ 26 - 59
tracim/tracim/lib/email_fetcher.py 파일 보기

@@ -4,32 +4,20 @@ import time
4 4
 import imaplib
5 5
 import json
6 6
 import typing
7
-from email.message import Message
7
+from email import message_from_bytes
8 8
 from email.header import decode_header
9 9
 from email.header import make_header
10
+from email.message import Message
10 11
 from email.utils import parseaddr
11
-from email import message_from_bytes
12 12
 
13 13
 import markdown
14 14
 import requests
15
-from bs4 import BeautifulSoup
16
-from bs4 import Tag
17 15
 from email_reply_parser import EmailReplyParser
18
-
19 16
 from tracim.lib.base import logger
17
+from tracim.lib.email_processing.parser import ParsedHTMLMail
18
+from tracim.lib.email_processing.sanitizer import HtmlSanitizer
20 19
 
21 20
 TRACIM_SPECIAL_KEY_HEADER = 'X-Tracim-Key'
22
-# TODO BS 20171124: Think about replace thin dict config by object
23
-BEAUTIFULSOUP_HTML_BODY_PARSE_CONFIG = {
24
-    'tag_blacklist': ['script', 'style', 'blockquote'],
25
-    'class_blacklist': ['moz-cite-prefix', 'gmail_extra', 'gmail_quote',
26
-                        'yahoo_quoted'],
27
-    'id_blacklist': ['reply-intro'],
28
-    'tag_whitelist': ['a', 'b', 'strong', 'i', 'br', 'ul', 'li', 'ol',
29
-                      'em', 'i', 'u',
30
-                      'thead', 'tr', 'td', 'tbody', 'table', 'p', 'pre'],
31
-    'attrs_whitelist': ['href'],
32
-}
33 21
 CONTENT_TYPE_TEXT_PLAIN = 'text/plain'
34 22
 CONTENT_TYPE_TEXT_HTML = 'text/html'
35 23
 
@@ -60,7 +48,11 @@ class DecodedMail(object):
60 48
     def get_special_key(self) -> typing.Optional[str]:
61 49
         return self._decode_header(TRACIM_SPECIAL_KEY_HEADER)
62 50
 
63
-    def get_body(self) -> typing.Optional[str]:
51
+    def get_body(
52
+            self,
53
+            use_html_parsing=True,
54
+            use_txt_parsing=True,
55
+    ) -> typing.Optional[str]:
64 56
         body_part = self._get_mime_body_message()
65 57
         body = None
66 58
         if body_part:
@@ -69,53 +61,20 @@ class DecodedMail(object):
69 61
             if content_type == CONTENT_TYPE_TEXT_PLAIN:
70 62
                 txt_body = body_part.get_payload(decode=True).decode(
71 63
                     charset)
72
-                body = DecodedMail._parse_txt_body(txt_body)
64
+                if use_txt_parsing:
65
+                    txt_body = EmailReplyParser.parse_reply(txt_body)
66
+                html_body = markdown.markdown(txt_body)
67
+                body = HtmlSanitizer.sanitize(html_body)
73 68
 
74 69
             elif content_type == CONTENT_TYPE_TEXT_HTML:
75 70
                 html_body = body_part.get_payload(decode=True).decode(
76 71
                     charset)
77
-                body = DecodedMail._parse_html_body(html_body)
78
-
79
-        return body
72
+                if use_html_parsing:
73
+                    html_body = str(ParsedHTMLMail(html_body))
74
+                body = HtmlSanitizer.sanitize(html_body)
80 75
 
81
-    @classmethod
82
-    def _parse_txt_body(cls, txt_body: str) -> str:
83
-        txt_body = EmailReplyParser.parse_reply(txt_body)
84
-        html_body = markdown.markdown(txt_body)
85
-        body = DecodedMail._parse_html_body(html_body)
86 76
         return body
87 77
 
88
-    @classmethod
89
-    def _parse_html_body(cls, html_body: str) -> str:
90
-        soup = BeautifulSoup(html_body, 'html.parser')
91
-        config = BEAUTIFULSOUP_HTML_BODY_PARSE_CONFIG
92
-        for tag in soup.findAll():
93
-            if DecodedMail._tag_to_extract(tag):
94
-                tag.extract()
95
-            elif tag.name.lower() in config['tag_whitelist']:
96
-                attrs = dict(tag.attrs)
97
-                for attr in attrs:
98
-                    if attr not in config['attrs_whitelist']:
99
-                        del tag.attrs[attr]
100
-            else:
101
-                tag.unwrap()
102
-        return str(soup)
103
-
104
-    @classmethod
105
-    def _tag_to_extract(cls, tag: Tag) -> bool:
106
-        config = BEAUTIFULSOUP_HTML_BODY_PARSE_CONFIG
107
-        if tag.name.lower() in config['tag_blacklist']:
108
-            return True
109
-        if 'class' in tag.attrs:
110
-            for elem in config['class_blacklist']:
111
-                if elem in tag.attrs['class']:
112
-                    return True
113
-        if 'id' in tag.attrs:
114
-            for elem in config['id_blacklist']:
115
-                if elem in tag.attrs['id']:
116
-                    return True
117
-        return False
118
-
119 78
     def _get_mime_body_message(self) -> typing.Optional[Message]:
120 79
         # TODO - G.M - 2017-11-16 - Use stdlib msg.get_body feature for py3.6+
121 80
         part = None
@@ -185,6 +144,8 @@ class MailFetcher(object):
185 144
         delay: int,
186 145
         endpoint: str,
187 146
         token: str,
147
+        use_html_parsing: bool,
148
+        use_txt_parsing: bool,
188 149
     ) -> None:
189 150
         """
190 151
         Fetch mail from a mailbox folder through IMAP and add their content to
@@ -199,6 +160,8 @@ class MailFetcher(object):
199 160
         :param delay: seconds to wait before fetching new mail again
200 161
         :param endpoint: tracim http endpoint where decoded mail are send.
201 162
         :param token: token to authenticate http connexion
163
+        :param use_html_parsing: parse html mail
164
+        :param use_txt_parsing: parse txt mail
202 165
         """
203 166
         self._connection = None
204 167
         self.host = host
@@ -210,6 +173,8 @@ class MailFetcher(object):
210 173
         self.delay = delay
211 174
         self.endpoint = endpoint
212 175
         self.token = token
176
+        self.use_html_parsing = use_html_parsing
177
+        self.use_txt_parsing = use_txt_parsing
213 178
 
214 179
         self._is_active = True
215 180
 
@@ -222,7 +187,7 @@ class MailFetcher(object):
222 187
                 self._connect()
223 188
                 messages = self._fetch()
224 189
                 # TODO - G.M -  2017-11-22 retry sending unsended mail
225
-                # These mails are return by _notify_tracim, flag them with "unseen"
190
+                # These mails are return by _notify_tracim, flag them with "unseen" # nopep8
226 191
                 # or store them until new _notify_tracim call
227 192
                 cleaned_mails = [DecodedMail(msg) for msg in messages]
228 193
                 self._notify_tracim(cleaned_mails)
@@ -350,7 +315,9 @@ class MailFetcher(object):
350 315
                    'user_mail': mail.get_from_address(),
351 316
                    'content_id': mail.get_key(),
352 317
                    'payload': {
353
-                       'content': mail.get_body(),
318
+                       'content': mail.get_body(
319
+                           use_html_parsing=self.use_html_parsing,
320
+                           use_txt_parsing=self.use_txt_parsing),
354 321
                    }}
355 322
             try:
356 323
                 logger.debug(

+ 0 - 0
tracim/tracim/lib/email_processing/__init__.py 파일 보기


+ 211 - 0
tracim/tracim/lib/email_processing/checkers.py 파일 보기

@@ -0,0 +1,211 @@
1
+# -*- coding: utf-8 -*-
2
+import typing
3
+
4
+from bs4 import Tag, NavigableString
5
+
6
+
7
+class ProprietaryHTMLAttrValues(object):
8
+    """
9
+    This are all Proprietary (mail client specific) html attr value we need to
10
+    check Html Elements
11
+    """
12
+    # Gmail
13
+    Gmail_extras_class = 'gmail_extra'
14
+    Gmail_quote_class = 'gmail_quote'
15
+    Gmail_signature_class = 'gmail_signature'
16
+    # Thunderbird
17
+    Thunderbird_quote_prefix_class = 'moz-cite-prefix'
18
+    Thunderbird_signature_class = 'moz-signature'
19
+    # Outlook.com
20
+    Outlook_com_quote_id = 'divRplyFwdMsg'
21
+    Outlook_com_signature_id = 'Signature'
22
+    Outlook_com_wrapper_id = 'divtagdefaultwrapper'
23
+    # Yahoo
24
+    Yahoo_quote_class = 'yahoo_quoted'
25
+    # Roundcube
26
+    # INFO - G.M - 2017-11-29 - New tag
27
+    # see : https://github.com/roundcube/roundcubemail/issues/6049
28
+    Roundcube_quote_prefix_class = 'reply-intro'
29
+
30
+
31
+class HtmlChecker(object):
32
+
33
+    @classmethod
34
+    def _has_attr_value(
35
+            cls,
36
+            elem: typing.Union[Tag, NavigableString],
37
+            attribute_name: str,
38
+            attribute_value: str,
39
+    )-> bool:
40
+        """
41
+        Check if elem contains attribute named attribute_name with
42
+        attribute_value : example <a id="ident"> elem contain attribute
43
+        with id as attribute_name and ident as attribute_value.
44
+        Checking is not case_sensitive.
45
+
46
+        :param elem: Tag or String Html Element
47
+        :param attribute_name: Html attribute name
48
+        :param attribute_value: Html attribute value
49
+        :return: True only if Element contain this attribute.
50
+        """
51
+        if isinstance(elem, Tag) and attribute_name in elem.attrs:
52
+            # INFO - G.M - 2017-12-01 - attrs[value}] can be string or list
53
+            # use get_attribute_list to always check in a list
54
+            # see https://www.crummy.com/software/BeautifulSoup/bs4/doc/#multi-valued-attributes # nopep8
55
+            values_lower = [value.lower()
56
+                            for value
57
+                            in elem.get_attribute_list(attribute_name)]
58
+            return attribute_value.lower() in values_lower
59
+        return False
60
+
61
+
62
+class HtmlMailQuoteChecker(HtmlChecker):
63
+    """
64
+    Check if one HTML Element from Body Mail look-like a quote or not.
65
+    """
66
+    @classmethod
67
+    def is_quote(
68
+            cls,
69
+            elem: typing.Union[Tag, NavigableString]
70
+    ) -> bool:
71
+        return cls._is_standard_quote(elem) \
72
+               or cls._is_thunderbird_quote(elem) \
73
+               or cls._is_gmail_quote(elem) \
74
+               or cls._is_outlook_com_quote(elem) \
75
+               or cls._is_yahoo_quote(elem) \
76
+               or cls._is_roundcube_quote(elem)
77
+
78
+    @classmethod
79
+    def _is_standard_quote(
80
+            cls,
81
+            elem: typing.Union[Tag, NavigableString]
82
+    ) -> bool:
83
+        if isinstance(elem, Tag) \
84
+                and elem.name.lower() == 'blockquote':
85
+            return True
86
+        return False
87
+
88
+    @classmethod
89
+    def _is_thunderbird_quote(
90
+            cls,
91
+            elem: typing.Union[Tag, NavigableString]
92
+    ) -> bool:
93
+        return cls._has_attr_value(
94
+            elem,
95
+            'class',
96
+            ProprietaryHTMLAttrValues.Thunderbird_quote_prefix_class)
97
+
98
+    @classmethod
99
+    def _is_gmail_quote(
100
+            cls,
101
+            elem: typing.Union[Tag, NavigableString]
102
+    ) -> bool:
103
+        if cls._has_attr_value(
104
+                elem,
105
+                'class',
106
+                ProprietaryHTMLAttrValues.Gmail_extras_class):
107
+            for child in elem.children:
108
+                if cls._has_attr_value(
109
+                        child,
110
+                        'class',
111
+                        ProprietaryHTMLAttrValues.Gmail_quote_class):
112
+                    return True
113
+        return False
114
+
115
+    @classmethod
116
+    def _is_outlook_com_quote(
117
+        cls,
118
+        elem: typing.Union[Tag, NavigableString]
119
+    ) -> bool:
120
+        if cls._has_attr_value(
121
+                elem,
122
+                'id',
123
+                ProprietaryHTMLAttrValues.Outlook_com_quote_id):
124
+            return True
125
+        return False
126
+
127
+    @classmethod
128
+    def _is_yahoo_quote(
129
+            cls,
130
+            elem: typing.Union[Tag, NavigableString]
131
+    ) -> bool:
132
+        return cls._has_attr_value(
133
+            elem,
134
+            'class',
135
+            ProprietaryHTMLAttrValues.Yahoo_quote_class)
136
+
137
+    @classmethod
138
+    def _is_roundcube_quote(
139
+            cls,
140
+            elem: typing.Union[Tag, NavigableString]
141
+    ) -> bool:
142
+        return cls._has_attr_value(
143
+            elem,
144
+            'id',
145
+            ProprietaryHTMLAttrValues.Roundcube_quote_prefix_class)
146
+
147
+
148
+class HtmlMailSignatureChecker(HtmlChecker):
149
+    """
150
+    Check if one HTML Element from Body Mail look-like a signature or not.
151
+    """
152
+
153
+    @classmethod
154
+    def is_signature(
155
+            cls,
156
+            elem: typing.Union[Tag, NavigableString]
157
+    ) -> bool:
158
+        return cls._is_thunderbird_signature(elem) \
159
+               or cls._is_gmail_signature(elem) \
160
+               or cls._is_outlook_com_signature(elem)
161
+
162
+    @classmethod
163
+    def _is_thunderbird_signature(
164
+            cls,
165
+            elem: typing.Union[Tag, NavigableString]
166
+    ) -> bool:
167
+        return cls._has_attr_value(
168
+            elem,
169
+            'class',
170
+            ProprietaryHTMLAttrValues.Thunderbird_signature_class)
171
+
172
+    @classmethod
173
+    def _is_gmail_signature(
174
+            cls,
175
+            elem: typing.Union[Tag, NavigableString]
176
+    ) -> bool:
177
+        if cls._has_attr_value(
178
+                elem,
179
+                'class',
180
+                ProprietaryHTMLAttrValues.Gmail_signature_class):
181
+            return True
182
+        if cls._has_attr_value(
183
+                elem,
184
+                'class',
185
+                ProprietaryHTMLAttrValues.Gmail_extras_class):
186
+            for child in elem.children:
187
+                if cls._has_attr_value(
188
+                        child,
189
+                        'class',
190
+                        ProprietaryHTMLAttrValues.Gmail_signature_class):
191
+                    return True
192
+        if isinstance(elem, Tag) and elem.name.lower() == 'div':
193
+            for child in elem.children:
194
+                if cls._has_attr_value(
195
+                        child,
196
+                        'class',
197
+                        ProprietaryHTMLAttrValues.Gmail_signature_class):
198
+                    return True
199
+        return False
200
+
201
+    @classmethod
202
+    def _is_outlook_com_signature(
203
+            cls,
204
+            elem: typing.Union[Tag, NavigableString]
205
+    ) -> bool:
206
+        if cls._has_attr_value(
207
+                elem,
208
+                'id',
209
+                ProprietaryHTMLAttrValues.Outlook_com_signature_id):
210
+            return True
211
+        return False

+ 115 - 0
tracim/tracim/lib/email_processing/models.py 파일 보기

@@ -0,0 +1,115 @@
1
+from bs4 import BeautifulSoup
2
+
3
+# -*- coding: utf-8 -*-
4
+
5
+
6
+class BodyMailPartType(object):
7
+    Signature = 'sign'
8
+    Main = 'main'
9
+    Quote = 'quote'
10
+
11
+
12
+class BodyMailPart(object):
13
+    def __init__(
14
+            self,
15
+            text: str,
16
+            part_type: str
17
+    )-> None:
18
+        self.text = text
19
+        self.part_type = part_type
20
+
21
+
22
+class BodyMailParts(object):
23
+    """
24
+    Data Structure to Distinct part of a Mail body into a "list" of BodyMailPart
25
+    When 2 similar BodyMailPart (same part_type) are added one after the other,
26
+    it doesn't create a new Part, it just merge those elements into one.
27
+    It should always have only one Signature type part, normally
28
+    at the end of the body.
29
+    This object doesn't provide other set method than append() in order to
30
+    preserve object coherence.
31
+    """
32
+    def __init__(self) -> None:
33
+        self._list = []  # type; List[BodyMailPart]
34
+        # INFO - G.M -
35
+        # automatically merge new value with last item if true, without any
36
+        # part_type check, same type as the older one, useful when some tag
37
+        # say "all elem after me is Signature"
38
+        self.follow = False
39
+
40
+    def __len__(self) -> int:
41
+        return len(self._list)
42
+
43
+    def __getitem__(self, index) -> BodyMailPart:
44
+        return self._list[index]
45
+
46
+    def __delitem__(self, index) -> None:
47
+        del self._list[index]
48
+        # FIXME - G.M - 2017-11-27 - Preserve BodyMailParts consistence
49
+        # check elem after and before index and merge them if necessary.
50
+
51
+    def append(self, value) -> None:
52
+        BodyMailParts._check_value(value)
53
+        self._append(value)
54
+
55
+    def _append(self, value, follow=None) -> None:
56
+        if follow is None:
57
+            follow = self.follow
58
+
59
+        if len(self._list) < 1:
60
+            self._list.append(value)
61
+        else:
62
+            if self._list[-1].part_type == value.part_type or follow:
63
+                self._list[-1].text += value.text
64
+            else:
65
+                self._list.append(value)
66
+
67
+    @classmethod
68
+    def _check_value(cls, value) -> None:
69
+        if not isinstance(value, BodyMailPart):
70
+            raise TypeError()
71
+
72
+    def drop_part_type(self, part_type: str) -> None:
73
+        """
74
+        Drop all elem of one part_type
75
+        :param part_type: part_type to completely remove
76
+        :return: None
77
+        """
78
+        new_list = [x for x in self._list if x.part_type != part_type]
79
+        self._list = []
80
+
81
+        # INFO - G.M - 2017-11-27 - use append() to have a consistent list
82
+        for elem in new_list:
83
+            self._append(elem, follow=False)
84
+
85
+    def get_nb_part_type(self, part_type: str) -> int:
86
+        """
87
+        Get number of elements of one part_type
88
+        :param part_type: part_type to check
89
+        :return: number of part_type elements
90
+        """
91
+        count = 0
92
+        for elem in self._list:
93
+            if elem.part_type == part_type:
94
+                count += 1
95
+        return count
96
+
97
+    def __str__(self) -> str:
98
+        s_mail = ''
99
+        for elem in self._list:
100
+            s_mail += elem.text
101
+        return str(s_mail)
102
+
103
+
104
+class HtmlBodyMailParts(BodyMailParts):
105
+
106
+    def append(self, value):
107
+        # INFO - G.M - 2017-12-01 - Override part_type is elem has no content.
108
+        # Choose last elem part_type instead of the proposed one.
109
+        if len(self._list) > 0:
110
+            txt = BeautifulSoup(value.text, 'html.parser').get_text()
111
+            txt = txt.replace('\n', '').strip()
112
+            if not txt:
113
+                value.part_type = self._list[-1].part_type
114
+        BodyMailParts._check_value(value)
115
+        BodyMailParts._append(self, value)

+ 128 - 0
tracim/tracim/lib/email_processing/parser.py 파일 보기

@@ -0,0 +1,128 @@
1
+# -*- coding: utf-8 -*-
2
+from bs4 import BeautifulSoup
3
+
4
+from tracim.lib.email_processing.checkers import ProprietaryHTMLAttrValues
5
+from tracim.lib.email_processing.checkers import HtmlMailQuoteChecker
6
+from tracim.lib.email_processing.checkers import HtmlMailSignatureChecker
7
+from tracim.lib.email_processing.models import BodyMailPartType
8
+from tracim.lib.email_processing.models import BodyMailPart
9
+from tracim.lib.email_processing.models import HtmlBodyMailParts
10
+
11
+
12
+class PreSanitizeConfig(object):
13
+    """
14
+    To avoid problems, html need to be sanitize a bit during parsing to distinct
15
+    Main,Quote and Signature elements
16
+    """
17
+    meta_tag = ['body', 'div']
18
+
19
+
20
+class ParsedHTMLMail(object):
21
+    """
22
+    Parse HTML Mail depending of some rules.
23
+    Distinct part of html mail body using BodyMailParts object and
24
+    process differents rules using HtmlChecker(s)
25
+    """
26
+
27
+    def __init__(self, html_body: str):
28
+        self.src_html_body = html_body
29
+
30
+    def __str__(self):
31
+        return str(self._parse_mail())
32
+
33
+    def get_elements(self) -> HtmlBodyMailParts:
34
+        tree = self._get_proper_main_body_tree()
35
+        return self._distinct_elements(tree)
36
+
37
+    def _parse_mail(self) -> HtmlBodyMailParts:
38
+        elements = self.get_elements()
39
+        elements = self._process_elements(elements)
40
+        return elements
41
+
42
+    def _get_proper_main_body_tree(self) -> BeautifulSoup:
43
+        """
44
+        Get html body tree without some kind of wrapper.
45
+        We need to have text, quote and signature parts at the same tree level
46
+        """
47
+        tree = BeautifulSoup(self.src_html_body, 'html.parser')
48
+
49
+        # Only parse body part of html if available
50
+        subtree = tree.find('body')
51
+        if subtree:
52
+            tree = BeautifulSoup(str(subtree), 'html.parser')
53
+
54
+        # if some kind of "meta_div", unwrap it
55
+        while len(tree.findAll(recursive=None)) == 1 and \
56
+                tree.find().name.lower() in PreSanitizeConfig.meta_tag:
57
+            tree.find().unwrap()
58
+
59
+        for tag in tree.findAll():
60
+            # HACK - G.M - 2017-11-28 - Unwrap outlook.com mail
61
+            # if Text -> Signature -> Quote Mail
62
+            # Text and signature are wrapped into divtagdefaultwrapper
63
+            if tag.attrs.get('id'):
64
+                if ProprietaryHTMLAttrValues.Outlook_com_wrapper_id\
65
+                        in tag.attrs['id']:
66
+                    tag.unwrap()
67
+        return tree
68
+
69
+    @classmethod
70
+    def _distinct_elements(cls, tree: BeautifulSoup) -> HtmlBodyMailParts:
71
+        parts = HtmlBodyMailParts()
72
+        for elem in list(tree):
73
+            part_txt = str(elem)
74
+            part_type = BodyMailPartType.Main
75
+
76
+            if HtmlMailQuoteChecker.is_quote(elem):
77
+                part_type = BodyMailPartType.Quote
78
+            elif HtmlMailSignatureChecker.is_signature(elem):
79
+                part_type = BodyMailPartType.Signature
80
+
81
+            part = BodyMailPart(part_txt, part_type)
82
+            parts.append(part)
83
+            # INFO - G.M - 2017-11-28 - Outlook.com special case
84
+            # all after quote tag is quote
85
+            if HtmlMailQuoteChecker._is_outlook_com_quote(elem):
86
+                parts.follow = True
87
+        return parts
88
+
89
+    @classmethod
90
+    def _process_elements(
91
+            cls,
92
+            elements: HtmlBodyMailParts,
93
+    ) -> HtmlBodyMailParts:
94
+        if len(elements) >= 2:
95
+            # Case 1 and 2, only one main and one quote
96
+            if elements.get_nb_part_type('main') == 1 and \
97
+                            elements.get_nb_part_type('quote') == 1:
98
+                # Case 1 : Main first
99
+                if elements[0].part_type == BodyMailPartType.Main:
100
+                    cls._process_main_first_case(elements)
101
+                # Case 2 : Quote first
102
+                if elements[0].part_type == BodyMailPartType.Quote:
103
+                    cls._process_quote_first_case(elements)
104
+            else:
105
+                # Case 3 : Multiple quotes and/or main
106
+                cls._process_multiples_elems_case(elements)
107
+        else:
108
+            cls._process_default_case(elements)
109
+            # default case (only one element or empty list)
110
+        return elements
111
+
112
+    @classmethod
113
+    def _process_quote_first_case(cls, elements: HtmlBodyMailParts) -> None:
114
+        elements.drop_part_type(BodyMailPartType.Signature)
115
+
116
+    @classmethod
117
+    def _process_main_first_case(cls, elements: HtmlBodyMailParts) -> None:
118
+        elements.drop_part_type(BodyMailPartType.Quote)
119
+        elements.drop_part_type(BodyMailPartType.Signature)
120
+
121
+    @classmethod
122
+    def _process_multiples_elems_case(cls, elements: HtmlBodyMailParts) -> None:
123
+        elements.drop_part_type(BodyMailPartType.Signature)
124
+
125
+    @classmethod
126
+    def _process_default_case(cls, elements: HtmlBodyMailParts) -> None:
127
+        elements.drop_part_type(BodyMailPartType.Quote)
128
+        elements.drop_part_type(BodyMailPartType.Signature)

+ 79 - 0
tracim/tracim/lib/email_processing/sanitizer.py 파일 보기

@@ -0,0 +1,79 @@
1
+from bs4 import BeautifulSoup, Tag
2
+
3
+
4
+class HtmlSanitizerConfig(object):
5
+    # some Default_html_tags type
6
+    HTML_Heading_tag = ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']
7
+    HTML_Text_parts_tag = ['p',
8
+                           'br', 'hr',
9
+                           'pre', 'code', 'samp',  # preformatted content
10
+                           'q', 'blockquote',  # quotes
11
+                           ]
12
+    HTML_Text_format_tag = ['b', 'i', 'u', 'small', 'sub', 'sup', ]
13
+    HTML_Text_semantic_tag = ['strong', 'em',
14
+                              'mark', 'cite', 'dfn',
15
+                              'del', 'ins', ]
16
+    HTML_Table_tag = ['table',
17
+                      'thead', 'tfoot', 'tbody',
18
+                      'tr', 'td', 'caption', ]
19
+
20
+    HTML_List_tag = ['ul', 'li', 'ol',  # simple list
21
+                     'dl', 'dt', 'dd', ]  # definition list
22
+
23
+    # Rules
24
+    Tag_whitelist = HTML_Heading_tag \
25
+                    + HTML_Text_parts_tag \
26
+                    + HTML_Text_format_tag \
27
+                    + HTML_Text_semantic_tag \
28
+                    + HTML_Table_tag \
29
+                    + HTML_List_tag
30
+
31
+    Tag_blacklist = ['script', 'style']
32
+
33
+    # TODO - G.M - 2017-12-01 - Think about removing class/id Blacklist
34
+    # These elements are no longer required.
35
+    Class_blacklist = []
36
+    Id_blacklist = []
37
+
38
+    Attrs_whitelist = ['href']
39
+
40
+
41
+class HtmlSanitizer(object):
42
+    """
43
+    Sanitize Html Rules :
44
+    - Tag :
45
+      - Remove Tag_blacklist tag
46
+      - Keep Tag_whitelist tag
47
+      - Unwrap others tags
48
+    - Attrs :
49
+      - Remove non-whitelisted attributes
50
+    """
51
+
52
+    @classmethod
53
+    def sanitize(cls, html_body: str) -> str:
54
+        soup = BeautifulSoup(html_body, 'html.parser')
55
+        for tag in soup.findAll():
56
+            if cls._tag_to_extract(tag):
57
+                tag.extract()
58
+            elif tag.name.lower() in HtmlSanitizerConfig.Tag_whitelist:
59
+                attrs = dict(tag.attrs)
60
+                for attr in attrs:
61
+                    if attr not in HtmlSanitizerConfig.Attrs_whitelist:
62
+                        del tag.attrs[attr]
63
+            else:
64
+                tag.unwrap()
65
+        return str(soup)
66
+
67
+    @classmethod
68
+    def _tag_to_extract(cls, tag: Tag) -> bool:
69
+        if tag.name.lower() in HtmlSanitizerConfig.Tag_blacklist:
70
+            return True
71
+        if 'class' in tag.attrs:
72
+            for elem in HtmlSanitizerConfig.Class_blacklist:
73
+                if elem in tag.attrs['class']:
74
+                    return True
75
+        if 'id' in tag.attrs:
76
+            for elem in HtmlSanitizerConfig.Id_blacklist:
77
+                if elem in tag.attrs['id']:
78
+                    return True
79
+        return False

+ 759 - 0
tracim/tracim/tests/library/test_email_body_parser.py 파일 보기

@@ -0,0 +1,759 @@
1
+from bs4 import BeautifulSoup
2
+from nose.tools import raises
3
+
4
+from tracim.lib.email_processing.checkers import HtmlMailQuoteChecker
5
+from tracim.lib.email_processing.checkers import HtmlMailSignatureChecker
6
+from tracim.lib.email_processing.parser import ParsedHTMLMail
7
+from tracim.lib.email_processing.models import BodyMailPartType
8
+from tracim.lib.email_processing.models import BodyMailPart
9
+from tracim.lib.email_processing.models import BodyMailParts
10
+from tracim.tests import TestStandard
11
+
12
+
13
+class TestHtmlMailQuoteChecker(TestStandard):
14
+    def test_unit__is_standard_quote_ok(self):
15
+        soup = BeautifulSoup('<blockquote></blockquote>', 'html.parser')
16
+        main_elem = soup.find()
17
+        assert HtmlMailQuoteChecker._is_standard_quote(main_elem) is True
18
+
19
+    def test_unit__is_standard_quote_no(self):
20
+        soup = BeautifulSoup('<a></a>', 'html.parser')
21
+        main_elem = soup.find()
22
+        assert HtmlMailQuoteChecker._is_standard_quote(main_elem) is False
23
+
24
+    def test_unit__is_thunderbird_quote_ok(self):
25
+        soup = BeautifulSoup('<div class="moz-cite-prefix"></div>',
26
+                             'html.parser')
27
+        main_elem = soup.find()
28
+        assert HtmlMailQuoteChecker._is_thunderbird_quote(main_elem) is True
29
+
30
+    def test_unit__is_thunderbird_quote_no(self):
31
+        soup = BeautifulSoup('<div class="nothing"></div>', 'html.parser')
32
+        main_elem = soup.find()
33
+        assert HtmlMailQuoteChecker._is_thunderbird_quote(main_elem) is False
34
+
35
+    def test_unit__is_gmail_quote_ok(self):
36
+        html = '<div class="gmail_extra">' + \
37
+              '<a></a><div class="gmail_quote"></div>' + \
38
+              '</div>'
39
+        soup = BeautifulSoup(html, 'html.parser')
40
+        main_elem = soup.find()
41
+        assert HtmlMailQuoteChecker._is_gmail_quote(main_elem) is True
42
+
43
+    def test_unit__is_gmail_quote_no(self):
44
+        soup = BeautifulSoup('<div class="nothing"></div>', 'html.parser')
45
+        main_elem = soup.find()
46
+        assert HtmlMailQuoteChecker._is_gmail_quote(main_elem) is False
47
+
48
+    def test_unit__is_gmail_quote_no_2(self):
49
+        html = '<div class="gmail_extra">' + \
50
+              '<a></a><div class="gmail_signature"></div>' + \
51
+              '</div>'
52
+        soup = BeautifulSoup(html, 'html.parser')
53
+        main_elem = soup.find()
54
+        assert HtmlMailQuoteChecker._is_gmail_quote(main_elem) is False
55
+
56
+    def test_unit__is_outlook_com_quote_ok(self):
57
+        soup = BeautifulSoup('<div id="divRplyFwdMsg"></div>', 'html.parser')
58
+        main_elem = soup.find()
59
+        assert HtmlMailQuoteChecker._is_outlook_com_quote(main_elem) is True
60
+
61
+    def test_unit__is_outlook_com_quote_no(self):
62
+        soup = BeautifulSoup('<div id="Signature"></div>', 'html.parser')
63
+        main_elem = soup.find()
64
+        assert HtmlMailQuoteChecker._is_outlook_com_quote(main_elem) is False
65
+
66
+    # TODO - G.M - 2017-11-24 - Check Yahoo and New roundcube html mail with
67
+    # correct mail example
68
+
69
+
70
+class TestHtmlMailSignatureChecker(TestStandard):
71
+    def test_unit__is_thunderbird_signature_ok(self):
72
+        soup = BeautifulSoup('<div class="moz-signature"></div>', 'html.parser')
73
+        main_elem = soup.find()
74
+        assert HtmlMailSignatureChecker._is_thunderbird_signature(main_elem) is True  # nopep8
75
+
76
+    def test_unit__is_thunderbird_signature_no(self):
77
+        soup = BeautifulSoup('<div class="other"></div>', 'html.parser')
78
+        main_elem = soup.find()
79
+        assert HtmlMailSignatureChecker._is_thunderbird_signature(main_elem) is False  # nopep8
80
+
81
+    def test_unit__is_gmail_signature_ok(self):
82
+        html = '<div class="gmail_extra">' + \
83
+               '<a></a><div class="gmail_quote"></div>' + \
84
+               '</div>'
85
+        soup = BeautifulSoup(html, 'html.parser')
86
+        main_elem = soup.find()
87
+        assert HtmlMailSignatureChecker._is_gmail_signature(main_elem) is False
88
+
89
+    def test_unit__is_gmail_signature_no(self):
90
+        soup = BeautifulSoup('<div class="nothing"></div>', 'html.parser')
91
+        main_elem = soup.find()
92
+        assert HtmlMailSignatureChecker._is_gmail_signature(main_elem) is False
93
+
94
+    def test_unit__is_gmail_signature_yes(self):
95
+        html = '<div class="gmail_extra">' + \
96
+               '<a></a><div class="gmail_signature"></div>' + \
97
+               '</div>'
98
+        soup = BeautifulSoup(html, 'html.parser')
99
+        main_elem = soup.find()
100
+        assert HtmlMailSignatureChecker._is_gmail_signature(main_elem) is True
101
+
102
+    def test_unit__is_gmail_signature_yes_2(self):
103
+        html = '<div class="gmail_signature">' + \
104
+               '</div>'
105
+        soup = BeautifulSoup(html, 'html.parser')
106
+        main_elem = soup.find()
107
+        assert HtmlMailSignatureChecker._is_gmail_signature(main_elem) is True
108
+
109
+    def test_unit__is_outlook_com_signature_no(self):
110
+        soup = BeautifulSoup('<div id="divRplyFwdMsg"></div>', 'html.parser')
111
+        main_elem = soup.find()
112
+        assert HtmlMailSignatureChecker._is_outlook_com_signature(main_elem) \
113
+               is False
114
+
115
+    def test_unit__is_outlook_com_signature_ok(self):
116
+        soup = BeautifulSoup('<div id="Signature"></div>', 'html.parser')
117
+        main_elem = soup.find()
118
+        assert HtmlMailSignatureChecker._is_outlook_com_signature(main_elem) \
119
+               is True
120
+
121
+
122
+class TestBodyMailsParts(TestStandard):
123
+
124
+    def test_unit__std_list_methods(self):
125
+        mail_parts = BodyMailParts()
126
+        assert len(mail_parts) == 0
127
+        a = BodyMailPart('a', BodyMailPartType.Main)
128
+        mail_parts._list.append(a)
129
+        assert len(mail_parts) == 1
130
+        assert mail_parts[0] == a
131
+        del mail_parts[0]
132
+        assert len(mail_parts) == 0
133
+
134
+    def test_unit__append_same_type(self):
135
+        mail_parts = BodyMailParts()
136
+        a = BodyMailPart('a', BodyMailPartType.Main)
137
+        mail_parts._append(a)
138
+        b = BodyMailPart('b', BodyMailPartType.Main)
139
+        mail_parts._append(b)
140
+        assert len(mail_parts) == 1
141
+        assert mail_parts[0].part_type == BodyMailPartType.Main
142
+        assert mail_parts[0].text == 'ab'
143
+
144
+    def test_unit__append_different_type(self):
145
+        mail_parts = BodyMailParts()
146
+        a = BodyMailPart('a', BodyMailPartType.Main)
147
+        mail_parts.append(a)
148
+        b = BodyMailPart('b', BodyMailPartType.Quote)
149
+        mail_parts._append(b)
150
+        assert len(mail_parts) == 2
151
+        assert mail_parts[0] == a
152
+        assert mail_parts[1] == b
153
+
154
+    def test_unit__append_follow(self):
155
+        mail_parts = BodyMailParts()
156
+        mail_parts.follow = True
157
+        a = BodyMailPart('a', BodyMailPartType.Main)
158
+        mail_parts._append(a)
159
+        b = BodyMailPart('b', BodyMailPartType.Quote)
160
+        mail_parts._append(b)
161
+        assert len(mail_parts) == 1
162
+        assert mail_parts[0].part_type == BodyMailPartType.Main
163
+        assert mail_parts[0].text == 'ab'
164
+
165
+    def test_unit__append_dont_follow_when_first(self):
166
+        mail_parts = BodyMailParts()
167
+        a = BodyMailPart('a', BodyMailPartType.Main)
168
+        mail_parts._append(a, follow=True)
169
+        assert len(mail_parts) == 1
170
+        assert mail_parts[0].part_type == BodyMailPartType.Main
171
+        assert mail_parts[0].text == 'a'
172
+
173
+    @raises(TypeError)
174
+    def test_unit__check_value__type_error(self):
175
+        mail_parts = BodyMailParts()
176
+        mail_parts._check_value('a')
177
+
178
+    def test_unit__check_value__ok(self):
179
+        mail_parts = BodyMailParts()
180
+        a = BodyMailPart('a', BodyMailPartType.Main)
181
+        mail_parts._check_value(a)
182
+
183
+    def test_unit__drop_part_type(self):
184
+        mail_parts = BodyMailParts()
185
+        a = BodyMailPart('a', BodyMailPartType.Main)
186
+        mail_parts._list.append(a)
187
+        b = BodyMailPart('b', BodyMailPartType.Quote)
188
+        mail_parts._list.append(b)
189
+        c = BodyMailPart('c', BodyMailPartType.Signature)
190
+        mail_parts._list.append(c)
191
+        mail_parts.drop_part_type(BodyMailPartType.Quote)
192
+        assert len(mail_parts) == 2
193
+        assert mail_parts[0].text == 'a'
194
+        assert mail_parts[0].part_type == BodyMailPartType.Main
195
+        assert len(mail_parts) == 2
196
+        assert mail_parts[1].text == 'c'
197
+        assert mail_parts[1].part_type == BodyMailPartType.Signature
198
+
199
+    def test_unit__drop_part_type_verify_no_follow_incidence(self):
200
+        mail_parts = BodyMailParts()
201
+        a = BodyMailPart('a', BodyMailPartType.Main)
202
+        mail_parts._list.append(a)
203
+        b = BodyMailPart('b', BodyMailPartType.Quote)
204
+        mail_parts._list.append(b)
205
+        c = BodyMailPart('c', BodyMailPartType.Signature)
206
+        mail_parts._list.append(c)
207
+        mail_parts.follow = True
208
+        mail_parts.drop_part_type(BodyMailPartType.Quote)
209
+        assert len(mail_parts) == 2
210
+        assert mail_parts[0].text == 'a'
211
+        assert mail_parts[0].part_type == BodyMailPartType.Main
212
+        assert len(mail_parts) == 2
213
+        assert mail_parts[1].text == 'c'
214
+        assert mail_parts[1].part_type == BodyMailPartType.Signature
215
+
216
+    def test_unit__drop_part_type_consistence(self):
217
+        mail_parts = BodyMailParts()
218
+        a = BodyMailPart('a', BodyMailPartType.Main)
219
+        mail_parts._list.append(a)
220
+        b = BodyMailPart('b', BodyMailPartType.Quote)
221
+        mail_parts._list.append(b)
222
+        c = BodyMailPart('c', BodyMailPartType.Main)
223
+        mail_parts._list.append(c)
224
+        mail_parts.drop_part_type(BodyMailPartType.Quote)
225
+        assert len(mail_parts) == 1
226
+        assert mail_parts[0].text == 'ac'
227
+        assert mail_parts[0].part_type == BodyMailPartType.Main
228
+
229
+    def test_unit__get_nb_part_type(self):
230
+        mail_parts = BodyMailParts()
231
+        assert mail_parts.get_nb_part_type(BodyMailPartType.Main) == 0
232
+        assert mail_parts.get_nb_part_type(BodyMailPartType.Quote) == 0
233
+        assert mail_parts.get_nb_part_type(BodyMailPartType.Signature) == 0
234
+        a = BodyMailPart('a', BodyMailPartType.Main)
235
+        mail_parts._list.append(a)
236
+        assert mail_parts.get_nb_part_type(BodyMailPartType.Main) == 1
237
+        b = BodyMailPart('b', BodyMailPartType.Quote)
238
+        mail_parts._list.append(b)
239
+        assert mail_parts.get_nb_part_type(BodyMailPartType.Quote) == 1
240
+        c = BodyMailPart('c', BodyMailPartType.Signature)
241
+        mail_parts._list.append(c)
242
+        assert mail_parts.get_nb_part_type(BodyMailPartType.Main) == 1
243
+        assert mail_parts.get_nb_part_type(BodyMailPartType.Quote) == 1
244
+        assert mail_parts.get_nb_part_type(BodyMailPartType.Signature) == 1
245
+
246
+    def test_unit__str(self):
247
+        mail_parts = BodyMailParts()
248
+        a = BodyMailPart('a', BodyMailPartType.Main)
249
+        mail_parts._list.append(a)
250
+        b = BodyMailPart('b', BodyMailPartType.Quote)
251
+        mail_parts._list.append(b)
252
+        c = BodyMailPart('c', BodyMailPartType.Signature)
253
+        mail_parts._list.append(c)
254
+        assert str(mail_parts) == 'abc'
255
+
256
+
257
+class TestParsedMail(TestStandard):
258
+
259
+    def test_other__check_gmail_mail_text_only(self):
260
+        text_only = '''<div dir="ltr">Voici le texte<br></div>'''
261
+        mail = ParsedHTMLMail(text_only)
262
+        elements = mail.get_elements()
263
+        assert len(elements) == 1
264
+        assert elements[0].part_type == BodyMailPartType.Main
265
+
266
+    def test_other__check_gmail_mail_text_signature(self):
267
+        text_and_signature = '''
268
+        <div dir="ltr">POF<br clear="all"><div><br>-- <br>
269
+        <div class="gmail_signature" data-smartmail="gmail_signature">
270
+        <div dir="ltr">Voici Ma signature. En HTML <br><ol>
271
+        <li>Plop</li>
272
+        <li>Plip</li>
273
+        <li>Plop<br>
274
+        </li></ol></div></div></div></div>
275
+        '''
276
+        mail = ParsedHTMLMail(text_and_signature)
277
+        elements = mail.get_elements()
278
+        assert len(elements) == 2
279
+        assert elements[0].part_type == BodyMailPartType.Main
280
+        assert elements[1].part_type == BodyMailPartType.Signature
281
+
282
+    def test_other__check_gmail_mail_text_quote(self):
283
+        text_and_quote = '''
284
+        <div dir="ltr">Réponse<br>
285
+        <div class="gmail_extra"><br>
286
+        <div class="gmail_quote">Le 28 novembre 2017 à 10:29, John Doe <span
287
+        dir="ltr">&lt;<a href="mailto:bidule@localhost.fr"
288
+        target="_blank">bidule@localhost.fr</a>&gt;</span>
289
+        a écrit :<br>
290
+        <blockquote class="gmail_quote" style="margin:0 0 0
291
+        .8ex;border-left:1px #ccc solid;padding-left:1ex">Voici ma réponse<br>
292
+        <br><br>
293
+        Le 28/11/2017 à 10:05, Foo Bar a écrit&nbsp;:<br>
294
+        <blockquote class="gmail_quote" style="margin:0 0 0
295
+        .8ex;border-left:1px #ccc solid;padding-left:1ex">
296
+        Voici le texte<span class="HOEnZb"><font color="#888888"><br>
297
+        </font></span></blockquote>
298
+        <span class="HOEnZb"><font color="#888888">
299
+        <br>
300
+        -- <br>
301
+        TEST DE signature<br>
302
+        </font></span></blockquote>
303
+        </div><br></div></div>
304
+        '''
305
+        mail = ParsedHTMLMail(text_and_quote)
306
+        elements = mail.get_elements()
307
+        assert len(elements) == 2
308
+        assert elements[0].part_type == BodyMailPartType.Main
309
+        assert elements[1].part_type == BodyMailPartType.Quote
310
+
311
+    def test_other__check_gmail_mail_text_quote_text(self):
312
+        text_quote_text = '''
313
+              <div dir="ltr">Avant<br>
314
+              <div class="gmail_extra"><br>
315
+              <div class="gmail_quote">Le 28 novembre 2017 à 10:29, John Doe 
316
+              <span dir="ltr">&lt;<a href="mailto:bidule@localhost.fr"
317
+              target="_blank">bidule@localhost.fr</a>&gt;</span>
318
+              a écrit :<br>
319
+              <blockquote class="gmail_quote" style="margin:0 0 0
320
+              .8ex;border-left:1px #ccc solid;padding-left:1ex">Voici ma
321
+              réponse<br>
322
+              <br>
323
+              <br>
324
+              Le 28/11/2017 à 10:05, Foo Bar a écrit&nbsp;:<br>
325
+              <blockquote class="gmail_quote" style="margin:0 0 0
326
+              .8ex;border-left:1px #ccc solid;padding-left:1ex">
327
+              Voici le texte<span class="HOEnZb"><font color="#888888"><br>
328
+              </font></span></blockquote>
329
+              <span class="HOEnZb"><font color="#888888">
330
+              <br>
331
+              -- <br>
332
+              TEST DE signature<br>
333
+              </font></span></blockquote>
334
+              </div>
335
+              <br>
336
+              </div>
337
+              <div class="gmail_extra">Aprés<br>
338
+              </div>
339
+              </div>
340
+              '''
341
+
342
+        mail = ParsedHTMLMail(text_quote_text)
343
+        elements = mail.get_elements()
344
+        assert len(elements) == 3
345
+        assert elements[0].part_type == BodyMailPartType.Main
346
+        assert elements[1].part_type == BodyMailPartType.Quote
347
+        assert elements[2].part_type == BodyMailPartType.Main
348
+
349
+    def test_other__check_gmail_mail_text_quote_signature(self):
350
+        text_quote_signature = '''
351
+        <div dir="ltr">Hey !<br>
352
+                 </div>
353
+                 <div class="gmail_extra"><br>
354
+                 <div class="gmail_quote">Le 28 novembre 2017 à 10:29,
355
+                  John Doe <span
356
+                 dir="ltr">&lt;<a href="mailto:bidule@localhost.fr"
357
+                 target="_blank">bidule@localhost.fr</a>&gt;</span>
358
+                 a écrit :<br>
359
+                 <blockquote class="gmail_quote" style="margin:0 0 0
360
+                 .8ex;border-left:1px #ccc solid;padding-left:1ex">Voici ma
361
+                 réponse<br>
362
+                 <br>
363
+                 <br>
364
+                  Le 28/11/2017 à 10:05, Foo Bar a écrit&nbsp;:<br>
365
+                  <blockquote class="gmail_quote" style="margin:0 0 0
366
+                  .8ex;border-left:1px #ccc solid;padding-left:1ex">
367
+                  Voici le texte<span class="HOEnZb"><font color="#888888"><br>
368
+                  </font></span></blockquote>
369
+                  <span class="HOEnZb"><font color="#888888">
370
+                  <br>
371
+                  -- <br>
372
+                  TEST DE signature<br>
373
+                  </font></span></blockquote>
374
+                  </div>
375
+                  <br>
376
+                  <br clear="all">
377
+                  <br>
378
+                  -- <br>
379
+                  <div class="gmail_signature" data-smartmail="gmail_signature">
380
+                  <div dir="ltr">Voici Ma signature. En HTML <br>
381
+                  <ol>
382
+                  <li>Plop</li>
383
+                  <li>Plip</li>
384
+                  <li>Plop<br>
385
+                  </li>
386
+                  </ol>
387
+                  </div>
388
+                  </div>
389
+                  </div>
390
+                 '''
391
+
392
+        # INFO - G.M - 2017-11-28 -
393
+        # Now Quote + Signature block in Gmail is considered as one Quote
394
+        # Block.
395
+        mail = ParsedHTMLMail(text_quote_signature)
396
+        elements = mail.get_elements()
397
+        assert len(elements) == 2
398
+        assert elements[0].part_type == BodyMailPartType.Main
399
+        assert elements[1].part_type == BodyMailPartType.Quote
400
+
401
+    def test_other__check_gmail_mail_text_quote_text_signature(self):
402
+        text_quote_text_sign = '''
403
+        <div dir="ltr">Test<br>
404
+        <div class="gmail_extra"><br>
405
+        <div class="gmail_quote">Le 28 novembre 2017 à 10:29, John Doe <span
406
+        dir="ltr">&lt;<a href="mailto:bidule@localhost.fr"
407
+        target="_blank">bidule@localhost.fr</a>&gt;</span>
408
+        a écrit :<br>
409
+        <blockquote class="gmail_quote" style="margin:0 0 0
410
+        .8ex;border-left:1px #ccc solid;padding-left:1ex">Voici ma
411
+        réponse<br>
412
+        <br>
413
+        <br>
414
+        Le 28/11/2017 à 10:05, Foo Bar a écrit&nbsp;:<br>
415
+        <blockquote class="gmail_quote" style="margin:0 0 0
416
+        .8ex;border-left:1px #ccc solid;padding-left:1ex">
417
+        Voici le texte<span class="HOEnZb"><font color="#888888"><br>
418
+        </font></span></blockquote>
419
+        <span class="HOEnZb"><font color="#888888">
420
+        <br>
421
+        -- <br>
422
+        TEST DE signature<br>
423
+        </font></span></blockquote>
424
+        </div>
425
+        <br>
426
+        <br>
427
+        </div>
428
+        <div class="gmail_extra">RE test<br clear="all">
429
+        </div>
430
+        <div class="gmail_extra"><br>
431
+        -- <br>
432
+        <div class="gmail_signature" data-smartmail="gmail_signature">
433
+        <div dir="ltr">Voici Ma signature. En HTML <br>
434
+        <ol>
435
+        <li>Plop</li>
436
+        <li>Plip</li>
437
+        <li>Plop<br>
438
+        </li>
439
+        </ol>
440
+        </div>
441
+        </div>
442
+        </div>
443
+        </div>
444
+        '''
445
+
446
+        mail = ParsedHTMLMail(text_quote_text_sign)
447
+        elements = mail.get_elements()
448
+        assert len(elements) == 4
449
+        assert elements[0].part_type == BodyMailPartType.Main
450
+        assert elements[1].part_type == BodyMailPartType.Quote
451
+        assert elements[2].part_type == BodyMailPartType.Main
452
+        assert elements[3].part_type == BodyMailPartType.Signature
453
+
454
+    def test_other__check_thunderbird_mail_text_only(self):
455
+
456
+        text_only = '''Coucou<br><br><br>'''
457
+        mail = ParsedHTMLMail(text_only)
458
+        elements = mail.get_elements()
459
+        assert len(elements) == 1
460
+        assert elements[0].part_type == BodyMailPartType.Main
461
+
462
+    def test_other__check_thunderbird_mail_text_signature(self):
463
+        text_and_signature = '''
464
+        <p>Test<br>
465
+        </p>
466
+        <div class="moz-signature">-- <br>
467
+          TEST DE signature</div>
468
+        '''
469
+        mail = ParsedHTMLMail(text_and_signature)
470
+        elements = mail.get_elements()
471
+        assert len(elements) == 2
472
+        assert elements[0].part_type == BodyMailPartType.Main
473
+        assert elements[1].part_type == BodyMailPartType.Signature
474
+
475
+    def test_other__check_thunderbird_mail_text_quote(self):
476
+        text_and_quote = '''
477
+            <p>Pof<br>
478
+            </p>
479
+            <br>
480
+            <div class="moz-cite-prefix">Le 28/11/2017 à 11:21, John Doe a
481
+              écrit&nbsp;:<br>
482
+            </div>
483
+            <blockquote type="cite"
484
+              cite="mid:658592c1-14de-2958-5187-3571edea0aac@localhost.fr">
485
+              <meta http-equiv="Context-Type" 
486
+              content="text/html; charset=utf-8">
487
+              <p>Test<br>
488
+              </p>
489
+              <div class="moz-signature">-- <br>
490
+                TEST DE signature</div>
491
+            </blockquote>
492
+            <br>
493
+        '''
494
+        mail = ParsedHTMLMail(text_and_quote)
495
+        elements = mail.get_elements()
496
+        assert len(elements) == 2
497
+        assert elements[0].part_type == BodyMailPartType.Main
498
+        assert elements[1].part_type == BodyMailPartType.Quote
499
+
500
+    def test_other__check_thunderbird_mail_text_quote_text(self):
501
+        text_quote_text = '''
502
+        <p>Pof<br>
503
+        </p>
504
+        <br>
505
+        <div class="moz-cite-prefix">Le 28/11/2017 à 11:54, 
506
+         Bidule a
507
+          écrit&nbsp;:<br>
508
+        </div>
509
+        <blockquote type="cite"
510
+          cite="mid:b541b451-bb31-77a4-45b9-ad89969d7962@localhost.fr">
511
+          <meta http-equiv="Context-Type" 
512
+          content="text/html; charset=utf-8">
513
+          <p>Pof<br>
514
+          </p>
515
+          <br>
516
+          <div class="moz-cite-prefix">Le 28/11/2017 à 11:21, John Doe a
517
+            écrit&nbsp;:<br>
518
+          </div>
519
+          <blockquote type="cite"
520
+            cite="mid:658592c1-14de-2958-5187-3571edea0aac@localhost.fr">
521
+            <p>Test<br>
522
+            </p>
523
+            <div class="moz-signature">-- <br>
524
+              TEST DE signature</div>
525
+          </blockquote>
526
+          <br>
527
+        </blockquote>
528
+        Pif<br>
529
+        '''
530
+
531
+        mail = ParsedHTMLMail(text_quote_text)
532
+        elements = mail.get_elements()
533
+        assert len(elements) == 3
534
+        assert elements[0].part_type == BodyMailPartType.Main
535
+        assert elements[1].part_type == BodyMailPartType.Quote
536
+        assert elements[2].part_type == BodyMailPartType.Main
537
+
538
+    def test_other__check_thunderbird_mail_text_quote_signature(self):
539
+        text_quote_signature = '''
540
+        <p>Coucou<br>
541
+        </p>
542
+        <br>
543
+        <div class="moz-cite-prefix">Le 28/11/2017 à 11:22, Bidule a
544
+        écrit&nbsp;:<br>
545
+        </div>
546
+        <blockquote type="cite"
547
+        cite="mid:4e6923e2-796d-eccf-84b7-6824da4151ee@localhost.fr">Réponse<br>
548
+        <br>
549
+        Le 28/11/2017 à 11:21, John Doe a écrit&nbsp;: <br>
550
+        <blockquote type="cite"> <br>
551
+        Test <br>
552
+        <br>
553
+        --&nbsp;<br>
554
+        TEST DE signature <br>
555
+        </blockquote>
556
+        <br>
557
+        </blockquote>
558
+        <br>
559
+        <div class="moz-signature">-- <br>
560
+        TEST DE signature</div>
561
+        '''
562
+
563
+        mail = ParsedHTMLMail(text_quote_signature)
564
+        elements = mail.get_elements()
565
+        assert len(elements) == 3
566
+        assert elements[0].part_type == BodyMailPartType.Main
567
+        assert elements[1].part_type == BodyMailPartType.Quote
568
+        assert elements[2].part_type == BodyMailPartType.Signature
569
+
570
+    def test_other__check_thunderbird_mail_text_quote_text_signature(self):
571
+        text_quote_text_sign = '''
572
+        <p>Avant<br>
573
+        </p>
574
+        <br>
575
+        <div class="moz-cite-prefix">Le 28/11/2017 à 11:19, Bidule a
576
+          écrit&nbsp;:<br>
577
+        </div>
578
+        <blockquote type="cite"
579
+          cite="mid:635df73c-d3c9-f2e9-2304-24ff536bfa16@localhost.fr">Coucou 
580
+          <br><br>
581
+        </blockquote>
582
+        Aprés<br>
583
+        <br>
584
+        <div class="moz-signature">-- <br>
585
+          TEST DE signature</div>
586
+        '''
587
+
588
+        mail = ParsedHTMLMail(text_quote_text_sign)
589
+        elements = mail.get_elements()
590
+        assert len(elements) == 4
591
+        assert elements[0].part_type == BodyMailPartType.Main
592
+        assert elements[1].part_type == BodyMailPartType.Quote
593
+        assert elements[2].part_type == BodyMailPartType.Main
594
+        assert elements[3].part_type == BodyMailPartType.Signature
595
+
596
+    # INFO - G.M - 2017-11-28 - Test for outlook.com webapp html mail
597
+    # outlook.com ui doesn't seems to allow complex reply, new message
598
+    # and signature are always before quoted one.
599
+
600
+    def test_other__check_outlook_com_mail_text_only(self):
601
+
602
+        text_only = '''
603
+        <div id="divtagdefaultwrapper"
604
+        style="font-size:12pt;color:#000000;
605
+        font-family:Calibri,Helvetica,sans-serif;"
606
+        dir="ltr">
607
+        <p style="margin-top:0;margin-bottom:0">message<br>
608
+        </p>
609
+        </div>
610
+        '''
611
+        mail = ParsedHTMLMail(text_only)
612
+        elements = mail.get_elements()
613
+        assert len(elements) == 1
614
+        assert elements[0].part_type == BodyMailPartType.Main
615
+
616
+    def test_other__check_outlook_com_mail_text_signature(self):
617
+        text_and_signature = '''
618
+        <div id="divtagdefaultwrapper"
619
+        style="font-size:12pt;color:#000000;
620
+        font-family:Calibri,Helvetica,sans-serif;"
621
+          dir="ltr">
622
+          <p style="margin-top:0;margin-bottom:0">Test<br>
623
+          </p>
624
+          <p style="margin-top:0;margin-bottom:0"><br>
625
+          </p>
626
+          <div id="Signature">
627
+            <div id="divtagdefaultwrapper" style="font-size: 12pt; color:
628
+              rgb(0, 0, 0); background-color: rgb(255, 255, 255);
629
+              font-family:
630
+              Calibri,Arial,Helvetica,sans-serif,&quot;EmojiFont&quot;,&quot;Apple
631
+              Color Emoji&quot;,&quot;Segoe UI
632
+              Emoji&quot;,NotoColorEmoji,&quot;Segoe UI
633
+              Symbol&quot;,&quot;Android Emoji&quot;,EmojiSymbols;">
634
+              Envoyé à partir de <a href="http://aka.ms/weboutlook"
635
+                id="LPNoLP">Outlook</a></div>
636
+          </div>
637
+        </div>
638
+        '''
639
+        mail = ParsedHTMLMail(text_and_signature)
640
+        elements = mail.get_elements()
641
+        assert len(elements) == 2
642
+        assert elements[0].part_type == BodyMailPartType.Main
643
+        assert elements[1].part_type == BodyMailPartType.Signature
644
+
645
+    def test_other__check_outlook_com_mail_text_quote(self):
646
+        text_and_quote = '''
647
+        <div id="divtagdefaultwrapper"
648
+        style="font-size:12pt;color:#000000;font-family:Calibri,Helvetica,sans-serif;"
649
+        dir="ltr">
650
+        <p style="margin-top:0;margin-bottom:0">Salut !<br>
651
+        </p>
652
+        </div>
653
+        <hr style="display:inline-block;width:98%" tabindex="-1">
654
+        <div id="divRplyFwdMsg" dir="ltr"><font style="font-size:11pt"
655
+        color="#000000" face="Calibri, sans-serif"><b>De :</b> John Doe<br>
656
+        <b>Envoyé :</b> mardi 28 novembre 2017 12:44:59<br>
657
+        <b>À :</b> dev.bidule@localhost.fr<br>
658
+        <b>Objet :</b> voila</font>
659
+        <div>&nbsp;</div>
660
+        </div>
661
+        <style type="text/css" style="display:none">
662
+        <!--
663
+        p
664
+        &#x09;{margin-top:0;
665
+        &#x09;margin-bottom:0}
666
+        -->
667
+        </style>
668
+        <div dir="ltr">
669
+        <div id="x_divtagdefaultwrapper" dir="ltr" style="font-size:12pt;
670
+        color:#000000; font-family:Calibri,Helvetica,sans-serif">
671
+        Contenu
672
+        <p style="margin-top:0; margin-bottom:0"><br>
673
+        </p>
674
+        <div id="x_Signature">
675
+          <div id="x_divtagdefaultwrapper" dir="ltr"
676
+            style="font-size:12pt; color:rgb(0,0,0);
677
+            background-color:rgb(255,255,255);
678
+        font-family:Calibri,Arial,Helvetica,sans-serif,&quot;EmojiFont&quot;,&quot;Apple
679
+            Color Emoji&quot;,&quot;Segoe UI
680
+            Emoji&quot;,NotoColorEmoji,&quot;Segoe UI
681
+            Symbol&quot;,&quot;Android Emoji&quot;,EmojiSymbols">
682
+            DLMQDNLQNDMLQS<br>
683
+            qs<br>
684
+            dqsd<br>
685
+            d<br>
686
+            qsd<br>
687
+          </div>
688
+        </div>
689
+        </div>
690
+        </div>
691
+        '''
692
+        mail = ParsedHTMLMail(text_and_quote)
693
+        elements = mail.get_elements()
694
+        assert len(elements) == 2
695
+        assert elements[0].part_type == BodyMailPartType.Main
696
+        assert elements[1].part_type == BodyMailPartType.Quote
697
+
698
+    def test_other__check_outlook_com_mail_text_signature_quote(self):
699
+        text_signature_quote = '''
700
+        <div id="divtagdefaultwrapper"
701
+        style="font-size:12pt;color:#000000;font-family:Calibri,Helvetica,sans-serif;"
702
+        dir="ltr">
703
+        <p style="margin-top:0;margin-bottom:0">Salut !<br>
704
+        </p>
705
+        <p style="margin-top:0;margin-bottom:0"><br>
706
+        </p>
707
+        <div id="Signature">
708
+        <div id="divtagdefaultwrapper" dir="ltr" style="font-size: 12pt;
709
+        color: rgb(0, 0, 0); background-color: rgb(255, 255, 255);
710
+        font-family:
711
+        Calibri,Arial,Helvetica,sans-serif,&quot;EmojiFont&quot;,&quot;Apple
712
+        Color Emoji&quot;,&quot;Segoe UI
713
+        Emoji&quot;,NotoColorEmoji,&quot;Segoe UI
714
+        Symbol&quot;,&quot;Android Emoji&quot;,EmojiSymbols;">
715
+        Envoyée depuis Outlook<br>
716
+        </div>
717
+        </div>
718
+        </div>
719
+        <hr style="display:inline-block;width:98%" tabindex="-1">
720
+        <div id="divRplyFwdMsg" dir="ltr"><font style="font-size:11pt"
721
+        color="#000000" face="Calibri, sans-serif"><b>De :</b> John Doe
722
+        &lt;dev.bidule@localhost.fr&gt;<br>
723
+        <b>Envoyé :</b> mardi 28 novembre 2017 12:51:42<br>
724
+        <b>À :</b> John Doe<br>
725
+        <b>Objet :</b> Re: Test</font>
726
+        <div>&nbsp;</div>
727
+        </div>
728
+        <div style="background-color:#FFFFFF">
729
+        <p>Coucou<br>
730
+        </p>
731
+        <br>
732
+        <div class="x_moz-cite-prefix">Le 28/11/2017 à 12:39, John Doe a
733
+        écrit&nbsp;:<br>
734
+        </div>
735
+        <blockquote type="cite">
736
+        <div id="x_divtagdefaultwrapper" dir="ltr">
737
+        <p>Test<br>
738
+        </p>
739
+        <p><br>
740
+        </p>
741
+        <div id="x_Signature">
742
+        <div id="x_divtagdefaultwrapper">Envoyé à partir de <a
743
+        href="http://aka.ms/weboutlook" id="LPNoLP">
744
+        Outlook</a></div>
745
+        </div>
746
+        </div>
747
+        </blockquote>
748
+        <br>
749
+        <div class="x_moz-signature">-- <br>
750
+        TEST DE signature</div>
751
+        </div>
752
+        '''
753
+
754
+        mail = ParsedHTMLMail(text_signature_quote)
755
+        elements = mail.get_elements()
756
+        assert len(elements) == 3
757
+        assert elements[0].part_type == BodyMailPartType.Main
758
+        assert elements[1].part_type == BodyMailPartType.Signature
759
+        assert elements[2].part_type == BodyMailPartType.Quote