|
@@ -4,6 +4,7 @@ from bs4 import BeautifulSoup
|
4
|
4
|
from bs4 import Tag
|
5
|
5
|
from bs4 import NavigableString
|
6
|
6
|
|
|
7
|
+# BodyParts and Body Parts Objects #
|
7
|
8
|
|
8
|
9
|
class BodyMailPartType(object):
|
9
|
10
|
Signature = 'sign'
|
|
@@ -101,8 +102,13 @@ class BodyMailParts(object):
|
101
|
102
|
class SignatureIndexError(Exception):
|
102
|
103
|
pass
|
103
|
104
|
|
|
105
|
+# Elements Checkers #
|
104
|
106
|
|
105
|
|
-class ProprietaryHTMLProperties(object):
|
|
107
|
+class ProprietaryHTMLAttrValues(object):
|
|
108
|
+ """
|
|
109
|
+ This are all Proprietary (mail client specific) html attr value we need to
|
|
110
|
+ check Html Elements
|
|
111
|
+ """
|
106
|
112
|
# Gmail
|
107
|
113
|
Gmail_extras_class = 'gmail_extra'
|
108
|
114
|
Gmail_quote_class = 'gmail_quote'
|
|
@@ -122,7 +128,6 @@ class ProprietaryHTMLProperties(object):
|
122
|
128
|
Roundcube_quote_prefix_class = 'reply-intro'
|
123
|
129
|
|
124
|
130
|
|
125
|
|
-
|
126
|
131
|
class HtmlChecker(object):
|
127
|
132
|
|
128
|
133
|
@classmethod
|
|
@@ -171,7 +176,7 @@ class HtmlMailQuoteChecker(HtmlChecker):
|
171
|
176
|
return cls._has_attr_value(
|
172
|
177
|
elem,
|
173
|
178
|
'class',
|
174
|
|
- ProprietaryHTMLProperties.Thunderbird_quote_prefix_class)
|
|
179
|
+ ProprietaryHTMLAttrValues.Thunderbird_quote_prefix_class)
|
175
|
180
|
|
176
|
181
|
@classmethod
|
177
|
182
|
def _is_gmail_quote(
|
|
@@ -181,12 +186,12 @@ class HtmlMailQuoteChecker(HtmlChecker):
|
181
|
186
|
if cls._has_attr_value(
|
182
|
187
|
elem,
|
183
|
188
|
'class',
|
184
|
|
- ProprietaryHTMLProperties.Gmail_extras_class):
|
|
189
|
+ ProprietaryHTMLAttrValues.Gmail_extras_class):
|
185
|
190
|
for child in elem.children:
|
186
|
191
|
if cls._has_attr_value(
|
187
|
192
|
child,
|
188
|
193
|
'class',
|
189
|
|
- ProprietaryHTMLProperties.Gmail_quote_class):
|
|
194
|
+ ProprietaryHTMLAttrValues.Gmail_quote_class):
|
190
|
195
|
return True
|
191
|
196
|
return False
|
192
|
197
|
|
|
@@ -198,7 +203,7 @@ class HtmlMailQuoteChecker(HtmlChecker):
|
198
|
203
|
if cls._has_attr_value(
|
199
|
204
|
elem,
|
200
|
205
|
'id',
|
201
|
|
- ProprietaryHTMLProperties.Outlook_com_quote_id):
|
|
206
|
+ ProprietaryHTMLAttrValues.Outlook_com_quote_id):
|
202
|
207
|
return True
|
203
|
208
|
return False
|
204
|
209
|
|
|
@@ -210,7 +215,7 @@ class HtmlMailQuoteChecker(HtmlChecker):
|
210
|
215
|
return cls._has_attr_value(
|
211
|
216
|
elem,
|
212
|
217
|
'class',
|
213
|
|
- ProprietaryHTMLProperties.Yahoo_quote_class)
|
|
218
|
+ ProprietaryHTMLAttrValues.Yahoo_quote_class)
|
214
|
219
|
|
215
|
220
|
@classmethod
|
216
|
221
|
def _is_roundcube_quote(
|
|
@@ -220,7 +225,7 @@ class HtmlMailQuoteChecker(HtmlChecker):
|
220
|
225
|
return cls._has_attr_value(
|
221
|
226
|
elem,
|
222
|
227
|
'id',
|
223
|
|
- ProprietaryHTMLProperties.Roundcube_quote_prefix_class)
|
|
228
|
+ ProprietaryHTMLAttrValues.Roundcube_quote_prefix_class)
|
224
|
229
|
|
225
|
230
|
|
226
|
231
|
class HtmlMailSignatureChecker(HtmlChecker):
|
|
@@ -242,7 +247,7 @@ class HtmlMailSignatureChecker(HtmlChecker):
|
242
|
247
|
return cls._has_attr_value(
|
243
|
248
|
elem,
|
244
|
249
|
'class',
|
245
|
|
- ProprietaryHTMLProperties.Thunderbird_signature_class)
|
|
250
|
+ ProprietaryHTMLAttrValues.Thunderbird_signature_class)
|
246
|
251
|
|
247
|
252
|
@classmethod
|
248
|
253
|
def _is_gmail_signature(
|
|
@@ -252,24 +257,24 @@ class HtmlMailSignatureChecker(HtmlChecker):
|
252
|
257
|
if cls._has_attr_value(
|
253
|
258
|
elem,
|
254
|
259
|
'class',
|
255
|
|
- ProprietaryHTMLProperties.Gmail_signature_class):
|
|
260
|
+ ProprietaryHTMLAttrValues.Gmail_signature_class):
|
256
|
261
|
return True
|
257
|
262
|
if cls._has_attr_value(
|
258
|
263
|
elem,
|
259
|
264
|
'class',
|
260
|
|
- ProprietaryHTMLProperties.Gmail_extras_class):
|
|
265
|
+ ProprietaryHTMLAttrValues.Gmail_extras_class):
|
261
|
266
|
for child in elem.children:
|
262
|
267
|
if cls._has_attr_value(
|
263
|
268
|
child,
|
264
|
269
|
'class',
|
265
|
|
- ProprietaryHTMLProperties.Gmail_signature_class):
|
|
270
|
+ ProprietaryHTMLAttrValues.Gmail_signature_class):
|
266
|
271
|
return True
|
267
|
272
|
if isinstance(elem, Tag) and elem.name.lower() == 'div':
|
268
|
273
|
for child in elem.children:
|
269
|
274
|
if cls._has_attr_value(
|
270
|
275
|
child,
|
271
|
276
|
'class',
|
272
|
|
- ProprietaryHTMLProperties.Gmail_signature_class):
|
|
277
|
+ ProprietaryHTMLAttrValues.Gmail_signature_class):
|
273
|
278
|
return True
|
274
|
279
|
return False
|
275
|
280
|
|
|
@@ -281,20 +286,27 @@ class HtmlMailSignatureChecker(HtmlChecker):
|
281
|
286
|
if cls._has_attr_value(
|
282
|
287
|
elem,
|
283
|
288
|
'id',
|
284
|
|
- ProprietaryHTMLProperties.Outlook_com_signature_id):
|
|
289
|
+ ProprietaryHTMLAttrValues.Outlook_com_signature_id):
|
285
|
290
|
return True
|
286
|
291
|
return False
|
287
|
292
|
|
|
293
|
+# ParsedHTMLMail #
|
|
294
|
+
|
288
|
295
|
|
289
|
296
|
class PreSanitizeConfig(object):
|
|
297
|
+ """
|
|
298
|
+ To avoid problems, html need to be a bit during parsing to distinct
|
|
299
|
+ Main,Quote and Signature elements
|
|
300
|
+ """
|
290
|
301
|
Ignored_tags = ['br', 'hr', 'script', 'style']
|
291
|
|
- meta_tag = ['body','div']
|
|
302
|
+ meta_tag = ['body', 'div']
|
|
303
|
+
|
292
|
304
|
|
293
|
305
|
class ParsedHTMLMail(object):
|
294
|
306
|
"""
|
295
|
307
|
Parse HTML Mail depending of some rules.
|
296
|
308
|
Distinct part of html mail body using BodyMailParts object and
|
297
|
|
- process different rules.
|
|
309
|
+ process differents rules using HtmlChecker(s)
|
298
|
310
|
"""
|
299
|
311
|
|
300
|
312
|
def __init__(self, html_body: str):
|
|
@@ -334,7 +346,7 @@ class ParsedHTMLMail(object):
|
334
|
346
|
# if Text -> Signature -> Quote Mail
|
335
|
347
|
# Text and signature are wrapped into divtagdefaultwrapper
|
336
|
348
|
if tag.attrs.get('id'):
|
337
|
|
- if ProprietaryHTMLProperties.Outlook_com_wrapper_id\
|
|
349
|
+ if ProprietaryHTMLAttrValues.Outlook_com_wrapper_id\
|
338
|
350
|
in tag.attrs['id']:
|
339
|
351
|
tag.unwrap()
|
340
|
352
|
return tree
|