|
@@ -88,16 +88,8 @@ class DecodedMail(object):
|
88
|
88
|
soup = BeautifulSoup(html_body)
|
89
|
89
|
config = BS_HTML_BODY_PARSE_CONFIG
|
90
|
90
|
for tag in soup.findAll():
|
91
|
|
- if tag.name.lower() in config['tag_blacklist']:
|
|
91
|
+ if DecodedMail._tag_to_extract(tag):
|
92
|
92
|
tag.extract()
|
93
|
|
- elif 'class' in tag.attrs:
|
94
|
|
- for elem in config['class_blacklist']:
|
95
|
|
- if elem in tag.attrs['class']:
|
96
|
|
- tag.extract()
|
97
|
|
- elif 'id' in tag.attrs:
|
98
|
|
- for elem in config['id_blacklist']:
|
99
|
|
- if elem in tag.attrs['id']:
|
100
|
|
- tag.extract()
|
101
|
93
|
elif tag.name.lower() in config['tag_whitelist']:
|
102
|
94
|
attrs = dict(tag.attrs)
|
103
|
95
|
for attr in attrs:
|
|
@@ -107,6 +99,22 @@ class DecodedMail(object):
|
107
|
99
|
tag.unwrap()
|
108
|
100
|
return str(soup)
|
109
|
101
|
|
|
102
|
+ @staticmethod
|
|
103
|
+ def _tag_to_extract(tag) -> bool:
|
|
104
|
+ config = BS_HTML_BODY_PARSE_CONFIG
|
|
105
|
+ if tag.name.lower() in config['tag_blacklist']:
|
|
106
|
+ return True
|
|
107
|
+ if 'class' in tag.attrs:
|
|
108
|
+ for elem in config['class_blacklist']:
|
|
109
|
+ if elem in tag.attrs['class']:
|
|
110
|
+ return True
|
|
111
|
+ if 'id' in tag.attrs:
|
|
112
|
+ for elem in config['id_blacklist']:
|
|
113
|
+ if elem in tag.attrs['id']:
|
|
114
|
+ return True
|
|
115
|
+ return False
|
|
116
|
+
|
|
117
|
+
|
110
|
118
|
def _get_mime_body_message(self) -> typing.Optional[Message]:
|
111
|
119
|
# FIXME - G.M - 2017-11-16 - Use stdlib msg.get_body feature for py3.6+
|
112
|
120
|
# FIXME - G.M - 2017-11-16 - Check support for non-multipart mail
|