Browse Source

Fix html mail parsing. unwrap all unextracted tag

Guénaël Muller 7 years ago
parent
commit
02d13bd955
1 changed files with 17 additions and 9 deletions
  1. 17 9
      tracim/tracim/lib/email_fetcher.py

+ 17 - 9
tracim/tracim/lib/email_fetcher.py View File

@@ -88,16 +88,8 @@ class DecodedMail(object):
88 88
         soup = BeautifulSoup(html_body)
89 89
         config = BS_HTML_BODY_PARSE_CONFIG
90 90
         for tag in soup.findAll():
91
-            if tag.name.lower() in config['tag_blacklist']:
91
+            if DecodedMail._tag_to_extract(tag):
92 92
                 tag.extract()
93
-            elif 'class' in tag.attrs:
94
-                for elem in config['class_blacklist']:
95
-                    if elem in tag.attrs['class']:
96
-                        tag.extract()
97
-            elif 'id' in tag.attrs:
98
-                for elem in config['id_blacklist']:
99
-                    if elem in tag.attrs['id']:
100
-                        tag.extract()
101 93
             elif tag.name.lower() in config['tag_whitelist']:
102 94
                 attrs = dict(tag.attrs)
103 95
                 for attr in attrs:
@@ -107,6 +99,22 @@ class DecodedMail(object):
107 99
                 tag.unwrap()
108 100
         return str(soup)
109 101
 
102
+    @staticmethod
103
+    def _tag_to_extract(tag) -> bool:
104
+        config = BS_HTML_BODY_PARSE_CONFIG
105
+        if tag.name.lower() in config['tag_blacklist']:
106
+            return True
107
+        if 'class' in tag.attrs:
108
+            for elem in config['class_blacklist']:
109
+                if elem in tag.attrs['class']:
110
+                    return True
111
+        if 'id' in tag.attrs:
112
+            for elem in config['id_blacklist']:
113
+                if elem in tag.attrs['id']:
114
+                    return True
115
+        return False
116
+
117
+
110 118
     def _get_mime_body_message(self) -> typing.Optional[Message]:
111 119
         # FIXME - G.M - 2017-11-16 - Use stdlib msg.get_body feature for py3.6+
112 120
         # FIXME - G.M - 2017-11-16 - Check support for non-multipart mail