|
@@ -8,14 +8,13 @@ from tracim.lib.email_processing.checkers import HtmlMailQuoteChecker
|
8
|
8
|
from tracim.lib.email_processing.checkers import HtmlMailSignatureChecker
|
9
|
9
|
from tracim.lib.email_processing.models import BodyMailPartType
|
10
|
10
|
from tracim.lib.email_processing.models import BodyMailPart
|
11
|
|
-from tracim.lib.email_processing.models import BodyMailParts
|
|
11
|
+from tracim.lib.email_processing.models import HtmlBodyMailParts
|
12
|
12
|
|
13
|
13
|
class PreSanitizeConfig(object):
|
14
|
14
|
"""
|
15
|
|
- To avoid problems, html need to be a bit during parsing to distinct
|
|
15
|
+ To avoid problems, html need to be sanitize a bit during parsing to distinct
|
16
|
16
|
Main,Quote and Signature elements
|
17
|
17
|
"""
|
18
|
|
- Ignored_tags = ['br', 'hr', 'script', 'style']
|
19
|
18
|
meta_tag = ['body', 'div']
|
20
|
19
|
|
21
|
20
|
|
|
@@ -32,11 +31,11 @@ class ParsedHTMLMail(object):
|
32
|
31
|
def __str__(self):
|
33
|
32
|
return str(self._parse_mail())
|
34
|
33
|
|
35
|
|
- def get_elements(self) -> BodyMailParts:
|
|
34
|
+ def get_elements(self) -> HtmlBodyMailParts:
|
36
|
35
|
tree = self._get_proper_main_body_tree()
|
37
|
36
|
return self._distinct_elements(tree)
|
38
|
37
|
|
39
|
|
- def _parse_mail(self) -> BodyMailParts:
|
|
38
|
+ def _parse_mail(self) -> HtmlBodyMailParts:
|
40
|
39
|
elements = self.get_elements()
|
41
|
40
|
elements = self._process_elements(elements)
|
42
|
41
|
return elements
|
|
@@ -69,26 +68,16 @@ class ParsedHTMLMail(object):
|
69
|
68
|
return tree
|
70
|
69
|
|
71
|
70
|
@classmethod
|
72
|
|
- def _distinct_elements(cls, tree: BeautifulSoup) -> BodyMailParts:
|
73
|
|
- parts = BodyMailParts()
|
|
71
|
+ def _distinct_elements(cls, tree: BeautifulSoup) -> HtmlBodyMailParts:
|
|
72
|
+ parts = HtmlBodyMailParts()
|
74
|
73
|
for elem in list(tree):
|
75
|
74
|
part_txt = str(elem)
|
76
|
75
|
part_type = BodyMailPartType.Main
|
77
|
|
- # sanitize NavigableString
|
78
|
|
- if isinstance(elem, NavigableString):
|
79
|
|
- part_txt = part_txt.replace('\n', '').strip()
|
80
|
76
|
|
81
|
77
|
if HtmlMailQuoteChecker.is_quote(elem):
|
82
|
78
|
part_type = BodyMailPartType.Quote
|
83
|
79
|
elif HtmlMailSignatureChecker.is_signature(elem):
|
84
|
80
|
part_type = BodyMailPartType.Signature
|
85
|
|
- else:
|
86
|
|
- # INFO - G.M -2017-11-28 - ignore unwanted parts
|
87
|
|
- if not part_txt:
|
88
|
|
- continue
|
89
|
|
- if isinstance(elem, Tag) \
|
90
|
|
- and elem.name.lower() in PreSanitizeConfig.Ignored_tags:
|
91
|
|
- continue
|
92
|
81
|
|
93
|
82
|
part = BodyMailPart(part_txt, part_type)
|
94
|
83
|
parts.append(part)
|
|
@@ -99,7 +88,7 @@ class ParsedHTMLMail(object):
|
99
|
88
|
return parts
|
100
|
89
|
|
101
|
90
|
@classmethod
|
102
|
|
- def _process_elements(cls, elements: BodyMailParts) -> BodyMailParts:
|
|
91
|
+ def _process_elements(cls, elements: HtmlBodyMailParts) -> HtmlBodyMailParts:
|
103
|
92
|
if len(elements) >= 2:
|
104
|
93
|
# Case 1 and 2, only one main and one quote
|
105
|
94
|
if elements.get_nb_part_type('main') == 1 and \
|
|
@@ -119,19 +108,19 @@ class ParsedHTMLMail(object):
|
119
|
108
|
return elements
|
120
|
109
|
|
121
|
110
|
@classmethod
|
122
|
|
- def _process_quote_first_case(cls, elements: BodyMailParts) -> None:
|
|
111
|
+ def _process_quote_first_case(cls, elements: HtmlBodyMailParts) -> None:
|
123
|
112
|
elements.drop_part_type(BodyMailPartType.Signature)
|
124
|
113
|
|
125
|
114
|
@classmethod
|
126
|
|
- def _process_main_first_case(cls, elements: BodyMailParts) -> None:
|
|
115
|
+ def _process_main_first_case(cls, elements: HtmlBodyMailParts) -> None:
|
127
|
116
|
elements.drop_part_type(BodyMailPartType.Quote)
|
128
|
117
|
elements.drop_part_type(BodyMailPartType.Signature)
|
129
|
118
|
|
130
|
119
|
@classmethod
|
131
|
|
- def _process_multiples_elems_case(cls, elements: BodyMailParts) -> None:
|
|
120
|
+ def _process_multiples_elems_case(cls, elements: HtmlBodyMailParts) -> None:
|
132
|
121
|
elements.drop_part_type(BodyMailPartType.Signature)
|
133
|
122
|
|
134
|
123
|
@classmethod
|
135
|
|
- def _process_default_case(cls, elements: BodyMailParts) -> None:
|
|
124
|
+ def _process_default_case(cls, elements: HtmlBodyMailParts) -> None:
|
136
|
125
|
elements.drop_part_type(BodyMailPartType.Quote)
|
137
|
126
|
elements.drop_part_type(BodyMailPartType.Signature)
|