|  | @@ -1,42 +1,19 @@
 | 
	
		
			
			|  | 1 | +import typing
 | 
	
		
			
			| 1 | 2 |  from bs4 import BeautifulSoup, Tag
 | 
	
		
			
			| 2 |  | -
 | 
	
		
			
			|  | 3 | +from tracim.lib.email_processing.sanitizer_config.attrs_whitelist import ATTRS_WHITELIST  # nopep8
 | 
	
		
			
			|  | 4 | +from tracim.lib.email_processing.sanitizer_config.class_blacklist import CLASS_BLACKLIST  # nopep8
 | 
	
		
			
			|  | 5 | +from tracim.lib.email_processing.sanitizer_config.id_blacklist import ID_BLACKLIST  # nopep8
 | 
	
		
			
			|  | 6 | +from tracim.lib.email_processing.sanitizer_config.tag_blacklist import TAG_BLACKLIST  # nopep8
 | 
	
		
			
			|  | 7 | +from tracim.lib.email_processing.sanitizer_config.tag_whitelist import TAG_WHITELIST  # nopep8
 | 
	
		
			
			| 3 | 8 |  
 | 
	
		
			
			| 4 | 9 |  class HtmlSanitizerConfig(object):
 | 
	
		
			
			| 5 |  | -    # some Default_html_tags type
 | 
	
		
			
			| 6 |  | -    HTML_Heading_tag = ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']
 | 
	
		
			
			| 7 |  | -    HTML_Text_parts_tag = ['p',
 | 
	
		
			
			| 8 |  | -                           'br', 'hr',
 | 
	
		
			
			| 9 |  | -                           'pre', 'code', 'samp',  # preformatted content
 | 
	
		
			
			| 10 |  | -                           'q', 'blockquote',  # quotes
 | 
	
		
			
			| 11 |  | -                           ]
 | 
	
		
			
			| 12 |  | -    HTML_Text_format_tag = ['b', 'i', 'u', 'small', 'sub', 'sup', ]
 | 
	
		
			
			| 13 |  | -    HTML_Text_semantic_tag = ['strong', 'em',
 | 
	
		
			
			| 14 |  | -                              'mark', 'cite', 'dfn',
 | 
	
		
			
			| 15 |  | -                              'del', 'ins', ]
 | 
	
		
			
			| 16 |  | -    HTML_Table_tag = ['table',
 | 
	
		
			
			| 17 |  | -                      'thead', 'tfoot', 'tbody',
 | 
	
		
			
			| 18 |  | -                      'tr', 'td', 'caption', ]
 | 
	
		
			
			| 19 |  | -
 | 
	
		
			
			| 20 |  | -    HTML_List_tag = ['ul', 'li', 'ol',  # simple list
 | 
	
		
			
			| 21 |  | -                     'dl', 'dt', 'dd', ]  # definition list
 | 
	
		
			
			| 22 |  | -
 | 
	
		
			
			| 23 |  | -    # Rules
 | 
	
		
			
			| 24 |  | -    Tag_whitelist = HTML_Heading_tag \
 | 
	
		
			
			| 25 |  | -                    + HTML_Text_parts_tag \
 | 
	
		
			
			| 26 |  | -                    + HTML_Text_format_tag \
 | 
	
		
			
			| 27 |  | -                    + HTML_Text_semantic_tag \
 | 
	
		
			
			| 28 |  | -                    + HTML_Table_tag \
 | 
	
		
			
			| 29 |  | -                    + HTML_List_tag
 | 
	
		
			
			| 30 |  | -
 | 
	
		
			
			| 31 |  | -    Tag_blacklist = ['script', 'style']
 | 
	
		
			
			| 32 |  | -
 | 
	
		
			
			| 33 |  | -    # TODO - G.M - 2017-12-01 - Think about removing class/id Blacklist
 | 
	
		
			
			| 34 |  | -    # These elements are no longer required.
 | 
	
		
			
			| 35 |  | -    Class_blacklist = []
 | 
	
		
			
			| 36 |  | -    Id_blacklist = []
 | 
	
		
			
			| 37 |  | -
 | 
	
		
			
			| 38 |  | -    Attrs_whitelist = ['href']
 | 
	
		
			
			| 39 |  | -
 | 
	
		
			
			|  | 10 | +    # whitelist : keep tag and content
 | 
	
		
			
			|  | 11 | +    Tag_whitelist = TAG_WHITELIST
 | 
	
		
			
			|  | 12 | +    Attrs_whitelist = ATTRS_WHITELIST
 | 
	
		
			
			|  | 13 | +    # blacklist : remove content
 | 
	
		
			
			|  | 14 | +    Tag_blacklist = TAG_BLACKLIST
 | 
	
		
			
			|  | 15 | +    Class_blacklist = CLASS_BLACKLIST
 | 
	
		
			
			|  | 16 | +    Id_blacklist = ID_BLACKLIST
 | 
	
		
			
			| 40 | 17 |  
 | 
	
		
			
			| 41 | 18 |  class HtmlSanitizer(object):
 | 
	
		
			
			| 42 | 19 |      """
 | 
	
	
		
			
			|  | @@ -50,7 +27,7 @@ class HtmlSanitizer(object):
 | 
	
		
			
			| 50 | 27 |      """
 | 
	
		
			
			| 51 | 28 |  
 | 
	
		
			
			| 52 | 29 |      @classmethod
 | 
	
		
			
			| 53 |  | -    def sanitize(cls, html_body: str) -> str:
 | 
	
		
			
			|  | 30 | +    def sanitize(cls, html_body: str) -> typing.Optional[str]:
 | 
	
		
			
			| 54 | 31 |          soup = BeautifulSoup(html_body, 'html.parser')
 | 
	
		
			
			| 55 | 32 |          for tag in soup.findAll():
 | 
	
		
			
			| 56 | 33 |              if cls._tag_to_extract(tag):
 | 
	
	
		
			
			|  | @@ -62,7 +39,17 @@ class HtmlSanitizer(object):
 | 
	
		
			
			| 62 | 39 |                          del tag.attrs[attr]
 | 
	
		
			
			| 63 | 40 |              else:
 | 
	
		
			
			| 64 | 41 |                  tag.unwrap()
 | 
	
		
			
			| 65 |  | -        return str(soup)
 | 
	
		
			
			|  | 42 | +
 | 
	
		
			
			|  | 43 | +        if cls._is_content_empty(soup):
 | 
	
		
			
			|  | 44 | +            return None
 | 
	
		
			
			|  | 45 | +        else:
 | 
	
		
			
			|  | 46 | +            return str(soup)
 | 
	
		
			
			|  | 47 | +
 | 
	
		
			
			|  | 48 | +    @classmethod
 | 
	
		
			
			|  | 49 | +    def _is_content_empty(cls, soup):
 | 
	
		
			
			|  | 50 | +        img = soup.find('img')
 | 
	
		
			
			|  | 51 | +        txt = soup.get_text().replace('\n', '').strip()
 | 
	
		
			
			|  | 52 | +        return (not img and not txt)
 | 
	
		
			
			| 66 | 53 |  
 | 
	
		
			
			| 67 | 54 |      @classmethod
 | 
	
		
			
			| 68 | 55 |      def _tag_to_extract(cls, tag: Tag) -> bool:
 |