|  | @@ -1,30 +1,64 @@
 | 
	
		
			
			| 1 | 1 |  from bs4 import BeautifulSoup, Tag
 | 
	
		
			
			| 2 | 2 |  
 | 
	
		
			
			| 3 |  | -# TODO BS 20171124: Think about replace thin dict config by object
 | 
	
		
			
			| 4 |  | -BEAUTIFULSOUP_HTML_BODY_SANITIZE_CONFIG = {
 | 
	
		
			
			| 5 |  | -    'tag_blacklist': ['script', 'style'],
 | 
	
		
			
			| 6 |  | -    'class_blacklist': [],
 | 
	
		
			
			| 7 |  | -    'id_blacklist': ['reply-intro'],
 | 
	
		
			
			| 8 |  | -    'tag_whitelist': ['a', 'b', 'strong', 'i', 'br', 'ul', 'li', 'ol',
 | 
	
		
			
			| 9 |  | -                      'em', 'i', 'u', 'blockquote', 'h1', 'h2', 'h3', 'h4',
 | 
	
		
			
			| 10 |  | -                      'thead', 'tr', 'td', 'tbody', 'table', 'p', 'pre'],
 | 
	
		
			
			| 11 |  | -    'attrs_whitelist': ['href'],
 | 
	
		
			
			| 12 |  | -}
 | 
	
		
			
			|  | 3 | +
 | 
	
		
			
			|  | 4 | +class HtmlSanitizerConfig(object):
 | 
	
		
			
			|  | 5 | +    # some Default_html_tags type
 | 
	
		
			
			|  | 6 | +    HTML_Heading_tag = ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']
 | 
	
		
			
			|  | 7 | +    HTML_Text_parts_tag = ['p',
 | 
	
		
			
			|  | 8 | +                           'br', 'hr',
 | 
	
		
			
			|  | 9 | +                           'pre', 'code', 'samp',  # preformatted content
 | 
	
		
			
			|  | 10 | +                           'q', 'blockquote',  # quotes
 | 
	
		
			
			|  | 11 | +                           ]
 | 
	
		
			
			|  | 12 | +    HTML_Text_format_tag = ['b', 'i', 'u', 'small', 'sub', 'sup', ]
 | 
	
		
			
			|  | 13 | +    HTML_Text_semantic_tag = ['strong', 'em',
 | 
	
		
			
			|  | 14 | +                              'mark', 'cite', 'dfn',
 | 
	
		
			
			|  | 15 | +                              'del', 'ins', ]
 | 
	
		
			
			|  | 16 | +    HTML_Table_tag = ['table',
 | 
	
		
			
			|  | 17 | +                      'thead', 'tfoot', 'tbody',
 | 
	
		
			
			|  | 18 | +                      'tr', 'td', 'caption', ]
 | 
	
		
			
			|  | 19 | +
 | 
	
		
			
			|  | 20 | +    HTML_List_tag = ['ul', 'li', 'ol',  # simple list
 | 
	
		
			
			|  | 21 | +                     'dl', 'dt', 'dd', ]  # definition list
 | 
	
		
			
			|  | 22 | +
 | 
	
		
			
			|  | 23 | +    # Rules
 | 
	
		
			
			|  | 24 | +    Tag_whitelist = HTML_Heading_tag \
 | 
	
		
			
			|  | 25 | +                    + HTML_Text_parts_tag \
 | 
	
		
			
			|  | 26 | +                    + HTML_Text_format_tag \
 | 
	
		
			
			|  | 27 | +                    + HTML_Text_semantic_tag \
 | 
	
		
			
			|  | 28 | +                    + HTML_Table_tag \
 | 
	
		
			
			|  | 29 | +                    + HTML_List_tag
 | 
	
		
			
			|  | 30 | +
 | 
	
		
			
			|  | 31 | +    Tag_blacklist = ['script', 'style']
 | 
	
		
			
			|  | 32 | +
 | 
	
		
			
			|  | 33 | +    # TODO - G.M - 2017-12-01 - Think about removing class/id Blacklist
 | 
	
		
			
			|  | 34 | +    # These elements are no longer required.
 | 
	
		
			
			|  | 35 | +    Class_blacklist = []
 | 
	
		
			
			|  | 36 | +    Id_blacklist = []
 | 
	
		
			
			|  | 37 | +
 | 
	
		
			
			|  | 38 | +    Attrs_whitelist = ['href']
 | 
	
		
			
			| 13 | 39 |  
 | 
	
		
			
			| 14 | 40 |  
 | 
	
		
			
			| 15 | 41 |  class HtmlSanitizer(object):
 | 
	
		
			
			|  | 42 | +    """
 | 
	
		
			
			|  | 43 | +    Sanitize Html Rules :
 | 
	
		
			
			|  | 44 | +    - Tag :
 | 
	
		
			
			|  | 45 | +      - Remove Tag_blacklist tag
 | 
	
		
			
			|  | 46 | +      - Keep Tag_whitelist tag
 | 
	
		
			
			|  | 47 | +      - Unwrap others tags
 | 
	
		
			
			|  | 48 | +    - Attrs :
 | 
	
		
			
			|  | 49 | +      - Remove non-whitelisted attributes
 | 
	
		
			
			|  | 50 | +    """
 | 
	
		
			
			| 16 | 51 |  
 | 
	
		
			
			| 17 | 52 |      @classmethod
 | 
	
		
			
			| 18 | 53 |      def sanitize(cls, html_body: str) -> str:
 | 
	
		
			
			| 19 | 54 |          soup = BeautifulSoup(html_body, 'html.parser')
 | 
	
		
			
			| 20 |  | -        config = BEAUTIFULSOUP_HTML_BODY_SANITIZE_CONFIG
 | 
	
		
			
			| 21 | 55 |          for tag in soup.findAll():
 | 
	
		
			
			| 22 | 56 |              if cls._tag_to_extract(tag):
 | 
	
		
			
			| 23 | 57 |                  tag.extract()
 | 
	
		
			
			| 24 |  | -            elif tag.name.lower() in config['tag_whitelist']:
 | 
	
		
			
			|  | 58 | +            elif tag.name.lower() in HtmlSanitizerConfig.Tag_whitelist:
 | 
	
		
			
			| 25 | 59 |                  attrs = dict(tag.attrs)
 | 
	
		
			
			| 26 | 60 |                  for attr in attrs:
 | 
	
		
			
			| 27 |  | -                    if attr not in config['attrs_whitelist']:
 | 
	
		
			
			|  | 61 | +                    if attr not in HtmlSanitizerConfig.Attrs_whitelist:
 | 
	
		
			
			| 28 | 62 |                          del tag.attrs[attr]
 | 
	
		
			
			| 29 | 63 |              else:
 | 
	
		
			
			| 30 | 64 |                  tag.unwrap()
 | 
	
	
		
			
			|  | @@ -32,15 +66,14 @@ class HtmlSanitizer(object):
 | 
	
		
			
			| 32 | 66 |  
 | 
	
		
			
			| 33 | 67 |      @classmethod
 | 
	
		
			
			| 34 | 68 |      def _tag_to_extract(cls, tag: Tag) -> bool:
 | 
	
		
			
			| 35 |  | -        config = BEAUTIFULSOUP_HTML_BODY_SANITIZE_CONFIG
 | 
	
		
			
			| 36 |  | -        if tag.name.lower() in config['tag_blacklist']:
 | 
	
		
			
			|  | 69 | +        if tag.name.lower() in HtmlSanitizerConfig.Tag_blacklist:
 | 
	
		
			
			| 37 | 70 |              return True
 | 
	
		
			
			| 38 | 71 |          if 'class' in tag.attrs:
 | 
	
		
			
			| 39 |  | -            for elem in config['class_blacklist']:
 | 
	
		
			
			|  | 72 | +            for elem in HtmlSanitizerConfig.Class_blacklist:
 | 
	
		
			
			| 40 | 73 |                  if elem in tag.attrs['class']:
 | 
	
		
			
			| 41 | 74 |                      return True
 | 
	
		
			
			| 42 | 75 |          if 'id' in tag.attrs:
 | 
	
		
			
			| 43 |  | -            for elem in config['id_blacklist']:
 | 
	
		
			
			|  | 76 | +            for elem in HtmlSanitizerConfig.Id_blacklist:
 | 
	
		
			
			| 44 | 77 |                  if elem in tag.attrs['id']:
 | 
	
		
			
			| 45 | 78 |                      return True
 | 
	
		
			
			| 46 | 79 |          return False
 |