-
Notifications
You must be signed in to change notification settings - Fork 2
/
normalizer.py
119 lines (97 loc) · 3.86 KB
/
normalizer.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
#!/usr/bin/python
# -*- coding: utf-8 -*-
"""
Module docstring
"""
from abc import ABCMeta, abstractmethod
from bs4 import BeautifulSoup
class AbstractNormalizer():
__metaclass__ = ABCMeta
@abstractmethod
def normalize(self, dom):
pass
class AttributeNormalizer(AbstractNormalizer):
def __init__(self, attr_list=None, mode='white_list'):
self.attr_list = attr_list
self.mode = mode
def normalize(self, dom):
soup = BeautifulSoup(dom, 'html.parser')
for tag in soup.find_all():
filtered_attrs = {}
if self.mode == 'white_list':
for attr in tag.attrs:
if self.attr_list and (attr in self.attr_list):
filtered_attrs[attr] = tag[attr]
else: # black_list
for attr in tag.attrs:
if attr not in self.attr_list:
filtered_attrs[attr] = tag[attr]
tag.attrs = filtered_attrs
return str(soup)
def __str__(self):
return 'AttributeNormalizer: attr_list: %s, mode: %s' % (self.attr_list, self.mode)
class TagContentNormalizer(AbstractNormalizer):
def __init__(self, tag_list=None):
self.tag_list = tag_list
def normalize(self, dom):
soup = BeautifulSoup(dom, 'html.parser')
for tag in soup.find_all():
if self.tag_list and (tag.name in self.tag_list):
tag.clear()
return str(soup)
def __str__(self):
return 'TagContentNormalizer: tag_list: %s' % self.tag_list
class TagNormalizer(AbstractNormalizer):
def __init__(self, tag_list=None):
self.tag_list = tag_list
def normalize(self, dom):
soup = BeautifulSoup(dom, 'html.parser')
for tag in soup.find_all():
if self.tag_list and (tag.name in self.tag_list):
tag.decompose()
return str(soup)
def __str__(self):
return 'TagNormalizer: tag_list: %s' % self.tag_list
#=============================================================================================
# remove tag with:
# 1. matched name, attr and startswith value
# 2. matched name, and tag content contains value without given attr
class TagWithAttributeNormalizer(AbstractNormalizer):
def __init__(self, name, attr, value, mode='startswith'):
self.name = name
self.attr = attr
self.value = value
self.mode = mode
def normalize(self, dom):
soup = BeautifulSoup(dom, 'html.parser')
for tag in soup.find_all(self.name):
if self.attr and tag.attrs and (self.attr in tag.attrs):
if type(tag[self.attr]) == type([]):
for attr_value in tag[self.attr]:
if self.is_attr_value(attr_value):
tag.decompose()
break
elif type(tag[self.attr]) == type('') or type(tag[self.attr]) == type(u'') :
if self.is_attr_value(tag[self.attr]):
tag.decompose()
elif not self.attr: # self.attr is None
for string in tag.stripped_strings:
if self.is_attr_value(string):
tag.decompose()
break
return str(soup)
def is_attr_value(self, attr_value):
if self.mode == 'startswith':
return attr_value.startswith(self.value)
elif self.mode == 'contains':
return self.value in attr_value
elif self.mode == 'attribute':
attrs = self.value.split(':')
for attr in attrs:
if not attr in attr_value:
return False
return True
else:
return False
def __str__(self):
return 'TagWithAttributeNormalizer: tag: %s, attr: %s, value: %s' % (self.name, self.attr, self.value)