-
Notifications
You must be signed in to change notification settings - Fork 2
/
DOMParser.php
95 lines (66 loc) · 1.95 KB
/
DOMParser.php
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
<?php
class DOMParser {
private $bannedTags = array('false','script','style');
private $blockTags = array('false','option','div','p','h1','h2','h3','h4','h5','h6','br','nobr');
private $stripped = '';
private function stripTags($element) {
if(!$this->isElementAllowed($element)){
return false;
} else if($element->hasChildNodes()) {
if(array_search($element->tagName, $this->blockTags)) {
$this->stripped .= " ";
}
foreach($element->childNodes as $child) {
$this->stripTags($child);
}
} else {
if($this->isImg($element)) $this->extractAltTag($element);
$this->stripped .= $element->nodeValue;
if(array_search($element->tagName, $this->blockTags)) {
$this->stripped .= " ";
}
}
}
private function isElementAllowed($element) {
/* node type 8 is an html comment */
if($element->nodeType == 8) {
return false;
} else if(array_search($element->tagName, $this->bannedTags)) {
return false;
} else {
return true;
}
}
private function isImg($element) {
return $element->tagName == 'img';
}
private function extractAltTag($element){
if($element->hasAttribute('alt')) {
$this->stripped .= $element->getAttribute('alt') . " ";
}
}
private function stripPunctuation() {
$this->stripped = preg_replace('/[,\|-]|(\.(?![0-9]))/',' ', $this->stripped);
$this->stripped = preg_replace('/ \?/',' ', $this->stripped);
}
private function stripWhitespace() {
$this->stripped = preg_replace('/\s+/',' ', $this->stripped);
}
public function parseFile($fileName) {
$html = file_get_contents($fileName);
return $this->parseString($html);
}
public function parseString($html) {
$this->stripped = '';
$doc = new DOMDocument();
$doc->loadHTML($html);
$children = $doc->getElementsByTagName('body');
foreach($children as $child) {
$this->stripTags($child);
}
$this->stripPunctuation();
$this->stripWhitespace();
return $this->stripped;
}
}
?>