-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathkicker_referee.pl
103 lines (98 loc) · 2.66 KB
/
kicker_referee.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
#! /usr/bin/perl -w
use strict;
use warnings;
use HTML::Entities;
use utf8;
use open ':std', ':encoding(utf8)';
$| = 1;
##############################################################################
# A script to crawl referee reviews from kicker.de as nice and handy xml-files
##############################################################################
my $path = "/path/to/outputfile.xml";
# --> Define path and outpute filename
############################
# no changes below this line
############################
my $url;
my $url_game;
my @urls;
my $title;
my $date;
my $kickoff;
my $team1;
my $team2;
my $home_goal;
my $away_goal;
my $article;
my $referee;
my $mark;
my $p;
my $start_url = "https://www.kicker.de/news/fussball/bundesliga/spieltag/1-bundesliga/2017-18/-1/0/spieltag.html";
for (my $i = 2017; $i > 1995; $i--) {
my $j = $i + 1;
my $end = substr($j, -2, 2);
my $start_url = "https://www.kicker.de/news/fussball/bundesliga/spieltag/1-bundesliga/$i-$end/-1/0/spieltag.html";
print "\nHole die URLs… von Saison $i-$end\n";
my $start_html = qx(curl -s '$start_url');
my @lines = split /\n/, $start_html;
foreach my $line (@lines) {
if ($line =~ m/<td><a class="link" href="(.+?)">Analyse/) {
$url = "https://www.kicker.de" . $1;
push @urls, $url;
}
}
my $counter = 0;
my $length = scalar @urls;
open OUT, ">> $path" or die $!;
foreach my $url_game (@urls) {
my $html = qx(curl -s $url_game);
$counter++;
print "\rLade Nr. $counter von $length…";
if ($html =~ /<title>(.+?)<\/title>/) {
$title = $1;
}
if ($html =~ /Anstoß:<\/b><\/div>\s+<div class="wert">(.+?)\.(.+?)\.(.+?) (.+?) Uhr/) {
$date = "$3-$2-$1";
$kickoff = $4;
}
if ($html =~ /<h1><a href=".+?">(.+?)<\/a><\/h1>\s+<\/td>\s+<td class="lttabst"/) {
$team1 = $1;
}
if ($html =~ /<h1><a href=".+?">(.+?)<\/a><\/h1>\s+<\/td>\s+<td class="lttablig/) {
$team2 = $1;
}
if ($html =~ /class="boardH">(\d)<\/div>/) {
$home_goal = $1;
}
if ($html =~ /class="boardA">(\d)<\/div>/) {
$away_goal = $1;
}
if ($html =~ /<div class="schiedsrichter">([\w\W]+?)<div class="spldesspiels">/) {
$article = $1;
}
if ($article =~ m/<a class="link".+?>(.+?)<\/a>.+?Note (\S+)<br \/>/s) {
$referee = $1;
$mark = $2;
}
if ($article =~ /<br \/>(.+?)<\/div>/s) {
$p = $1;
$p =~ s/[\r\n]//g;
}
print OUT "<text>
<url>$url_game</url>
<title>$title</title>
<team1>$team1</team1>
<team2>$team2</team2>
<date>$date</date>
<kickoff>$kickoff</kickoff>
<result>$home_goal:$away_goal</result>
<referee>$referee</referee>
<mark>$mark</mark>
<p>$p</p>\n";
print OUT "</text>\n";
sleep rand 1;
}
close OUT;
undef @urls;
}
print "\nDone!\n";