-
Notifications
You must be signed in to change notification settings - Fork 2
/
sportsmole_livetext.pl
125 lines (117 loc) · 3.24 KB
/
sportsmole_livetext.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
#! /usr/bin/perl -w
use strict;
use warnings;
use HTML::Entities;
use utf8;
use open ':std', ':encoding(utf8)';
use List::MoreUtils qw(uniq);
$| = 1;
#################################################################################
# A script to scrape live texts from sportsmole.co.uk as nice and handy xml-files
#################################################################################
my $url_overview;
my $url_game;
my @urls;
my $title;
my $date;
my $kickoff;
my $team1;
my $team2;
my $p;
my $time;
my $filename;
my $result;
my $start_url = "https://www.sportsmole.co.uk/football/premier-league/2017-18/results.html";
# --> Define the start page (open this URL in your browswer and choose by the dropdown menu)
if ($start_url =~ /premier-league\/(.+?)\//) {
$filename = $1;
}
my $path = "/path/$filename.xml";
# --> Define path and outpute filename
############################
# no changes below this line
############################
unlink($path);
print "Fetching URLs…\n";
my $start_html = qx(curl -s '$start_url');
my @lines = split /\n/, $start_html;
my $counter = 0;
foreach my $line (@lines) {
if ($line =~ m/href="(.+?_game_\d+\.html)"/) {
$url_overview = "https://www.sportsmole.co.uk" . $1;
my $html_overview = qx(curl -s '$url_overview');
if ($html_overview =~ /class="game_match" href="(.+?)"><div\nclass="game_match_name">Live Commentary<\/div>/) {
$url_game = "https://www.sportsmole.co.uk" . $1;
}
push @urls, $url_game if defined $url_game;
$counter++ if defined $url_game;
print "\rFetching URL no. $counter (be patient!)" if defined $url_game;
}
}
@urls = uniq(@urls);
my $counter_game = 0;
my $length = scalar @urls;
print " Done! $length URls fetched.\n";
open OUT, ">> $path" or die $!;
print OUT "<corpus>\n";#
foreach my $url_game (@urls) {
my $html = qx(curl -s '$url_game');
$counter_game++;
print "\rGet no. $counter_game of $length";
my @lines = split /\n/, $html;
foreach my $line (@lines) {
if ($line =~ /<title>(.+?)<\/title>/) {
$title = $1;
}
if ($line =~ /class="game_header_score">(.+?)</) {
$result = $1;
}
if ($line =~ /datetime="(.+?)T(.+?):00\+.+?"/) {
$date = $1;
$kickoff = $2;
}
}
if ($html =~ /class="game_header_bar_team left">(.+?)</) {
$team1 = $1;
}
if ($html =~ /class="game_header_bar_team left"><span[\w\W]+?desktop_only">(.+?)</) {
$team1 = $1;
}
if ($html =~ /class="game_header_bar_team">(.+?)</) {
$team2 = $1;
}
if ($html =~ /class="game_header_bar_team"><span[\w\W]+?desktop_only">(.+?)</) {
$team2 = $1;
}
$team1 =~ s/&/&/g;
$team2 =~ s/&/&/g;
print OUT "<text>
<url>$url_game</url>
<title>$title</title>
<team1>$team1</team1>
<team2>$team2</team2>
<date>$date</date>
<kickoff>$kickoff</kickoff>
<result>$result</result>\n";
my @paragraphs = split /class="livecomm"/, $html;
foreach my $paragraph (@paragraphs) {
if ($paragraph =~ m/class="period">(.+?)<\/a>/) {
$time = $1;
}
if ($paragraph =~ m/class="post">([\w\W]+?)<\/span>/) {
$p = decode_entities($1);
$p =~ s/\n//g;
$p =~ s/<p>/ /g;
$p =~ s/<.+?>//g;
$p =~ s/&/&/g;
}
print OUT "\t<time>$time</time>\n\t<p>$p</p>\n" if defined $time;
undef $time;
undef $p;
}
print OUT "</text>\n";
sleep rand 3;
}
print OUT "</corpus>\n";
close OUT;
print "\nDone!\n";