Reddit_Image_Scraper

#!/usr/bin/env perl

#   Reddit Image Scraper: A perl script to download images hosted on
#   the imgur.com hosting service linked from a subreddit at reddit.com
#   Copyright (C) 2013 Joshua Hull

#   This program is free software: you can redistribute it and/or modify
#   it under the terms of the GNU General Public License as published by
#   the Free Software Foundation, either version 3 of the License, or
#   (at your option) any later version.
#
#   This program is distributed in the hope that it will be useful,
#   but WITHOUT ANY WARRANTY; without even the implied warranty of
#   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#   GNU General Public License for more details.
#
#   You should have received a copy of the GNU General Public License
#   along with this program.  If not, see <http://www.gnu.org/licenses/>.

use strict;
use warnings;
use feature 'say';

use JSON;
use WWW::Mechanize;
use URI;
use Getopt::Long qw(:config auto_abbrev);

# in order for LIMIT to see $opt_l, we need to pass options in BEGIN
# maybe we can come up with a less sloppy solution (i.e less global vars)
my ($opt_u, $opt_q, $opt_l, $opt_s, $opt_t, $opt_a);
BEGIN {
    ## TODO we should probably do some validation checks
    GetOptions(
        "user"    => \$opt_u,
        "query"   => \$opt_q,
        "limit=i" => \$opt_l,
        "sort=s"  => \$opt_s,
        "time=s"  => \$opt_t,
        "after=s" => \$opt_a,
        "help"    => sub { say "run perldoc on this file to see documentation" },
    ) or die "Error in args?\n";
    die '--time [-t] option can only be set when --sort [-s] option has a value'
      if $opt_t and not $opt_s;
}

use constant {
    INTERVAL => 2,
    LIMIT    => $opt_l || 1000,
};

my $version_string = "2013.10.18";

my $user_agent = "Reddit_Image_Scraper/".$version_string." by /u/hyperspace290";

my $mech = WWW::Mechanize->new(autocheck=>0,agent=>$user_agent);

# TODO break up this loop into smaller subroutines
foreach my $sub (@ARGV) {

    our $count = 0;
    my $url = URI->new('http://www.reddit.com/');
    my %url_form = (
        t     => $opt_t,
        after => $opt_a,
        sort  => $opt_s,
    );

    my $type = $opt_u ? 'user' : $opt_q ? 'search' : 'subreddit';
    if ($type eq 'search') {
        ## when adding support for i.minus.com
        # add the domain to @domains list
        my @domains      = qw(imgur.com);
        my $search_query = $sub;
        $search_query .= ' (' . join(' OR ', map {"site:$_"} @domains) . ')';
        $url_form{q} = $search_query;
    }

    # build url
    $url->path(    #
        $type   eq 'subreddit' ? 'r/' . join('/', grep $_, ($sub, $opt_s)) . '.json'
        : $type eq 'user'      ? "user/$sub/.json"
        : "$type/.json"
    );
    $url->query_form(\%url_form);

    my @links;

    # build a valid path, remove characters
    # like a slash which would make subdirs
    # ex: ./script -q "look at my dog/cat"
    my $directory = $type eq 'subreddit'    #
      ? $sub
      : join '_',
      grep { s/\s+/_/g; s/[^\w\-\.]//g; 1 } ($type, $sub);

    mkdir $directory unless -e $directory;
    do {
        $mech->get($url->as_string);
        retry_last_request(3) unless $mech->success;
        die q,That subreddit doesn't exist, if $mech->uri->path eq '/subreddits/search';
        my $json_text = JSON->new->allow_nonref->utf8->relaxed->decode($mech->text);

        # pull in all non-self posts
        # we could also add a filter for only imgur domains
        # but that depends on what sites we intend to scrape from
        my @posts = grep { !$_->{data}{is_self} } @{$json_text->{data}->{children}};
        foreach my $post (@posts) {
            my $uri = URI->new($post->{data}->{url}) or next;
            ## remove any paramters such as http://i.imgur.com/K7EdYrV.jpg?1
            $uri->query_form({});
            ## remove uri fragments like #foo
            $uri->fragment(undef);
            push @links, get_image_links($uri->as_string) if @links < LIMIT;
        }
        # get next page
        $url = get_next_page($json_text->{data}{after}, $url,$count);
   	    sleep INTERVAL;
    }
    while($url and @links < LIMIT);

    print "Extracted " . @links . " images from $type $sub\n";

    my %album_seen;
    foreach my $img (@links) {
        my $album_dir = $img->{album_hash} || '';
        my $file_name = $album_dir ? ++$album_seen{$album_dir} . $img->{ext} : $img->{file_name};
        mkdir "$directory/$album_dir" unless -e "$directory/$album_dir";
        my $path = join '/', grep $_, ($directory, $album_dir, $file_name);
        unless (-e $path) {
            print 'Downloading ' . $img->{url} . " to $path\n";
            $mech->get($img->{url}, ':content_file' => "$path");
            sleep INTERVAL;
        }
	    else{
		    print "File already exists..." . $img->{file_name} . "\n";
	    }
    }
    undef @links;
}

sub get_next_page {
    my ($after, $uri) = @_;
    return undef unless $after;
    my %form = $uri->query_form;
    my $count = $_[2];
    $_[2] = $_[2] + 100;
    if($opt_u){
	 $uri->query_form({limit =>$form{limit},after => $after,count=>$count});
    } else {
    	$uri->query_form({limit => $form{limit}, after => $after});
    }
    return $uri;
}

sub get_image_links {
    my ($url) = @_;
    my @images;
    if (my ($album) = $url =~ m{ imgur\.com/a/ ([^/]+) /?$ }ix) {
        $mech->get($url);
        my ($json) = ($mech->content =~ m{ \b images \s* : \s* (.+?) , \s* cover \s* : }isx);
        my $struct = eval { $json && JSON->new->allow_nonref->utf8->relaxed->decode($json) };
        if ($struct) {
            foreach my $item (@{$struct->{items}}) {
                my %info = (album_hash => $album, %$item);
                $info{file_name} = $item->{hash} . $item->{ext};
                $info{url} = 'http://imgur.com/' . $info{file_name};
                push @images, \%info if $info{url} && $info{file_name};
            }
        }
        $mech->back;
    }
    elsif ($url =~ m{ imgur\.com/ ([^/\.]+) (\. [^/\.]+)? /?$ }ix) {
        my %info;
        unless ($2) {
            $mech->follow_link( url_regex => qr/$1/i );
            $url = $mech->uri;
        }
        $info{url} = $url;
        ($info{file_name}) = $url =~ m{ imgur\.com/([^/]+) }ix;
        push @images, \%info if $info{url} && $info{file_name};
    }
    return @images;
}

sub retry_last_request {
    my $max_tries = shift;
    my $attempts  = 0;
    my $url       = $mech->uri->as_string;
    until ($mech->success or $attempts == $max_tries) {
        $attempts++;
        print "Retrying ($attempts/$max_tries) $url...\n";
        $mech->get($url);
        sleep INTERVAL;
    }
    die "Failed to connect to $url\n" unless $mech->success;
    return;
}

sub VERSION_MESSAGE {
    print "Reddit-Image-Scrapper version ".$version_string."\n";
}

__END__

=head1 NAME

Reddit_Image_Scraper - A perl script to download images hosted on the imgur.com hosting service linked from a subreddit at reddit.com Copyright (C) 2013 Joshua Hull

=head1 SYNOPSIS

./Reddit_Image_Scraper --sort=top --time=all pics # scrape all imgur pics from r/pics, sorted by most upvotes since all time
./Reddit_Image_Scraper --limit=1000 pics          # scrape first 1000 imgur pics from r/pics (default sort type is 'hot')
./Reddit_Image_Scraper --after=t3_1y5d0i pics     # scrape all images from r/pics starting AFTER the post ID of t3_1y5d0i

=head1 COMMANDS

=over 4

=item (arguments)

    ./Reddit_Image_Scraper EarthPorn

Multiple arguments can be processed.  By default, these arguments will be treated as subreddits.  Setting the --user [-u] flag will treat these arguments as users instead.  Currently the script doesn't handle both subreddits and users in one call.

=item -q, --query

    Scrape images only matching the given query.  Note that string may contain spaces, but it must be enclosed in quotes (or you can escape the whitespace).

=item -u, --user

    By default, we treat @ARGV as a list of subreddits, but when the C<-u|--user> flag is set, these are treat as user pages.

=item -l, --limit

    How many images do you want?  Default is 1,000.

=item -s, --sort

    This tells the script *how* to sort a subreddit.  Valid options include new, rising, controversial, top

=item -t, --time

    This is the time to sort a subreddit by.  Valid options include hour, week, month, year, all

    C<-s|--sort> is required in order to use C<-t|--time>

=item -a, --after

Scrape images AFTER a post with the given ID.

Q: How do I find the ID of a post?

A: IDs looks like "t3_1y5d0i".  If you look in the HTML, each post should have a tag similar to this one:

    <div class=" thing id-t3_1y5d0i odd link " data-downs="26614" data-ups="30566" data-fullname="t3_1y5d0i" onclick="click_thing(this)">

The ID we want is in @data-fullname attribute, so we can pass C<-a|--after> as t3_1y5d0i

=item -h, --help

Useless argument which tells you to run perldoc

=back

=cut