Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Ruby3 updates #49

Open
wants to merge 3 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 5 additions & 6 deletions lib/arachnid2.rb
Original file line number Diff line number Diff line change
Expand Up @@ -105,17 +105,16 @@ def initialize(url)
#
# @return nil
#
def crawl(opts = {}, with_watir = false)
def crawl(opts = {}, with_watir = false, &block)
if with_watir
crawl_watir(opts, &Proc.new)
crawl_watir(opts, &block)
else
Arachnid2::Typhoeus.new(@url).crawl(opts, &Proc.new)
Arachnid2::Typhoeus.new(@url).crawl(opts, &block)
end
end

def crawl_watir(opts)
Arachnid2::Watir.new(@url).crawl(opts, &Proc.new)
def crawl_watir(opts, &block)
Arachnid2::Watir.new(@url).crawl(opts, &block)
end
# https://mudge.name/2011/01/26/passing-blocks-in-ruby-without-block.html

end
18 changes: 9 additions & 9 deletions lib/arachnid2/typhoeus.rb
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ def initialize(url)
@cached_data = []
end

def crawl(opts = {})
def crawl(opts = {}, &block)
preflight(opts)
typhoeus_preflight

Expand All @@ -20,11 +20,11 @@ def crawl(opts = {})
break if time_to_stop?
@global_visited.insert(q)

found_in_cache = use_cache(q, opts, &Proc.new)
found_in_cache = use_cache(q, opts, &block)
return if found_in_cache

request = ::Typhoeus::Request.new(q, request_options)
requestable = after_request(request, &Proc.new)
requestable = after_request(request, &block)
@hydra.queue(request) if requestable
end # max_concurrency.times do

Expand All @@ -35,9 +35,9 @@ def crawl(opts = {})
end # def crawl(opts = {})

private
def after_request(request)
def after_request(request, &block)
request.on_complete do |response|
cacheable = use_response(response, &Proc.new)
cacheable = use_response(response, &block)
return unless cacheable

put_cached_data(response.effective_url, @options, response)
Expand All @@ -46,19 +46,19 @@ def after_request(request)
true
end

def use_response(response)
def use_response(response, &block)
links = process(response.effective_url, response.body)
return unless links

yield response
block.call response

vacuum(links, response.effective_url)
true
end

def use_cache(url, options)
def use_cache(url, options, &block)
data = load_data(url, options)
use_response(data, &Proc.new) if data
use_response(data, &block) if data

data
end
Expand Down
12 changes: 6 additions & 6 deletions lib/arachnid2/watir.rb
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ def initialize(url)
@domain = Adomain[@url]
end

def crawl(opts)
def crawl(opts, &block)
preflight(opts)
watir_preflight
@already_retried = false
Expand All @@ -23,17 +23,17 @@ def crawl(opts)

@global_visited.insert(q)

make_request(q, &Proc.new)
make_request(q, &block)
end # until @global_queue.empty?
ensure
@browser.close if @browser rescue nil
@headless.destroy if @headless rescue nil
end

private
def make_request(q)
def make_request(q, &block)
begin
links = browse_links(q, &Proc.new)
links = browse_links(q, &block)
return unless links

vacuum(links, browser.url)
Expand All @@ -53,10 +53,10 @@ def make_request(q)
end
end

def browse_links(url)
def browse_links(url, &block)
return unless navigate(url)

yield browser
block.call browser

process(browser.url, browser.body.html) if browser.body.exists?
end
Expand Down
Loading