-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathcraigslister.rb
executable file
·129 lines (110 loc) · 3.19 KB
/
craigslister.rb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
#!/usr/bin/ruby
# == Synopsis
# craigslister: Downloads listings from Craigslist and puts them in RSS
#
# == Usage
# craigslister [--nopics] [--minprice <min>] [--maxprice <max>]
# [--query <query>] [--queryfile <file>] [--historyfile <file>]
#
# You must specify either --query <query> or --queryfile <file>.
#
# == Author
# Neil Hooey <nhooey [at] gmail.com>
#
# == Copyright
# Copyright (c) 2009 Neil Hooey. Licensed under the same terms as Ruby.
require 'getoptlong'
require 'rexml/document'
require 'open-uri'
require 'net/http'
require 'timeout'
include REXML
usage = <<EOF
usage: craigslister [--nopics] [--minprice <min>] [--maxprice <max>]
< --query <query> | --queryfile <file> >
[--historyfile <file>]
You must specify either --query <query> or --queryfile <file>.
EOF
xml_base = <<EOF
<rss version="2.0">
<channel>
<title>Craigslist Couch Watch</title>
<link></link>
<description>Craigslist Couch Watch</description>
<ttl>180</ttl>
<pubdate></pubdate>
<generator>Neil Hooey</generator>
<copyright>Neil Hooey</copyright>
<language>en-us</language>
</channel>
</rss>
EOF
regex_price = /\$(([\d]+,)?[\d]+(\.[\d]+)?)/
regex_month = /Jan|Feb|Mar|Apr|May|June|July|Aug|Sept|Oct|Nov|Dec/
regex_listing = /<p>\s*(#{regex_month})\s+([0-9]+)\s*-\s*<a href="([^"]+)">\s*([^<]+) - \$([0-9]+) -\s*<\/a>(<font[^>]+>)?\s*\(([^\)]+)\)\s*/
def get_cmd_options()
begin
getopt = GetoptLong.new(
['--nopics', '-p', GetoptLong::NO_ARGUMENT],
['--minprice', '-n', GetoptLong::REQUIRED_ARGUMENT],
['--maxprice', '-x', GetoptLong::REQUIRED_ARGUMENT],
['--query', '-q', GetoptLong::REQUIRED_ARGUMENT],
['--queryfile', '-f', GetoptLong::REQUIRED_ARGUMENT],
['--historyfile', '-h', GetoptLong::REQUIRED_ARGUMENT],
['--debug', '-d', GetoptLong::REQUIRED_ARGUMENT]
)
opts = {}
getopt.each do |opt, arg|
opts[opt[2..-1]] = arg
end
rescue GetoptLong::MissingArgument => error
exit 1
end
opts['minprice'] = 'min' unless opts['minprice']
opts['maxprice'] = 'max' unless opts['maxprice']
if opts['nopics'] then
opts['nopics'] = ''
else
opts['nopics'] = '&hasPic=1'
end
unless opts['query'] or opts['queryfile'] then
return nil
end
return opts
end
def get_html(url)
begin
print "Downloading listings...\n"
return open(URI.escape(url)).read
rescue OpenURI::HTTPError => error
$stderr.print "Opening URL: `#{url}' failed: #{error}\n"
raise
rescue Timeout::Error => error
$stderr.print "Opening URL: `#{url}' timed out: #{error}\n"
raise
rescue Errno::ECONNRESET => error
$stderr.print
"Opening URL: `#{url}' failed, connection reset: #{error}\n"
raise
end
end
opts = get_cmd_options()
unless opts
print usage
exit 1
end
filename_base_rss = 'base_rss.xml'
page_url = "http://newyork.craigslist.org/search/fuo?query=#{opts['query']}" +
"&srchType=T&minAsk=#{opts['minprice']}&maxAsk=#{opts['maxprice']}" +
"#{opts['nopics']}"
html_doc = ''
if opts['debug'] then
File.open(opts['debug'], 'r') { |f| html_doc = f.read }
else
html_doc = get_html(page_url)
end
match_array = html_doc.scan(regex_listing)
match_array.each do |match|
p match
print "\n\n"
end