forked from jkoelmel/texas_auctions_scraper
-
Notifications
You must be signed in to change notification settings - Fork 0
/
texas_hibid_scraper.py
78 lines (73 loc) · 3.86 KB
/
texas_hibid_scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
# TODO add filter for year of vehicle
import json
import os
import pprint
import re
import sys
import csv
import urllib.request
from bs4 import BeautifulSoup as bs
miles = "500"
zipCode = "78414"
searchTerms = ['cars', 'trucks', 'trucks---rvs', 'trailers']
additionalSearch = ""
confirmation = "n"
pageIterator = 1
termIterator = 0
while confirmation != "y":
zipCode = input("Zip Code: (Default 78414) Press Enter to keep ")
miles = input("Distance: (Default 500) Press Enter to keep ")
additionalSearch = input("Add Search Terms, comma separated: \n(Default covers most vehicles and trailers): ")
if zipCode == "":
zipCode = "78414"
if miles == "":
miles = "500"
if additionalSearch != "":
additionalSearch = additionalSearch.split(",")
for word in additionalSearch:
searchTerms.append(word)
print("Searching with zipcode {}, distance {}, and terms {}\n\nDo you wish to continue? (y/n)".format(zipCode,
miles, searchTerms))
confirmation = str(input())
if confirmation.lower() != "n":
with open('texas_hibid_output.csv', 'w+') as file:
filewriter = csv.writer(file, delimiter=',', quoting=csv.QUOTE_MINIMAL)
filewriter.writerow(['Title', 'Quantity', 'Time Left', 'High Bid', 'Bid Count', 'URL', 'Company', 'City', 'State', 'Shipping?'])
while True:
url = 'https://texas.hibid.com/lots/{}/?status=open&zip={}&miles={}&apage={}&ipp=100'.format(searchTerms[termIterator],
zipCode, miles, pageIterator)
opener = urllib.request.build_opener()
# Add headers so that user-agent is not null and connection is not rejected
opener.addheaders = [('User-agent', 'Mozilla/5.0')]
urllib.request.install_opener(opener)
response = urllib.request.urlopen(url)
source = response.read()
soup = bs(source, 'lxml')
lots = soup.find_all("script", text=lambda text: text and "var lotModels" in text)
result = []
for lot in lots:
result.extend(lot)
# Fix malformed JSON from website so JSON parser can handle it
result = result[0][20:-4]
result = re.sub("[}],\s*[{]", "},\n{\n", result)
toJSON = json.loads(result)
listing = {}
if not toJSON:
print("-----{}".format(searchTerms[termIterator]).upper(), "category exhausted-----\n")
termIterator += 1 # Go to next term
pageIterator = 1 # Reset page count to 1
if termIterator == len(searchTerms):
print("Search complete, csv file created in local directory") # TODO
break
continue
for item in toJSON:
url = "https://texas.hibid.com/lot/{}\n".format(item['eventItemId'])
filewriter.writerow([item['lead'], item['quantity'], item['lotStatus']['timeLeft'], item['lotStatus']['highBid'],
item['lotStatus']['bidCount'], "https://texas.hibid.com/lot/" + str(item['eventItemId']), str(item['companyName']),
item['auctionCity'], item['auctionState'], item['shippingOffered']])
# print("Title: {} ".format(item['lead']), end=" ")
# print("High Bid: {}".format(item['lotStatus']['highBid']), end=" ")
# print("Time Left: {}\n".format(item['lotStatus']['timeLeft']))
# print("https://texas.hibid.com/lot/{}\n".format(item['eventItemId']))
print("{}".format(searchTerms[termIterator]).upper() + " Page: {} complete".format(pageIterator))
pageIterator += 1 # Go to next page