Skip to content

Commit

Permalink
Improving scheme.
Browse files Browse the repository at this point in the history
  • Loading branch information
Iury O. G. Figueiredo committed Jul 6, 2017
1 parent 791a691 commit 6943202
Show file tree
Hide file tree
Showing 5 changed files with 37 additions and 25 deletions.
20 changes: 11 additions & 9 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -26,19 +26,21 @@ for the data thats extracted from the pages.
The basic example below is equivalent to scrapy's main example although it not only scrapes the author's name
but its complete description that stays a layer down from the quotes's pages.

~~~python
Miners inherit from python list class, so they can be used to accumulate data from the pages, they can be placed anywhere too(in this way
it is highly flexible to construct json structures for your fetched data.)

~~~python
from sukhoi import Miner, core

class AuthorMiner(Miner):
def run(self, dom):
elem = dom.fst('div', ('class', 'author-description'))
self.pool.append(elem.text())
self.append(elem.text())

class QuoteMiner(Miner):
def run(self, dom):
elems = dom.find('div', ('class', 'quote'))
self.pool.extend(map(self.extract_quote, elems))
self.extend(map(self.extract_quote, elems))

elem = dom.fst('li', ('class', 'next'))
if elem: self.next(elem.fst('a').attr['href'])
Expand All @@ -55,7 +57,7 @@ if __name__ == '__main__':
quotes = QuoteMiner(URL)
core.gear.mainloop()

print repr(quotes.pool)
print quotes

~~~

Expand Down Expand Up @@ -86,12 +88,12 @@ from sukhoi import Miner, core
class AuthorMiner(Miner):
def run(self, dom):
elem = dom.fst('div', ('class', 'author-description'))
self.pool.append(elem.text())
self.append(elem.text())

class QuoteMiner(Miner):
def run(self, dom):
elems = dom.find('div', ('class', 'quote'))
self.pool.extend(map(self.extract_quote, elems))
self.extend(map(self.extract_quote, elems))

elem = dom.fst('li', ('class', 'next'))
if elem: self.next(elem.fst('a').attr['href'])
Expand Down Expand Up @@ -120,16 +122,15 @@ class TagMiner(Miner):
self.extract_quotes()

def extract_quotes(self):
self.pool.extend(map(lambda ind: (ind[0],
self.extend(map(lambda ind: (ind[0],
QuoteMiner(self.geturl(ind[1]))), self.acc))

if __name__ == '__main__':
URL = 'http://quotes.toscrape.com/'
tags = TagMiner(URL)
core.gear.mainloop()

print repr(tags.pool)

print tags
~~~

The structure would look like:
Expand All @@ -153,3 +154,4 @@ pip2 install sukhoi




9 changes: 5 additions & 4 deletions demo/by_tags.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,12 +7,12 @@
class AuthorMiner(Miner):
def run(self, dom):
elem = dom.fst('div', ('class', 'author-description'))
self.pool.append(elem.text())
self.append(elem.text())

class QuoteMiner(Miner):
def run(self, dom):
elems = dom.find('div', ('class', 'quote'))
self.pool.extend(map(self.extract_quote, elems))
self.extend(map(self.extract_quote, elems))

elem = dom.fst('li', ('class', 'next'))
if elem: self.next(elem.fst('a').attr['href'])
Expand Down Expand Up @@ -41,15 +41,16 @@ def run(self, dom):
self.extract_quotes()

def extract_quotes(self):
self.pool.extend(map(lambda ind: (ind[0],
self.extend(map(lambda ind: (ind[0],
QuoteMiner(self.geturl(ind[1]))), self.acc))

if __name__ == '__main__':
URL = 'http://quotes.toscrape.com/'
tags = TagMiner(URL)
core.gear.mainloop()

print repr(tags.pool)
print tags




7 changes: 4 additions & 3 deletions demo/quotes.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,12 +7,12 @@
class AuthorMiner(Miner):
def run(self, dom):
elem = dom.fst('div', ('class', 'author-description'))
self.pool.append(elem.text())
self.append(elem.text())

class QuoteMiner(Miner):
def run(self, dom):
elems = dom.find('div', ('class', 'quote'))
self.pool.extend(map(self.extract_quote, elems))
self.extend(map(self.extract_quote, elems))

elem = dom.fst('li', ('class', 'next'))
if elem: self.next(elem.fst('a').attr['href'])
Expand All @@ -29,5 +29,6 @@ def extract_quote(self, elem):
quotes = QuoteMiner(URL)
core.gear.mainloop()

print repr(quotes.pool)
print quotes


3 changes: 2 additions & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

from distutils.core import setup
setup(name="sukhoi",
version="0.0.2",
version="0.0.3",
py_modules=["sukhoi"],
author="Iury O. G. Figueiredo",
author_email="[email protected]",
Expand Down Expand Up @@ -36,5 +36,6 @@






23 changes: 15 additions & 8 deletions sukhoi.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,6 @@ def install_handles(self, con):
self.miner.task.add(con, LOST)

def on_success(self, con, response):

self.miner.build_dom(response)

def on_redirect(self, con, response):
Expand All @@ -50,16 +49,15 @@ def on_redirect(self, con, response):

self.install_handles(con)

class Miner(object):
class Miner(list):
html = Html()
task = Task()
task.add_map(DONE, lambda task: die())
task.start()

def __init__(self, url, pool=None, max_depth=10,
headers=HEADERS, method='get', payload={}, auth=()):

self.pool = pool if pool != None else []
self.pool = pool
self.url = url
self.urlparser = urlparse(url)
self.max_depth = max_depth
Expand All @@ -70,6 +68,13 @@ def __init__(self, url, pool=None, max_depth=10,
self.encoding = 'utf-8'
self.response = None

super(list, self).__init__()
self.expand()

def expand(self):
"""
No exception being raised.
"""
try:
self.create_connection()
except Exception as excpt:
Expand Down Expand Up @@ -115,11 +120,12 @@ def geturl(self, reference):
return url

def next(self, reference):
url = self.geturl(reference)
self.__class__(url, self.pool, self.max_depth)
self.url = self.geturl(reference)
self.expand()
# self.__class__(url, self.pool, self.max_depth)

def __repr__(self):
return str(self.pool)
# def __repr__(self):
# return str(self.pool)

def run(self, dom):
"""
Expand All @@ -130,3 +136,4 @@ def run(self, dom):




0 comments on commit 6943202

Please sign in to comment.