diff --git a/README.md b/README.md index 46b95cb..848163c 100644 --- a/README.md +++ b/README.md @@ -26,19 +26,21 @@ for the data thats extracted from the pages. The basic example below is equivalent to scrapy's main example although it not only scrapes the author's name but its complete description that stays a layer down from the quotes's pages. -~~~python +Miners inherit from python list class, so they can be used to accumulate data from the pages, they can be placed anywhere too(in this way +it is highly flexible to construct json structures for your fetched data.) +~~~python from sukhoi import Miner, core class AuthorMiner(Miner): def run(self, dom): elem = dom.fst('div', ('class', 'author-description')) - self.pool.append(elem.text()) + self.append(elem.text()) class QuoteMiner(Miner): def run(self, dom): elems = dom.find('div', ('class', 'quote')) - self.pool.extend(map(self.extract_quote, elems)) + self.extend(map(self.extract_quote, elems)) elem = dom.fst('li', ('class', 'next')) if elem: self.next(elem.fst('a').attr['href']) @@ -55,7 +57,7 @@ if __name__ == '__main__': quotes = QuoteMiner(URL) core.gear.mainloop() - print repr(quotes.pool) + print quotes ~~~ @@ -86,12 +88,12 @@ from sukhoi import Miner, core class AuthorMiner(Miner): def run(self, dom): elem = dom.fst('div', ('class', 'author-description')) - self.pool.append(elem.text()) + self.append(elem.text()) class QuoteMiner(Miner): def run(self, dom): elems = dom.find('div', ('class', 'quote')) - self.pool.extend(map(self.extract_quote, elems)) + self.extend(map(self.extract_quote, elems)) elem = dom.fst('li', ('class', 'next')) if elem: self.next(elem.fst('a').attr['href']) @@ -120,7 +122,7 @@ class TagMiner(Miner): self.extract_quotes() def extract_quotes(self): - self.pool.extend(map(lambda ind: (ind[0], + self.extend(map(lambda ind: (ind[0], QuoteMiner(self.geturl(ind[1]))), self.acc)) if __name__ == '__main__': @@ -128,8 +130,7 @@ if __name__ == '__main__': tags = TagMiner(URL) core.gear.mainloop() - print repr(tags.pool) - + print tags ~~~ The structure would look like: @@ -153,3 +154,4 @@ pip2 install sukhoi + diff --git a/demo/by_tags.py b/demo/by_tags.py index 43e5337..cc46856 100644 --- a/demo/by_tags.py +++ b/demo/by_tags.py @@ -7,12 +7,12 @@ class AuthorMiner(Miner): def run(self, dom): elem = dom.fst('div', ('class', 'author-description')) - self.pool.append(elem.text()) + self.append(elem.text()) class QuoteMiner(Miner): def run(self, dom): elems = dom.find('div', ('class', 'quote')) - self.pool.extend(map(self.extract_quote, elems)) + self.extend(map(self.extract_quote, elems)) elem = dom.fst('li', ('class', 'next')) if elem: self.next(elem.fst('a').attr['href']) @@ -41,7 +41,7 @@ def run(self, dom): self.extract_quotes() def extract_quotes(self): - self.pool.extend(map(lambda ind: (ind[0], + self.extend(map(lambda ind: (ind[0], QuoteMiner(self.geturl(ind[1]))), self.acc)) if __name__ == '__main__': @@ -49,7 +49,8 @@ def extract_quotes(self): tags = TagMiner(URL) core.gear.mainloop() - print repr(tags.pool) + print tags + diff --git a/demo/quotes.py b/demo/quotes.py index b20f8d8..236aba9 100644 --- a/demo/quotes.py +++ b/demo/quotes.py @@ -7,12 +7,12 @@ class AuthorMiner(Miner): def run(self, dom): elem = dom.fst('div', ('class', 'author-description')) - self.pool.append(elem.text()) + self.append(elem.text()) class QuoteMiner(Miner): def run(self, dom): elems = dom.find('div', ('class', 'quote')) - self.pool.extend(map(self.extract_quote, elems)) + self.extend(map(self.extract_quote, elems)) elem = dom.fst('li', ('class', 'next')) if elem: self.next(elem.fst('a').attr['href']) @@ -29,5 +29,6 @@ def extract_quote(self, elem): quotes = QuoteMiner(URL) core.gear.mainloop() - print repr(quotes.pool) + print quotes + diff --git a/setup.py b/setup.py index 3740788..2338f1b 100644 --- a/setup.py +++ b/setup.py @@ -2,7 +2,7 @@ from distutils.core import setup setup(name="sukhoi", - version="0.0.2", + version="0.0.3", py_modules=["sukhoi"], author="Iury O. G. Figueiredo", author_email="ioliveira@id.uff.br", @@ -36,5 +36,6 @@ + diff --git a/sukhoi.py b/sukhoi.py index a4166f5..4557fc5 100644 --- a/sukhoi.py +++ b/sukhoi.py @@ -26,7 +26,6 @@ def install_handles(self, con): self.miner.task.add(con, LOST) def on_success(self, con, response): - self.miner.build_dom(response) def on_redirect(self, con, response): @@ -50,7 +49,7 @@ def on_redirect(self, con, response): self.install_handles(con) -class Miner(object): +class Miner(list): html = Html() task = Task() task.add_map(DONE, lambda task: die()) @@ -58,8 +57,7 @@ class Miner(object): def __init__(self, url, pool=None, max_depth=10, headers=HEADERS, method='get', payload={}, auth=()): - - self.pool = pool if pool != None else [] + self.pool = pool self.url = url self.urlparser = urlparse(url) self.max_depth = max_depth @@ -70,6 +68,13 @@ def __init__(self, url, pool=None, max_depth=10, self.encoding = 'utf-8' self.response = None + super(list, self).__init__() + self.expand() + + def expand(self): + """ + No exception being raised. + """ try: self.create_connection() except Exception as excpt: @@ -115,11 +120,12 @@ def geturl(self, reference): return url def next(self, reference): - url = self.geturl(reference) - self.__class__(url, self.pool, self.max_depth) + self.url = self.geturl(reference) + self.expand() + # self.__class__(url, self.pool, self.max_depth) - def __repr__(self): - return str(self.pool) + # def __repr__(self): + # return str(self.pool) def run(self, dom): """ @@ -130,3 +136,4 @@ def run(self, dom): +