From 70a7e967b711a89326e7fb46cb7339a13d3bfba5 Mon Sep 17 00:00:00 2001 From: Bhavesh Anand Date: Mon, 29 Jan 2018 21:23:59 +0530 Subject: [PATCH] Fixes #455 Add Image/Video search support for Parsijoo (#449) * Addresses #320 and #321 Add Image/Video search support for Parsijoo * Add staticmethod decorator * Add staticmethod decorator * Add staticmethod decorator * Add staticmethod decorator * Add staticmethod decorator * Add staticmethod decorator * Add staticmethod decorator * Add staticmethod decorator * Add staticmethod decorator * Add staticmethod decorator * Add tests --- .travis.yml | 2 +- app/scrapers/__init__.py | 4 ++-- app/scrapers/generalized.py | 15 ++++++++++--- app/scrapers/parsijoo.py | 42 +++++++++++++++++++++++++++++++++++++ test/test_parsijoo.py | 25 ++++++++++++++++++++++ 5 files changed, 82 insertions(+), 6 deletions(-) diff --git a/.travis.yml b/.travis.yml index 45250049..3772ca0c 100644 --- a/.travis.yml +++ b/.travis.yml @@ -12,7 +12,7 @@ install: - pip install -r requirements.txt before_script: - - flake8 . --count --max-complexity=16 --show-source --statistics + - flake8 . --count --max-complexity=16 --show-source --statistics --max-line-length=100 script: - python -m app.server > /dev/null & diff --git a/app/scrapers/__init__.py b/app/scrapers/__init__.py index 18e09569..190bf86a 100644 --- a/app/scrapers/__init__.py +++ b/app/scrapers/__init__.py @@ -42,9 +42,9 @@ def feed_gen(query, engine, count=10, qtype=''): engine = old_names.get(engine, engine) if engine in ('quora', 'youtube'): urls = scrapers[engine].search_without_count(query) - elif engine in ('bing',) and qtype == 'vid': + elif engine in ('bing', 'parsijoo') and qtype == 'vid': urls = scrapers[engine].video_search_without_count(query) - elif engine in ('bing',) and qtype == 'isch': + elif engine in ('bing', 'parsijoo') and qtype == 'isch': urls = scrapers[engine].image_search_without_count(query) elif engine in ('ask',) and qtype == 'vid': urls = scrapers[engine].video_search(query, count, qtype) diff --git a/app/scrapers/generalized.py b/app/scrapers/generalized.py index a51a1546..204533a0 100644 --- a/app/scrapers/generalized.py +++ b/app/scrapers/generalized.py @@ -133,13 +133,19 @@ def video_search_without_count(self, query): Returns: list """ urls = [] - if self.name in ['bing']: + if self.name in ['parsijoo']: + url = self.videoURL + payload = {self.queryKey: query} + elif self.name in ['bing']: url = self.videoURL payload = {self.queryKey: query, self.videoKey: 'HDRSC3'} response = requests.get(url, headers=self.headers, params=payload) soup = BeautifulSoup(response.text, 'html.parser') urls = self.parse_video_response(soup) - return urls + if len(urls) == 0: + return "No video with this Keyword" + else: + return urls def image_search_without_count(self, query): """ @@ -147,7 +153,10 @@ def image_search_without_count(self, query): Returns: list """ urls = [] - if self.name in ['bing']: + if self.name in ['parsijoo']: + url = self.imageURL + payload = {self.queryKey: query} + elif self.name in ['bing']: url = self.imageURL payload = {self.queryKey: query, self.imageKey: 'HDRSC2'} response = requests.get(url, headers=self.headers, params=payload) diff --git a/app/scrapers/parsijoo.py b/app/scrapers/parsijoo.py index 54d916b3..69e5e40a 100644 --- a/app/scrapers/parsijoo.py +++ b/app/scrapers/parsijoo.py @@ -8,6 +8,8 @@ class Parsijoo(Scraper): def __init__(self): Scraper.__init__(self) self.url = 'https://parsijoo.ir/web' + self.imageURL = 'https://image.parsijoo.ir/image' + self.videoURL = 'https://video.parsijoo.ir/video' self.defaultStart = 0 self.startKey = 'co' self.name = 'parsijoo' @@ -29,3 +31,43 @@ def parse_response(soup): print('Parsijoo parsed: ' + str(urls)) return urls + + @staticmethod + def parse_video_response(soup): + """ Parse response and returns the urls + + Returns: urls (list) + [[Tile1, url1], [Title2, url2], ...] + """ + urls = [] + for a in soup.findAll('a', attrs={'class': 'over-page'}): + title = a.get('title') + url = 'https://video.parsijoo.ir' + a.get('href') + urls.append({ + 'title': title, + 'link': url + }) + + print('Parsijoo parsed: ' + str(urls)) + + return urls + + @staticmethod + def parse_image_response(soup): + """ Parse response and returns the urls + + Returns: urls (list) + [[url1], [url2], ...] + """ + urls = [] + for div in \ + soup.findAll('div', attrs={'class': 'image-container overflow'}): + a = div.find('a') + url = 'https://image.parsijoo.ir' + a.get('href') + urls.append({ + 'link': url + }) + + print('Parsijoo parsed: ' + str(urls)) + + return urls diff --git a/test/test_parsijoo.py b/test/test_parsijoo.py index e89b5387..8682aedf 100644 --- a/test/test_parsijoo.py +++ b/test/test_parsijoo.py @@ -20,3 +20,28 @@ def test_parse_response(): }] resp = Parsijoo().parse_response(dummy_soup) assert resp == expected_resp + + +def test_parse_video_response(): + html_text = """mock_title""" + dummy_soup = BeautifulSoup(html_text, 'html.parser') + url = 'https://video.parsijoo.ir' + "mock_url" + expected_resp = [{ + 'title': u'mock_title', + 'link': url, + }] + resp = Parsijoo().parse_video_response(dummy_soup) + assert resp == expected_resp + + +def test_parse_image_response(): + html_text = """
mock_title
""" + dummy_soup = BeautifulSoup(html_text, 'html.parser') + image_url = 'https://image.parsijoo.ir' + 'mock_url' + expected_resp = [{ + 'link': image_url, + }] + resp = Parsijoo().parse_image_response(dummy_soup) + assert resp == expected_resp