Skip to content

Commit

Permalink
Fixes #455 Add Image/Video search support for Parsijoo (#449)
Browse files Browse the repository at this point in the history
* Addresses #320 and #321 Add Image/Video search support for Parsijoo

* Add staticmethod decorator

* Add staticmethod decorator

* Add staticmethod decorator

* Add staticmethod decorator

* Add staticmethod decorator

* Add staticmethod decorator

* Add staticmethod decorator

* Add staticmethod decorator

* Add staticmethod decorator

* Add staticmethod decorator

* Add tests
  • Loading branch information
bhaveshAn authored and mariobehling committed Jan 29, 2018
1 parent 659dee4 commit 70a7e96
Show file tree
Hide file tree
Showing 5 changed files with 82 additions and 6 deletions.
2 changes: 1 addition & 1 deletion .travis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ install:
- pip install -r requirements.txt

before_script:
- flake8 . --count --max-complexity=16 --show-source --statistics
- flake8 . --count --max-complexity=16 --show-source --statistics --max-line-length=100

script:
- python -m app.server > /dev/null &
Expand Down
4 changes: 2 additions & 2 deletions app/scrapers/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,9 +42,9 @@ def feed_gen(query, engine, count=10, qtype=''):
engine = old_names.get(engine, engine)
if engine in ('quora', 'youtube'):
urls = scrapers[engine].search_without_count(query)
elif engine in ('bing',) and qtype == 'vid':
elif engine in ('bing', 'parsijoo') and qtype == 'vid':
urls = scrapers[engine].video_search_without_count(query)
elif engine in ('bing',) and qtype == 'isch':
elif engine in ('bing', 'parsijoo') and qtype == 'isch':
urls = scrapers[engine].image_search_without_count(query)
elif engine in ('ask',) and qtype == 'vid':
urls = scrapers[engine].video_search(query, count, qtype)
Expand Down
15 changes: 12 additions & 3 deletions app/scrapers/generalized.py
Original file line number Diff line number Diff line change
Expand Up @@ -133,21 +133,30 @@ def video_search_without_count(self, query):
Returns: list
"""
urls = []
if self.name in ['bing']:
if self.name in ['parsijoo']:
url = self.videoURL
payload = {self.queryKey: query}
elif self.name in ['bing']:
url = self.videoURL
payload = {self.queryKey: query, self.videoKey: 'HDRSC3'}
response = requests.get(url, headers=self.headers, params=payload)
soup = BeautifulSoup(response.text, 'html.parser')
urls = self.parse_video_response(soup)
return urls
if len(urls) == 0:
return "No video with this Keyword"
else:
return urls

def image_search_without_count(self, query):
"""
Search for the query and return set of urls
Returns: list
"""
urls = []
if self.name in ['bing']:
if self.name in ['parsijoo']:
url = self.imageURL
payload = {self.queryKey: query}
elif self.name in ['bing']:
url = self.imageURL
payload = {self.queryKey: query, self.imageKey: 'HDRSC2'}
response = requests.get(url, headers=self.headers, params=payload)
Expand Down
42 changes: 42 additions & 0 deletions app/scrapers/parsijoo.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,8 @@ class Parsijoo(Scraper):
def __init__(self):
Scraper.__init__(self)
self.url = 'https://parsijoo.ir/web'
self.imageURL = 'https://image.parsijoo.ir/image'
self.videoURL = 'https://video.parsijoo.ir/video'
self.defaultStart = 0
self.startKey = 'co'
self.name = 'parsijoo'
Expand All @@ -29,3 +31,43 @@ def parse_response(soup):
print('Parsijoo parsed: ' + str(urls))

return urls

@staticmethod
def parse_video_response(soup):
""" Parse response and returns the urls
Returns: urls (list)
[[Tile1, url1], [Title2, url2], ...]
"""
urls = []
for a in soup.findAll('a', attrs={'class': 'over-page'}):
title = a.get('title')
url = 'https://video.parsijoo.ir' + a.get('href')
urls.append({
'title': title,
'link': url
})

print('Parsijoo parsed: ' + str(urls))

return urls

@staticmethod
def parse_image_response(soup):
""" Parse response and returns the urls
Returns: urls (list)
[[url1], [url2], ...]
"""
urls = []
for div in \
soup.findAll('div', attrs={'class': 'image-container overflow'}):
a = div.find('a')
url = 'https://image.parsijoo.ir' + a.get('href')
urls.append({
'link': url
})

print('Parsijoo parsed: ' + str(urls))

return urls
25 changes: 25 additions & 0 deletions test/test_parsijoo.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,3 +20,28 @@ def test_parse_response():
}]
resp = Parsijoo().parse_response(dummy_soup)
assert resp == expected_resp


def test_parse_video_response():
html_text = """<a href="mock_url" class="over-page"
title="mock_title">mock_title</a>"""
dummy_soup = BeautifulSoup(html_text, 'html.parser')
url = 'https://video.parsijoo.ir' + "mock_url"
expected_resp = [{
'title': u'mock_title',
'link': url,
}]
resp = Parsijoo().parse_video_response(dummy_soup)
assert resp == expected_resp


def test_parse_image_response():
html_text = """<div class="image-container overflow"><a href="mock_url"
title="mock_title">mock_title</a></div>"""
dummy_soup = BeautifulSoup(html_text, 'html.parser')
image_url = 'https://image.parsijoo.ir' + 'mock_url'
expected_resp = [{
'link': image_url,
}]
resp = Parsijoo().parse_image_response(dummy_soup)
assert resp == expected_resp

0 comments on commit 70a7e96

Please sign in to comment.