Skip to content

Commit

Permalink
Addresses fossasia#320 and fossasia#321 Add Image/Video search suppor…
Browse files Browse the repository at this point in the history
…t for Parsijoo
  • Loading branch information
bhaveshAn committed Jan 20, 2018
1 parent 24f17b1 commit 62e7735
Show file tree
Hide file tree
Showing 5 changed files with 78 additions and 3 deletions.
2 changes: 1 addition & 1 deletion .travis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ install:
- pip install -r requirements.txt

before_script:
- flake8 . --count --max-complexity=15 --show-source --statistics
- flake8 . --count --max-complexity=16 --show-source --statistics --max-line-length=100
script:
- python -m app.server > /dev/null &
- pytest --cov=./
Expand Down
4 changes: 4 additions & 0 deletions app/scrapers/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,10 @@ def feed_gen(query, engine, count=10, qtype=''):
engine = old_names.get(engine, engine)
if engine in ('quora', 'youtube'):
urls = scrapers[engine].search_without_count(query)
elif (engine in ['parsijoo']) and (qtype == 'isch'):
urls = scrapers[engine].image_search_without_count(query)
elif (engine in ['parsijoo']) and (qtype == 'vid'):
urls = scrapers[engine].video_search_without_count(query)
else:
urls = scrapers[engine].search(query, count, qtype)
return urls
31 changes: 31 additions & 0 deletions app/scrapers/generalized.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,3 +81,34 @@ def search_without_count(self, query):
soup = BeautifulSoup(response.text, 'html.parser')
urls = self.parse_response(soup)
return urls

def video_search_without_count(self, query):
"""
Search for the query and return set of urls
Returns: list
"""
urls = []
if self.name in ['parsijoo']:
url = self.videoURL
payload = {self.queryKey: query}
response = requests.get(url, headers=self.headers, params=payload)
soup = BeautifulSoup(response.text, 'html.parser')
urls = self.parse_video_response(soup)
if urls == []:
return "No video with this Keyword"
else:
return urls

def image_search_without_count(self, query):
"""
Search for the query and return set of urls
Returns: list
"""
urls = []
if self.name in ['parsijoo']:
url = self.imageURL
payload = {self.queryKey: query}
response = requests.get(url, headers=self.headers, params=payload)
soup = BeautifulSoup(response.text, 'html.parser')
urls = self.parse_image_response(soup)
return urls
39 changes: 39 additions & 0 deletions app/scrapers/parsijoo.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,8 @@ class Parsijoo(Scraper):
def __init__(self):
Scraper.__init__(self)
self.url = 'https://parsijoo.ir/web'
self.imageURL = 'https://image.parsijoo.ir/image'
self.videoURL = 'https://video.parsijoo.ir/video'
self.defaultStart = 0
self.startKey = 'co'
self.name = 'parsijoo'
Expand All @@ -28,3 +30,40 @@ def parse_response(self, soup):
print('Parsijoo parsed: ' + str(urls))

return urls

def parse_video_response(self, soup):
""" Parse response and returns the urls
Returns: urls (list)
[[Tile1, url1], [Title2, url2], ...]
"""
urls = []
for a in soup.findAll('a', attrs={'class': 'over-page'}):
title = a.get('title')
url = self.videoURL + a.get('href')
urls.append({
'title': title,
'link': url
})

print('Parsijoo parsed: ' + str(urls))

return urls

def parse_image_response(self, soup):
""" Parse response and returns the urls
Returns: urls (list)
[[url1], [url2], ...]
"""
urls = []
for div in soup.findAll('div', attrs={'class': 'image-container overflow'}):
a = div.find('a')
url = 'https://image.parsijoo.ir' + a.get('href')
urls.append({
'link': url
})

print('Parsijoo parsed: ' + str(urls))

return urls
5 changes: 3 additions & 2 deletions app/server.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
from flask import (Flask, Response, abort, jsonify, make_response,
render_template, request)

from app.scrapers import feed_gen, scrapers
from scrapers import feed_gen, scrapers

DISABLE_CACHE = True # Temporarily disable the MongoDB cache
if DISABLE_CACHE:
Expand Down Expand Up @@ -77,7 +77,8 @@ def search(search_engine):
unicode # unicode is undefined in Python 3 so NameError is raised
for line in result:
line['link'] = line['link'].encode('utf-8')
line['title'] = line['title'].encode('utf-8')
if 'title' in line:
line['title'] = line['title'].encode('utf-8')
if 'desc' in line:
line['desc'] = line['desc'].encode('utf-8')
except NameError:
Expand Down

0 comments on commit 62e7735

Please sign in to comment.