diff --git a/.ci/build.sh b/.ci/build.sh index 2a47cf2f..01a111fb 100755 --- a/.ci/build.sh +++ b/.ci/build.sh @@ -4,6 +4,7 @@ set -e -x mkdir private _site public +scrapy crawl gsoc if [[ -n "$GCI_TOKEN" ]]; then python manage.py fetch_gci_task_data private python manage.py cleanse_gci_task_data private _site diff --git a/community/urls.py b/community/urls.py index ba58235c..e9513fbb 100644 --- a/community/urls.py +++ b/community/urls.py @@ -7,6 +7,8 @@ from django.conf import settings from django.views.generic import TemplateView +from gsoc.views import index as gsoc_index +from gsoc.views import projects as gsoc_projects from gci.views import index as gci_index from gci.feeds import LatestTasksFeed as gci_tasks_rss from activity.scraper import activity_json @@ -51,6 +53,18 @@ def get_index(): distill_func=get_index, distill_file='gci/index.html', ), + distill_url( + r'^gsoc/$', gsoc_index, + name='community-gsoc', + distill_func=get_index, + distill_file='gsoc/index.html', + ), + distill_url( + r'^gsoc/projects/$', gsoc_projects, + name='community-gsoc-projects', + distill_func=get_index, + distill_file='gsoc/projects.html', + ), distill_url( r'twitter/', twitter_index, name='twitter', diff --git a/gsoc/config.py b/gsoc/config.py new file mode 100644 index 00000000..aa12b99c --- /dev/null +++ b/gsoc/config.py @@ -0,0 +1,16 @@ +import ruamel.yaml +import os +from django.conf import settings + +os.environ['DJANGO_SETTINGS_MODULE'] = 'community.settings' + +DATA_DIR = settings.STATIC_ROOT + + +def load_cache(filename): + with open(os.path.join(DATA_DIR, filename), 'r') as f: + return ruamel.yaml.load(f, Loader=ruamel.yaml.Loader) + + +def get_year(): + return 2018 - 1 diff --git a/gsoc/data.py b/gsoc/data.py new file mode 100644 index 00000000..b991f9a0 --- /dev/null +++ b/gsoc/data.py @@ -0,0 +1,21 @@ +from .config import load_cache + + +_org = {} +_projects = {} + + +def get_org_data(): + global _org + if not _org: + _org = load_cache('gsoc_org_info.yaml') + + return _org + + +def get_projects_data(): + global _projects + if not _projects: + _projects = load_cache('gsoc_project_info.yaml') + + return _projects diff --git a/gsoc/urls.py b/gsoc/urls.py new file mode 100644 index 00000000..d66ea97f --- /dev/null +++ b/gsoc/urls.py @@ -0,0 +1,8 @@ +from django.conf.urls import url + +from . import views + +urlpatterns = [ + url(r'^$', views.index, name='index'), + url(r'^projects/$', views.projects, name='projects'), +] diff --git a/gsoc/views.py b/gsoc/views.py new file mode 100644 index 00000000..f6e27606 --- /dev/null +++ b/gsoc/views.py @@ -0,0 +1,68 @@ +from django.shortcuts import render +from django.http import Http404 +import logging + +from .data import get_org_data +from .data import get_projects_data +from community.git import get_owner +from gsoc.config import get_year + +logger = logging.getLogger(__name__ + '.index') +org_name = get_owner() +year = get_year() + + +def index(request): + try: + org = get_org_data() + except FileNotFoundError: + logger.info('GSoC data not available') + raise Http404 + else: + for key in org.keys(): + id = org.get(key).get('id') + name = org.get(key).get('name') + tagline = org.get(key).get('tagline') + description = org.get(key).get('description') + tech = [] + for technology in org.get(key).get('technologies').values(): + tech.append(technology) + + return render(request, 'gsoc.html', {'id': id, + 'name': name, + 'tagline': tagline, + 'description': description, + 'tech': tech + }) + + +def projects(request): + try: + org = get_org_data() + except FileNotFoundError: + logger.info('GSoC data not available') + raise Http404 + else: + for key in org.keys(): + name = org.get(key).get('name') + projects = get_projects_data() + projects_list = [] + for key in projects.keys(): + mentors = [] + for mentor in projects.get(key).get('mentors').values(): + mentors.append(mentor) + item = { + 'id': projects.get(key).get('id'), + 'summary': projects.get(key).get('summary'), + 'title': projects.get(key).get('title'), + 'student': projects.get(key).get('student'), + 'code': projects.get(key).get('project_code'), + 'link': projects.get(key).get('project_link'), + 'mentors': mentors + } + projects_list.append(item) + return render(request, 'gsoc_projects.html', + { + 'project_list': projects_list, + 'org_name': name + }) diff --git a/gsocscrape/__init__.py b/gsocscrape/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/gsocscrape/items.py b/gsocscrape/items.py new file mode 100644 index 00000000..ddd5df65 --- /dev/null +++ b/gsocscrape/items.py @@ -0,0 +1,5 @@ +import scrapy + + +class GsocscrapeItem(scrapy.Item): + pass diff --git a/gsocscrape/middlewares.py b/gsocscrape/middlewares.py new file mode 100644 index 00000000..d55ab00a --- /dev/null +++ b/gsocscrape/middlewares.py @@ -0,0 +1,27 @@ +from scrapy import signals + + +class GsocscrapeSpiderMiddleware(object): + + @classmethod + def from_crawler(cls, crawler): + s = cls() + crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) + return s + + def process_spider_input(self, response, spider): + return None + + def process_spider_output(self, response, result, spider): + for i in result: + yield i + + def process_spider_exception(self, response, exception, spider): + pass + + def process_start_requests(self, start_requests, spider): + for r in start_requests: + yield r + + def spider_opened(self, spider): + spider.logger.info('Spider opened: %s' % spider.name) diff --git a/gsocscrape/pipelines.py b/gsocscrape/pipelines.py new file mode 100644 index 00000000..c2c5a97e --- /dev/null +++ b/gsocscrape/pipelines.py @@ -0,0 +1,3 @@ +class GsocscrapePipeline(object): + def process_item(self, item, spider): + return item diff --git a/gsocscrape/settings.py b/gsocscrape/settings.py new file mode 100644 index 00000000..a10c27c0 --- /dev/null +++ b/gsocscrape/settings.py @@ -0,0 +1,6 @@ +BOT_NAME = 'gsocscrape' + +SPIDER_MODULES = ['gsocscrape.spiders'] +NEWSPIDER_MODULE = 'gsocscrape.spiders' + +ROBOTSTXT_OBEY = True diff --git a/gsocscrape/spiders/__init__.py b/gsocscrape/spiders/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/gsocscrape/spiders/gsoc.py b/gsocscrape/spiders/gsoc.py new file mode 100644 index 00000000..60c7f0b6 --- /dev/null +++ b/gsocscrape/spiders/gsoc.py @@ -0,0 +1,132 @@ +import scrapy +import string +import json +import logging + +from ruamel.yaml import YAML +import os.path +from collections import OrderedDict +from community.git import get_owner +from gsoc.config import get_year + + +logger = logging.getLogger(__name__ + '.index') +org_name = get_owner() +year = get_year() +yaml = YAML() + + +class GsocSpider(scrapy.Spider): + name = 'gsoc' + start_urls = [ + 'https://summerofcode.withgoogle.com/archive/{}/organizations' + .format(year), + ] + + def parse(self, response): + home_url = 'https://summerofcode.withgoogle.com/'\ + 'archive/{}/organizations/'.format(year) + selector = "//li[contains(.,'{org_name}')]/a/@href".format( + org_name=org_name) + organization_link = response.xpath(selector) + + if(organization_link): + organization_link = organization_link[0].extract().split('/')[4] + else: + logger.info('Organisation {} does not exist in GSoC for {}'.format( + org_name, year + )) + return + + follow_link = home_url + organization_link + yield response.follow(follow_link, self.parse_org) + + def parse_org(self, response): + project_url = 'https://summerofcode.withgoogle.com/'\ + 'archive/{}/projects/'.format(year) + + technology = {} + + id = response.url.split('/')[-2] + org_name = response.css('h3.banner__title::text')[0].extract() + org_tagline = response.css('h4.org__tagline::text')[0].extract() + org_long_description = response.xpath( + "//div[@class='org__long-description']")[0].extract() + org_technologies = response.xpath(".//div[@class='org__meta']/div" + "[contains(.,'Technologies')]/ul/" + 'li/text()').extract() + + count = 0 + for tech in org_technologies: + technology['%s' % str(count)] = tech + count = count + 1 + + item = { + 'id': id, + 'name': org_name, + 'tagline': org_tagline, + 'description': org_long_description, + 'technologies': technology + } + org_data = {} + org_data[int(item['id'])] = item + + with open(os.path.join('_site', 'gsoc_org_info.yaml'), 'w') as f: + yaml.dump(org_data, f) + + # Overwrite any previous data with empty dataset + open(os.path.join('_site', 'gsoc_project_info.yaml'), 'w').close() + + for res in response.css('a.archive-project-card__link'): + link = res.xpath('@href').extract()[0] + link = link.split('/')[4] + url_project = project_url + link + yield response.follow(url_project, self.parse_project) + + def parse_project(self, response): + mentors = [] + org_url = 'https://summerofcode.withgoogle.com/'\ + 'archive/{}/organizations/'.format(year) + page = response.url.split('/')[-2] + + project_id = page + project_title = response.css('h3.banner__title::text')[0].extract() + project_summary = response.xpath( + "//div[@class='org__long-description']")[0].extract() + project_organization_code = response.css( + 'md-card.org__info-card a::attr(href)')[0].extract().split('/')[4] + project_link = response.url + project_organization_url = org_url + project_organization_code + project_organization_name = response.css('md-card.org__info-card ' + 'a::text')[3].extract() + project_code = response.css('md-card.org__info-card ' + 'a::attr(href)')[1].extract() + project_student = response.xpath( + ".//div[@class='org__meta']/div[contains(.,'Student')]/" + 'div/text()')[0].extract() + project_mentors = response.xpath( + ".//div[@class='org__meta']/div[contains(.,'Mentors')]" + '/ul/li/text()').extract() + ment = {} + count = 0 + for mentor in project_mentors: + ment['%s' % str(count)] = mentor + count = count + 1 + + item = { + 'id': project_id, + 'title': project_title, + 'summary': project_summary, + 'student': project_student, + 'mentors': ment, + 'organization_name': project_organization_name, + 'organization_code': project_organization_code, + 'organization_url': project_organization_url, + 'project_link': project_link, + 'project_code': project_code + } + project_data = {} + project_data[int(item['id'])] = item + + with open(os.path.join('_site', 'gsoc_project_info.yaml'), 'a') as f: + yaml.dump(project_data, f) diff --git a/requirements.txt b/requirements.txt index 78ad6893..91c088c5 100644 --- a/requirements.txt +++ b/requirements.txt @@ -4,6 +4,7 @@ django-distill django-eventtools IGitt==0.4.1.dev20180111025558 requests +scrapy python-dateutil pillow ruamel.yaml diff --git a/scrapy.cfg b/scrapy.cfg new file mode 100644 index 00000000..465badd4 --- /dev/null +++ b/scrapy.cfg @@ -0,0 +1,11 @@ +# Automatically created by: scrapy startproject +# +# For more information about the [deploy] section see: +# https://scrapyd.readthedocs.org/en/latest/deploy.html + +[settings] +default = gsocscrape.settings + +[deploy] +#url = http://localhost:6800/ +project = gsocscrape diff --git a/templates/gsoc.html b/templates/gsoc.html new file mode 100644 index 00000000..4e984bc1 --- /dev/null +++ b/templates/gsoc.html @@ -0,0 +1,27 @@ + + +
+{{id}}
+{{name}}
+{{tagline}}
+{% autoescape off %}{{description}}{% endautoescape %}
+Title: {{project.title}}
+Student: {{project.student}}
+Summary: {% autoescape off %} {{project.summary}} {% endautoescape %}
+Project Code: Code
+Mentors:
+