From 42d3534c8717524958c76e3aab2d3cab33ae3dba Mon Sep 17 00:00:00 2001 From: verena91 Date: Fri, 12 Sep 2014 09:33:16 -0400 Subject: [PATCH] Lanzar splash y otros cambios. --- bin/DataCrawler.py | 17 ++++++- bin/run_splash.sh | 5 ++ crawler/data_json.py | 24 ++++----- crawler/settings.py | 2 +- crawler/spiders/data_spider.py | 92 ++++++++++++++++++++++++---------- 5 files changed, 100 insertions(+), 40 deletions(-) create mode 100755 bin/run_splash.sh diff --git a/bin/DataCrawler.py b/bin/DataCrawler.py index 1354d62..7c3bcd9 100644 --- a/bin/DataCrawler.py +++ b/bin/DataCrawler.py @@ -2,6 +2,9 @@ import requests import click +import os +import time +from multiprocessing import Process from twisted.internet import reactor from scrapy.crawler import Crawler from scrapy import log, signals @@ -17,8 +20,14 @@ default="/home/desa2/PycharmProjects/DataCrawler/crawler/domains.txt", help='The list of domains to crawl.') def main(file): + # Iniciar splash + # p = Process(target=start_splash_server) + # p.start() + # time.sleep(10) click.echo('File path: %s' % file) call_spider(file) + # Finalizar splash + # p.terminate() def call_spider(file): @@ -58,13 +67,19 @@ def call_spider(file): data_spider.copy_items_to_files() """ Eliminar archivos temporales """ - #FileController.FileController().clean_tmp_files() + FileController.FileController().clean_tmp_files() """ Convertir los archivos .json a data.json (formato POD) """ for domain in domains: DataJson.DataJson().convert(domain) +def start_splash_server(): + # Inciar splash + os.system("chmod +x run_splash.sh") + os.system("./run_splash.sh /home/desa2/datos") + + results = [] diff --git a/bin/run_splash.sh b/bin/run_splash.sh new file mode 100755 index 0000000..9d59390 --- /dev/null +++ b/bin/run_splash.sh @@ -0,0 +1,5 @@ +#!/bin/bash +PATH=$1 +echo "virtualenv -->" $PATH +source $PATH/bin/activate +python -m splash.server \ No newline at end of file diff --git a/crawler/data_json.py b/crawler/data_json.py index b6f26d4..fa175ca 100644 --- a/crawler/data_json.py +++ b/crawler/data_json.py @@ -150,18 +150,18 @@ def convert(self, domain): Falta spatial Provider name cambia por publisher """ - response.append({'title': dataproperty["name"][0], - 'landingPage': url, - 'description': dataproperty["description"][0], - 'contactName': creatorproperty["name"][0], - 'mbox': creatorproperty["email"][0], - 'keyword': keywords, - 'accessLevel': "public", - 'version': dataproperty["version"][0], - 'license': dataproperty["license"][0], - 'temporal': dataproperty["temporal"][0], - 'publisher': providerproperty["name"][0], - 'distribution': distributionlist}) + #response.append({'title': dataproperty["name"][0], + # 'landingPage': url, + # 'description': dataproperty["description"][0], + # 'contactName': creatorproperty["name"][0], + # 'mbox': creatorproperty["email"][0], + # 'keyword': keywords, + # 'accessLevel': "public", + # 'version': dataproperty["version"][0], + # 'license': dataproperty["license"][0], + # 'temporal': dataproperty["temporal"][0], + # 'publisher': providerproperty["name"][0], + # 'distribution': distributionlist}) """ Escribe en el archivo final """ FileController.FileController().save_existing_data_json(response, domain, False) """ Elimina el archivo temporal de items """ diff --git a/crawler/settings.py b/crawler/settings.py index f33cf3d..e044073 100644 --- a/crawler/settings.py +++ b/crawler/settings.py @@ -10,7 +10,7 @@ LOG_LEVEL = 'INFO' COOKIES_ENABLED = False LOG_FILE = 'datacrowler.log' -SPLASH_URL = 'http://192.168.0.21:8050/render.html?url=' +SPLASH_URL = 'http://192.168.43.151:8050/render.html?url=' # Crawl responsibly by identifying yourself (and your website) on the user-agent #USER_AGENT = 'tutorial (+http://www.yourdomain.com)' diff --git a/crawler/spiders/data_spider.py b/crawler/spiders/data_spider.py index ac01621..9dd20a6 100644 --- a/crawler/spiders/data_spider.py +++ b/crawler/spiders/data_spider.py @@ -11,6 +11,7 @@ from scrapy.utils.project import get_project_settings from microdata import get_items from rdflib.serializer import Serializer +from scrapy import log try: import json @@ -88,22 +89,23 @@ def parse_item(self, response): """ Obtiene el dominio de busqueda actual, y llama al metodo transformar. """ - self.log('A response from %s just arrived.' % response.url) - time.sleep(3) - - """ Obtiene el domain actual """ - https = response.url.find("https") - if https == -1: - pos_second_bar = 7 - else: - pos_second_bar = 8 - pos_third_bar = response.url.find("/", pos_second_bar + 1) - domain = response.url[pos_second_bar:pos_third_bar] - if domain not in items_list.keys(): - items_list[domain] = [] - self.log('Domain: %s' % domain) + if response.status != 404: + self.log('A response from %s just arrived.' % response.url) + time.sleep(3) + + """ Obtiene el domain actual """ + https = response.url.find("https") + if https == -1: + pos_second_bar = 7 + else: + pos_second_bar = 8 + pos_third_bar = response.url.find("/", pos_second_bar + 1) + domain = response.url[pos_second_bar:pos_third_bar] + if domain not in items_list.keys(): + items_list[domain] = [] + self.log('Domain: %s' % domain) - transformar(response.url, domain) + transformar(response.url, domain) def transformar(url, domain): @@ -135,9 +137,10 @@ def transformar(url, domain): # Si se cumple que por cada pagina hay un solo item if len(items) == 1: # Si el item tiene atributos se agrega o modifca en la lista - if items: - if items[indice].props: - refresh_items_list(items[indice], domain) + if items[indice].props: + #add_item_to_file(items[indice], "items") + #add_item_to_file_2(items[indice].props['url'][0], "urls") + refresh_items_list(items[indice], domain) def refresh_items_list(item_nuevo, domain): @@ -148,15 +151,15 @@ def refresh_items_list(item_nuevo, domain): # Itera sobre la lista de items existentes for item in items_list[domain]: - #add_item = True + # add_item = True # Si el item a comparar es DataCatalog if item.itemtype == "[http://schema.org/Datacatalog]": # Si el nuevo item es DataCatalog compara directo - if item.itemtype == item_nuevo.itemtype: + if item_nuevo.itemtype == "[http://schema.org/Datacatalog]": # Si ya existe modifica - if item.props['url'] == item_nuevo.props['url']: + if unicode(item.props['url'][0]) == unicode(item_nuevo.props['url'][0]): add_item = False # Agrega los nuevos atributos del item @@ -166,12 +169,12 @@ def refresh_items_list(item_nuevo, domain): item.props[name].append(v) # Si el nuevo item es DataSet busca entre sus datasets - else: - for datasets in item.get_all('datasets'): + elif item_nuevo.itemtype == "[http://schema.org/Dataset]": + for datasets in item.get_all('dataset'): for dataset in datasets: # Si el item ya existe modifica - if dataset.props['url'] == item_nuevo.props['url']: + if unicode(dataset.props['url'][0]) == unicode(item_nuevo.props['url'][0]): add_item = False # Agrega los nuevos atributos del item @@ -180,7 +183,6 @@ def refresh_items_list(item_nuevo, domain): for v in values: dataset.props[name].append(v) - # TODO: todavia no se puede hacer esta comparacion porque no esta bien anotada la url # Si el item a comparar es DataSet else: @@ -188,7 +190,7 @@ def refresh_items_list(item_nuevo, domain): if item_nuevo.itemtype == "[http://schema.org/Dataset]": # Si el item ya existe modifica - if item.props['url'] == item_nuevo.props['url'] and item.props['name'] == item_nuevo.props['name']: + if unicode(item.props['url'][0]) == unicode(item_nuevo.props['url'][0]): add_item = False # Agrega los nuevos atributos del item @@ -201,6 +203,44 @@ def refresh_items_list(item_nuevo, domain): if add_item: items_list[domain].append(item_nuevo) +# Nuevo metodo para agregar items a la lista +# def refresh_items_list_2(item_nuevo, domain): +# """ +# Actualiza la lista de items por dominio por cada item nuevo. +# """ +# add_item = True +# if item_nuevo.itemtype == "[http://schema.org/Datacatalog]": +# existe = buscar_datacatalog() +# if not existe: +# agregar_datacatalog(item_nuevo) +# +# datasets = extraer_datasets_from_datacatalog() +# for dataset in datasets: +# existe = buscar_dataset() +# if not existe: +# agregar_dataset() +# +# if item_nuevo.itemtype == "[http://schema.org/Dataset]": +# existe = buscar_dataset() +# if not existe: +# agregar_dataset() +# else: +# agregar_atributos_nuevo() +# # Si es un nuevo item agrega a la lista +# if add_item: +# items_list[domain].append(item_nuevo) + +def add_item_to_file(item, file): + file_name = file + ".json" + filee = open(file_name, 'ab+') + filee.write(item.json()) + filee.close() + +def add_item_to_file_2(item, file): + file_name = file + ".json" + filee = open(file_name, 'ab+') + filee.write(item + " ") + filee.close() def copy_items_to_files(): """