diff --git a/bin/DataCrawler.py b/bin/DataCrawler.py index e8aac27..1354d62 100644 --- a/bin/DataCrawler.py +++ b/bin/DataCrawler.py @@ -58,7 +58,7 @@ def call_spider(file): data_spider.copy_items_to_files() """ Eliminar archivos temporales """ - FileController.FileController().clean_tmp_files() + #FileController.FileController().clean_tmp_files() """ Convertir los archivos .json a data.json (formato POD) """ for domain in domains: diff --git a/crawler/data_json.py b/crawler/data_json.py index 1394f99..b6f26d4 100644 --- a/crawler/data_json.py +++ b/crawler/data_json.py @@ -34,7 +34,7 @@ def convert(self, domain): if property: url = "" if url_aux in property.keys(): - url = dataproperty["url"][0].encode('utf-8') + url = property["url"][0] """ Iterar sobre creator (publicador) """ for creator in property["creator"]: @@ -168,4 +168,4 @@ def convert(self, domain): FileController.FileController().clean_item_tmp_file(domain) -#DataJson().convert("192.168.200.102") +#DataJson().convert("datos.mec.gov.py") diff --git a/crawler/settings.py b/crawler/settings.py index 2c2872b..f33cf3d 100644 --- a/crawler/settings.py +++ b/crawler/settings.py @@ -10,6 +10,7 @@ LOG_LEVEL = 'INFO' COOKIES_ENABLED = False LOG_FILE = 'datacrowler.log' +SPLASH_URL = 'http://192.168.0.21:8050/render.html?url=' # Crawl responsibly by identifying yourself (and your website) on the user-agent #USER_AGENT = 'tutorial (+http://www.yourdomain.com)' diff --git a/crawler/spiders/data_spider.py b/crawler/spiders/data_spider.py index c16d129..ac01621 100644 --- a/crawler/spiders/data_spider.py +++ b/crawler/spiders/data_spider.py @@ -116,7 +116,8 @@ def transformar(url, domain): microdata = {} microdata['items'] = items = [] - url_splash = "http://192.168.0.21:8050/render.html?url=" + url + "&timeout=20&wait=2.5" + settings = get_project_settings() + url_splash = settings['SPLASH_URL'] + url + "&timeout=20&wait=2.5" file_splash = open('splash.html', 'w') html = urllib.urlopen(url_splash) file_splash.write(str(html.read())) @@ -147,7 +148,7 @@ def refresh_items_list(item_nuevo, domain): # Itera sobre la lista de items existentes for item in items_list[domain]: - add_item = True + #add_item = True # Si el item a comparar es DataCatalog if item.itemtype == "[http://schema.org/Datacatalog]": @@ -182,16 +183,19 @@ def refresh_items_list(item_nuevo, domain): # TODO: todavia no se puede hacer esta comparacion porque no esta bien anotada la url # Si el item a comparar es DataSet else: - add_item = True - # Si el item ya existe modifica - if item.props['url'] == item_nuevo.props['url']: - addItem = False - - # Agrega los nuevos atributos del item - for name, values in item_nuevo.props.items(): - if not item.props[name]: - for v in values: - item.props[name].append(v) + + # Si el item nuevo es Dataset + if item_nuevo.itemtype == "[http://schema.org/Dataset]": + + # Si el item ya existe modifica + if item.props['url'] == item_nuevo.props['url'] and item.props['name'] == item_nuevo.props['name']: + add_item = False + + # Agrega los nuevos atributos del item + for name, values in item_nuevo.props.items(): + if not item.props[name]: + for v in values: + item.props[name].append(v) # Si es un nuevo item agrega a la lista if add_item: