From 99cd2734a677254fba2fa8f32e072c551480cfee Mon Sep 17 00:00:00 2001 From: verena91 Date: Mon, 15 Sep 2014 15:25:20 -0400 Subject: [PATCH] Cambios en settings-example.py --- bin/DataCrawler.py | 2 +- crawler/data_json.py | 2 +- crawler/file_controller.py | 2 +- crawler/{settings.py => settings-example.py} | 6 ++++-- importer/rest.py | 6 ++++-- 5 files changed, 11 insertions(+), 7 deletions(-) rename crawler/{settings.py => settings-example.py} (68%) diff --git a/bin/DataCrawler.py b/bin/DataCrawler.py index c2ad27a..9c676c9 100644 --- a/bin/DataCrawler.py +++ b/bin/DataCrawler.py @@ -31,7 +31,7 @@ def main(file): created_files = call_spider(file) # Finalizar splash # p.terminate() - import_to_ckan(created_files) + # import_to_ckan(created_files) def call_spider(file): diff --git a/crawler/data_json.py b/crawler/data_json.py index 850b3c3..51560ff 100644 --- a/crawler/data_json.py +++ b/crawler/data_json.py @@ -103,7 +103,7 @@ def convert(self, domain): 'description': property["description"][0], 'contactName': "", 'mbox': "", - 'keywords': keywords_catalog, + 'keyword': keywords_catalog, 'accessLevel': "public", 'publisher': ""}) diff --git a/crawler/file_controller.py b/crawler/file_controller.py index b6316e1..0f0c60f 100644 --- a/crawler/file_controller.py +++ b/crawler/file_controller.py @@ -40,7 +40,7 @@ def save_existing_data_json(self, response, domain, to_json): if not os.path.exists(subprincipal): os.makedirs(subprincipal) filename = subprincipal + "/" + "data.json" - file_response = codecs.open(filename, 'w+', 'utf-8-sig') + file_response = codecs.open(filename, 'wb', 'utf-8-sig') if to_json == True: file_response.write(json.dumps(response.json(), indent=2, ensure_ascii=False)) else: diff --git a/crawler/settings.py b/crawler/settings-example.py similarity index 68% rename from crawler/settings.py rename to crawler/settings-example.py index 81cc21b..4cfd7b8 100644 --- a/crawler/settings.py +++ b/crawler/settings-example.py @@ -10,8 +10,10 @@ LOG_LEVEL = 'INFO' COOKIES_ENABLED = False LOG_FILE = 'datacrowler.log' -SPLASH_URL = 'http://192.168.200.3:8050/render.html?url=' -API_KEY = "" +# Especificar aqui la ubicacion donde se levanta el servidor splash +SPLASH_URL = 'http://localhost:8050/render.html?url=' +# Especificar aqui la API Key del catalago +API_KEY = "xxxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx" #DEPTH_LIMIT = 1 # Crawl responsibly by identifying yourself (and your website) on the user-agent diff --git a/importer/rest.py b/importer/rest.py index 2ad0318..b3d800a 100644 --- a/importer/rest.py +++ b/importer/rest.py @@ -4,13 +4,15 @@ import os import sys from zipfile import ZipFile +from scrapy.utils.project import get_project_settings from model import DataEntry, CkanDataset class CKANImporter(object): def __init__(self): - self.headers = {'Authorization': 'xxxxx', 'Content-type':'application/json'} + settings = get_project_settings() + self.headers = {'Authorization': settings['API_KEY'], 'Content-type':'application/json'} self.base_url = 'http://www.datos.gov.py/api/3/action/' def import_package(self, filename, modalidad): @@ -100,4 +102,4 @@ def get_organization_id(self, org_name): sys.setdefaultencoding("utf-8") importer = CKANImporter() #Para pruebas sin ejecutar el crawler - importer.import_package('/home/desa4/workspace/DataCrawler/results_12_09_14/datos.mec.gov.py/data.json', 'data-hunting') + importer.import_package('data.json', 'data-hunting')