Corrección al guardar los archivos data.json.

camilobaezcamba · Sep 16, 2014 · 19eefa2 · 19eefa2
1 parent c6d5701
commit 19eefa2
Show file tree

Hide file tree

Showing 7 changed files with 94 additions and 83 deletions.
diff --git a/bin/DataCrawler.py b/bin/DataCrawler.py
@@ -22,7 +22,10 @@
 @click.option('--file',  # prompt='Path to your file with domains to crawl',
               default="/home/desa2/PycharmProjects/DataCrawler/crawler/domains.txt",
               help='The list of domains to crawl.')
-def main(file):
+@click.option('--virtualenv',  # prompt='Path to your virtual enviroment',
+              default="/home/desa2/datos",
+              help='The path of the virtual enviroment.')
+def main(file, virtualenv):
     # Iniciar splash
     # p = Process(target=start_splash_server)
     # p.start()
@@ -86,7 +89,7 @@ def call_spider(file):
 def start_splash_server():
     # Inciar splash
     os.system("chmod +x run_splash.sh")
-    os.system("./run_splash.sh /home/desa2/datos")
+    os.system("./run_splash.sh " + virtualenv)
 
 
 results = []

diff --git a/crawler/data_json.py b/crawler/data_json.py
@@ -156,4 +156,4 @@ def convert(self, domain):
 
         return filename
 
-#DataJson().convert("datos.mec.gov.py")
+# DataJson().convert("datos.mec.gov.py")
diff --git a/crawler/file_controller.py b/crawler/file_controller.py
@@ -9,6 +9,7 @@
 except ImportError:
     import simplejson as json
 
+
 class FileController:
     def clean_tmp_files(self):
         """
@@ -24,9 +25,9 @@ def clean_item_tmp_file(self, domain):
         """
         Elimina los archivos temporales utilizados por el spider.
         """
-        file =  domain + ".json"
+        file = domain + ".json"
         if os.path.exists(file):
-             os.remove(file)
+            os.remove(file)
 
 
     def save_existing_data_json(self, response, domain, to_json):
@@ -40,9 +41,12 @@ def save_existing_data_json(self, response, domain, to_json):
         if not os.path.exists(subprincipal):
             os.makedirs(subprincipal)
         filename = subprincipal + "/" + "data.json"
-        file_response = codecs.open(filename, 'wb', 'utf-8-sig')
+        # file_response = codecs.open(filename, 'w+', 'utf-8-sig')
+        file_response = open(filename, 'w+')
         if to_json == True:
             file_response.write(json.dumps(response.json(), indent=2, ensure_ascii=False))
+            file_response.close()
         else:
             file_response.write(json.dumps(response, indent=2, ensure_ascii=False))
+            file_response.close()
         return filename
diff --git a/crawler/settings-example.py b/crawler/settings-example.py
@@ -11,10 +11,13 @@
 COOKIES_ENABLED = False
 LOG_FILE = 'datacrowler.log'
 # Especificar aqui la ubicacion donde se levanta el servidor splash
-SPLASH_URL = 'http://localhost:8050/render.html?url='
-# Especificar aqui la API Key del catalago
+SPLASH_URL = 'http://your_splash_location:8050/render.html?url='
+# Especificar aqui la url del Catalogo
+CATALOG_URL = 'http://your_catalog_site/api/3/action/'
+# Especificar aqui la API Key del Catalago
 API_KEY = "xxxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx"
-#DEPTH_LIMIT = 1
+
+# DEPTH_LIMIT = 1
 
 # Crawl responsibly by identifying yourself (and your website) on the user-agent
-#USER_AGENT = 'tutorial (+http://www.yourdomain.com)'
+# USER_AGENT = 'tutorial (+http://www.yourdomain.com)'
diff --git a/crawler/spiders/data_spider.py b/crawler/spiders/data_spider.py
@@ -141,67 +141,6 @@ def transformar(url, domain):
             refresh_items_list(items[indice], domain)
 
 
-def refresh_items_list_old(item_nuevo, domain):
-    """
-    Actualiza la lista de items por dominio por cada item nuevo.
-    """
-    add_item = True
-
-    # Itera sobre la lista de items existentes
-    for item in items_list[domain]:
-        # add_item = True
-        # Si el item a comparar es DataCatalog
-        if item.itemtype == "[http://schema.org/Datacatalog]":
-
-            # Si el nuevo item es DataCatalog compara directo
-            if item_nuevo.itemtype == "[http://schema.org/Datacatalog]":
-
-                # Si ya existe modifica
-                if unicode(item.props['url'][0]) == unicode(item_nuevo.props['url'][0]):
-                    add_item = False
-
-                    # Agrega los nuevos atributos del item
-                    for name, values in item_nuevo.props.items():
-                        if not item.props[name]:
-                            for v in values:
-                                item.props[name].append(v)
-
-            # Si el nuevo item es DataSet busca entre sus datasets
-            elif item_nuevo.itemtype == "[http://schema.org/Dataset]":
-                for datasets in item.get_all('dataset'):
-                    for dataset in datasets:
-
-                        # Si el item ya existe modifica
-                        if unicode(dataset.props['url'][0]) == unicode(item_nuevo.props['url'][0]):
-                            add_item = False
-
-                            # Agrega los nuevos atributos del item
-                            for name, values in item_nuevo.props.items():
-                                if not dataset.props[name]:
-                                    for v in values:
-                                        dataset.props[name].append(v)
-
-        # Si el item a comparar es DataSet
-        else:
-
-            # Si el item nuevo es Dataset
-            if item_nuevo.itemtype == "[http://schema.org/Dataset]":
-
-                # Si el item ya existe modifica
-                if unicode(item.props['url'][0]) == unicode(item_nuevo.props['url'][0]):
-                    add_item = False
-
-                    # Agrega los nuevos atributos del item
-                    for name, values in item_nuevo.props.items():
-                        if not item.props[name]:
-                            for v in values:
-                                item.props[name].append(v)
-
-    # Si es un nuevo item agrega a la lista
-    if add_item:
-        items_list[domain].append(item_nuevo)
-
-
 # Nuevo metodo para agregar items a la lista
 def refresh_items_list(item_nuevo, domain):
     """
@@ -275,6 +214,7 @@ def add_new_att(item_nuevo, domain):
                             item.props[name].append(v)
                         first = False
 
+
 def log_to_file(data):
     file_name = "log.txt"
     filee = open(file_name, 'ab+')
@@ -322,4 +262,65 @@ def rdfa_to_microdata(url):
         serialization = g.serialize(format=target_format).decode("UTF-8")
         return serialization
     else:
-        return ""
+        return ""
+
+
+def refresh_items_list_old(item_nuevo, domain):
+    """
+    Actualiza la lista de items por dominio por cada item nuevo.
+    """
+    add_item = True
+
+    # Itera sobre la lista de items existentes
+    for item in items_list[domain]:
+        # add_item = True
+        # Si el item a comparar es DataCatalog
+        if item.itemtype == "[http://schema.org/Datacatalog]":
+
+            # Si el nuevo item es DataCatalog compara directo
+            if item_nuevo.itemtype == "[http://schema.org/Datacatalog]":
+
+                # Si ya existe modifica
+                if unicode(item.props['url'][0]) == unicode(item_nuevo.props['url'][0]):
+                    add_item = False
+
+                    # Agrega los nuevos atributos del item
+                    for name, values in item_nuevo.props.items():
+                        if not item.props[name]:
+                            for v in values:
+                                item.props[name].append(v)
+
+            # Si el nuevo item es DataSet busca entre sus datasets
+            elif item_nuevo.itemtype == "[http://schema.org/Dataset]":
+                for datasets in item.get_all('dataset'):
+                    for dataset in datasets:
+
+                        # Si el item ya existe modifica
+                        if unicode(dataset.props['url'][0]) == unicode(item_nuevo.props['url'][0]):
+                            add_item = False
+
+                            # Agrega los nuevos atributos del item
+                            for name, values in item_nuevo.props.items():
+                                if not dataset.props[name]:
+                                    for v in values:
+                                        dataset.props[name].append(v)
+
+        # Si el item a comparar es DataSet
+        else:
+
+            # Si el item nuevo es Dataset
+            if item_nuevo.itemtype == "[http://schema.org/Dataset]":
+
+                # Si el item ya existe modifica
+                if unicode(item.props['url'][0]) == unicode(item_nuevo.props['url'][0]):
+                    add_item = False
+
+                    # Agrega los nuevos atributos del item
+                    for name, values in item_nuevo.props.items():
+                        if not item.props[name]:
+                            for v in values:
+                                item.props[name].append(v)
+
+    # Si es un nuevo item agrega a la lista
+    if add_item:
+        items_list[domain].append(item_nuevo)
diff --git a/importer/rest.py b/importer/rest.py
@@ -13,7 +13,7 @@ class CKANImporter(object):
     def __init__(self):
         settings = get_project_settings()
         self.headers = {'Authorization': settings['API_KEY'], 'Content-type':'application/json'}
-        self.base_url = 'http://www.datos.gov.py/api/3/action/'
+        self.base_url = settings['CATALOG_URL']
 
     def import_package(self, filename, modalidad):
         with open(filename) as file:	# Use file to refer to the file object
@@ -102,4 +102,4 @@ def get_organization_id(self, org_name):
     sys.setdefaultencoding("utf-8")
     importer = CKANImporter()
     #Para pruebas sin ejecutar el crawler
-    importer.import_package('data.json', 'data-hunting')
+    importer.import_package('/home/desa2/PycharmProjects/DataCrawler/bin/results_16_09_14/datos.mec.gov.py/data.json', 'data-hunting')
diff --git a/install.sh b/install.sh
@@ -9,10 +9,10 @@ PATH_PYTHON_VENV=$1
 echo $1
 
 # Scrapy dependencies
-#sudo apt-get install libffi-dev libxslt1-dev libxslt1.1 libxml2-dev libxml2 libssl-dev -y
+sudo apt-get install libffi-dev libxslt1-dev libxslt1.1 libxml2-dev libxml2 libssl-dev -y
 
 # PyQt4 dependencies
-#sudo apt-get install python-dev python-qt4 python-qt4-dev python-sip python-sip-dev build-essential gfortran libqt4-dev qt4-qmake libpq-dev libsqlite3-dev qt4-dev-#tools qt4-doc unixodbc-dev pyqt4-dev-tools -y
+sudo apt-get install python-dev python-qt4 python-qt4-dev python-sip python-sip-dev build-essential gfortran libqt4-dev qt4-qmake libpq-dev libsqlite3-dev qt4-dev-#tools qt4-doc unixodbc-dev pyqt4-dev-tools -y
 
 # RDFLib
 echo "--------- Installing RDFlib"
@@ -30,24 +30,24 @@ cd ..
 cd lib
 cd sip
 echo "--------- Installing SIP 4.16.2"
-#python configure.py
-#make
-#sudo make install
+python configure.py
+make
+sudo make install
 
 # re2
 cd ../re2
 echo "--------- Installing re2"
 #make test
-#sudo make install
+sudo make install
 #sudo make testinstall
 pip install re2
 
 # PyQt4
 cd ../pyqt
 echo "--------- Installing PyQt 4.10.04"
-#python configure-ng.py
-#make
-#sudo make install
+python configure-ng.py
+make
+sudo make install
 
 cd ../..
Original file line number	Diff line number	Diff line change
Expand Up		@@ -156,4 +156,4 @@ def convert(self, domain):

		return filename

		#DataJson().convert("datos.mec.gov.py")
		# DataJson().convert("datos.mec.gov.py")