Skip to content

Commit

Permalink
Corrección al guardar los archivos data.json.
Browse files Browse the repository at this point in the history
  • Loading branch information
verena91 committed Sep 16, 2014
1 parent c6d5701 commit 19eefa2
Show file tree
Hide file tree
Showing 7 changed files with 94 additions and 83 deletions.
7 changes: 5 additions & 2 deletions bin/DataCrawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,10 @@
@click.option('--file', # prompt='Path to your file with domains to crawl',
default="/home/desa2/PycharmProjects/DataCrawler/crawler/domains.txt",
help='The list of domains to crawl.')
def main(file):
@click.option('--virtualenv', # prompt='Path to your virtual enviroment',
default="/home/desa2/datos",
help='The path of the virtual enviroment.')
def main(file, virtualenv):
# Iniciar splash
# p = Process(target=start_splash_server)
# p.start()
Expand Down Expand Up @@ -86,7 +89,7 @@ def call_spider(file):
def start_splash_server():
# Inciar splash
os.system("chmod +x run_splash.sh")
os.system("./run_splash.sh /home/desa2/datos")
os.system("./run_splash.sh " + virtualenv)


results = []
Expand Down
2 changes: 1 addition & 1 deletion crawler/data_json.py
Original file line number Diff line number Diff line change
Expand Up @@ -156,4 +156,4 @@ def convert(self, domain):

return filename

#DataJson().convert("datos.mec.gov.py")
# DataJson().convert("datos.mec.gov.py")
10 changes: 7 additions & 3 deletions crawler/file_controller.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
except ImportError:
import simplejson as json


class FileController:
def clean_tmp_files(self):
"""
Expand All @@ -24,9 +25,9 @@ def clean_item_tmp_file(self, domain):
"""
Elimina los archivos temporales utilizados por el spider.
"""
file = domain + ".json"
file = domain + ".json"
if os.path.exists(file):
os.remove(file)
os.remove(file)


def save_existing_data_json(self, response, domain, to_json):
Expand All @@ -40,9 +41,12 @@ def save_existing_data_json(self, response, domain, to_json):
if not os.path.exists(subprincipal):
os.makedirs(subprincipal)
filename = subprincipal + "/" + "data.json"
file_response = codecs.open(filename, 'wb', 'utf-8-sig')
# file_response = codecs.open(filename, 'w+', 'utf-8-sig')
file_response = open(filename, 'w+')
if to_json == True:
file_response.write(json.dumps(response.json(), indent=2, ensure_ascii=False))
file_response.close()
else:
file_response.write(json.dumps(response, indent=2, ensure_ascii=False))
file_response.close()
return filename
11 changes: 7 additions & 4 deletions crawler/settings-example.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,10 +11,13 @@
COOKIES_ENABLED = False
LOG_FILE = 'datacrowler.log'
# Especificar aqui la ubicacion donde se levanta el servidor splash
SPLASH_URL = 'http://localhost:8050/render.html?url='
# Especificar aqui la API Key del catalago
SPLASH_URL = 'http://your_splash_location:8050/render.html?url='
# Especificar aqui la url del Catalogo
CATALOG_URL = 'http://your_catalog_site/api/3/action/'
# Especificar aqui la API Key del Catalago
API_KEY = "xxxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx"
#DEPTH_LIMIT = 1

# DEPTH_LIMIT = 1

# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'tutorial (+http://www.yourdomain.com)'
# USER_AGENT = 'tutorial (+http://www.yourdomain.com)'
125 changes: 63 additions & 62 deletions crawler/spiders/data_spider.py
Original file line number Diff line number Diff line change
Expand Up @@ -141,67 +141,6 @@ def transformar(url, domain):
refresh_items_list(items[indice], domain)


def refresh_items_list_old(item_nuevo, domain):
"""
Actualiza la lista de items por dominio por cada item nuevo.
"""
add_item = True

# Itera sobre la lista de items existentes
for item in items_list[domain]:
# add_item = True
# Si el item a comparar es DataCatalog
if item.itemtype == "[http://schema.org/Datacatalog]":

# Si el nuevo item es DataCatalog compara directo
if item_nuevo.itemtype == "[http://schema.org/Datacatalog]":

# Si ya existe modifica
if unicode(item.props['url'][0]) == unicode(item_nuevo.props['url'][0]):
add_item = False

# Agrega los nuevos atributos del item
for name, values in item_nuevo.props.items():
if not item.props[name]:
for v in values:
item.props[name].append(v)

# Si el nuevo item es DataSet busca entre sus datasets
elif item_nuevo.itemtype == "[http://schema.org/Dataset]":
for datasets in item.get_all('dataset'):
for dataset in datasets:

# Si el item ya existe modifica
if unicode(dataset.props['url'][0]) == unicode(item_nuevo.props['url'][0]):
add_item = False

# Agrega los nuevos atributos del item
for name, values in item_nuevo.props.items():
if not dataset.props[name]:
for v in values:
dataset.props[name].append(v)

# Si el item a comparar es DataSet
else:

# Si el item nuevo es Dataset
if item_nuevo.itemtype == "[http://schema.org/Dataset]":

# Si el item ya existe modifica
if unicode(item.props['url'][0]) == unicode(item_nuevo.props['url'][0]):
add_item = False

# Agrega los nuevos atributos del item
for name, values in item_nuevo.props.items():
if not item.props[name]:
for v in values:
item.props[name].append(v)

# Si es un nuevo item agrega a la lista
if add_item:
items_list[domain].append(item_nuevo)


# Nuevo metodo para agregar items a la lista
def refresh_items_list(item_nuevo, domain):
"""
Expand Down Expand Up @@ -275,6 +214,7 @@ def add_new_att(item_nuevo, domain):
item.props[name].append(v)
first = False


def log_to_file(data):
file_name = "log.txt"
filee = open(file_name, 'ab+')
Expand Down Expand Up @@ -322,4 +262,65 @@ def rdfa_to_microdata(url):
serialization = g.serialize(format=target_format).decode("UTF-8")
return serialization
else:
return ""
return ""


def refresh_items_list_old(item_nuevo, domain):
"""
Actualiza la lista de items por dominio por cada item nuevo.
"""
add_item = True

# Itera sobre la lista de items existentes
for item in items_list[domain]:
# add_item = True
# Si el item a comparar es DataCatalog
if item.itemtype == "[http://schema.org/Datacatalog]":

# Si el nuevo item es DataCatalog compara directo
if item_nuevo.itemtype == "[http://schema.org/Datacatalog]":

# Si ya existe modifica
if unicode(item.props['url'][0]) == unicode(item_nuevo.props['url'][0]):
add_item = False

# Agrega los nuevos atributos del item
for name, values in item_nuevo.props.items():
if not item.props[name]:
for v in values:
item.props[name].append(v)

# Si el nuevo item es DataSet busca entre sus datasets
elif item_nuevo.itemtype == "[http://schema.org/Dataset]":
for datasets in item.get_all('dataset'):
for dataset in datasets:

# Si el item ya existe modifica
if unicode(dataset.props['url'][0]) == unicode(item_nuevo.props['url'][0]):
add_item = False

# Agrega los nuevos atributos del item
for name, values in item_nuevo.props.items():
if not dataset.props[name]:
for v in values:
dataset.props[name].append(v)

# Si el item a comparar es DataSet
else:

# Si el item nuevo es Dataset
if item_nuevo.itemtype == "[http://schema.org/Dataset]":

# Si el item ya existe modifica
if unicode(item.props['url'][0]) == unicode(item_nuevo.props['url'][0]):
add_item = False

# Agrega los nuevos atributos del item
for name, values in item_nuevo.props.items():
if not item.props[name]:
for v in values:
item.props[name].append(v)

# Si es un nuevo item agrega a la lista
if add_item:
items_list[domain].append(item_nuevo)
4 changes: 2 additions & 2 deletions importer/rest.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ class CKANImporter(object):
def __init__(self):
settings = get_project_settings()
self.headers = {'Authorization': settings['API_KEY'], 'Content-type':'application/json'}
self.base_url = 'http://www.datos.gov.py/api/3/action/'
self.base_url = settings['CATALOG_URL']

def import_package(self, filename, modalidad):
with open(filename) as file: # Use file to refer to the file object
Expand Down Expand Up @@ -102,4 +102,4 @@ def get_organization_id(self, org_name):
sys.setdefaultencoding("utf-8")
importer = CKANImporter()
#Para pruebas sin ejecutar el crawler
importer.import_package('data.json', 'data-hunting')
importer.import_package('/home/desa2/PycharmProjects/DataCrawler/bin/results_16_09_14/datos.mec.gov.py/data.json', 'data-hunting')
18 changes: 9 additions & 9 deletions install.sh
Original file line number Diff line number Diff line change
Expand Up @@ -9,10 +9,10 @@ PATH_PYTHON_VENV=$1
echo $1

# Scrapy dependencies
#sudo apt-get install libffi-dev libxslt1-dev libxslt1.1 libxml2-dev libxml2 libssl-dev -y
sudo apt-get install libffi-dev libxslt1-dev libxslt1.1 libxml2-dev libxml2 libssl-dev -y

# PyQt4 dependencies
#sudo apt-get install python-dev python-qt4 python-qt4-dev python-sip python-sip-dev build-essential gfortran libqt4-dev qt4-qmake libpq-dev libsqlite3-dev qt4-dev-#tools qt4-doc unixodbc-dev pyqt4-dev-tools -y
sudo apt-get install python-dev python-qt4 python-qt4-dev python-sip python-sip-dev build-essential gfortran libqt4-dev qt4-qmake libpq-dev libsqlite3-dev qt4-dev-#tools qt4-doc unixodbc-dev pyqt4-dev-tools -y

# RDFLib
echo "--------- Installing RDFlib"
Expand All @@ -30,24 +30,24 @@ cd ..
cd lib
cd sip
echo "--------- Installing SIP 4.16.2"
#python configure.py
#make
#sudo make install
python configure.py
make
sudo make install

# re2
cd ../re2
echo "--------- Installing re2"
#make test
#sudo make install
sudo make install
#sudo make testinstall
pip install re2

# PyQt4
cd ../pyqt
echo "--------- Installing PyQt 4.10.04"
#python configure-ng.py
#make
#sudo make install
python configure-ng.py
make
sudo make install

cd ../..

Expand Down

0 comments on commit 19eefa2

Please sign in to comment.