Skip to content

Commit

Permalink
Merge
Browse files Browse the repository at this point in the history
  • Loading branch information
rparrapy committed Sep 12, 2014
2 parents 318bdcc + 42d3534 commit 858e4a0
Show file tree
Hide file tree
Showing 6 changed files with 113 additions and 52 deletions.
23 changes: 18 additions & 5 deletions bin/DataCrawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,9 @@
import requests
import click
import sys
import os
import time
from multiprocessing import Process
from twisted.internet import reactor
from scrapy.crawler import Crawler
from scrapy import log, signals
Expand All @@ -17,12 +20,17 @@

@click.command()
@click.option('--file', # prompt='Path to your file with domains to crawl',
default="./crawler/domains.txt",
default="/home/desa2/PycharmProjects/DataCrawler/crawler/domains.txt",
help='The list of domains to crawl.')
def main(file):
# Iniciar splash
# p = Process(target=start_splash_server)
# p.start()
# time.sleep(10)
click.echo('File path: %s' % file)
created_files = call_spider(file)
log.msg("continua la ejecucion", level=log.DEBUG)
# Finalizar splash
# p.terminate()
import_to_ckan(created_files)


Expand Down Expand Up @@ -59,14 +67,13 @@ def call_spider(file):
crawler.crawl(spider)
crawler.start()
log.start(loglevel=log.DEBUG)
log.msg("after log", level=log.DEBUG)
reactor.run() # the script will block here

""" Copiar los datos a los archivos .json """
data_spider.copy_items_to_files()

""" Eliminar archivos temporales """
#FileController.FileController().clean_tmp_files()
FileController.FileController().clean_tmp_files()

""" Convertir los archivos .json a data.json (formato POD) """
for domain in domains:
Expand All @@ -75,6 +82,12 @@ def call_spider(file):

return created_files

def start_splash_server():
# Inciar splash
os.system("chmod +x run_splash.sh")
os.system("./run_splash.sh /home/desa2/datos")


results = []


Expand All @@ -92,4 +105,4 @@ def import_to_ckan(created_files):
if __name__ == '__main__':
reload(sys)
sys.setdefaultencoding("utf-8")
main()
main()
5 changes: 5 additions & 0 deletions bin/run_splash.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
#!/bin/bash
PATH=$1
echo "virtualenv -->" $PATH
source $PATH/bin/activate
python -m splash.server
24 changes: 12 additions & 12 deletions crawler/data_json.py
Original file line number Diff line number Diff line change
Expand Up @@ -150,18 +150,18 @@ def convert(self, domain):
Falta spatial
Provider name cambia por publisher
"""
response.append({'title': dataproperty["name"][0],
'landingPage': url,
'description': dataproperty["description"][0],
'contactName': creatorproperty["name"][0],
'mbox': creatorproperty["email"][0],
'keyword': keywords,
'accessLevel': "public",
'version': dataproperty["version"][0],
'license': dataproperty["license"][0],
'temporal': dataproperty["temporal"][0],
'publisher': providerproperty["name"][0],
'distribution': distributionlist})
#response.append({'title': dataproperty["name"][0],
# 'landingPage': url,
# 'description': dataproperty["description"][0],
# 'contactName': creatorproperty["name"][0],
# 'mbox': creatorproperty["email"][0],
# 'keyword': keywords,
# 'accessLevel': "public",
# 'version': dataproperty["version"][0],
# 'license': dataproperty["license"][0],
# 'temporal': dataproperty["temporal"][0],
# 'publisher': providerproperty["name"][0],
# 'distribution': distributionlist})
""" Escribe en el archivo final """
filename = FileController.FileController().save_existing_data_json(response, domain, False)
""" Elimina el archivo temporal de items """
Expand Down
2 changes: 1 addition & 1 deletion crawler/settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
LOG_LEVEL = 'INFO'
COOKIES_ENABLED = False
LOG_FILE = 'datacrowler.log'
SPLASH_URL = 'http://192.168.0.21:8050/render.html?url='
SPLASH_URL = 'http://192.168.43.151:8050/render.html?url='
#DEPTH_LIMIT = 1

# Crawl responsibly by identifying yourself (and your website) on the user-agent
Expand Down
109 changes: 76 additions & 33 deletions crawler/spiders/data_spider.py
Original file line number Diff line number Diff line change
Expand Up @@ -88,22 +88,23 @@ def parse_item(self, response):
"""
Obtiene el dominio de busqueda actual, y llama al metodo transformar.
"""
self.log('A response from %s just arrived.' % response.url)
time.sleep(3)

""" Obtiene el domain actual """
https = response.url.find("https")
if https == -1:
pos_second_bar = 7
else:
pos_second_bar = 8
pos_third_bar = response.url.find("/", pos_second_bar + 1)
domain = response.url[pos_second_bar:pos_third_bar]
if domain not in items_list.keys():
items_list[domain] = []
self.log('Domain: %s' % domain)
if response.status != 404:
self.log('A response from %s just arrived.' % response.url)
time.sleep(3)

""" Obtiene el domain actual """
https = response.url.find("https")
if https == -1:
pos_second_bar = 7
else:
pos_second_bar = 8
pos_third_bar = response.url.find("/", pos_second_bar + 1)
domain = response.url[pos_second_bar:pos_third_bar]
if domain not in items_list.keys():
items_list[domain] = []
self.log('Domain: %s' % domain)

transformar(response.url, domain)
transformar(response.url, domain)


def transformar(url, domain):
Expand Down Expand Up @@ -135,9 +136,10 @@ def transformar(url, domain):
# Si se cumple que por cada pagina hay un solo item
if len(items) == 1:
# Si el item tiene atributos se agrega o modifca en la lista
if items:
if items[indice].props:
refresh_items_list(items[indice], domain)
if items[indice].props:
#add_item_to_file(items[indice], "items")
#add_item_to_file_2(items[indice].props['url'][0], "urls")
refresh_items_list(items[indice], domain)


def refresh_items_list(item_nuevo, domain):
Expand All @@ -148,15 +150,15 @@ def refresh_items_list(item_nuevo, domain):

# Itera sobre la lista de items existentes
for item in items_list[domain]:
add_item = True
# add_item = True
# Si el item a comparar es DataCatalog
if item.itemtype == "[http://schema.org/Datacatalog]":

# Si el nuevo item es DataCatalog compara directo
if item.itemtype == item_nuevo.itemtype:
if item_nuevo.itemtype == "[http://schema.org/Datacatalog]":

# Si ya existe modifica
if item.props['url'] == item_nuevo.props['url']:
if unicode(item.props['url'][0]) == unicode(item_nuevo.props['url'][0]):
add_item = False

# Agrega los nuevos atributos del item
Expand All @@ -166,12 +168,12 @@ def refresh_items_list(item_nuevo, domain):
item.props[name].append(v)

# Si el nuevo item es DataSet busca entre sus datasets
else:
for datasets in item.get_all('datasets'):
elif item_nuevo.itemtype == "[http://schema.org/Dataset]":
for datasets in item.get_all('dataset'):
for dataset in datasets:

# Si el item ya existe modifica
if dataset.props['url'] == item_nuevo.props['url']:
if unicode(dataset.props['url'][0]) == unicode(item_nuevo.props['url'][0]):
add_item = False

# Agrega los nuevos atributos del item
Expand All @@ -183,21 +185,62 @@ def refresh_items_list(item_nuevo, domain):
# TODO: todavia no se puede hacer esta comparacion porque no esta bien anotada la url
# Si el item a comparar es DataSet
else:
add_item = True
# Si el item ya existe modifica
if item.props['url'] == item_nuevo.props['url']:
addItem = False

# Agrega los nuevos atributos del item
for name, values in item_nuevo.props.items():
if not item.props[name]:
for v in values:
item.props[name].append(v)
# Si el item nuevo es Dataset
if item_nuevo.itemtype == "[http://schema.org/Dataset]":

# Si el item ya existe modifica
if unicode(item.props['url'][0]) == unicode(item_nuevo.props['url'][0]):
add_item = False

# Agrega los nuevos atributos del item
for name, values in item_nuevo.props.items():
if not item.props[name]:
for v in values:
item.props[name].append(v)

# Si es un nuevo item agrega a la lista
if add_item:
items_list[domain].append(item_nuevo)

# Nuevo metodo para agregar items a la lista
# def refresh_items_list_2(item_nuevo, domain):
# """
# Actualiza la lista de items por dominio por cada item nuevo.
# """
# add_item = True
# if item_nuevo.itemtype == "[http://schema.org/Datacatalog]":
# existe = buscar_datacatalog()
# if not existe:
# agregar_datacatalog(item_nuevo)
#
# datasets = extraer_datasets_from_datacatalog()
# for dataset in datasets:
# existe = buscar_dataset()
# if not existe:
# agregar_dataset()
#
# if item_nuevo.itemtype == "[http://schema.org/Dataset]":
# existe = buscar_dataset()
# if not existe:
# agregar_dataset()
# else:
# agregar_atributos_nuevo()
# # Si es un nuevo item agrega a la lista
# if add_item:
# items_list[domain].append(item_nuevo)

def add_item_to_file(item, file):
file_name = file + ".json"
filee = open(file_name, 'ab+')
filee.write(item.json())
filee.close()

def add_item_to_file_2(item, file):
file_name = file + ".json"
filee = open(file_name, 'ab+')
filee.write(item + " ")
filee.close()

def copy_items_to_files():
"""
Expand Down
2 changes: 1 addition & 1 deletion importer/rest.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ class CKANImporter(object):

def import_package(self, filename, modalidad):
with open(filename) as file: # Use file to refer to the file object
base_dir = '/'.join(a.split('/')[0:-1]) + '/'
base_dir = '/'.join(filename.split('/')[0:-1]) + '/'
data = file.read()
entries = [DataEntry(**j) for j in json.loads(data)]
datasets = [CkanDataset(e, modalidad) for e in entries]
Expand Down

0 comments on commit 858e4a0

Please sign in to comment.