Skip to content

Commit

Permalink
Lanzar splash y otros cambios.
Browse files Browse the repository at this point in the history
  • Loading branch information
verena91 committed Sep 12, 2014
1 parent 0319738 commit 42d3534
Show file tree
Hide file tree
Showing 5 changed files with 100 additions and 40 deletions.
17 changes: 16 additions & 1 deletion bin/DataCrawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,9 @@

import requests
import click
import os
import time
from multiprocessing import Process
from twisted.internet import reactor
from scrapy.crawler import Crawler
from scrapy import log, signals
Expand All @@ -17,8 +20,14 @@
default="/home/desa2/PycharmProjects/DataCrawler/crawler/domains.txt",
help='The list of domains to crawl.')
def main(file):
# Iniciar splash
# p = Process(target=start_splash_server)
# p.start()
# time.sleep(10)
click.echo('File path: %s' % file)
call_spider(file)
# Finalizar splash
# p.terminate()


def call_spider(file):
Expand Down Expand Up @@ -58,13 +67,19 @@ def call_spider(file):
data_spider.copy_items_to_files()

""" Eliminar archivos temporales """
#FileController.FileController().clean_tmp_files()
FileController.FileController().clean_tmp_files()

""" Convertir los archivos .json a data.json (formato POD) """
for domain in domains:
DataJson.DataJson().convert(domain)


def start_splash_server():
# Inciar splash
os.system("chmod +x run_splash.sh")
os.system("./run_splash.sh /home/desa2/datos")


results = []


Expand Down
5 changes: 5 additions & 0 deletions bin/run_splash.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
#!/bin/bash
PATH=$1
echo "virtualenv -->" $PATH
source $PATH/bin/activate
python -m splash.server
24 changes: 12 additions & 12 deletions crawler/data_json.py
Original file line number Diff line number Diff line change
Expand Up @@ -150,18 +150,18 @@ def convert(self, domain):
Falta spatial
Provider name cambia por publisher
"""
response.append({'title': dataproperty["name"][0],
'landingPage': url,
'description': dataproperty["description"][0],
'contactName': creatorproperty["name"][0],
'mbox': creatorproperty["email"][0],
'keyword': keywords,
'accessLevel': "public",
'version': dataproperty["version"][0],
'license': dataproperty["license"][0],
'temporal': dataproperty["temporal"][0],
'publisher': providerproperty["name"][0],
'distribution': distributionlist})
#response.append({'title': dataproperty["name"][0],
# 'landingPage': url,
# 'description': dataproperty["description"][0],
# 'contactName': creatorproperty["name"][0],
# 'mbox': creatorproperty["email"][0],
# 'keyword': keywords,
# 'accessLevel': "public",
# 'version': dataproperty["version"][0],
# 'license': dataproperty["license"][0],
# 'temporal': dataproperty["temporal"][0],
# 'publisher': providerproperty["name"][0],
# 'distribution': distributionlist})
""" Escribe en el archivo final """
FileController.FileController().save_existing_data_json(response, domain, False)
""" Elimina el archivo temporal de items """
Expand Down
2 changes: 1 addition & 1 deletion crawler/settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
LOG_LEVEL = 'INFO'
COOKIES_ENABLED = False
LOG_FILE = 'datacrowler.log'
SPLASH_URL = 'http://192.168.0.21:8050/render.html?url='
SPLASH_URL = 'http://192.168.43.151:8050/render.html?url='

# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'tutorial (+http://www.yourdomain.com)'
92 changes: 66 additions & 26 deletions crawler/spiders/data_spider.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
from scrapy.utils.project import get_project_settings
from microdata import get_items
from rdflib.serializer import Serializer
from scrapy import log

try:
import json
Expand Down Expand Up @@ -88,22 +89,23 @@ def parse_item(self, response):
"""
Obtiene el dominio de busqueda actual, y llama al metodo transformar.
"""
self.log('A response from %s just arrived.' % response.url)
time.sleep(3)

""" Obtiene el domain actual """
https = response.url.find("https")
if https == -1:
pos_second_bar = 7
else:
pos_second_bar = 8
pos_third_bar = response.url.find("/", pos_second_bar + 1)
domain = response.url[pos_second_bar:pos_third_bar]
if domain not in items_list.keys():
items_list[domain] = []
self.log('Domain: %s' % domain)
if response.status != 404:
self.log('A response from %s just arrived.' % response.url)
time.sleep(3)

""" Obtiene el domain actual """
https = response.url.find("https")
if https == -1:
pos_second_bar = 7
else:
pos_second_bar = 8
pos_third_bar = response.url.find("/", pos_second_bar + 1)
domain = response.url[pos_second_bar:pos_third_bar]
if domain not in items_list.keys():
items_list[domain] = []
self.log('Domain: %s' % domain)

transformar(response.url, domain)
transformar(response.url, domain)


def transformar(url, domain):
Expand Down Expand Up @@ -135,9 +137,10 @@ def transformar(url, domain):
# Si se cumple que por cada pagina hay un solo item
if len(items) == 1:
# Si el item tiene atributos se agrega o modifca en la lista
if items:
if items[indice].props:
refresh_items_list(items[indice], domain)
if items[indice].props:
#add_item_to_file(items[indice], "items")
#add_item_to_file_2(items[indice].props['url'][0], "urls")
refresh_items_list(items[indice], domain)


def refresh_items_list(item_nuevo, domain):
Expand All @@ -148,15 +151,15 @@ def refresh_items_list(item_nuevo, domain):

# Itera sobre la lista de items existentes
for item in items_list[domain]:
#add_item = True
# add_item = True
# Si el item a comparar es DataCatalog
if item.itemtype == "[http://schema.org/Datacatalog]":

# Si el nuevo item es DataCatalog compara directo
if item.itemtype == item_nuevo.itemtype:
if item_nuevo.itemtype == "[http://schema.org/Datacatalog]":

# Si ya existe modifica
if item.props['url'] == item_nuevo.props['url']:
if unicode(item.props['url'][0]) == unicode(item_nuevo.props['url'][0]):
add_item = False

# Agrega los nuevos atributos del item
Expand All @@ -166,12 +169,12 @@ def refresh_items_list(item_nuevo, domain):
item.props[name].append(v)

# Si el nuevo item es DataSet busca entre sus datasets
else:
for datasets in item.get_all('datasets'):
elif item_nuevo.itemtype == "[http://schema.org/Dataset]":
for datasets in item.get_all('dataset'):
for dataset in datasets:

# Si el item ya existe modifica
if dataset.props['url'] == item_nuevo.props['url']:
if unicode(dataset.props['url'][0]) == unicode(item_nuevo.props['url'][0]):
add_item = False

# Agrega los nuevos atributos del item
Expand All @@ -180,15 +183,14 @@ def refresh_items_list(item_nuevo, domain):
for v in values:
dataset.props[name].append(v)

# TODO: todavia no se puede hacer esta comparacion porque no esta bien anotada la url
# Si el item a comparar es DataSet
else:

# Si el item nuevo es Dataset
if item_nuevo.itemtype == "[http://schema.org/Dataset]":

# Si el item ya existe modifica
if item.props['url'] == item_nuevo.props['url'] and item.props['name'] == item_nuevo.props['name']:
if unicode(item.props['url'][0]) == unicode(item_nuevo.props['url'][0]):
add_item = False

# Agrega los nuevos atributos del item
Expand All @@ -201,6 +203,44 @@ def refresh_items_list(item_nuevo, domain):
if add_item:
items_list[domain].append(item_nuevo)

# Nuevo metodo para agregar items a la lista
# def refresh_items_list_2(item_nuevo, domain):
# """
# Actualiza la lista de items por dominio por cada item nuevo.
# """
# add_item = True
# if item_nuevo.itemtype == "[http://schema.org/Datacatalog]":
# existe = buscar_datacatalog()
# if not existe:
# agregar_datacatalog(item_nuevo)
#
# datasets = extraer_datasets_from_datacatalog()
# for dataset in datasets:
# existe = buscar_dataset()
# if not existe:
# agregar_dataset()
#
# if item_nuevo.itemtype == "[http://schema.org/Dataset]":
# existe = buscar_dataset()
# if not existe:
# agregar_dataset()
# else:
# agregar_atributos_nuevo()
# # Si es un nuevo item agrega a la lista
# if add_item:
# items_list[domain].append(item_nuevo)

def add_item_to_file(item, file):
file_name = file + ".json"
filee = open(file_name, 'ab+')
filee.write(item.json())
filee.close()

def add_item_to_file_2(item, file):
file_name = file + ".json"
filee = open(file_name, 'ab+')
filee.write(item + " ")
filee.close()

def copy_items_to_files():
"""
Expand Down

0 comments on commit 42d3534

Please sign in to comment.