Skip to content

Commit

Permalink
Cambios mínimos.
Browse files Browse the repository at this point in the history
  • Loading branch information
verena91 committed Sep 11, 2014
1 parent 57753f2 commit 0319738
Show file tree
Hide file tree
Showing 4 changed files with 20 additions and 15 deletions.
2 changes: 1 addition & 1 deletion bin/DataCrawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,7 @@ def call_spider(file):
data_spider.copy_items_to_files()

""" Eliminar archivos temporales """
FileController.FileController().clean_tmp_files()
#FileController.FileController().clean_tmp_files()

""" Convertir los archivos .json a data.json (formato POD) """
for domain in domains:
Expand Down
4 changes: 2 additions & 2 deletions crawler/data_json.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ def convert(self, domain):
if property:
url = ""
if url_aux in property.keys():
url = dataproperty["url"][0].encode('utf-8')
url = property["url"][0]

""" Iterar sobre creator (publicador) """
for creator in property["creator"]:
Expand Down Expand Up @@ -168,4 +168,4 @@ def convert(self, domain):
FileController.FileController().clean_item_tmp_file(domain)


#DataJson().convert("192.168.200.102")
#DataJson().convert("datos.mec.gov.py")
1 change: 1 addition & 0 deletions crawler/settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
LOG_LEVEL = 'INFO'
COOKIES_ENABLED = False
LOG_FILE = 'datacrowler.log'
SPLASH_URL = 'http://192.168.0.21:8050/render.html?url='

# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'tutorial (+http://www.yourdomain.com)'
28 changes: 16 additions & 12 deletions crawler/spiders/data_spider.py
Original file line number Diff line number Diff line change
Expand Up @@ -116,7 +116,8 @@ def transformar(url, domain):
microdata = {}
microdata['items'] = items = []

url_splash = "http://192.168.0.21:8050/render.html?url=" + url + "&timeout=20&wait=2.5"
settings = get_project_settings()
url_splash = settings['SPLASH_URL'] + url + "&timeout=20&wait=2.5"
file_splash = open('splash.html', 'w')
html = urllib.urlopen(url_splash)
file_splash.write(str(html.read()))
Expand Down Expand Up @@ -147,7 +148,7 @@ def refresh_items_list(item_nuevo, domain):

# Itera sobre la lista de items existentes
for item in items_list[domain]:
add_item = True
#add_item = True
# Si el item a comparar es DataCatalog
if item.itemtype == "[http://schema.org/Datacatalog]":

Expand Down Expand Up @@ -182,16 +183,19 @@ def refresh_items_list(item_nuevo, domain):
# TODO: todavia no se puede hacer esta comparacion porque no esta bien anotada la url
# Si el item a comparar es DataSet
else:
add_item = True
# Si el item ya existe modifica
if item.props['url'] == item_nuevo.props['url']:
addItem = False

# Agrega los nuevos atributos del item
for name, values in item_nuevo.props.items():
if not item.props[name]:
for v in values:
item.props[name].append(v)

# Si el item nuevo es Dataset
if item_nuevo.itemtype == "[http://schema.org/Dataset]":

# Si el item ya existe modifica
if item.props['url'] == item_nuevo.props['url'] and item.props['name'] == item_nuevo.props['name']:
add_item = False

# Agrega los nuevos atributos del item
for name, values in item_nuevo.props.items():
if not item.props[name]:
for v in values:
item.props[name].append(v)

# Si es un nuevo item agrega a la lista
if add_item:
Expand Down

0 comments on commit 0319738

Please sign in to comment.