Skip to content

Commit

Permalink
Se integra con el importador a CKAN
Browse files Browse the repository at this point in the history
  • Loading branch information
rparrapy committed Sep 11, 2014
1 parent 57753f2 commit 5012738
Show file tree
Hide file tree
Showing 11 changed files with 246 additions and 28 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -13,3 +13,4 @@
/dist
\#*\#
.\#*
results*
27 changes: 23 additions & 4 deletions bin/DataCrawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

import requests
import click
import sys
from twisted.internet import reactor
from scrapy.crawler import Crawler
from scrapy import log, signals
Expand All @@ -11,14 +12,18 @@
from crawler import data_json as DataJson
from crawler import file_controller as FileController

from importer.rest import CKANImporter


@click.command()
@click.option('--file', # prompt='Path to your file with domains to crawl',
default="/home/desa2/PycharmProjects/DataCrawler/crawler/domains.txt",
default="./crawler/domains.txt",
help='The list of domains to crawl.')
def main(file):
click.echo('File path: %s' % file)
call_spider(file)
created_files = call_spider(file)
log.msg("continua la ejecucion", level=log.DEBUG)
import_to_ckan(created_files)


def call_spider(file):
Expand All @@ -30,14 +35,16 @@ def call_spider(file):
list_url = f.readlines()
domains = []
urls = []
created_files = []
for u in list_url:
domain = u.strip('\n')
url = "http://" + u.strip('\n') + "" + "/"
print "============= Domain " + domain
print "============= Start url " + url
response = requests.get(url + "/data.json")
if response.status_code == 200:
FileController.FileController().save_existing_data_json(response, domain, True)
filename = FileController.FileController().save_existing_data_json(response, domain, True)
created_files.append({'modalidad': 'recolecta', 'archivo': filename})
else:
domains.append(domain)
urls.append(url)
Expand All @@ -52,6 +59,7 @@ def call_spider(file):
crawler.crawl(spider)
crawler.start()
log.start(loglevel=log.DEBUG)
log.msg("after log", level=log.DEBUG)
reactor.run() # the script will block here

""" Copiar los datos a los archivos .json """
Expand All @@ -62,15 +70,26 @@ def call_spider(file):

""" Convertir los archivos .json a data.json (formato POD) """
for domain in domains:
DataJson.DataJson().convert(domain)
filename = DataJson.DataJson().convert(domain)
created_files.append({'modalidad': 'data-hunting', 'archivo': filename})

return created_files

results = []


def spider_closed(spider):
print results

def import_to_ckan(created_files):
importer = CKANImporter()
for f in created_files:
m = 'Importing %s' % str(f)
log.msg(m, level=log.DEBUG)
importer.import_package(f['archivo'], f['modalidad'])


if __name__ == '__main__':
reload(sys)
sys.setdefaultencoding("utf-8")
main()
3 changes: 2 additions & 1 deletion crawler/data_json.py
Original file line number Diff line number Diff line change
Expand Up @@ -163,9 +163,10 @@ def convert(self, domain):
'publisher': providerproperty["name"][0],
'distribution': distributionlist})
""" Escribe en el archivo final """
FileController.FileController().save_existing_data_json(response, domain, False)
filename = FileController.FileController().save_existing_data_json(response, domain, False)
""" Elimina el archivo temporal de items """
FileController.FileController().clean_item_tmp_file(domain)

return filename

#DataJson().convert("192.168.200.102")
7 changes: 4 additions & 3 deletions crawler/file_controller.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,9 +39,10 @@ def save_existing_data_json(self, response, domain, to_json):
subprincipal = principal + "/" + domain
if not os.path.exists(subprincipal):
os.makedirs(subprincipal)
final = subprincipal + "/" + "data.json"
file_response = codecs.open(final, 'w+', 'utf-8-sig')
filename = subprincipal + "/" + "data.json"
file_response = codecs.open(filename, 'w+', 'utf-8-sig')
if to_json == True:
file_response.write(json.dumps(response.json(), indent=2, ensure_ascii=False))
else:
file_response.write(json.dumps(response, indent=2, ensure_ascii=False))
file_response.write(json.dumps(response, indent=2, ensure_ascii=False))
return filename
1 change: 1 addition & 0 deletions crawler/settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
LOG_LEVEL = 'INFO'
COOKIES_ENABLED = False
LOG_FILE = 'datacrowler.log'
#DEPTH_LIMIT = 1

# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'tutorial (+http://www.yourdomain.com)'
20 changes: 11 additions & 9 deletions crawler/spiders/data_spider.py
Original file line number Diff line number Diff line change
Expand Up @@ -183,15 +183,17 @@ def refresh_items_list(item_nuevo, domain):
# Si el item a comparar es DataSet
else:
add_item = True
# Si el item ya existe modifica
if item.props['url'] == item_nuevo.props['url']:
addItem = False

# Agrega los nuevos atributos del item
for name, values in item_nuevo.props.items():
if not item.props[name]:
for v in values:
item.props[name].append(v)
# Si el item nuevo es Dataset
if item_nuevo.itemtype == "[http://schema.org/Dataset]":
# Si el item ya existe modifica
if item.props['url'] == item_nuevo.props['url']:
addItem = False

# Agrega los nuevos atributos del item
for name, values in item_nuevo.props.items():
if not item.props[name]:
for v in values:
item.props[name].append(v)

# Si es un nuevo item agrega a la lista
if add_item:
Expand Down
Empty file added importer/__init__.py
Empty file.
94 changes: 94 additions & 0 deletions importer/model.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,94 @@
#!/usr/bin/env python
import unicodedata
import string
import copy

class DataEntry(object):
def __init__(self, *a, **kw):
self.description = kw.get('description')
self.contact_name = kw.get('contactName')
self.keywords = kw.get('keywords')
self.access_level = kw.get('accessLevel')
self.publisher = kw.get('publisher')
self.landing_page = kw.get('landingPage')
self.license = kw.get('license')
self.title = kw.get('title')
self.temporal = kw.get('temporal')
self.mbox = kw.get('mbox')
self.version = kw.get('version')
self.distribution = []
for d in kw.get('distribution', []):
self.distribution.append(DataDistribution(d))

class DataDistribution(object):
def __init__(self, d):
self.accessURL = d.get('accessURL')
self.format = d.get('format')

class CkanDataset(object):

license_dict = {
'https://creativecommons.org/licenses/by/4.0/legalcode' : 'cc-by'
}

def __init__(self, entry, modalidad):
self.notes = unicode(entry.description)
self.title = unicode(entry.title)
self.name = unicode('-'.join(remove_accents(entry.title).lower().split()))
self.tags = [{'name': unicode(k)} for k in entry.keywords] + [{'name': unicode(modalidad)}]
self.author = unicode(entry.contact_name)
self.author_email = unicode(entry.mbox)
self.maintainer = unicode(entry.contact_name)
self.maintainer_email = unicode(entry.mbox)
self.version = entry.version
if entry.temporal:
self.valid_from = entry.temporal.split('/')[0]
self.valid_until = entry.temporal.split('/')[1]
self.license = self.license_dict.get(entry.license)
self.owner_org = unicode('-'.join(remove_accents(entry.publisher).lower().split()))
self.resources = []
for d in entry.distribution:
self.resources.append(CkanResource(d, self.title))
self.modalidad = u'recolecta'
self.private = True

def as_dict(self):
d = self.__dict__.copy()
d['resources'] = [r.__dict__ for r in self.resources]
#print d['resources']
return d

def __str__(self):
d = self.as_dict()
c = 1
rep = 'Dataset: %s\n\n' % self.title
for k in d:
if k == 'resources':
res = d[k]
for r in res:
rep += 'Recurso Nro. %s\n' % str(c)
for i in r:
rep += i + ': ' + unicode(r[i]) + '\n'
c += 1
elif k == 'tags':
tags = ''
for t in d[k]:
tags += t['name'] + ', '
tags = tags[0:-2]
rep += 'tags' + ': ' + tags + '\n'
else:
rep += k + ': ' + unicode(d[k]) + '\n'
return rep




class CkanResource(object):
def __init__(self, distribution, title):
self.name = title + ' ' + distribution.format.upper()
self.format = distribution.format
self.url = unicode(distribution.accessURL)


def remove_accents(data):
return ''.join(x for x in unicodedata.normalize('NFKD', data) if x in string.ascii_letters + ' ').lower()
99 changes: 99 additions & 0 deletions importer/rest.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,99 @@
import json
import requests
import datetime
import os
import sys
from zipfile import ZipFile
from model import DataEntry, CkanDataset


class CKANImporter(object):
headers = {'Authorization': 'b90c3e69-fae7-45d8-aade-1fffd94b92e4', 'Content-type':'application/json'}
base_url = 'http://localhost:8080/api/3/action/'

def import_package(self, filename, modalidad):
with open(filename) as file: # Use file to refer to the file object
base_dir = '/'.join(a.split('/')[0:-1]) + '/'
data = file.read()
entries = [DataEntry(**j) for j in json.loads(data)]
datasets = [CkanDataset(e, modalidad) for e in entries]
valid = [d for d in datasets if d.resources]

old_datasets = []
try:
for d in valid:
r = self.create_or_update_dataset(d)
if r:
old_datasets.append(r)
finally:
if old_datasets:
dt = str(datetime.datetime.now())
filename = 'backup ' + dt
with open(base_dir + filename + '.txt', 'a') as f:
f.write(json.dumps(old_datasets))
with ZipFile(base_dir + filename + '.zip', 'w') as myzip:
myzip.write(base_dir + filename + '.txt')

if os.path.isfile(base_dir + filename + '.txt'):
os.remove(base_dir + filename + '.txt')


def create_or_update_dataset(self, dataset):
dataset.owner_org = self.get_organization_id(dataset.owner_org)
r = None
exists, old = self.dataset_exists(dataset.name)
confirm = ''
if exists:
print old
while not confirm in ['s', 'n']:
confirm = raw_input("El dataset ya existe. Desea actualizarlo con los valores anteriores? (s/n) ")
if confirm == 's':
r = self.update_dataset(dataset, old)
print 'Se ha actualizado el dataset %s' % dataset.name
return old
else:
print dataset
while not confirm in ['s', 'n']:
confirm = raw_input("Desea crear un nuevo dataset con los valores anteriores? (s/n) ")
if confirm == 's':
r = self.create_dataset(dataset)
print 'Se ha creado el dataset %s' % dataset.name

def create_dataset(self, dataset):
url = base_url + 'package_create'
dataset_dict = dataset.as_dict()
r = requests.post(url, data=json.dumps(dataset_dict), headers=headers)
if r.status_code == 200:
return r


def update_dataset(self, dataset, current):
url = base_url + 'package_update'
dataset_dict = dataset.as_dict()
merge_dict = self.merge_datasets(dataset_dict, current)
r = requests.post(url, data=json.dumps(merge_dict), headers=headers)
if r.status_code == 200:
return r

def merge_datasets(self, a, b):
for key in a.keys():
b[key] = a[key]
return b

def dataset_exists(self, name):
url = base_url + 'package_show'
params = {'id': name}
r = requests.get(url, headers=headers, params=params)
return (r.status_code == 200, r.json()['result'])

def get_organization_id(self, org_name):
url = base_url + 'organization_show'
params = {'id': org_name}
r = requests.get(url, params=params)
return r.json()['result']['id']


if __name__ == '__main__':
reload(sys)
sys.setdefaultencoding("utf-8")
main()
20 changes: 10 additions & 10 deletions install.sh
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
# Observaciones: No esta terminado ni probado.
# IMPORTANT: This script must be run within the virtual environment

PATH_PYTHON_VENV_SITE_PACKAGES=$1
PATH_PYTHON_VENV=$1

echo $1

Expand All @@ -18,7 +18,7 @@ echo "--------- Installing RDFlib"
pip install rdflib
cd lib/
# Copiar el archivo microdata.py dentro de rdflib/plugins/serializers
cp microdata.py $PATH_PYTHON_VENV_SITE_PACKAGES/rdflib/plugins/serializers
cp microdata.py $PATH_PYTHON_VENV/lib/python2.7/site-packages/rdflib/plugins/serializers

# DataCrawler Project dependencies
cd ..
Expand All @@ -29,24 +29,24 @@ python setup.py develop
cd lib
cd sip
echo "--------- Installing SIP 4.16.2"
python configure.py
make
sudo make install
#python configure.py
#make
#sudo make install

# re2
cd ../re2
echo "--------- Installing re2"
#make test
sudo make install
#sudo make install
#sudo make testinstall
pip install re2
#pip install re2

# PyQt4
cd ../pyqt
echo "--------- Installing PyQt 4.10.04"
python configure-ng.py
make
sudo make install
#python configure-ng.py
#make
#sudo make install

cd ../..

Expand Down
Loading

0 comments on commit 5012738

Please sign in to comment.