forked from verena91/DataCrawler
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
11 changed files
with
246 additions
and
28 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -13,3 +13,4 @@ | |
/dist | ||
\#*\# | ||
.\#* | ||
results* |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Empty file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,94 @@ | ||
#!/usr/bin/env python | ||
import unicodedata | ||
import string | ||
import copy | ||
|
||
class DataEntry(object): | ||
def __init__(self, *a, **kw): | ||
self.description = kw.get('description') | ||
self.contact_name = kw.get('contactName') | ||
self.keywords = kw.get('keywords') | ||
self.access_level = kw.get('accessLevel') | ||
self.publisher = kw.get('publisher') | ||
self.landing_page = kw.get('landingPage') | ||
self.license = kw.get('license') | ||
self.title = kw.get('title') | ||
self.temporal = kw.get('temporal') | ||
self.mbox = kw.get('mbox') | ||
self.version = kw.get('version') | ||
self.distribution = [] | ||
for d in kw.get('distribution', []): | ||
self.distribution.append(DataDistribution(d)) | ||
|
||
class DataDistribution(object): | ||
def __init__(self, d): | ||
self.accessURL = d.get('accessURL') | ||
self.format = d.get('format') | ||
|
||
class CkanDataset(object): | ||
|
||
license_dict = { | ||
'https://creativecommons.org/licenses/by/4.0/legalcode' : 'cc-by' | ||
} | ||
|
||
def __init__(self, entry, modalidad): | ||
self.notes = unicode(entry.description) | ||
self.title = unicode(entry.title) | ||
self.name = unicode('-'.join(remove_accents(entry.title).lower().split())) | ||
self.tags = [{'name': unicode(k)} for k in entry.keywords] + [{'name': unicode(modalidad)}] | ||
self.author = unicode(entry.contact_name) | ||
self.author_email = unicode(entry.mbox) | ||
self.maintainer = unicode(entry.contact_name) | ||
self.maintainer_email = unicode(entry.mbox) | ||
self.version = entry.version | ||
if entry.temporal: | ||
self.valid_from = entry.temporal.split('/')[0] | ||
self.valid_until = entry.temporal.split('/')[1] | ||
self.license = self.license_dict.get(entry.license) | ||
self.owner_org = unicode('-'.join(remove_accents(entry.publisher).lower().split())) | ||
self.resources = [] | ||
for d in entry.distribution: | ||
self.resources.append(CkanResource(d, self.title)) | ||
self.modalidad = u'recolecta' | ||
self.private = True | ||
|
||
def as_dict(self): | ||
d = self.__dict__.copy() | ||
d['resources'] = [r.__dict__ for r in self.resources] | ||
#print d['resources'] | ||
return d | ||
|
||
def __str__(self): | ||
d = self.as_dict() | ||
c = 1 | ||
rep = 'Dataset: %s\n\n' % self.title | ||
for k in d: | ||
if k == 'resources': | ||
res = d[k] | ||
for r in res: | ||
rep += 'Recurso Nro. %s\n' % str(c) | ||
for i in r: | ||
rep += i + ': ' + unicode(r[i]) + '\n' | ||
c += 1 | ||
elif k == 'tags': | ||
tags = '' | ||
for t in d[k]: | ||
tags += t['name'] + ', ' | ||
tags = tags[0:-2] | ||
rep += 'tags' + ': ' + tags + '\n' | ||
else: | ||
rep += k + ': ' + unicode(d[k]) + '\n' | ||
return rep | ||
|
||
|
||
|
||
|
||
class CkanResource(object): | ||
def __init__(self, distribution, title): | ||
self.name = title + ' ' + distribution.format.upper() | ||
self.format = distribution.format | ||
self.url = unicode(distribution.accessURL) | ||
|
||
|
||
def remove_accents(data): | ||
return ''.join(x for x in unicodedata.normalize('NFKD', data) if x in string.ascii_letters + ' ').lower() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,99 @@ | ||
import json | ||
import requests | ||
import datetime | ||
import os | ||
import sys | ||
from zipfile import ZipFile | ||
from model import DataEntry, CkanDataset | ||
|
||
|
||
class CKANImporter(object): | ||
headers = {'Authorization': 'b90c3e69-fae7-45d8-aade-1fffd94b92e4', 'Content-type':'application/json'} | ||
base_url = 'http://localhost:8080/api/3/action/' | ||
|
||
def import_package(self, filename, modalidad): | ||
with open(filename) as file: # Use file to refer to the file object | ||
base_dir = '/'.join(a.split('/')[0:-1]) + '/' | ||
data = file.read() | ||
entries = [DataEntry(**j) for j in json.loads(data)] | ||
datasets = [CkanDataset(e, modalidad) for e in entries] | ||
valid = [d for d in datasets if d.resources] | ||
|
||
old_datasets = [] | ||
try: | ||
for d in valid: | ||
r = self.create_or_update_dataset(d) | ||
if r: | ||
old_datasets.append(r) | ||
finally: | ||
if old_datasets: | ||
dt = str(datetime.datetime.now()) | ||
filename = 'backup ' + dt | ||
with open(base_dir + filename + '.txt', 'a') as f: | ||
f.write(json.dumps(old_datasets)) | ||
with ZipFile(base_dir + filename + '.zip', 'w') as myzip: | ||
myzip.write(base_dir + filename + '.txt') | ||
|
||
if os.path.isfile(base_dir + filename + '.txt'): | ||
os.remove(base_dir + filename + '.txt') | ||
|
||
|
||
def create_or_update_dataset(self, dataset): | ||
dataset.owner_org = self.get_organization_id(dataset.owner_org) | ||
r = None | ||
exists, old = self.dataset_exists(dataset.name) | ||
confirm = '' | ||
if exists: | ||
print old | ||
while not confirm in ['s', 'n']: | ||
confirm = raw_input("El dataset ya existe. Desea actualizarlo con los valores anteriores? (s/n) ") | ||
if confirm == 's': | ||
r = self.update_dataset(dataset, old) | ||
print 'Se ha actualizado el dataset %s' % dataset.name | ||
return old | ||
else: | ||
print dataset | ||
while not confirm in ['s', 'n']: | ||
confirm = raw_input("Desea crear un nuevo dataset con los valores anteriores? (s/n) ") | ||
if confirm == 's': | ||
r = self.create_dataset(dataset) | ||
print 'Se ha creado el dataset %s' % dataset.name | ||
|
||
def create_dataset(self, dataset): | ||
url = base_url + 'package_create' | ||
dataset_dict = dataset.as_dict() | ||
r = requests.post(url, data=json.dumps(dataset_dict), headers=headers) | ||
if r.status_code == 200: | ||
return r | ||
|
||
|
||
def update_dataset(self, dataset, current): | ||
url = base_url + 'package_update' | ||
dataset_dict = dataset.as_dict() | ||
merge_dict = self.merge_datasets(dataset_dict, current) | ||
r = requests.post(url, data=json.dumps(merge_dict), headers=headers) | ||
if r.status_code == 200: | ||
return r | ||
|
||
def merge_datasets(self, a, b): | ||
for key in a.keys(): | ||
b[key] = a[key] | ||
return b | ||
|
||
def dataset_exists(self, name): | ||
url = base_url + 'package_show' | ||
params = {'id': name} | ||
r = requests.get(url, headers=headers, params=params) | ||
return (r.status_code == 200, r.json()['result']) | ||
|
||
def get_organization_id(self, org_name): | ||
url = base_url + 'organization_show' | ||
params = {'id': org_name} | ||
r = requests.get(url, params=params) | ||
return r.json()['result']['id'] | ||
|
||
|
||
if __name__ == '__main__': | ||
reload(sys) | ||
sys.setdefaultencoding("utf-8") | ||
main() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.