-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathinsert-only-doi-ddf.py
111 lines (80 loc) · 3.23 KB
/
insert-only-doi-ddf.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
import os
import yaml
import json
import py2neo
import pandas
import pymongo
import logging
import requests
import datetime
import argparse
from sys import exit
from pprint import pprint
from urllib.parse import quote
class InsertOnlyDoiDDF():
def __init__(self,config):
logging.basicConfig(
level=logging.INFO,
filename= 'pipeline.log',
datefmt='%Y-%m-%d %H:%M:%S',
format='%(asctime)s %(levelname)-8s %(message)s')
self.logger = logging.getLogger('pipeline')
name_yml = os.path.abspath(config)
with open(name_yml, 'r') as ymlfile:
cfg = yaml.load(ymlfile,Loader=yaml.BaseLoader)
client = pymongo.MongoClient('mongodb://{0}:{1}@{2}/{3}'.format(
quote(cfg['auth'][cfg['env']]['mongo-user']),
quote(cfg['auth'][cfg['env']]['mongo-pass']),
cfg['auth'][cfg['env']]['mongo-host'],
cfg['auth'][cfg['env']]['mongo-db'])
)
self.graph = py2neo.Graph(
password = cfg['auth'][cfg['env']]['neo4j-pass'],
host = cfg['auth'][cfg['env']]['neo4j-host']
)
self.db = client[cfg['auth'][cfg['env']]['mongo-db']]
self.dimension_all_flags = 'dimension_all_flags'
self.organisations = 'organisations'
self.ddf = 'ddf'
self.data = []
def process(self):
query = {}
query["dim_grid"] = False
query["dim_ddf"] = False
data = self.db[self.dimension_all_flags].find(query,{'_id': 0})
for row in data:
# Obtenemos el grid desde la respuesta descargada de la API DDF
only_ddf = self.db[self.ddf].find_one({'meta.doi' : row['doi']})
# Comprobamos que no haya cambiado el grid
grid = self.db[self.organisations].find_one({'id_original' : only_ddf['meta']['grid']})['id']
# Ya fue solucionado en el python getddf.py
if 'url ' in only_ddf['meta']:
only_ddf['meta']['url'] = only_ddf['meta']['url ']
del only_ddf['meta']['url ']
self.data.append({
'defaultid' : 'ddf-{doi}'.format(doi=only_ddf['meta']['doi']),
'doi' : only_ddf['meta']['doi'],
'grid' : grid ,
'url' : only_ddf['meta']['url'],
'year' : only_ddf['meta']['year']
})
pandas.DataFrame(self.data).to_csv(f'/var/lib/neo4j/import/only-ddf.csv',encoding='utf-8',index=False,sep='|')
cyphers = """
USING PERIODIC COMMIT
LOAD CSV WITH HEADERS FROM "file:///only-ddf.csv" AS row FIELDTERMINATOR '|'
WITH row
MERGE (d:Document { defaultid: row.defaultid })
SET
d.doi = row.doi,
d.grid = row.grid,
d.url = row.url,
d.year = row.year
"""
nodes_created = self.graph.run(cyphers).stats().nodes_created
self.logger.info(f'Only DDF { nodes_created}')
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('-cfg','--config',default='./config.yml')
args = parser.parse_args()
config = args.config
InsertOnlyDoiDDF(config).process()