-
Notifications
You must be signed in to change notification settings - Fork 0
/
findExistingTaxTermsAndTermsToCreate.py
73 lines (62 loc) · 2.46 KB
/
findExistingTaxTermsAndTermsToCreate.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
import pandas as pd
from datetime import datetime
import os
path = os.getcwd()
aggregated = os.path.join(path, 'aggregated-taxonomies/')
taxonomies = os.path.join(path, 'existing-taxonomies/')
directory = os.path.join(path, 'items-matched/')
if not os.path.exists(directory):
os.mkdir(directory)
termsDone = os.path.join(path, 'termsDone/')
if not os.path.exists(termsDone):
os.mkdir(termsDone)
termsToCreate = os.path.join(path, 'termsToCreate/')
if not os.path.exists(termsToCreate):
os.mkdir(termsToCreate)
matchDictionary = {'AggregatedByfield_publisher.csv': 'publishers.csv',
'AggregatedByfield_subjects.csv': 'subjects.csv',
'AggregatedByfield_instrumentation_metadata.csv':
'instrumentation_metadata.csv'}
matchFieldsAndTax = {'field_publisher': 'publishers',
'field_subjects': 'subjects',
'field_instrumentation_metadata': 'instrumentation_metadata'}
for key, value in matchDictionary.items():
df_1 = pd.read_csv(aggregated+key, header=0)
df_2 = pd.read_csv(taxonomies+value, header=0)
frame = pd.merge(df_1, df_2, how='left', on=['name'], suffixes=('_1', '_2'))
frame.drop_duplicates(inplace=True)
dt = datetime.now().strftime('%Y-%m-%d %H.%M.%S')
renamedKey = key.replace('AggregatedBy', 'matched_')
fullname = os.path.join(directory, renamedKey)
print("Created new file: {}".format(renamedKey))
print('')
frame.to_csv(fullname, index=False)
newDF = pd.DataFrame()
for filename in os.listdir(directory):
filename = directory + "/" + filename
if filename.endswith('.csv'):
df = pd.read_csv(filename)
field = filename[:-4].replace(directory, '')
field = field.replace('/matched_', '')
df['type'] = field
newDF = newDF.append(df, ignore_index=True, sort=True)
toCreate = []
done = []
for count, row in newDF.iterrows():
id = row['id']
taxonomy = row['type']
if pd.isna(id):
taxonomyValue = matchFieldsAndTax.get(taxonomy)
row['taxonomy'] = taxonomyValue
del row['id']
toCreate.append(row)
else:
done.append(row)
toCreate = pd.DataFrame.from_dict(toCreate)
filename = 'taxonomyTermsToCreate.csv'
fullname = os.path.join(termsToCreate, filename)
toCreate.to_csv(fullname, index=False)
done = pd.DataFrame.from_dict(done)
filename2 = 'taxonomyTermsDone.csv'
fullname2 = os.path.join(termsDone, filename2)
done.to_csv(fullname2, index=False)