-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathqueries.py
150 lines (116 loc) · 4.66 KB
/
queries.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
from SPARQLWrapper import SPARQLWrapper, JSON
import pandas as pd
# TODO cache ?
PLOSH_URL = "http://graphdb.dev.innovation.insee.eu/repositories/plosh"
SCOTT_URL = "https://statistics.gov.scot/sparql"
def get_endpoints_list():
return [
{"label": "PLOSH data sets", "value": PLOSH_URL},
{"label": "Scotland's official statistics", "value": SCOTT_URL}
]
def query_datasets(target_url):
QUERY = """
PREFIX qb: <http://purl.org/linked-data/cube#>
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
SELECT ?label ?comment ?dataset_uri where {
?dataset_uri a qb:DataSet ; rdfs:label ?label .
OPTIONAL {
?dataset_uri rdfs:comment ?comment.
}
}
"""
wrapper = SPARQLWrapper(target_url)
wrapper.setQuery(QUERY)
wrapper.setReturnFormat(JSON)
json = wrapper.query().convert()
def label_modif(row):
if "label" in row.keys():
if 'comment' in row.keys():
return row["label"]["value"]+ ' - '+ row["comment"]["value"]
else :
return row["label"]["value"]
else:
if 'comment' in row.keys():
return row["dataset_uri"]["value"]+ ' - '+ row["comment"]["value"]
else:
return row["dataset_uri"]["value"]
results=json['results']['bindings']
keys=list(results[0].keys())
return list({v['value']:v for v in [{'label': label_modif(result), 'value': result["dataset_uri"]['value']} for result in results]}.values())
def queryToDataFrame(results):
results_value=results['results']['bindings']
table=pd.DataFrame([[x[name]['value'] for x in results_value] for name in list(results_value[0].keys())]).T
table.columns=list(results_value[0].keys())
return table
def query_dimensions(target_url, dataset_uri):
sparql = SPARQLWrapper(target_url)
sparql.setReturnFormat(JSON)
query = f"""
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
PREFIX qb: <http://purl.org/linked-data/cube#>
SELECT ?label ?value where {{
<{dataset_uri}> qb:structure ?dsd .
?dsd qb:component/qb:dimension ?value .
?value rdfs:label ?label .
#filter(langMatches(lang(?label), "en"))
}}
"""
sparql.setQuery(query)
results = sparql.query().convert()
results=results["results"]["bindings"]
keys=list(results[0].keys())
return [{keys[0]: result[keys[0]]['value'],keys[1]: result[keys[1]]['value']} for result in results]
def query_measures(target_url, dataset_uri):
sparql = SPARQLWrapper(target_url)
sparql.setReturnFormat(JSON)
query = f"""
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
PREFIX qb: <http://purl.org/linked-data/cube#>
SELECT ?measure ?label where {{
<{dataset_uri}> qb:structure ?dsd .
?dsd qb:component/qb:measure ?measure .
?measure rdfs:label ?label .
FILTER (STRLEN(?label) > 0)
}}
"""
sparql.setQuery(query)
results = sparql.query().convert()
results=results["results"]["bindings"]
def label_modif(row):
if row["label"]["value"]!="":
return row["label"]["value"]
else:
return row["measure"]["value"]
def index(row):
if row["label"]["value"]!="":
return 1
else:
return 2
df=pd.DataFrame([{'label': label_modif(result), 'value': result['measure']['value'], 'index':index(result)}
for result in results]).sort_values(['value','index'])
df=df.groupby('value').first().reset_index()[['value','label']]
return [{'label': result[1], 'value': result[0]} for result in df.to_dict('split')['data']]
def query_data(target_url, dataset_uri, dimension1, dimension2, measures_info):
sparql = SPARQLWrapper(target_url)
sparql.setReturnFormat(JSON)
query = f"""
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
PREFIX qb: <http://purl.org/linked-data/cube#>
PREFIX skos: <http://www.w3.org/2004/02/skos/core#>
SELECT ?concept1 ?concept2 (SUM(?value) as ?total) where {{
?obs qb:dataSet <{dataset_uri}> .
?obs <{dimension1}> ?dim1 .
?obs <{dimension2}> ?dim2 .
?obs <{measures_info}> ?value .
?dim1 rdfs:label|skos:notation ?concept1 .
?dim2 rdfs:label|skos:notation ?concept2 .
FILTER (STRLEN(?concept1) > 0)
FILTER (STRLEN(?concept2) > 0)
}}
GROUP BY ?concept1 ?concept2
"""
sparql.setQuery(query)
results = sparql.query().convert()
#print(results)
df=queryToDataFrame(results)
return df.pivot(index='concept1', columns='concept2', values='total').reset_index()