-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathaws_stac_catalogs.py
96 lines (62 loc) · 2.41 KB
/
aws_stac_catalogs.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
import json
import os
import shutil
import yaml
import leafmap
import pandas as pd
url = "https://github.com/awslabs/open-data-registry/archive/refs/heads/main.zip"
out_dir = "open-data-registry-main"
zip_file = "open-data-registry-main.zip"
if os.path.exists(out_dir):
shutil.rmtree(out_dir)
if os.path.exists(zip_file):
os.remove(zip_file)
leafmap.download_file(url, output=zip_file, unzip=True)
in_dir = os.path.join(out_dir, "datasets")
files = leafmap.find_files(in_dir, ext=".yaml")
print(f"Total number of AWS open datasets: {len(files)}")
datasets = []
names = {}
for file in files:
dataset = {}
with open(file, "r") as f:
dataset = yaml.safe_load(f)
if "Deprecated" in dataset:
continue
tags = dataset.get("Tags", [])
name = dataset.get("Name", "")
if "stac" in tags:
basename = os.path.basename(file)
out_file = os.path.join("datasets", basename)
shutil.copy(file, out_file)
resources = dataset.get("Resources", [])
for resource in resources:
if "Explore" in resource:
names[name] = names.get(name, 0) + 1
for resource in resources:
if "Explore" in resource:
explore = resource["Explore"][0]
url = explore[explore.find("http") : -1]
resource.pop("Explore")
item = {}
resource["Description"] = resource["Description"].replace(
"Water Observations from Space ", ""
)
if names[name] > 1:
item[
"Name"
] = f"{name} - {resource['Description'].replace(name, '')}"
else:
item["Name"] = name
item["Name"] = item["Name"].replace("/", "-").replace("- -", "-")
item["Endpoint"] = url
for key in resource:
item[key] = resource[key]
datasets.append(item)
print(f"Total number of STAC datasets: {len(datasets)}")
df = pd.DataFrame(datasets)
df = df.sort_values(by="Name")
df.to_csv("aws_stac_catalogs.tsv", index=False, sep="\t")
data = json.loads(df.to_json(orient="records"))
with open("aws_stac_catalogs.json", "w") as f:
json.dump(data, f, indent=4)