-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
It only took about a minute on my computer on crappy wifi so this might be good enough for now?
- Loading branch information
Showing
8 changed files
with
1,216 additions
and
94 deletions.
There are no files selected for viewing
Large diffs are not rendered by default.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Empty file.
Large diffs are not rendered by default.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,56 @@ | ||
use anyhow::{Error, Result}; | ||
use serde::Deserialize; | ||
use stac::{Catalog, Link}; | ||
use std::{ | ||
collections::HashMap, | ||
fs::File, | ||
io::{BufReader, Read}, | ||
path::Path, | ||
}; | ||
|
||
#[derive(Debug, Deserialize)] | ||
pub struct Config { | ||
catalog: Catalog, | ||
catalogs: HashMap<String, CatalogConfig>, | ||
} | ||
|
||
#[derive(Debug, Deserialize)] | ||
struct CatalogConfig { | ||
href: String, | ||
title: String, | ||
index: usize, | ||
} | ||
|
||
impl Config { | ||
pub fn from_path(path: impl AsRef<Path>) -> Result<Config> { | ||
let mut file = BufReader::new(File::open(path)?); | ||
let mut s = String::new(); | ||
file.read_to_string(&mut s)?; | ||
toml::from_str(&s).map_err(Error::from) | ||
} | ||
|
||
pub async fn crawl(self) -> Result<Catalog> { | ||
crate::crawl(self.into_catalog()?).await | ||
} | ||
|
||
pub fn into_catalog(mut self) -> Result<Catalog> { | ||
for (id, catalog_config) in &self.catalogs { | ||
let mut link = | ||
Link::child(&catalog_config.href).title(Some(catalog_config.title.clone())); | ||
// Once https://github.com/stac-utils/stac-rs/issues/501 lands this should be cleaner | ||
link.additional_fields | ||
.insert("heystac:id".into(), id.as_str().into()); | ||
link.additional_fields | ||
.insert("heystac:index".into(), catalog_config.index.into()); | ||
self.catalog.links.push(link); | ||
} | ||
self.catalog.links.sort_by_key(|c| { | ||
c.additional_fields | ||
.get("heystac:index") | ||
.unwrap() | ||
.as_i64() | ||
.unwrap() | ||
}); | ||
Ok(self.catalog) | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,118 @@ | ||
use anyhow::{Error, Result}; | ||
use reqwest::{Client, Url}; | ||
use serde::{Deserialize, Serialize}; | ||
use serde_json::Value; | ||
use stac::Catalog; | ||
use std::{collections::HashMap, future::Future, pin::Pin}; | ||
use tokio::task::JoinSet; | ||
|
||
pub async fn crawl(catalog: Catalog) -> Result<Catalog> { | ||
let client = Client::new(); | ||
crawl_value(catalog.try_into()?, client).await?.try_into() | ||
} | ||
|
||
fn crawl_value( | ||
mut value: CrawlValue, | ||
client: Client, | ||
) -> Pin<Box<impl Future<Output = Result<CrawlValue>>>> { | ||
Box::pin(async move { | ||
let mut join_set: JoinSet<Result<CrawlValue>> = JoinSet::new(); | ||
match value.r#type.as_str() { | ||
"Catalog" => { | ||
for link in value.links.iter().filter(|link| link.rel == "child") { | ||
let href = link.href.clone(); | ||
let client = client.clone(); | ||
tracing::info!("getting child: {href}"); | ||
let _ = join_set.spawn(async move { | ||
client | ||
.get(href) | ||
.send() | ||
.await? | ||
.error_for_status()? | ||
.json() | ||
.await | ||
.map_err(Error::from) | ||
}); | ||
} | ||
} | ||
"Collection" => { | ||
if let Some(link) = value.links.iter().find(|link| link.rel == "item") { | ||
let url = Url::parse_with_params( | ||
&link.href, | ||
[("limit", "1"), ("sortby", "-properties.datetime")], | ||
)?; | ||
tracing::info!("getting item: {}", url); | ||
value.item = client | ||
.get(url) | ||
.send() | ||
.await? | ||
.error_for_status()? | ||
.json() | ||
.await?; | ||
} | ||
if value.item.is_none() { | ||
if let Some(link) = value.links.iter().find(|link| link.rel == "items") { | ||
// TODO sort items, maybe limit? | ||
tracing::info!("getting items: {}", link.href); | ||
let mut items: CrawlValue = reqwest::get(&link.href) | ||
.await? | ||
.error_for_status()? | ||
.json() | ||
.await?; | ||
if !items.features.is_empty() { | ||
value.item = Some(items.features.remove(0)); | ||
} | ||
} | ||
} | ||
} | ||
_ => {} | ||
} | ||
while let Some(result) = join_set.join_next().await { | ||
let child = result??; | ||
let client = client.clone(); | ||
let child = crawl_value(child, client).await?; | ||
value.children.push(Box::new(child)); | ||
} | ||
Ok(value) | ||
}) | ||
} | ||
|
||
// We use a very limited STAC value representation to parse as permissively as possible. | ||
#[derive(Debug, Deserialize, Serialize)] | ||
struct CrawlValue { | ||
r#type: String, | ||
#[serde(default)] | ||
links: Vec<CrawlLink>, | ||
#[serde(default)] | ||
children: Vec<Box<CrawlValue>>, | ||
#[serde(default)] | ||
item: Option<Box<CrawlValue>>, | ||
#[serde(default, skip_serializing_if = "Vec::is_empty")] | ||
features: Vec<Box<CrawlValue>>, | ||
#[serde(flatten)] | ||
additional_fields: HashMap<String, Value>, | ||
} | ||
|
||
#[derive(Debug, Deserialize, Serialize)] | ||
struct CrawlLink { | ||
href: String, | ||
rel: String, | ||
#[serde(flatten)] | ||
additional_fields: HashMap<String, Value>, | ||
} | ||
|
||
impl TryFrom<Catalog> for CrawlValue { | ||
type Error = Error; | ||
|
||
fn try_from(value: Catalog) -> Result<Self> { | ||
serde_json::from_value(serde_json::to_value(value)?).map_err(Error::from) | ||
} | ||
} | ||
|
||
impl TryFrom<CrawlValue> for Catalog { | ||
type Error = Error; | ||
|
||
fn try_from(value: CrawlValue) -> Result<Self> { | ||
serde_json::from_value(serde_json::to_value(value)?).map_err(Error::from) | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,54 +1,4 @@ | ||
use anyhow::{Error, Result}; | ||
use serde::Deserialize; | ||
use stac::{Catalog, Link}; | ||
use std::{ | ||
collections::HashMap, | ||
fs::File, | ||
io::{BufReader, Read}, | ||
path::Path, | ||
}; | ||
mod config; | ||
mod crawl; | ||
|
||
#[derive(Debug, Deserialize)] | ||
pub struct Config { | ||
catalog: Catalog, | ||
catalogs: HashMap<String, CatalogConfig>, | ||
} | ||
|
||
#[derive(Debug, Deserialize)] | ||
struct CatalogConfig { | ||
href: String, | ||
title: String, | ||
index: usize, | ||
} | ||
|
||
impl Config { | ||
pub fn from_path(path: impl AsRef<Path>) -> Result<Config> { | ||
let mut file = BufReader::new(File::open(path)?); | ||
let mut s = String::new(); | ||
file.read_to_string(&mut s)?; | ||
toml::from_str(&s).map_err(Error::from) | ||
} | ||
|
||
pub fn write_catalog(&self, path: impl AsRef<Path>) -> Result<()> { | ||
let mut catalog = self.catalog.clone(); | ||
for (id, catalog_config) in &self.catalogs { | ||
let mut link = | ||
Link::child(&catalog_config.href).title(Some(catalog_config.title.clone())); | ||
// Once https://github.com/stac-utils/stac-rs/issues/501 lands this should be cleaner | ||
link.additional_fields | ||
.insert("heystac:id".into(), id.as_str().into()); | ||
link.additional_fields | ||
.insert("heystac:index".into(), catalog_config.index.into()); | ||
catalog.links.push(link); | ||
} | ||
catalog.links.sort_by_key(|c| { | ||
c.additional_fields | ||
.get("heystac:index") | ||
.unwrap() | ||
.as_i64() | ||
.unwrap() | ||
}); | ||
let file = File::create(path)?; | ||
serde_json::to_writer_pretty(file, &catalog).map_err(Error::from) | ||
} | ||
} | ||
pub use {config::Config, crawl::crawl}; |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters