Skip to content

Commit

Permalink
feat: add rudimentary crawling
Browse files Browse the repository at this point in the history
It only took about a minute on my computer on crappy wifi so this might be good
enough for now?
  • Loading branch information
gadomski committed Oct 28, 2024
1 parent 9b25570 commit 0ef3144
Show file tree
Hide file tree
Showing 8 changed files with 1,216 additions and 94 deletions.
1,045 changes: 1,011 additions & 34 deletions Cargo.lock

Large diffs are not rendered by default.

4 changes: 4 additions & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,11 @@ edition = "2021"
[dependencies]
anyhow = "1.0.91"
clap = { version = "4.5.20", features = ["derive"] }
reqwest = { version = "0.12.9", features = ["json"] }
serde = { version = "1.0.214", features = ["derive"] }
serde_json = "1.0.132"
stac = "0.10.2"
tokio = { version = "1.41.0", features = ["full"] }
toml = "0.8.19"
tracing = "0.1.40"
tracing-subscriber = "0.3.18"
Empty file added crawl/.gitignore
Empty file.
1 change: 1 addition & 0 deletions crawl/crawl.json

Large diffs are not rendered by default.

56 changes: 56 additions & 0 deletions src/config.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
use anyhow::{Error, Result};
use serde::Deserialize;
use stac::{Catalog, Link};
use std::{
collections::HashMap,
fs::File,
io::{BufReader, Read},
path::Path,
};

#[derive(Debug, Deserialize)]
pub struct Config {
catalog: Catalog,
catalogs: HashMap<String, CatalogConfig>,
}

#[derive(Debug, Deserialize)]
struct CatalogConfig {
href: String,
title: String,
index: usize,
}

impl Config {
pub fn from_path(path: impl AsRef<Path>) -> Result<Config> {
let mut file = BufReader::new(File::open(path)?);
let mut s = String::new();
file.read_to_string(&mut s)?;
toml::from_str(&s).map_err(Error::from)
}

pub async fn crawl(self) -> Result<Catalog> {
crate::crawl(self.into_catalog()?).await
}

pub fn into_catalog(mut self) -> Result<Catalog> {
for (id, catalog_config) in &self.catalogs {
let mut link =
Link::child(&catalog_config.href).title(Some(catalog_config.title.clone()));
// Once https://github.com/stac-utils/stac-rs/issues/501 lands this should be cleaner
link.additional_fields
.insert("heystac:id".into(), id.as_str().into());
link.additional_fields
.insert("heystac:index".into(), catalog_config.index.into());
self.catalog.links.push(link);
}
self.catalog.links.sort_by_key(|c| {
c.additional_fields
.get("heystac:index")
.unwrap()
.as_i64()
.unwrap()
});
Ok(self.catalog)
}
}
118 changes: 118 additions & 0 deletions src/crawl.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,118 @@
use anyhow::{Error, Result};
use reqwest::{Client, Url};
use serde::{Deserialize, Serialize};
use serde_json::Value;
use stac::Catalog;
use std::{collections::HashMap, future::Future, pin::Pin};
use tokio::task::JoinSet;

pub async fn crawl(catalog: Catalog) -> Result<Catalog> {
let client = Client::new();
crawl_value(catalog.try_into()?, client).await?.try_into()
}

fn crawl_value(
mut value: CrawlValue,
client: Client,
) -> Pin<Box<impl Future<Output = Result<CrawlValue>>>> {
Box::pin(async move {
let mut join_set: JoinSet<Result<CrawlValue>> = JoinSet::new();
match value.r#type.as_str() {
"Catalog" => {
for link in value.links.iter().filter(|link| link.rel == "child") {
let href = link.href.clone();
let client = client.clone();
tracing::info!("getting child: {href}");
let _ = join_set.spawn(async move {
client
.get(href)
.send()
.await?
.error_for_status()?
.json()
.await
.map_err(Error::from)
});
}
}
"Collection" => {
if let Some(link) = value.links.iter().find(|link| link.rel == "item") {
let url = Url::parse_with_params(
&link.href,
[("limit", "1"), ("sortby", "-properties.datetime")],
)?;
tracing::info!("getting item: {}", url);
value.item = client
.get(url)
.send()
.await?
.error_for_status()?
.json()
.await?;
}
if value.item.is_none() {
if let Some(link) = value.links.iter().find(|link| link.rel == "items") {
// TODO sort items, maybe limit?
tracing::info!("getting items: {}", link.href);
let mut items: CrawlValue = reqwest::get(&link.href)
.await?
.error_for_status()?
.json()
.await?;
if !items.features.is_empty() {
value.item = Some(items.features.remove(0));
}
}
}
}
_ => {}
}
while let Some(result) = join_set.join_next().await {
let child = result??;
let client = client.clone();
let child = crawl_value(child, client).await?;
value.children.push(Box::new(child));
}
Ok(value)
})
}

// We use a very limited STAC value representation to parse as permissively as possible.
#[derive(Debug, Deserialize, Serialize)]
struct CrawlValue {
r#type: String,
#[serde(default)]
links: Vec<CrawlLink>,
#[serde(default)]
children: Vec<Box<CrawlValue>>,
#[serde(default)]
item: Option<Box<CrawlValue>>,
#[serde(default, skip_serializing_if = "Vec::is_empty")]
features: Vec<Box<CrawlValue>>,
#[serde(flatten)]
additional_fields: HashMap<String, Value>,
}

#[derive(Debug, Deserialize, Serialize)]
struct CrawlLink {
href: String,
rel: String,
#[serde(flatten)]
additional_fields: HashMap<String, Value>,
}

impl TryFrom<Catalog> for CrawlValue {
type Error = Error;

fn try_from(value: Catalog) -> Result<Self> {
serde_json::from_value(serde_json::to_value(value)?).map_err(Error::from)
}
}

impl TryFrom<CrawlValue> for Catalog {
type Error = Error;

fn try_from(value: CrawlValue) -> Result<Self> {
serde_json::from_value(serde_json::to_value(value)?).map_err(Error::from)
}
}
56 changes: 3 additions & 53 deletions src/lib.rs
Original file line number Diff line number Diff line change
@@ -1,54 +1,4 @@
use anyhow::{Error, Result};
use serde::Deserialize;
use stac::{Catalog, Link};
use std::{
collections::HashMap,
fs::File,
io::{BufReader, Read},
path::Path,
};
mod config;
mod crawl;

#[derive(Debug, Deserialize)]
pub struct Config {
catalog: Catalog,
catalogs: HashMap<String, CatalogConfig>,
}

#[derive(Debug, Deserialize)]
struct CatalogConfig {
href: String,
title: String,
index: usize,
}

impl Config {
pub fn from_path(path: impl AsRef<Path>) -> Result<Config> {
let mut file = BufReader::new(File::open(path)?);
let mut s = String::new();
file.read_to_string(&mut s)?;
toml::from_str(&s).map_err(Error::from)
}

pub fn write_catalog(&self, path: impl AsRef<Path>) -> Result<()> {
let mut catalog = self.catalog.clone();
for (id, catalog_config) in &self.catalogs {
let mut link =
Link::child(&catalog_config.href).title(Some(catalog_config.title.clone()));
// Once https://github.com/stac-utils/stac-rs/issues/501 lands this should be cleaner
link.additional_fields
.insert("heystac:id".into(), id.as_str().into());
link.additional_fields
.insert("heystac:index".into(), catalog_config.index.into());
catalog.links.push(link);
}
catalog.links.sort_by_key(|c| {
c.additional_fields
.get("heystac:index")
.unwrap()
.as_i64()
.unwrap()
});
let file = File::create(path)?;
serde_json::to_writer_pretty(file, &catalog).map_err(Error::from)
}
}
pub use {config::Config, crawl::crawl};
30 changes: 23 additions & 7 deletions src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -9,18 +9,34 @@ struct Args {

#[derive(Debug, clap::Subcommand)]
enum Subcommand {
/// Crawl all catalogs
Crawl,

/// Run the prebuild actions
Prebuild,
}

fn main() {
#[tokio::main]
async fn main() {
tracing_subscriber::fmt::init();
let args = Args::parse();
let config = Config::from_path("config.toml").unwrap();
match args.subcommand {
Subcommand::Prebuild => prebuild(),
Subcommand::Crawl => {
let catalogs = config.crawl().await.unwrap();
std::fs::write(
"crawl/crawl.json",
serde_json::to_string(&catalogs).unwrap(),
)
.unwrap();
}
Subcommand::Prebuild => {
let catalog = config.into_catalog().unwrap();
std::fs::write(
"app/catalog.json",
serde_json::to_string_pretty(&catalog).unwrap(),
)
.unwrap();
}
}
}

fn prebuild() {
let config = Config::from_path("config.toml").unwrap();
config.write_catalog("app/catalog.json").unwrap();
}

0 comments on commit 0ef3144

Please sign in to comment.