Skip to content

Commit

Permalink
feat: crawl to their own files
Browse files Browse the repository at this point in the history
  • Loading branch information
gadomski committed Oct 29, 2024
1 parent 44d8341 commit 3a847a7
Show file tree
Hide file tree
Showing 9 changed files with 139,585 additions and 32 deletions.
Empty file removed crawl/.gitignore
Empty file.
1 change: 0 additions & 1 deletion crawl/crawl.json

This file was deleted.

11,699 changes: 11,699 additions & 0 deletions crawl/earth-search-aws.json

Large diffs are not rendered by default.

115,294 changes: 115,294 additions & 0 deletions crawl/microsoft-pc.json

Large diffs are not rendered by default.

12,512 changes: 12,512 additions & 0 deletions crawl/usgs-landsat.json

Large diffs are not rendered by default.

26 changes: 19 additions & 7 deletions src/config.rs
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
use anyhow::{Error, Result};
use crate::Crawl;
use anyhow::{anyhow, Error, Result};
use reqwest::Url;
use serde::Deserialize;
use stac::{Catalog, Link};
use std::{
Expand All @@ -8,14 +10,14 @@ use std::{
path::Path,
};

#[derive(Debug, Deserialize)]
#[derive(Clone, Debug, Deserialize)]
pub struct Config {
catalog: Catalog,
catalogs: HashMap<String, CatalogConfig>,
pub catalogs: HashMap<String, CatalogConfig>,
}

#[derive(Debug, Deserialize)]
struct CatalogConfig {
#[derive(Clone, Debug, Deserialize)]
pub struct CatalogConfig {
href: String,
title: String,
index: usize,
Expand All @@ -29,8 +31,18 @@ impl Config {
toml::from_str(&s).map_err(Error::from)
}

pub async fn crawl(self) -> Result<Catalog> {
crate::crawl(self.into_catalog()?).await
pub async fn crawl_url(self, url: Url) -> Result<Crawl> {
let catalog: Catalog = reqwest::get(url).await?.error_for_status()?.json().await?;
crate::crawl(catalog).await
}

pub async fn crawl_id(self, id: &str) -> Result<Crawl> {
let catalog_config = self
.catalogs
.get(id)
.ok_or_else(|| anyhow!("invalid id: {id}"))?;
let url = catalog_config.href.parse()?;
self.crawl_url(url).await
}

pub fn into_catalog(mut self) -> Result<Catalog> {
Expand Down
36 changes: 21 additions & 15 deletions src/crawl.rs
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,17 @@ use stac::Catalog;
use std::{collections::HashMap, future::Future, pin::Pin};
use tokio::task::JoinSet;

pub async fn crawl(catalog: Catalog) -> Result<Catalog> {
#[derive(Debug, Serialize, Deserialize)]
pub struct Crawl {
children: Vec<Value>,

item: Option<Value>,

#[serde(flatten)]
additional_fields: HashMap<String, Value>,
}

pub async fn crawl(catalog: Catalog) -> Result<Crawl> {
let client = Client::new();
crawl_value(catalog.try_into()?, client).await?.try_into()
}
Expand Down Expand Up @@ -37,13 +47,9 @@ fn crawl_value(
}
"Collection" => {
if let Some(link) = value.links.iter().find(|link| link.rel == "item") {
let url = Url::parse_with_params(
&link.href,
[("limit", "1"), ("sortby", "-properties.datetime")],
)?;
tracing::info!("getting item: {}", url);
tracing::info!("getting item: {}", link.href);
value.item = client
.get(url)
.get(&link.href)
.send()
.await?
.error_for_status()?
Expand All @@ -52,13 +58,13 @@ fn crawl_value(
}
if value.item.is_none() {
if let Some(link) = value.links.iter().find(|link| link.rel == "items") {
// TODO sort items, maybe limit?
tracing::info!("getting items: {}", link.href);
let mut items: CrawlValue = reqwest::get(&link.href)
.await?
.error_for_status()?
.json()
.await?;
let url = Url::parse_with_params(
&link.href,
[("limit", "1"), ("sortby", "-properties.datetime")],
)?;
tracing::info!("getting items: {}", url);
let mut items: CrawlValue =
reqwest::get(url).await?.error_for_status()?.json().await?;
if !items.features.is_empty() {
value.item = Some(Box::new(items.features.remove(0)));
}
Expand Down Expand Up @@ -109,7 +115,7 @@ impl TryFrom<Catalog> for CrawlValue {
}
}

impl TryFrom<CrawlValue> for Catalog {
impl TryFrom<CrawlValue> for Crawl {
type Error = Error;

fn try_from(value: CrawlValue) -> Result<Self> {
Expand Down
5 changes: 4 additions & 1 deletion src/lib.rs
Original file line number Diff line number Diff line change
@@ -1,4 +1,7 @@
mod config;
mod crawl;

pub use {config::Config, crawl::crawl};
pub use {
config::Config,
crawl::{crawl, Crawl},
};
44 changes: 36 additions & 8 deletions src/main.rs
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
use clap::Parser;
use heystac::Config;
use reqwest::Url;

#[derive(Debug, Parser)]
struct Args {
Expand All @@ -10,7 +11,17 @@ struct Args {
#[derive(Debug, clap::Subcommand)]
enum Subcommand {
/// Crawl all catalogs
Crawl,
Crawl {
/// The id or the href of the STAC catalog to crawl
///
/// If the string "all" is provided, all configured catalogs will be crawled.
id_or_href: String,

/// The output file.
///
/// Required if an href is provided, otherwise will be `crawl/<id>.json`.
outfile: Option<String>,
},

/// Run the prebuild actions
Prebuild,
Expand All @@ -22,13 +33,30 @@ async fn main() {
let args = Args::parse();
let config = Config::from_path("config.toml").unwrap();
match args.subcommand {
Subcommand::Crawl => {
let catalogs = config.crawl().await.unwrap();
std::fs::write(
"crawl/crawl.json",
serde_json::to_string(&catalogs).unwrap(),
)
.unwrap();
Subcommand::Crawl {
id_or_href,
outfile,
} => {
if let Ok(url) = Url::parse(&id_or_href) {
if let Some(outfile) = outfile {
let crawl = config.crawl_url(url).await.unwrap();
std::fs::write(outfile, serde_json::to_string_pretty(&crawl).unwrap()).unwrap()
} else {
eprint!("ERROR: outfile must be provided when crawling an href");
std::process::exit(1);
}
} else if id_or_href == "all" {
for id in config.catalogs.keys() {
let config = config.clone();
let crawl = config.crawl_id(id).await.unwrap();
let outfile = format!("crawl/{id}.json");
std::fs::write(outfile, serde_json::to_string_pretty(&crawl).unwrap()).unwrap();
}
} else {
let crawl = config.crawl_id(&id_or_href).await.unwrap();
let outfile = format!("crawl/{id_or_href}.json");
std::fs::write(outfile, serde_json::to_string_pretty(&crawl).unwrap()).unwrap();
}
}
Subcommand::Prebuild => {
let catalog = config.into_catalog().unwrap();
Expand Down

0 comments on commit 3a847a7

Please sign in to comment.