diff --git a/Cargo.lock b/Cargo.lock index 7e6b31e3..0d7ebc63 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1721,6 +1721,27 @@ version = "0.3.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f67ad224767faa3c7d8b6d91985b78e70a1324408abcb1cfcc2be4c06bc06043" +[[package]] +name = "snafu" +version = "0.8.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "418b8136fec49956eba89be7da2847ec1909df92a9ae4178b5ff0ff092c8d95e" +dependencies = [ + "snafu-derive", +] + +[[package]] +name = "snafu-derive" +version = "0.8.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1a4812a669da00d17d8266a0439eddcacbc88b17f732f927e52eeb9d196f7fb5" +dependencies = [ + "heck", + "proc-macro2", + "quote", + "syn 2.0.30", +] + [[package]] name = "socket2" version = "0.4.9" @@ -2184,6 +2205,12 @@ dependencies = [ "serde", ] +[[package]] +name = "urlencoding" +version = "2.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "daf8dba3b7eb870caf1ddeed7bc9d2a049f3cfdfae7cb521b087cc33ae4c49da" + [[package]] name = "utf-8" version = "0.7.6" @@ -2327,8 +2354,10 @@ dependencies = [ "serde", "serde_json", "serde_repr", + "snafu", "tracing", "url", + "urlencoding", "wiki-api-macros", ] diff --git a/src/components/page.rs b/src/components/page.rs index bb744329..f3978b10 100644 --- a/src/components/page.rs +++ b/src/components/page.rs @@ -12,8 +12,8 @@ use ratatui::{ }; use tracing::{debug, info, warn}; use wiki_api::{ - document::Data, - page::{Page, Section}, + document::{Data, Node}, + page::{Link, Page, Section}, }; use crate::{ @@ -218,7 +218,7 @@ impl PageComponent { .nth(0) .unwrap() .descendants() - .find(|node| matches!(node.data(), &Data::WikiLink { .. })); + .find(|node| matches!(node.data(), &Data::Link(_))); if let Some(selectable_node) = selectable_node { let first_index = selectable_node.index(); @@ -241,9 +241,7 @@ impl PageComponent { .nth(0) .unwrap() .descendants() - .filter(|node| { - matches!(node.data(), &Data::WikiLink { .. }) && node.index() < self.selected.0 - }) + .filter(|node| matches!(node.data(), &Data::Link(_)) && node.index() < self.selected.0) .last(); if let Some(selectable_node) = selectable_node { @@ -267,9 +265,7 @@ impl PageComponent { .nth(0) .unwrap() .descendants() - .find(|node| { - matches!(node.data(), &Data::WikiLink { .. }) && self.selected.1 < node.index() - }); + .find(|node| matches!(node.data(), &Data::Link(_)) && self.selected.1 < node.index()); if let Some(selectable_node) = selectable_node { let first_index = selectable_node.index(); @@ -292,9 +288,7 @@ impl PageComponent { .nth(0) .unwrap() .descendants() - .filter(|node| { - matches!(node.data(), &Data::WikiLink { .. }) && node.index() > self.selected.1 - }) + .filter(|node| matches!(node.data(), &Data::Link(_)) && node.index() > self.selected.1) .last(); if let Some(selectable_node) = selectable_node { @@ -307,6 +301,17 @@ impl PageComponent { } } + fn open_link(&self) -> ActionResult { + let index = self.selected.0; + let node = Node::new(&self.page.content, index).unwrap(); + let data = node.data().to_owned(); + + match data { + Data::Link(Link::Internal(link_data)) => Action::LoadPage(link_data.page).into(), + _ => ActionResult::consumed(), + } + } + fn resize(&mut self, width: u16, height: u16) { self.viewport.width = width; self.viewport.height = height; @@ -405,6 +410,7 @@ impl Component for PageComponent { } KeyCode::Left => Action::Page(PageAction::SelectPrevLink).into(), KeyCode::Right => Action::Page(PageAction::SelectNextLink).into(), + KeyCode::Enter => self.open_link(), _ => ActionResult::Ignored, } } diff --git a/src/page_loader.rs b/src/page_loader.rs index 0ba491a8..8f8bc316 100644 --- a/src/page_loader.rs +++ b/src/page_loader.rs @@ -28,7 +28,11 @@ impl PageLoader { pub fn load_page(&self, title: String) { let page_request = Page::builder() .page(title) - .properties(vec![Property::Text, Property::Sections, Property::LangLinks]) + .properties(vec![ + Property::Text, + Property::Sections, + Property::LangLinks, + ]) .endpoint(self.endpoint.clone()) .language(self.language.clone()); diff --git a/src/renderer/default_renderer.rs b/src/renderer/default_renderer.rs index 9156a221..6bfe678e 100644 --- a/src/renderer/default_renderer.rs +++ b/src/renderer/default_renderer.rs @@ -1,7 +1,10 @@ use ratatui::style::{Color, Modifier, Style}; use textwrap::wrap_algorithms::{wrap_optimal_fit, Penalties}; use tracing::warn; -use wiki_api::document::{Data, Document, HeaderKind, Node}; +use wiki_api::{ + document::{Data, Document, HeaderKind, Node}, + page::Link, +}; use crate::renderer::Word; @@ -447,6 +450,16 @@ impl<'a> Renderer { self.add_whitespace(); } + fn render_link(&mut self, node: Node<'a>, link: Link) { + match link { + Link::Internal(_) => self.render_wiki_link(node), + Link::Anchor(_) => self.render_wiki_link(node), + Link::RedLink(_) => self.render_red_link(node), + Link::External(_) => self.render_external_link(node), + Link::ExternalToInternal(_) => self.render_external_link(node), + } + } + fn render_wiki_link(&mut self, node: Node<'a>) { self.set_text_fg(Color::Blue); self.render_children(node); @@ -508,6 +521,7 @@ impl<'a> Renderer { Data::DerscriptionListDescription => self.render_description_list_description(node), Data::Bold => self.render_bold(node), Data::Italic => self.render_italic(node), + Data::Link(link) => self.render_link(node, link.clone()), Data::WikiLink { href: _, title: _ } => self.render_wiki_link(node), Data::RedLink { title: _ } => self.render_red_link(node), Data::MediaLink { href: _, title: _ } => self.render_media_link(node), diff --git a/wiki-api/Cargo.toml b/wiki-api/Cargo.toml index 2acfcaee..a6310c7a 100644 --- a/wiki-api/Cargo.toml +++ b/wiki-api/Cargo.toml @@ -21,8 +21,10 @@ scraper = "0.17.1" serde = "1.0.188" serde_json = "1.0.105" serde_repr = "0.1.16" +snafu = "0.8.3" tracing = "0.1.37" url = { version = "2.4.1", features = ["serde"] } +urlencoding = "2.1.3" # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html diff --git a/wiki-api/src/document.rs b/wiki-api/src/document.rs index b84c54a7..65bc0f43 100644 --- a/wiki-api/src/document.rs +++ b/wiki-api/src/document.rs @@ -1,5 +1,7 @@ use serde_repr::Deserialize_repr; +use crate::page::Link; + #[derive(Clone, PartialEq, Eq)] pub struct Document { pub nodes: Vec, @@ -59,6 +61,7 @@ pub enum Data { Bold, Italic, + Link(Link), WikiLink { href: String, title: Option, diff --git a/wiki-api/src/page.rs b/wiki-api/src/page.rs index ffaac6f4..2a4818b4 100644 --- a/wiki-api/src/page.rs +++ b/wiki-api/src/page.rs @@ -395,12 +395,14 @@ impl PageBuilder { .map(|x| x as usize) .ok_or_else(|| anyhow!("missing the pageid"))?; + let endpoint = self.endpoint.0; + let language = self.language.0; let content = res_json .get("parse") .and_then(|x| x.get("text")) .and_then(|x| x.as_str()) .map(|x| { - let parser = WikipediaParser::parse_document(x); + let parser = WikipediaParser::parse_document(x, endpoint, language.clone()); Document { nodes: parser.nodes(), } @@ -470,7 +472,7 @@ impl PageBuilder { title, pageid, content, - language: self.language.0, + language, language_links, sections, revision_id, diff --git a/wiki-api/src/parser.rs b/wiki-api/src/parser.rs index d263b7aa..c4d6a133 100644 --- a/wiki-api/src/parser.rs +++ b/wiki-api/src/parser.rs @@ -1,17 +1,29 @@ use html5ever::{parse_document, tendril::TendrilSink}; use markup5ever_rcdom::{Handle, NodeData, RcDom}; +use snafu::Snafu; use std::str::FromStr; -use tracing::{trace, warn}; - -use crate::document::{Data, HeaderKind, Raw}; +use tracing::{debug, trace, warn}; + +use crate::{ + document::{Data, HeaderKind, Raw}, + languages::Language, + page::{ + link_data::{AnchorData, ExternalData, InternalData, RedLinkData}, + Link, + }, + search::Namespace, + Endpoint, +}; // TODO: remove Parser and replace it with normal functions and helper functions pub trait Parser { - fn parse_document(docuemnt: &str) -> Self; + fn parse_document(document: &str, endpoint: Endpoint, language: Language) -> Self; fn nodes(self) -> Vec; } pub struct WikipediaParser { + endpoint: Endpoint, + language: Language, nodes: Vec, } @@ -177,6 +189,9 @@ impl WikipediaParser { Data::Disambiguation } + "a" => self.parse_link(attrs.iter()).unwrap_or_default(), + + /* "a" if attrs.iter().any(|(name, value)| { name.as_str() == "rel" && value.as_str() == "mw:WikiLink" }) => @@ -197,7 +212,7 @@ impl WikipediaParser { { self.parse_external_link(attrs.iter()).unwrap_or_default() } - + */ "div" => Data::Division, _ => { warn!("unknown node '{name}'"); @@ -275,7 +290,7 @@ impl WikipediaParser { }) } - fn parse_wiki_link<'a>( + fn parse_link<'a>( &mut self, mut attrs: impl Iterator, ) -> Option { @@ -287,66 +302,403 @@ impl WikipediaParser { .find(|(name, _)| name.as_str() == "title") .map(|(_, value)| value.to_owned()); - if attrs.any(|(name, value)| name.as_str() == "class" && value.contains("new")) { - return Some(Data::RedLink { title }); - } + let link = match parse_href_to_link( + self.endpoint.clone(), + href.strip_prefix("//en.wikipedia.org").unwrap_or_default(), + title, + self.language.clone(), + ) { + Ok(link) => link, + Err(error) => { + warn!("{:?}", error); + return None; + } + }; - Some(Data::WikiLink { href, title }) + Some(Data::Link(link)) } +} - fn parse_media_link<'a>( - &mut self, - mut attrs: impl Iterator, - ) -> Option { - let href = attrs - .find(|(name, _)| name.as_str() == "href") - .map(|(_, value)| value.to_owned())?; +impl Parser for WikipediaParser { + fn parse_document(document: &str, endpoint: Endpoint, language: Language) -> Self { + let mut parser = WikipediaParser { + nodes: Vec::new(), + endpoint, + language, + }; - let title = attrs - .find(|(name, _)| name.as_str() == "title") - .map(|(_, value)| value.to_owned()); + let rc_dom = parse_document(RcDom::default(), Default::default()).one(document); + parser.parse_node(&rc_dom.document, None, None); - if attrs.any(|(name, value)| name.as_str() == "class" && value.contains("new")) { - return Some(Data::RedLink { title }); - } + parser + } - Some(Data::MediaLink { href, title }) + fn nodes(self) -> Vec { + self.nodes } +} - fn parse_external_link<'a>( - &self, - mut attrs: impl Iterator, - ) -> Option { - let href = attrs - .find(|(name, _)| name.as_str() == "href") - .map(|(_, value)| value.to_owned())?; +#[derive(Debug, Clone, PartialEq, Eq, Snafu)] +enum ParsingError { + #[snafu(display("The link leads to an invalid namespace: '{namespace}"))] + InvalidNamespace { namespace: String }, + #[snafu(display("The link is missing data: '{data}'"))] + MissingData { data: String }, + #[snafu(display("Error while processing the link: '{process}'"))] + ProcessingFailure { process: String }, + #[snafu(display("Link is not UTF-8 encoded"))] + InvalidEncoding, +} - let title = attrs - .find(|(name, _)| name.as_str() == "title") - .map(|(_, value)| value.to_owned()); +fn parse_href_to_link( + endpoint: Endpoint, + href: impl Into, + title: Option>, + language: Language, +) -> Result { + let href: String = match urlencoding::decode(&href.into()) { + Ok(href) => href.into_owned(), + Err(_) => return Err(ParsingError::InvalidEncoding), + }; + + let title: Option = title.map(|title| title.into()); + + debug!("parsing the link '{}'", href); + debug!("link title: '{:?}'", title); + debug!("link endpoint: '{}'", endpoint.as_str()); + + // the prefix /wiki/ indicates that the link is a internal link + const INTERNAL_LINK_PREFIX: &str = "/wiki/"; + // the character used to separate the namespace and the page + const NAMESPACE_DELIMITER: char = ':'; + // the character used to separate the page and the anchor + const ANCHOR_DELIMITER: char = '#'; + // the parameter indicating a redlink (non existent link) + const REDLINK_PARAM: &str = "redlink=1"; + + if href.starts_with(INTERNAL_LINK_PREFIX) { + let title = title.ok_or(ParsingError::MissingData { + data: "title".to_string(), + })?; + return parse_internal_link(href, title, endpoint, language); + } + + if href.starts_with(ANCHOR_DELIMITER) { + let anchor_str = + href.strip_prefix(ANCHOR_DELIMITER) + .ok_or(ParsingError::ProcessingFailure { + process: "removing ANCHOR_DELIMITER prefix".to_string(), + })?; + return Ok(Link::Anchor(AnchorData { + anchor: anchor_str.to_string(), + title: anchor_str.replace('_', " "), + })); + } - let autonumber = - attrs.any(|(name, value)| name.as_str() == "class" && value.contains("autonumber")); + if href.contains(REDLINK_PARAM) { + let url = endpoint + .join(&href) + .map_err(|_| ParsingError::ProcessingFailure { + process: "joining endpoint and href for REDLINK".to_string(), + })?; + let title = title.ok_or(ParsingError::MissingData { + data: "title".to_string(), + })?; + return Ok(Link::RedLink(RedLinkData { url, title })); + } - Some(Data::ExternalLink { - href, + if let Ok(url) = Endpoint::parse(&href) { + return Ok(Link::External(ExternalData { url })); + } + + fn parse_internal_link( + href: String, + title: String, + endpoint: Endpoint, + language: Language, + ) -> Result { + let mut href = + href.strip_prefix(INTERNAL_LINK_PREFIX) + .ok_or(ParsingError::ProcessingFailure { + process: "removing INTERNAL_LINK_PREFIX".to_string(), + })?; + let mut namespace = Namespace::Main; + let mut anchor: Option = None; + + if href.contains(NAMESPACE_DELIMITER) { + debug!("link contains a namespace"); + let (namespace_str, href_split) = + href.split_once(NAMESPACE_DELIMITER) + .ok_or(ParsingError::ProcessingFailure { + process: "splitting at NAMESPACE_DELIMITER".to_string(), + })?; + + href = href_split; + namespace = + Namespace::from_string(namespace_str).ok_or(ParsingError::InvalidNamespace { + namespace: namespace_str.to_string(), + })?; + + debug!("link namespace: '{}'", namespace); + } + + if href.contains(ANCHOR_DELIMITER) { + debug!("link contains an anchor"); + let (page_ref, anchor_str) = + href.split_once(ANCHOR_DELIMITER) + .ok_or(ParsingError::ProcessingFailure { + process: "splitting at ANCHOR_DELIMITER".to_string(), + })?; + + href = page_ref; + anchor = Some(AnchorData { + anchor: anchor_str.to_string(), + title: anchor_str.replace('_', " "), + }); + + debug!("link anchor: '{}'", anchor_str); + } + + Ok(Link::Internal(InternalData { + namespace, + page: href.to_string(), title, - autonumber, - }) + language, + endpoint, + anchor, + })) } + + Err(ParsingError::ProcessingFailure { + process: "invalid link".to_string(), + }) } -impl Parser for WikipediaParser { - fn parse_document(document: &str) -> Self { - let mut parser = WikipediaParser { nodes: Vec::new() }; +#[cfg(test)] +mod tests { + use url::Url; + + use crate::{ + languages::Language, + page::{ + link_data::{AnchorData, ExternalData, InternalData, RedLinkData}, + Link, + }, + parser::ParsingError, + search::Namespace, + }; + + use super::parse_href_to_link; + + const ENDPOINT: &str = "https://en.wikipedia.org/w/api.php"; + const LANGUAGE: Language = Language::English; + + fn internal_link( + namespace: Namespace, + page: impl Into, + title: impl Into, + endpoint: Url, + anchor: Option, + ) -> Link { + Link::Internal(InternalData { + namespace, + page: page.into(), + title: title.into(), + endpoint, + anchor, + language: LANGUAGE.clone(), + }) + } - let rc_dom = parse_document(RcDom::default(), Default::default()).one(document); - parser.parse_node(&rc_dom.document, None, None); + fn anchor_data(anchor: impl Into, title: impl Into) -> AnchorData { + AnchorData { + anchor: anchor.into(), + title: title.into(), + } + } - parser + fn endpoint() -> Url { + Url::parse(ENDPOINT).expect("hard-coded endpoint should be valid") } - fn nodes(self) -> Vec { - self.nodes + #[test] + fn test_parse_link_unknown_namespace() { + assert!(matches!( + parse_href_to_link( + endpoint(), + "/wiki/UnknownNamespace:Main_Page", + Some("Main Page"), + LANGUAGE + ), + Err(ParsingError::InvalidNamespace { .. }) + )) + } + + #[test] + fn test_parse_link_invalid_link() { + assert!(matches!( + parse_href_to_link(endpoint(), "/invalid/hello", Some("hello"), LANGUAGE), + Err(ParsingError::ProcessingFailure { .. }) + )) + } + + #[test] + fn test_parse_internal_link_no_namespace() { + assert_eq!( + parse_href_to_link(endpoint(), "/wiki/Main_Page", Some("Main Page"), LANGUAGE), + Ok(internal_link( + Namespace::Main, + "Main_Page", + "Main Page", + endpoint(), + None + )) + ) + } + + #[test] + fn test_parse_internal_link_with_namespace() { + assert_eq!( + parse_href_to_link( + endpoint(), + "/wiki/Help:Contents", + Some("Help:Contents"), + LANGUAGE + ), + Ok(internal_link( + Namespace::Help, + "Contents", + "Help:Contents", + endpoint(), + None + )) + ); + + assert_eq!( + parse_href_to_link( + endpoint(), + "/wiki/Help:Editing_pages", + Some("Help:Editing pages"), + LANGUAGE + ), + Ok(internal_link( + Namespace::Help, + "Editing_pages", + "Help:Editing pages", + endpoint(), + None + )) + ); + } + + #[test] + fn test_parse_internal_link_with_anchor() { + assert_eq!( + parse_href_to_link( + endpoint(), + "/wiki/Help:Editing_pages#Preview", + Some("Help:Editing pages"), + LANGUAGE + ), + Ok(internal_link( + Namespace::Help, + "Editing_pages", + "Help:Editing pages", + endpoint(), + Some(anchor_data("Preview", "Preview")) + )) + ); + } + + #[test] + fn test_parse_internal_link_with_anchor_whitespace() { + assert_eq!( + parse_href_to_link( + endpoint(), + "/wiki/Help:Editing_pages#See_also", + Some("Help:Editing pages"), + LANGUAGE + ), + Ok(internal_link( + Namespace::Help, + "Editing_pages", + "Help:Editing pages", + endpoint(), + Some(anchor_data("See_also", "See also")) + )) + ); + } + + #[test] + fn test_parse_internal_link_with_subpage() { + assert_eq!( + parse_href_to_link( + endpoint(), + "/wiki/Help:Links/example", + Some("Help:Links/example"), + LANGUAGE + ), + Ok(internal_link( + Namespace::Help, + "Links/example", + "Help:Links/example", + endpoint(), + None, + )) + ) + } + + #[test] + fn test_parse_anchor_link() { + assert_eq!( + parse_href_to_link(endpoint(), "#See_also", None::, LANGUAGE), + Ok(Link::Anchor(anchor_data("See_also", "See also"))) + ) + } + + #[test] + fn test_parse_redlink() { + let link = "/w/index.php?title=Help:Links/example2&action=edit&redlink=1"; + let title = "Help:Links/example2 (page does not exist)"; + assert_eq!( + parse_href_to_link(endpoint(), link, Some(title), LANGUAGE), + Ok(Link::RedLink(RedLinkData { + url: endpoint().join(link).unwrap(), + title: title.to_string(), + })) + ) + } + + #[test] + fn test_parse_external_link() { + let link = "https://mediawiki.org/"; + assert_eq!( + parse_href_to_link(endpoint(), link, None::, LANGUAGE), + Ok(Link::External(ExternalData { + url: Url::parse(link).expect("hard-coded url should be valid") + })) + ); + } + + #[test] + fn test_parse_external_link_with_params() { + let link = "https://google.com/search?q=link"; + assert_eq!( + parse_href_to_link(endpoint(), link, None::, LANGUAGE), + Ok(Link::External(ExternalData { + url: Url::parse(link).expect("hard-coded url should be valid") + })) + ) + } + + #[test] + fn test_parse_external_link_with_mailto() { + let link = "mailto:info@example.org"; + assert_eq!( + parse_href_to_link(endpoint(), link, None::, LANGUAGE), + Ok(Link::External(ExternalData { + url: Url::parse(link).expect("hard-coded url should be valid") + })) + ) } } diff --git a/wiki-api/src/search.rs b/wiki-api/src/search.rs index cc000cb6..06c5c051 100644 --- a/wiki-api/src/search.rs +++ b/wiki-api/src/search.rs @@ -4,11 +4,10 @@ use reqwest::{Client, Response}; use serde_repr::Deserialize_repr; use std::fmt::Debug; use std::fmt::Display; -use std::str::FromStr; use crate::Endpoint; -use super::languages::Language; +use crate::languages::Language; /// A finished search containing the found results and additional optional information regarding /// the search @@ -196,27 +195,26 @@ impl Display for Namespace { } } -impl FromStr for Namespace { - type Err = (); - fn from_str(namespace: &str) -> Result { +impl Namespace { + pub fn from_string(namespace: &str) -> Option { match namespace.to_lowercase().as_str() { - "main" => Ok(Namespace::Main), - "main_talk" => Ok(Namespace::MainTalk), - "user" => Ok(Namespace::User), - "user_talk" => Ok(Namespace::UserTalk), - "project" => Ok(Namespace::Project), - "project_talk" => Ok(Namespace::ProjectTalk), - "file" => Ok(Namespace::File), - "file_talk" => Ok(Namespace::FileTalk), - "mediawiki" => Ok(Namespace::MediaWiki), - "mediawiki_talk" => Ok(Namespace::MediaWikiTalk), - "template" => Ok(Namespace::Template), - "template_talk" => Ok(Namespace::TemplateTalk), - "help" => Ok(Namespace::Help), - "help_talk" => Ok(Namespace::HelpTalk), - "category" => Ok(Namespace::Category), - "category_talk" => Ok(Namespace::CategoryTalk), - _ => Err(()), + "main" => Some(Namespace::Main), + "main_talk" => Some(Namespace::MainTalk), + "user" => Some(Namespace::User), + "user_talk" => Some(Namespace::UserTalk), + "project" => Some(Namespace::Project), + "project_talk" => Some(Namespace::ProjectTalk), + "file" => Some(Namespace::File), + "file_talk" => Some(Namespace::FileTalk), + "mediawiki" => Some(Namespace::MediaWiki), + "mediawiki_talk" => Some(Namespace::MediaWikiTalk), + "template" => Some(Namespace::Template), + "template_talk" => Some(Namespace::TemplateTalk), + "help" => Some(Namespace::Help), + "help_talk" => Some(Namespace::HelpTalk), + "category" => Some(Namespace::Category), + "category_talk" => Some(Namespace::CategoryTalk), + _ => None, } } }