webapp: Implement client addresses extractor, use for report spam pro…

…tection and logging
repology · Mar 2, 2025 · 9e2e33d · 9e2e33d
1 parent dc6e98c
commit 9e2e33d
Show file tree

Hide file tree

Showing 4 changed files with 147 additions and 18 deletions.
diff --git a/repology-webapp/src/extractors.rs b/repology-webapp/src/extractors.rs
@@ -0,0 +1,108 @@
+// SPDX-FileCopyrightText: Copyright 2025 Dmitry Marakasov <[email protected]>
+// SPDX-License-Identifier: GPL-3.0-or-later
+
+use std::convert::Infallible;
+use std::net::IpAddr;
+
+use anyhow::Result;
+use axum::extract::FromRequestParts;
+use axum::http::request::Parts;
+use tracing::{debug, error};
+
+/// axum extractor for all possible client IP addresses
+///
+/// Depending on how the [repology] application is deployed, client address
+/// needs to be extracted from different places. If application is directly
+/// accessible to clients, client IP comes from tokio connection. If an application
+/// is behind a reverse proxy, client address is usually passed in some HTTP header
+/// such as X-Real-IP. There may be multiple headers, and a header may contain
+/// multiple addresses, and some headers may also be spoofed.
+///
+/// For the sake of client identification or load balancing, this may require
+/// application to know how it is deployed, to extract the address specifically
+/// from the proper place. However, for the sake of client blacklising, we can
+/// just extract all addresses from all possible locations, and avoid configuration.
+///
+/// Therefore, we use custom extractor and not https://docs.rs/axum-client-ip.
+///
+/// The current implementation does not really support ALL possible locations,
+/// good enough job for how repology is currently deployed. If someone deploys
+/// repology another way, and suffers from spam problems, this implementation
+/// may be extended.
+pub struct PossibleClientAddresses(pub Vec<IpAddr>);
+
+impl<S> FromRequestParts<S> for PossibleClientAddresses
+where
+    S: Send + Sync,
+{
+    type Rejection = Infallible;
+
+    async fn from_request_parts(parts: &mut Parts, _state: &S) -> Result<Self, Self::Rejection> {
+        const SOURCE_HEADERS: &[&str] = &["x-real-ip", "x-forwarded-for"];
+
+        fn parse_address(data: &[u8]) -> Result<IpAddr> {
+            Ok(std::str::from_utf8(data)?.trim().parse()?)
+        }
+
+        let mut addresses: Vec<IpAddr> = vec![];
+
+        for header in SOURCE_HEADERS {
+            let Some(value) = parts.headers.get(*header) else {
+                continue;
+            };
+            // we iterate byte slice to avoid the possibility of malicious user poisoning
+            // the whole header by injecting invalid utf sequence in previous header content
+            for part in value.as_bytes().split(|b| *b == b',') {
+                match parse_address(part) {
+                    Ok(address) => addresses.push(address),
+                    Err(err) => {
+                        let part = String::from_utf8_lossy(part);
+                        error!(%part, %err, "unable to parse client address");
+                    }
+                }
+            }
+        }
+        debug!(?parts.headers, "headers dump in PossibleClientAddresses extractor");
+        Ok(Self(addresses))
+    }
+}
+
+#[cfg(test)]
+#[coverage(off)]
+mod tests {
+    use std::net::Ipv4Addr;
+
+    use http::HeaderValue;
+
+    use super::*;
+
+    #[tokio::test]
+    async fn test_simple() {
+        let mut parts = http::Request::builder()
+            .header(
+                "X-Real-IP",
+                HeaderValue::from_bytes(b" 10.0.0.1 , 10.0.0.2 , garbage, \xff").unwrap(),
+            )
+            .header(
+                "X-Forwarded-For",
+                HeaderValue::from_bytes(b" 10.1.0.1 , 10.1.0.2 , garbage, \xff").unwrap(),
+            )
+            .body(())
+            .unwrap()
+            .into_parts()
+            .0;
+
+        assert_eq!(
+            PossibleClientAddresses::from_request_parts(&mut parts, &())
+                .await
+                .unwrap()
+                .0,
+            vec![
+                IpAddr::V4(Ipv4Addr::new(10, 0, 0, 1)),
+                IpAddr::V4(Ipv4Addr::new(10, 0, 0, 2)),
+                IpAddr::V4(Ipv4Addr::new(10, 1, 0, 1)),
+                IpAddr::V4(Ipv4Addr::new(10, 1, 0, 2)),
+            ]
+        );
+    }
+}
diff --git a/repology-webapp/src/lib.rs b/repology-webapp/src/lib.rs
@@ -13,6 +13,7 @@ mod badges;
 pub mod config;
 mod constants;
 mod endpoints;
+mod extractors;
 mod feeds;
 mod font;
 mod graphs;

diff --git a/repology-webapp/src/views/project/report.rs b/repology-webapp/src/views/project/report.rs
@@ -17,6 +17,7 @@ use tracing::error;
 
 use crate::config::AppConfig;
 use crate::endpoints::Endpoint;
+use crate::extractors::PossibleClientAddresses;
 use crate::result::EndpointResult;
 use crate::state::AppState;
 use crate::template_context::TemplateContext;
@@ -67,7 +68,7 @@ fn check_new_report(
     project_name: &str,
     project_is_alive: bool,
     too_many_reports: bool,
-    client_address: &IpAddr,
+    client_addresses: &[IpAddr],
     form: &ReportForm,
     config: &AppConfig,
 ) -> std::result::Result<(), Vec<&'static str>> {
@@ -80,20 +81,21 @@ fn check_new_report(
     if !project_is_alive {
         error!(
             project_name,
-            %client_address, "report to gone or nonexisting project"
+            ?client_addresses,
+            "report to gone or nonexisting project"
         );
         errors.push("project does not exist or is gone");
     }
 
     if too_many_reports {
-        error!(project_name, %client_address, "too many reports");
+        error!(project_name, ?client_addresses, "too many reports");
         errors.push("too many reports for this project");
     }
 
     if form.comment.len() > MAX_REPORT_COMMENT_LENGTH {
         error!(
             project_name,
-            %client_address,
+            ?client_addresses,
             length = form.comment.len(),
             "report comment too long"
         );
@@ -106,27 +108,33 @@ fn check_new_report(
         && !form.need_vuln
         && form.comment.is_empty()
     {
-        error!(project_name, %client_address, "report form is not filled");
+        error!(project_name, ?client_addresses, "report form is not filled");
         errors.push("please fill out the form");
     }
 
     if form.comment.contains("<a href") {
-        error!(project_name, %client_address, "report comment contains HTML");
+        error!(
+            project_name,
+            ?client_addresses,
+            "report comment contains HTML"
+        );
         errors.push("HTML not allowed");
     }
 
     if form.need_vuln && !form.comment.contains("nvd.nist.gov/vuln/detail/CVE-") {
         error!(
             project_name,
-            %client_address, "missing vulnerability report does not contain NVD link"
+            ?client_addresses,
+            "missing vulnerability report does not contain NVD link"
         );
         errors.push("link to NVD entry (e.g. https://nvd.nist.gov/vuln/detail/CVE-*) for missing CVE is required; note that CVE must already have CPE(s) assigned");
     }
 
     if config.disabled_reports.contains(project_name) {
         error!(
             project_name,
-            %client_address, "report attempt to disabled project"
+            ?client_addresses,
+            "report attempt to disabled project"
         );
         errors.push("new reports to this project are disabled, probably due to a big number of incorrect reports or spam");
     }
@@ -136,18 +144,23 @@ fn check_new_report(
         if form.comment.contains(keyword) {
             error!(
                 project_name,
-                %client_address, keyword, "report comment contains spam keyword"
+                ?client_addresses,
+                keyword,
+                "report comment contains spam keyword"
             );
             is_spam = true;
             break;
         }
     }
 
     for spam_network in &config.spam_networks {
-        if spam_network.contains(client_address.clone()) {
+        if client_addresses
+            .iter()
+            .any(|address| spam_network.contains(*address))
+        {
             error!(
                 project_name,
-                %client_address, %spam_network, "report submitter is blacklisted"
+                ?client_addresses, %spam_network, "report submitter is blacklisted"
             );
             is_spam = true;
             break;
@@ -162,7 +175,8 @@ fn check_new_report(
     {
         error!(
             project_name,
-            %client_address, "report form filled in meaningless pattern"
+            ?client_addresses,
+            "report form filled in meaningless pattern"
         );
         is_spam = true;
     }
@@ -181,7 +195,7 @@ fn check_new_report(
 async fn project_report_generic(
     project_name: String,
     state: &AppState,
-    form: Option<ReportForm>,
+    input: Option<(&[std::net::IpAddr], ReportForm)>,
 ) -> EndpointResult {
     let ctx = TemplateContext::new(
         Endpoint::ProjectReport,
@@ -226,12 +240,12 @@ async fn project_report_generic(
 
     let too_many_reports = reports.len() >= crate::constants::MAX_REPORTS;
 
-    let errors = if let Some(form) = &form {
+    let errors = if let Some((client_addresses, form)) = &input {
         if let Err(errors) = check_new_report(
             &project_name,
             !project.is_orphaned(),
             too_many_reports,
-            &"127.0.0.1".parse().unwrap(), // XXX: stub, use real client address
+            client_addresses,
             form,
             &state.config,
         ) {
@@ -295,7 +309,7 @@ async fn project_report_generic(
             reports_disabled: false,
             too_many_reports,
             afk_till: None,
-            form: form.unwrap_or(ReportForm::default()),
+            form: input.map(|(_, form)| form).unwrap_or(ReportForm::default()),
             errors,
         }
         .render()?,
@@ -307,9 +321,10 @@ async fn project_report_generic(
 pub async fn project_report_post(
     Path(project_name): Path<String>,
     State(state): State<Arc<AppState>>,
+    PossibleClientAddresses(addresses): PossibleClientAddresses,
     Form(form): Form<ReportForm>,
 ) -> EndpointResult {
-    project_report_generic(project_name, &state, Some(form)).await
+    project_report_generic(project_name, &state, Some((&addresses, form))).await
 }
 
 #[cfg_attr(not(feature = "coverage"), tracing::instrument(skip(state)))]

diff --git a/repology-webapp/tests/integration_tests/project_report_submission_failures.rs b/repology-webapp/tests/integration_tests/project_report_submission_failures.rs
@@ -150,7 +150,12 @@ async fn test_spam_network(pool: PgPool) {
         need_verignore: true,
         ..Default::default()
     };
-    let response = Request::new(pool, "/project/zsh/report").with_form(form).with_spam_network(&"0.0.0.0/0".parse().unwrap()).perform().await;
+    let response = Request::new(pool, "/project/zsh/report")
+        .with_form(form)
+        .with_spam_network(&"10.0.0.1/32".parse().unwrap())
+        .with_header("x-real-ip", "10.0.0.0, 10.0.0.1, 10.0.0.2")
+        .perform()
+        .await;
     assert_eq!(response.status(), http::StatusCode::OK);
     assert_eq!(response.header_value_str("content-type").unwrap(), Some("text/html"));
     assert!(response.is_html_valid(HtmlValidationFlags::ALLOW_EMPTY_TAGS | HtmlValidationFlags::WARNINGS_ARE_FATAL));