LemmyNet · dessalines · Sep 6, 2023 · Aug 23, 2023 · Aug 23, 2023 · Aug 29, 2023
@@ -150,7 +150,7 @@ impl ActivityHandler for CreateOrUpdatePage {
     PostLike::like(&mut context.pool(), &like_form).await?;
 
     // Calculate initial hot_rank for post
-    PostAggregates::update_hot_rank(&mut context.pool(), post.id).await?;
+    PostAggregates::update_ranks(&mut context.pool(), post.id).await?;
 
     Ok(())
   }

@@ -1,10 +1,14 @@
 use crate::{
   aggregates::structs::PostAggregates,
   newtypes::PostId,
-  schema::post_aggregates,
-  utils::{functions::hot_rank, get_conn, DbPool},
+  schema::{community_aggregates, post, post_aggregates},
+  utils::{
+    functions::{hot_rank, scaled_rank},
+    get_conn,
+    DbPool,
+  },
 };
-use diesel::{result::Error, ExpressionMethods, QueryDsl};
+use diesel::{result::Error, ExpressionMethods, JoinOnDsl, QueryDsl};
 use diesel_async::RunQueryDsl;
 
 impl PostAggregates {
@@ -16,9 +20,19 @@ impl PostAggregates {
       .await
   }
 
-  pub async fn update_hot_rank(pool: &mut DbPool<'_>, post_id: PostId) -> Result<Self, Error> {
+  pub async fn update_ranks(pool: &mut DbPool<'_>, post_id: PostId) -> Result<Self, Error> {
     let conn = &mut get_conn(pool).await?;
 
+    // Diesel can't update based on a join, which is necessary for the scaled_rank
+    // https://github.com/diesel-rs/diesel/issues/1478
+    // Just select the users_active_month manually for now, since its a single post anyway
+    let users_active_month = community_aggregates::table
+      .select(community_aggregates::users_active_month)
+      .inner_join(post::table.on(community_aggregates::community_id.eq(post::community_id)))
+      .filter(post::id.eq(post_id))
+      .first::<i64>(conn)
+      .await?;
+
     diesel::update(post_aggregates::table)
       .filter(post_aggregates::post_id.eq(post_id))
       .set((
@@ -27,6 +41,11 @@ impl PostAggregates {
           post_aggregates::score,
           post_aggregates::newest_comment_time_necro,
         )),
+        post_aggregates::scaled_rank.eq(scaled_rank(
+          post_aggregates::score,
+          post_aggregates::published,
+          users_active_month,
+        )),
       ))
       .get_result::<Self>(conn)
       .await

@@ -100,6 +100,8 @@ pub struct PostAggregates {
   pub community_id: CommunityId,
   pub creator_id: PersonId,
   pub controversy_rank: f64,
+  /// A rank that amplifies smaller communities
+  pub scaled_rank: i32,
 }
 
 #[derive(PartialEq, Eq, Debug, Serialize, Deserialize, Clone)]

@@ -52,6 +52,7 @@ use ts_rs::TS;
 )]
 #[cfg_attr(feature = "full", DbValueStyle = "verbatim")]
 #[cfg_attr(feature = "full", ts(export))]
+// TODO add the controversial and scaled rankings to the doc below
 /// The post sort types. See here for descriptions: https://join-lemmy.org/docs/en/users/03-votes-and-ranking.html
 pub enum SortType {
   Active,
@@ -72,6 +73,7 @@ pub enum SortType {
   TopSixMonths,
   TopNineMonths,
   Controversial,
+  Scaled,
 }
 
 #[derive(EnumString, Display, Debug, Serialize, Deserialize, Clone, Copy)]

@@ -678,6 +678,7 @@ diesel::table! {
         community_id -> Int4,
         creator_id -> Int4,
         controversy_rank -> Float8,
+        scaled_rank -> Int4,
     }
 }
 

@@ -346,7 +346,7 @@ pub fn naive_now() -> NaiveDateTime {
 
 pub fn post_to_comment_sort_type(sort: SortType) -> CommentSortType {
   match sort {
-    SortType::Active | SortType::Hot => CommentSortType::Hot,
+    SortType::Active | SortType::Hot | SortType::Scaled => CommentSortType::Hot,
     SortType::New | SortType::NewComments | SortType::MostComments => CommentSortType::New,
     SortType::Old => CommentSortType::Old,
     SortType::Controversial => CommentSortType::Controversial,
@@ -386,6 +386,10 @@ pub mod functions {
     fn hot_rank(score: BigInt, time: Timestamp) -> Integer;
   }
 
+  sql_function! {
+    fn scaled_rank(score: BigInt, time: Timestamp, users_active_month: BigInt) -> Integer;
+  }
+
   sql_function! {
     fn controversy_rank(upvotes: BigInt, downvotes: BigInt, score: BigInt) -> Double;
   }

diff --git a/crates/db_views/src/post_report_view.rs b/crates/db_views/src/post_report_view.rs
@@ -423,6 +423,7 @@ mod tests {
         featured_local: false,
         hot_rank: 1728,
         hot_rank_active: 1728,
+        scaled_rank: 742,
         controversy_rank: 0.0,
         community_id: inserted_post.community_id,
         creator_id: inserted_post.creator_id,

@@ -332,6 +332,9 @@ fn queries<'a>() -> Queries<
       SortType::Hot => query
         .then_order_by(post_aggregates::hot_rank.desc())
         .then_order_by(post_aggregates::published.desc()),
+      SortType::Scaled => query
+        .then_order_by(post_aggregates::scaled_rank.desc())
+        .then_order_by(post_aggregates::published.desc()),
       SortType::Controversial => query.then_order_by(post_aggregates::controversy_rank.desc()),
       SortType::New => query.then_order_by(post_aggregates::published.desc()),
       SortType::Old => query.then_order_by(post_aggregates::published.asc()),
@@ -1129,6 +1132,7 @@ mod tests {
         hot_rank: 1728,
         hot_rank_active: 1728,
         controversy_rank: 0.0,
+        scaled_rank: 742,
         community_id: inserted_post.community_id,
         creator_id: inserted_post.creator_id,
       },

@@ -113,7 +113,7 @@ fn queries<'a>() -> Queries<
     }
 
     match options.sort.unwrap_or(Hot) {
-      Hot | Active => query = query.order_by(community_aggregates::hot_rank.desc()),
+      Hot | Active | Scaled => query = query.order_by(community_aggregates::hot_rank.desc()),
       NewComments | TopDay | TopTwelveHour | TopSixHour | TopHour => {
         query = query.order_by(community_aggregates::users_active_day.desc())
       }

@@ -0,0 +1,52 @@
+DROP FUNCTION scaled_rank;
+
+ALTER TABLE post_aggregates
+    DROP COLUMN scaled_rank;
+
+-- The following code is necessary because postgres can't remove
+-- a single enum value.
+ALTER TABLE local_user
+    ALTER default_sort_type DROP DEFAULT;
+
+UPDATE
+    local_user
+SET
+    default_sort_type = 'Hot'
+WHERE
+    default_sort_type = 'Scaled';
+
+-- rename the old enum
+ALTER TYPE sort_type_enum RENAME TO sort_type_enum__;
+
+-- create the new enum
+CREATE TYPE sort_type_enum AS ENUM (
+    'Active',
+    'Hot',
+    'New',
+    'Old',
+    'TopDay',
+    'TopWeek',
+    'TopMonth',
+    'TopYear',
+    'TopAll',
+    'MostComments',
+    'NewComments',
+    'TopHour',
+    'TopSixHour',
+    'TopTwelveHour',
+    'TopThreeMonths',
+    'TopSixMonths',
+    'TopNineMonths'
+);
+
+-- alter all your enum columns
+ALTER TABLE local_user
+    ALTER COLUMN default_sort_type TYPE sort_type_enum
+    USING default_sort_type::text::sort_type_enum;
+
+ALTER TABLE local_user
+    ALTER default_sort_type SET DEFAULT 'Active';
+
+-- drop the old enum
+DROP TYPE sort_type_enum__;
+
@@ -0,0 +1,28 @@
+CREATE OR REPLACE FUNCTION scaled_rank (score numeric, published timestamp without time zone, users_active_month numeric)
+    RETURNS integer
+    AS $$
+BEGIN
+    -- Add 2 to avoid divide by zero errors
+    -- Use 0.1 to lessen the initial sharp decline at a hot_rank ~ 300
+    -- Default for score = 1, active users = 1, and now, is 742
+    RETURN (hot_rank (score, published) / log(2 + 0.1 * users_active_month))::integer;
+END;
+$$
+LANGUAGE plpgsql
+IMMUTABLE PARALLEL SAFE;
+
+ALTER TABLE post_aggregates
+    ADD COLUMN scaled_rank integer NOT NULL DEFAULT 742;
+
+CREATE INDEX idx_post_aggregates_featured_community_scaled ON post_aggregates (featured_community DESC, scaled_rank DESC, published DESC);
+
+CREATE INDEX idx_post_aggregates_featured_local_scaled ON post_aggregates (featured_local DESC, scaled_rank DESC, published DESC);
+
+-- We forgot to add the controversial sort type
+ALTER TYPE sort_type_enum
+    ADD VALUE 'Controversial';
+
+-- Add the Scaled enum
+ALTER TYPE sort_type_enum
+    ADD VALUE 'Scaled';
+
@@ -154,22 +154,16 @@ fn startup_jobs(db_url: &str) {
 fn update_hot_ranks(conn: &mut PgConnection) {
   info!("Updating hot ranks for all history...");
 
-  process_hot_ranks_in_batches(
-    conn,
-    "post_aggregates",
-    "a.hot_rank != 0 OR a.hot_rank_active != 0",
-    "SET hot_rank = hot_rank(a.score, a.published),
-         hot_rank_active = hot_rank(a.score, a.newest_comment_time_necro)",
-  );
+  process_post_aggregates_ranks_in_batches(conn);
 
-  process_hot_ranks_in_batches(
+  process_ranks_in_batches(
     conn,
     "comment_aggregates",
     "a.hot_rank != 0",
     "SET hot_rank = hot_rank(a.score, a.published)",
   );
 
-  process_hot_ranks_in_batches(
+  process_ranks_in_batches(
     conn,
     "community_aggregates",
     "a.hot_rank != 0",
@@ -189,7 +183,7 @@ struct HotRanksUpdateResult {
 /// In `where_clause` and `set_clause`, "a" will refer to the current aggregates table.
 /// Locked rows are skipped in order to prevent deadlocks (they will likely get updated on the next
 /// run)
-fn process_hot_ranks_in_batches(
+fn process_ranks_in_batches(
   conn: &mut PgConnection,
   table_name: &str,
   where_clause: &str,
@@ -238,6 +232,52 @@ fn process_hot_ranks_in_batches(
   );
 }
 
+/// Post aggregates is a special case, since it needs to join to the community_aggregates
+/// table, to get the active monthly user counts.
+fn process_post_aggregates_ranks_in_batches(conn: &mut PgConnection) {
+  let process_start_time = NaiveDateTime::from_timestamp_opt(0, 0).expect("0 timestamp creation");
+
+  let update_batch_size = 1000; // Bigger batches than this tend to cause seq scans
+  let mut processed_rows_count = 0;
+  let mut previous_batch_result = Some(process_start_time);
+  while let Some(previous_batch_last_published) = previous_batch_result {
+    let result = sql_query(
+      r#"WITH batch AS (SELECT pa.id
+               FROM post_aggregates pa
+               WHERE pa.published > $1
+               AND (pa.hot_rank != 0 OR pa.hot_rank_active != 0 OR pa.scaled_rank != 0)
+               ORDER BY pa.published
+               LIMIT $2
+               FOR UPDATE SKIP LOCKED)
+         UPDATE post_aggregates pa
+           SET hot_rank = hot_rank(pa.score, pa.published),
+           hot_rank_active = hot_rank(pa.score, pa.newest_comment_time_necro),
+           scaled_rank = scaled_rank(pa.score, pa.published, ca.users_active_month)
+         FROM batch, community_aggregates ca
+         WHERE pa.id = batch.id and pa.community_id = ca.community_id RETURNING pa.published;
+    "#,
+    )
+    .bind::<Timestamp, _>(previous_batch_last_published)
+    .bind::<Integer, _>(update_batch_size)
+    .get_results::<HotRanksUpdateResult>(conn);
+
+    match result {
+      Ok(updated_rows) => {
+        processed_rows_count += updated_rows.len();
+        previous_batch_result = updated_rows.last().map(|row| row.published);
+      }
+      Err(e) => {
+        error!("Failed to update {} hot_ranks: {}", "post_aggregates", e);
+        break;
+      }
+    }
+  }
+  info!(
+    "Finished process_hot_ranks_in_batches execution for {} (processed {} rows)",
+    "post_aggregates", processed_rows_count
+  );
+}
+
 fn delete_expired_captcha_answers(conn: &mut PgConnection) {
   diesel::delete(
     captcha_answer::table.filter(captcha_answer::published.lt(now - IntervalDsl::minutes(10))),