target_sda() fails if id_loan is duplicated (#164)

* target_sda failes if id_loan is duplicated * don't weight emission_factor if by_company = TRUE Co-authored-by: Mauro Lepore <[email protected]>
RMI-PACTA · Aug 6, 2020 · 555b12e · 555b12e
1 parent 5509ac2
commit 555b12e
Show file tree

Hide file tree

Showing 3 changed files with 77 additions and 23 deletions.
diff --git a/R/target_sda.R b/R/target_sda.R
@@ -106,6 +106,7 @@ target_sda <- function(data,
   )
 
   check_crucial_names(data, crucial_portfolio)
+  check_unique_id(data, "id_loan")
   walk(crucial_portfolio, ~ check_no_value_is_missing(data, .x))
 
   check_crucial_names(ald, crucial_ald)
@@ -120,11 +121,13 @@ target_sda <- function(data,
     by_company
   )
 
-  loanbook_with_weighted_emission_factors <- data %>%
-    calculate_weighted_emission_factor(ald,
-      !!!rlang::syms(loanbook_summary_groups),
-      use_credit_limit = use_credit_limit
-    )
+  loanbook_with_weighted_emission_factors <- calculate_weighted_emission_factor(
+    data,
+    ald,
+    !!!rlang::syms(loanbook_summary_groups),
+    use_credit_limit = use_credit_limit,
+    by_company = by_company
+  )
 
   if (identical(nrow(loanbook_with_weighted_emission_factors), 0L)) {
     rlang::warn("Found no match between loanbook and ald.")
@@ -183,10 +186,14 @@ maybe_add_name_ald <- function(data, by_company = FALSE) {
 calculate_weighted_emission_factor <- function(data,
                                                ald,
                                                ...,
-                                               use_credit_limit = FALSE) {
+                                               use_credit_limit = FALSE,
+                                               by_company = FALSE) {
   data %>%
     inner_join(ald, by = ald_columns()) %>%
-    add_loan_weighted_emission_factor(use_credit_limit = use_credit_limit) %>%
+    add_loan_weighted_emission_factor(
+      use_credit_limit = use_credit_limit,
+      by_company = by_company
+    ) %>%
     group_by(...) %>%
     summarize(
       emission_factor_projected = sum(.data$weighted_loan_emission_factor)
@@ -195,25 +202,30 @@ calculate_weighted_emission_factor <- function(data,
     rename(sector = .data$sector_ald)
 }
 
-add_loan_weighted_emission_factor <- function(data, use_credit_limit) {
-  loan_size <- paste0(
-    "loan_size_", ifelse(use_credit_limit, "credit_limit", "outstanding")
-  )
+add_loan_weighted_emission_factor <- function(data, use_credit_limit, by_company = FALSE) {
+  if (by_company) {
+    data %>%
+      mutate(weighted_loan_emission_factor = .data$emission_factor)
+  } else {
+    loan_size <- paste0(
+      "loan_size_", ifelse(use_credit_limit, "credit_limit", "outstanding")
+    )
 
-  distinct_loans_by_sector <- data %>%
-    ungroup() %>%
-    group_by(.data$sector_ald) %>%
-    distinct(.data$id_loan, .data[[loan_size]])
+    distinct_loans_by_sector <- data %>%
+      ungroup() %>%
+      group_by(.data$sector_ald) %>%
+      distinct(.data$id_loan, .data[[loan_size]])
 
-  total_size_by_sector <- distinct_loans_by_sector %>%
-    summarize(total_size = sum(.data[[loan_size]]))
+    total_size_by_sector <- distinct_loans_by_sector %>%
+      summarize(total_size = sum(.data[[loan_size]]))
 
-  data %>%
-    left_join(total_size_by_sector, by = "sector_ald") %>%
-    mutate(
-      loan_weight = .data[[loan_size]] / .data$total_size,
-      weighted_loan_emission_factor = .data$emission_factor * .data$loan_weight
-    )
+    data %>%
+      left_join(total_size_by_sector, by = "sector_ald") %>%
+      mutate(
+        loan_weight = .data[[loan_size]] / .data$total_size,
+        weighted_loan_emission_factor = .data$emission_factor * .data$loan_weight
+      )
+  }
 }
 
 calculate_market_average <- function(data) {

diff --git a/R/utils.R b/R/utils.R
@@ -9,6 +9,17 @@ check_no_value_is_missing <- function(data, column) {
   invisible(data)
 }
 
+check_unique_id <- function(data, column) {
+  if (sum(duplicated(data[[column]]))) {
+    abort(
+      class = "unique_ids",
+      sprintf("Column `%s` must not contain any duplicates.", column)
+    )
+  }
+
+  invisible(data)
+}
+
 warn_grouped <- function(data, message) {
   if (dplyr::is_grouped_df(data)) warn(message)
 

diff --git a/tests/testthat/test-target_sda.R b/tests/testthat/test-target_sda.R
@@ -208,6 +208,7 @@ test_that("with known input outputs as expected", {
 
 test_that("with known input outputs as expected, at company level (#155)", {
   matched <- fake_matched(
+    id_loan = c(1, 2),
     name_ald = c("shaanxi auto", "company 2"),
     sector_ald = "cement"
   )
@@ -261,6 +262,36 @@ test_that("with no matching data warns", {
   )
 })
 
+test_that("with duplicated id_loan weights emission_factor as expected (#160)", {
+  match_result <- fake_matched(
+    id_loan = c(1, 1),
+    name_ald = rep("large company", 2),
+    sector_ald = "cement"
+  )
+
+  ald <- fake_ald(
+    sector = "cement",
+    name_company = "large company",
+    emission_factor = 2,
+    year = c(2020, 2025)
+  )
+
+  scen <- fake_co2_scenario(
+    year = c(2020, 2025),
+    emission_factor = c(1, 0.5)
+  )
+
+  expect_error(
+    target_sda(
+      match_result,
+      ald,
+      scen
+    ) %>%
+      filter(year == min(year)),
+    class = "unique_ids"
+  )
+})
+
 test_that("with NAs in crucial columns errors with informative message (#146)", {
   expect_error_crucial_NAs_portfolio <- function(name) {
     data <- fake_matched(sector_ald = "cement")