From 00af7798b01e3f8721c4cc99cc27737331f6f50d Mon Sep 17 00:00:00 2001 From: PatrickHSI Date: Wed, 12 Jun 2024 12:46:18 -0400 Subject: [PATCH 1/3] updates to de-duplicate email domain lists --- .../hmda/model/institution/Institution.scala | 5 +-- .../institution/InstitutionCsvParser.scala | 5 +-- .../InstitutionProtobufConverter.scala | 5 +-- .../scala/hmda/util/CSVConsolidator.scala | 23 +++++++++++++ .../institution/InstitutionGenerators.scala | 5 +-- .../publisher/scheduler/PanelScheduler.scala | 34 +++---------------- .../src/main/resources/application.conf | 4 +-- .../api/http/InstitutionConverter.scala | 10 ++++-- 8 files changed, 48 insertions(+), 43 deletions(-) create mode 100644 common/src/main/scala/hmda/util/CSVConsolidator.scala diff --git a/common/src/main/scala/hmda/model/institution/Institution.scala b/common/src/main/scala/hmda/model/institution/Institution.scala index 7ac5e77968..963160daca 100644 --- a/common/src/main/scala/hmda/model/institution/Institution.scala +++ b/common/src/main/scala/hmda/model/institution/Institution.scala @@ -2,6 +2,7 @@ package hmda.model.institution import cats.data.NonEmptyList import hmda.model.filing.Institution.InstitutionFieldMapping +import hmda.util.CSVConsolidator.{listDeDupeToList, listDeDupeToString} import io.chrisdavenport.cormorant.CSV import io.circe._ import io.circe.syntax._ @@ -42,7 +43,7 @@ object Institution { ("institutionId2017", Json.fromString(i.institutionId_2017.getOrElse(""))), ("taxId", Json.fromString(i.taxId.getOrElse(""))), ("rssd", Json.fromInt(i.rssd)), - ("emailDomains", i.emailDomains.asJson), + ("emailDomains", listDeDupeToList(i.emailDomains).asJson), ("respondent", i.respondent.asJson), ("parent", i.parent.asJson), ("assets", Json.fromLong(i.assets)), @@ -148,7 +149,7 @@ case class Institution( institutionId_2017.getOrElse(""), taxId.getOrElse(""), rssd.toString, - emailDomains.mkString(","), + listDeDupeToString(emailDomains), respondent.name.getOrElse(""), respondent.state.getOrElse(""), respondent.city.getOrElse(""), diff --git a/common/src/main/scala/hmda/parser/institution/InstitutionCsvParser.scala b/common/src/main/scala/hmda/parser/institution/InstitutionCsvParser.scala index b48ba8cce0..e179ee2f0f 100644 --- a/common/src/main/scala/hmda/parser/institution/InstitutionCsvParser.scala +++ b/common/src/main/scala/hmda/parser/institution/InstitutionCsvParser.scala @@ -1,7 +1,8 @@ package hmda.parser.institution import hmda.model.institution._ -import io.chrisdavenport.cormorant.parser.{CSVLikeParser} +import hmda.util.CSVConsolidator.{listDeDupeToList, stringDeDupeToList} +import io.chrisdavenport.cormorant.parser.CSVLikeParser import io.chrisdavenport.cormorant object InstitutionCsvParser { @@ -34,7 +35,7 @@ object InstitutionCsvParser { val notes = values.lift.apply(22).getOrElse("") //TODO consider default value from env val emails = - if (emailDomains.isEmpty) List() else emailDomains.split(',').toList + if (emailDomains.isEmpty) List() else stringDeDupeToList(emailDomains) Institution( activityYear, diff --git a/common/src/main/scala/hmda/serialization/institution/InstitutionProtobufConverter.scala b/common/src/main/scala/hmda/serialization/institution/InstitutionProtobufConverter.scala index ef72759bad..810177bd84 100644 --- a/common/src/main/scala/hmda/serialization/institution/InstitutionProtobufConverter.scala +++ b/common/src/main/scala/hmda/serialization/institution/InstitutionProtobufConverter.scala @@ -1,7 +1,8 @@ package hmda.serialization.institution import hmda.model.institution._ -import hmda.persistence.serialization.institution.{ InstitutionMessage, ParentMessage, RespondentMessage, TopHolderMessage } +import hmda.persistence.serialization.institution.{InstitutionMessage, ParentMessage, RespondentMessage, TopHolderMessage} +import hmda.util.CSVConsolidator.listDeDupeToList object InstitutionProtobufConverter { @@ -53,7 +54,7 @@ object InstitutionProtobufConverter { id2017 = i.institutionId_2017.getOrElse(""), taxId = i.taxId.getOrElse(""), rssd = i.rssd, - emailDomains = i.emailDomains, + emailDomains = listDeDupeToList(i.emailDomains), respondent = Some(respondentToProtobuf(i.respondent)), parent = Some(parentToProtobuf(i.parent)), assets = i.assets, diff --git a/common/src/main/scala/hmda/util/CSVConsolidator.scala b/common/src/main/scala/hmda/util/CSVConsolidator.scala new file mode 100644 index 0000000000..327e9969b0 --- /dev/null +++ b/common/src/main/scala/hmda/util/CSVConsolidator.scala @@ -0,0 +1,23 @@ +package hmda.util + +import cats.implicits._ +import com.typesafe.config.ConfigFactory +import hmda.model.filing.submission.SubmissionId + +object CSVConsolidator { + + def listDeDupeToString(seqToDeDupe: Seq[String]) = { + seqToDeDupe.mkString(",").toLowerCase().trim.split("\\s*,\\s*").distinct.mkString(",") + } + def listDeDupeToList(seqToDeDupe: Seq[String]) = { + seqToDeDupe.mkString(",").toLowerCase().trim.split("\\s*,\\s*").distinct.toList + } + + def stringDeDupeToList(stringToDeDupe: String) = { + stringToDeDupe.toLowerCase().trim.split("\\s*,\\s*").distinct.toList + } + + def stringDeDupeToString(stringToDeDupe: String) = { + stringToDeDupe.toLowerCase().trim.split("\\s*,\\s*").distinct.mkString(",") + } +} diff --git a/common/src/test/scala/hmda/model/institution/InstitutionGenerators.scala b/common/src/test/scala/hmda/model/institution/InstitutionGenerators.scala index d6cd556af0..78569628d8 100644 --- a/common/src/test/scala/hmda/model/institution/InstitutionGenerators.scala +++ b/common/src/test/scala/hmda/model/institution/InstitutionGenerators.scala @@ -1,6 +1,7 @@ package hmda.model.institution -import hmda.generators.CommonGenerators.{ activityYearGen, emailListGen, leiGen, stateGen } +import hmda.generators.CommonGenerators.{activityYearGen, emailListGen, leiGen, stateGen} +import hmda.util.CSVConsolidator.listDeDupeToList import org.scalacheck.Gen object InstitutionGenerators { @@ -36,7 +37,7 @@ object InstitutionGenerators { if (id2017 == "") None else Some(id2017), if (taxId == "") None else Some(taxId), rssd, - email, + listDeDupeToList(email), respondent, parent, assets, diff --git a/hmda-data-publisher/src/main/scala/hmda/publisher/scheduler/PanelScheduler.scala b/hmda-data-publisher/src/main/scala/hmda/publisher/scheduler/PanelScheduler.scala index d46e5f9898..040dabe06c 100644 --- a/hmda-data-publisher/src/main/scala/hmda/publisher/scheduler/PanelScheduler.scala +++ b/hmda-data-publisher/src/main/scala/hmda/publisher/scheduler/PanelScheduler.scala @@ -114,11 +114,11 @@ class PanelScheduler(publishingReporter: ActorRef[PublishingReporter.Command], s } } - def appendEmailDomains2018(institution: InstitutionEntity): Future[InstitutionAltEntity] = { - + def appendEmailDomains(institution: InstitutionEntity): Future[InstitutionAltEntity] = { val emails: Future[Seq[InstitutionEmailEntity]] = emailRepository.findByLei(institution.lei) - + var seqTest:Seq[String] = Seq("1","2,3","4","5","6") + val dedupe= csvDeDupe(seqTest) emails.map(emailList => InstitutionAltEntity( lei = institution.lei, @@ -138,38 +138,12 @@ class PanelScheduler(publishingReporter: ActorRef[PublishingReporter.Command], s topHolderIdRssd = institution.topHolderIdRssd, topHolderName = institution.topHolderName, hmdaFiler = institution.hmdaFiler, - emailDomains = emailList.map(email => email.emailDomain).mkString(",") + emailDomains = listDeDupeToString(emailList.map(email => email.emailDomain)) ) ) } - def appendEmailDomains(institution: InstitutionEntity): Future[InstitutionAltEntity] = { - val emails: Future[Seq[InstitutionEmailEntity]] = - emailRepository.findByLei(institution.lei) - emails.map(emailList => - InstitutionAltEntity( - lei = institution.lei, - activityYear = institution.activityYear, - agency = institution.agency, - institutionType = institution.institutionType, - id2017 = institution.id2017, - taxId = institution.taxId, - rssd = institution.rssd, - respondentName = institution.respondentName, - respondentState = institution.respondentState, - respondentCity = institution.respondentCity, - parentIdRssd = institution.parentIdRssd, - parentName = institution.parentName, - assets = institution.assets, - otherLenderCode = institution.otherLenderCode, - topHolderIdRssd = institution.topHolderIdRssd, - topHolderName = institution.topHolderName, - hmdaFiler = institution.hmdaFiler, - emailDomains = emailList.map(email => email.emailDomain).mkString(",") - ) - ) - } protected def reportPublishingComplete(result: Try[Any], schedule: Schedule, fullFilePath: String): Unit = result match { diff --git a/hmda-quarterly-data-service/src/main/resources/application.conf b/hmda-quarterly-data-service/src/main/resources/application.conf index 486be341db..bce152c48b 100644 --- a/hmda-quarterly-data-service/src/main/resources/application.conf +++ b/hmda-quarterly-data-service/src/main/resources/application.conf @@ -13,9 +13,9 @@ server { db { mv { - periods_annual: "2018,2019,2020,2021,2022" + periods_annual: "2018,2019,2020,2021,2022,2023" periods_annual: ${?PERIODS_ANNUAL} - periods_with_quarter: "2018,2019,2020,2021,2022,2023_q1,2023_q2,2023_q3" + periods_with_quarter: "2018,2019,2020,2021,2022,2023" periods_with_quarter: ${?PERIODS_QUARTER} suffix_home_purchase: "purpose_p" suffix_home_purchase: ${?SUFFIX_HOME_PURCHASE} diff --git a/institutions-api/src/main/scala/hmda/institution/api/http/InstitutionConverter.scala b/institutions-api/src/main/scala/hmda/institution/api/http/InstitutionConverter.scala index bcf75c2c3b..0f2315f28e 100644 --- a/institutions-api/src/main/scala/hmda/institution/api/http/InstitutionConverter.scala +++ b/institutions-api/src/main/scala/hmda/institution/api/http/InstitutionConverter.scala @@ -1,7 +1,8 @@ package hmda.institution.api.http -import hmda.institution.query.{ InstitutionEmailEntity, InstitutionEntity } +import hmda.institution.query.{InstitutionEmailEntity, InstitutionEntity} import hmda.model.institution._ +import hmda.util.CSVConsolidator.listDeDupeToList object InstitutionConverter { @@ -66,6 +67,9 @@ object InstitutionConverter { notes = institution.notes ) - def emailsFromInstitution(institution: Institution): Seq[InstitutionEmailEntity] = - institution.emailDomains.map(email => InstitutionEmailEntity(lei = institution.LEI, emailDomain = email.trim.toLowerCase())) + def emailsFromInstitution(institution: Institution): Seq[InstitutionEmailEntity] = { + + val uniqueEmailDomainList= listDeDupeToList(institution.emailDomains) + uniqueEmailDomainList.map(email => InstitutionEmailEntity(lei = institution.LEI, emailDomain = email)) + } } From f2cb15d90dbfccb163daf5cc15b5c193657bd6f0 Mon Sep 17 00:00:00 2001 From: PatrickHSI Date: Wed, 12 Jun 2024 12:48:46 -0400 Subject: [PATCH 2/3] updates to de-duplicate email domain lists --- .../main/scala/hmda/publisher/scheduler/PanelScheduler.scala | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/hmda-data-publisher/src/main/scala/hmda/publisher/scheduler/PanelScheduler.scala b/hmda-data-publisher/src/main/scala/hmda/publisher/scheduler/PanelScheduler.scala index 040dabe06c..acdbd15fb2 100644 --- a/hmda-data-publisher/src/main/scala/hmda/publisher/scheduler/PanelScheduler.scala +++ b/hmda-data-publisher/src/main/scala/hmda/publisher/scheduler/PanelScheduler.scala @@ -17,12 +17,13 @@ import hmda.publisher.helper.{PrivateAWSConfigLoader, S3Utils, SnapshotCheck} import hmda.publisher.query.component.{InstitutionEmailComponent, InstitutionRepository, PublisherComponent, PublisherComponent2018, PublisherComponent2019, PublisherComponent2020, PublisherComponent2021, PublisherComponent2022, PublisherComponent2023} import hmda.publisher.query.panel.{InstitutionAltEntity, InstitutionEmailEntity, InstitutionEntity} import hmda.publisher.scheduler.schedules.{Schedule, ScheduleWithYear} -import hmda.publisher.scheduler.schedules.Schedules.{PanelSchedule} +import hmda.publisher.scheduler.schedules.Schedules.PanelSchedule import hmda.publisher.util.{PublishingReporter, ScheduleCoordinator} import hmda.publisher.util.PublishingReporter.Command.FilePublishingCompleted import hmda.publisher.util.ScheduleCoordinator.Command._ import hmda.query.DbConfiguration.dbConfig import hmda.util.BankFilterUtils._ +import hmda.util.CSVConsolidator.listDeDupeToString import scala.concurrent.duration.HOURS import scala.concurrent.{ExecutionContext, Future} @@ -118,7 +119,6 @@ class PanelScheduler(publishingReporter: ActorRef[PublishingReporter.Command], s val emails: Future[Seq[InstitutionEmailEntity]] = emailRepository.findByLei(institution.lei) var seqTest:Seq[String] = Seq("1","2,3","4","5","6") - val dedupe= csvDeDupe(seqTest) emails.map(emailList => InstitutionAltEntity( lei = institution.lei, From 74ced5b1f7d3809989d4cc123c1ad6dcf45ca13a Mon Sep 17 00:00:00 2001 From: PatrickHSI Date: Mon, 17 Jun 2024 13:50:48 -0400 Subject: [PATCH 3/3] removing test code --- .../src/main/scala/hmda/publisher/scheduler/PanelScheduler.scala | 1 - 1 file changed, 1 deletion(-) diff --git a/hmda-data-publisher/src/main/scala/hmda/publisher/scheduler/PanelScheduler.scala b/hmda-data-publisher/src/main/scala/hmda/publisher/scheduler/PanelScheduler.scala index acdbd15fb2..b845772f43 100644 --- a/hmda-data-publisher/src/main/scala/hmda/publisher/scheduler/PanelScheduler.scala +++ b/hmda-data-publisher/src/main/scala/hmda/publisher/scheduler/PanelScheduler.scala @@ -118,7 +118,6 @@ class PanelScheduler(publishingReporter: ActorRef[PublishingReporter.Command], s def appendEmailDomains(institution: InstitutionEntity): Future[InstitutionAltEntity] = { val emails: Future[Seq[InstitutionEmailEntity]] = emailRepository.findByLei(institution.lei) - var seqTest:Seq[String] = Seq("1","2,3","4","5","6") emails.map(emailList => InstitutionAltEntity( lei = institution.lei,