Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

updates to de-duplicate email domain lists #4824

Merged
merged 3 commits into from
Jun 17, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ package hmda.model.institution

import cats.data.NonEmptyList
import hmda.model.filing.Institution.InstitutionFieldMapping
import hmda.util.CSVConsolidator.{listDeDupeToList, listDeDupeToString}
import io.chrisdavenport.cormorant.CSV
import io.circe._
import io.circe.syntax._
Expand Down Expand Up @@ -42,7 +43,7 @@ object Institution {
("institutionId2017", Json.fromString(i.institutionId_2017.getOrElse(""))),
("taxId", Json.fromString(i.taxId.getOrElse(""))),
("rssd", Json.fromInt(i.rssd)),
("emailDomains", i.emailDomains.asJson),
("emailDomains", listDeDupeToList(i.emailDomains).asJson),
("respondent", i.respondent.asJson),
("parent", i.parent.asJson),
("assets", Json.fromLong(i.assets)),
Expand Down Expand Up @@ -148,7 +149,7 @@ case class Institution(
institutionId_2017.getOrElse(""),
taxId.getOrElse(""),
rssd.toString,
emailDomains.mkString(","),
listDeDupeToString(emailDomains),
respondent.name.getOrElse(""),
respondent.state.getOrElse(""),
respondent.city.getOrElse(""),
Expand Down
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
package hmda.parser.institution

import hmda.model.institution._
import io.chrisdavenport.cormorant.parser.{CSVLikeParser}
import hmda.util.CSVConsolidator.{listDeDupeToList, stringDeDupeToList}
import io.chrisdavenport.cormorant.parser.CSVLikeParser
import io.chrisdavenport.cormorant

object InstitutionCsvParser {
Expand Down Expand Up @@ -34,7 +35,7 @@ object InstitutionCsvParser {
val notes = values.lift.apply(22).getOrElse("") //TODO consider default value from env

val emails =
if (emailDomains.isEmpty) List() else emailDomains.split(',').toList
if (emailDomains.isEmpty) List() else stringDeDupeToList(emailDomains)

Institution(
activityYear,
Expand Down
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
package hmda.serialization.institution

import hmda.model.institution._
import hmda.persistence.serialization.institution.{ InstitutionMessage, ParentMessage, RespondentMessage, TopHolderMessage }
import hmda.persistence.serialization.institution.{InstitutionMessage, ParentMessage, RespondentMessage, TopHolderMessage}
import hmda.util.CSVConsolidator.listDeDupeToList

object InstitutionProtobufConverter {

Expand Down Expand Up @@ -53,7 +54,7 @@ object InstitutionProtobufConverter {
id2017 = i.institutionId_2017.getOrElse(""),
taxId = i.taxId.getOrElse(""),
rssd = i.rssd,
emailDomains = i.emailDomains,
emailDomains = listDeDupeToList(i.emailDomains),
respondent = Some(respondentToProtobuf(i.respondent)),
parent = Some(parentToProtobuf(i.parent)),
assets = i.assets,
Expand Down
23 changes: 23 additions & 0 deletions common/src/main/scala/hmda/util/CSVConsolidator.scala
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
package hmda.util

import cats.implicits._
import com.typesafe.config.ConfigFactory
import hmda.model.filing.submission.SubmissionId

object CSVConsolidator {

def listDeDupeToString(seqToDeDupe: Seq[String]) = {
seqToDeDupe.mkString(",").toLowerCase().trim.split("\\s*,\\s*").distinct.mkString(",")
}
def listDeDupeToList(seqToDeDupe: Seq[String]) = {
seqToDeDupe.mkString(",").toLowerCase().trim.split("\\s*,\\s*").distinct.toList
}

def stringDeDupeToList(stringToDeDupe: String) = {
stringToDeDupe.toLowerCase().trim.split("\\s*,\\s*").distinct.toList
}

def stringDeDupeToString(stringToDeDupe: String) = {
stringToDeDupe.toLowerCase().trim.split("\\s*,\\s*").distinct.mkString(",")
}
}
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
package hmda.model.institution

import hmda.generators.CommonGenerators.{ activityYearGen, emailListGen, leiGen, stateGen }
import hmda.generators.CommonGenerators.{activityYearGen, emailListGen, leiGen, stateGen}
import hmda.util.CSVConsolidator.listDeDupeToList
import org.scalacheck.Gen

object InstitutionGenerators {
Expand Down Expand Up @@ -36,7 +37,7 @@ object InstitutionGenerators {
if (id2017 == "") None else Some(id2017),
if (taxId == "") None else Some(taxId),
rssd,
email,
listDeDupeToList(email),
respondent,
parent,
assets,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,12 +17,13 @@ import hmda.publisher.helper.{PrivateAWSConfigLoader, S3Utils, SnapshotCheck}
import hmda.publisher.query.component.{InstitutionEmailComponent, InstitutionRepository, PublisherComponent, PublisherComponent2018, PublisherComponent2019, PublisherComponent2020, PublisherComponent2021, PublisherComponent2022, PublisherComponent2023}
import hmda.publisher.query.panel.{InstitutionAltEntity, InstitutionEmailEntity, InstitutionEntity}
import hmda.publisher.scheduler.schedules.{Schedule, ScheduleWithYear}
import hmda.publisher.scheduler.schedules.Schedules.{PanelSchedule}
import hmda.publisher.scheduler.schedules.Schedules.PanelSchedule
import hmda.publisher.util.{PublishingReporter, ScheduleCoordinator}
import hmda.publisher.util.PublishingReporter.Command.FilePublishingCompleted
import hmda.publisher.util.ScheduleCoordinator.Command._
import hmda.query.DbConfiguration.dbConfig
import hmda.util.BankFilterUtils._
import hmda.util.CSVConsolidator.listDeDupeToString

import scala.concurrent.duration.HOURS
import scala.concurrent.{ExecutionContext, Future}
Expand Down Expand Up @@ -114,11 +115,9 @@ class PanelScheduler(publishingReporter: ActorRef[PublishingReporter.Command], s
}
}

def appendEmailDomains2018(institution: InstitutionEntity): Future[InstitutionAltEntity] = {

def appendEmailDomains(institution: InstitutionEntity): Future[InstitutionAltEntity] = {
val emails: Future[Seq[InstitutionEmailEntity]] =
emailRepository.findByLei(institution.lei)

emails.map(emailList =>
InstitutionAltEntity(
lei = institution.lei,
Expand All @@ -138,38 +137,12 @@ class PanelScheduler(publishingReporter: ActorRef[PublishingReporter.Command], s
topHolderIdRssd = institution.topHolderIdRssd,
topHolderName = institution.topHolderName,
hmdaFiler = institution.hmdaFiler,
emailDomains = emailList.map(email => email.emailDomain).mkString(",")
emailDomains = listDeDupeToString(emailList.map(email => email.emailDomain))
)
)
}

def appendEmailDomains(institution: InstitutionEntity): Future[InstitutionAltEntity] = {
val emails: Future[Seq[InstitutionEmailEntity]] =
emailRepository.findByLei(institution.lei)

emails.map(emailList =>
InstitutionAltEntity(
lei = institution.lei,
activityYear = institution.activityYear,
agency = institution.agency,
institutionType = institution.institutionType,
id2017 = institution.id2017,
taxId = institution.taxId,
rssd = institution.rssd,
respondentName = institution.respondentName,
respondentState = institution.respondentState,
respondentCity = institution.respondentCity,
parentIdRssd = institution.parentIdRssd,
parentName = institution.parentName,
assets = institution.assets,
otherLenderCode = institution.otherLenderCode,
topHolderIdRssd = institution.topHolderIdRssd,
topHolderName = institution.topHolderName,
hmdaFiler = institution.hmdaFiler,
emailDomains = emailList.map(email => email.emailDomain).mkString(",")
)
)
}

protected def reportPublishingComplete(result: Try[Any], schedule: Schedule, fullFilePath: String): Unit =
result match {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -13,9 +13,9 @@ server {

db {
mv {
periods_annual: "2018,2019,2020,2021,2022"
periods_annual: "2018,2019,2020,2021,2022,2023"
periods_annual: ${?PERIODS_ANNUAL}
periods_with_quarter: "2018,2019,2020,2021,2022,2023_q1,2023_q2,2023_q3"
periods_with_quarter: "2018,2019,2020,2021,2022,2023"
periods_with_quarter: ${?PERIODS_QUARTER}
suffix_home_purchase: "purpose_p"
suffix_home_purchase: ${?SUFFIX_HOME_PURCHASE}
Expand Down
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
package hmda.institution.api.http

import hmda.institution.query.{ InstitutionEmailEntity, InstitutionEntity }
import hmda.institution.query.{InstitutionEmailEntity, InstitutionEntity}
import hmda.model.institution._
import hmda.util.CSVConsolidator.listDeDupeToList

object InstitutionConverter {

Expand Down Expand Up @@ -66,6 +67,9 @@ object InstitutionConverter {
notes = institution.notes
)

def emailsFromInstitution(institution: Institution): Seq[InstitutionEmailEntity] =
institution.emailDomains.map(email => InstitutionEmailEntity(lei = institution.LEI, emailDomain = email.trim.toLowerCase()))
def emailsFromInstitution(institution: Institution): Seq[InstitutionEmailEntity] = {

val uniqueEmailDomainList= listDeDupeToList(institution.emailDomains)
uniqueEmailDomainList.map(email => InstitutionEmailEntity(lei = institution.LEI, emailDomain = email))
}
}