Skip to content

Commit

Permalink
Adds exclusion for words that contain green (#5)
Browse files Browse the repository at this point in the history
Signed-off-by: William Rizzo <[email protected]>
  • Loading branch information
wrkode authored Nov 27, 2023
1 parent 536e4c4 commit a8fc999
Show file tree
Hide file tree
Showing 3 changed files with 55 additions and 9 deletions.
10 changes: 5 additions & 5 deletions greenscraper/cmd/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -29,16 +29,16 @@ func ReadLinesFromFile(filename string) ([]string, error) {
return lines, scanner.Err()
}

func containsAnyOfKeywords(title string, keywords []string) bool {
for _, keyword := range keywords {
if contains := strings.Contains(title, keyword); contains {
func containsAnyOfExclusions(title string, exclusions []string) bool {
for _, exclusion := range exclusions {
if strings.Contains(title, exclusion) {
return true
}
}
return false
}

func ProcessURL(url string, keywordRegexes []*regexp.Regexp, titleRegex *regexp.Regexp, wg *sync.WaitGroup) {
func ProcessURL(url string, keywordRegexes []*regexp.Regexp, exclusions []string, titleRegex *regexp.Regexp, wg *sync.WaitGroup) {
defer wg.Done()

resp, err := http.Get(url)
Expand Down Expand Up @@ -66,7 +66,7 @@ func ProcessURL(url string, keywordRegexes []*regexp.Regexp, titleRegex *regexp.
if len(titleMatch) > 1 {
title := titleMatch[1]

if len(title) >= 40 && !encounteredTitles[title] && !containsBadgePickUp(title) {
if len(title) >= 40 && !encounteredTitles[title] && !containsAnyOfExclusions(title, exclusions) && !containsBadgePickUp(title) {
encounteredTitles[title] = true
talks = append(talks, "- "+title)
}
Expand Down
41 changes: 41 additions & 0 deletions greenscraper/exclusions.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
Greene
Greenley
Greenwald
Greenberg
Blue/Green
blue/green
Green Light
Blue-green
Blue green
COVIDGreen
Greenwood
Greentree
Gvergreen
Greenfield
Greenleaf
Greenhill
Greenway
Greenland
Greenlee
Greenville
Greenwood
Greenberg
Greenfield
Greenbaum
Greenberg
Greenwald
Greenspan
Greene
Greenberg
Greenman
Greenland
Greenleaf
Greenlee
Greenspan
Greenstein
Greenwald
Greenwell
Greenblatt
Greenberg
Greenfield
Greenbaum
13 changes: 9 additions & 4 deletions greenscraper/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,11 @@ func main() {
if err != nil {
panic(fmt.Sprintf("Failed to read URLs from file: %v", err))
}
// Read exclusions from exclusions.txt
exclusions, err := cmd.ReadLinesFromFile("exclusions.txt")
if err != nil {
panic(fmt.Sprintf("Failed to read exclusions from file: %v", err))
}

titleRegex := regexp.MustCompile(".*'>(.*?)<span class=\"vs\">.*")
keywordRegexes := make([]*regexp.Regexp, len(keywords))
Expand All @@ -25,15 +30,15 @@ func main() {
}

const concurrentLimit = 5
sem := make(chan struct{}, concurrentLimit) // semaphore pattern for limiting concurrency
sem := make(chan struct{}, concurrentLimit)
var wg sync.WaitGroup

for _, url := range urls {
wg.Add(1)
sem <- struct{}{} // acquire a token
sem <- struct{}{}
go func(u string) {
cmd.ProcessURL(u, keywordRegexes, titleRegex, &wg)
<-sem // release a token
cmd.ProcessURL(u, keywordRegexes, exclusions, titleRegex, &wg)
<-sem
}(url)
}

Expand Down

0 comments on commit a8fc999

Please sign in to comment.