From d8b7e23330f47eeb52eb2d0697f91a45ad5de8dd Mon Sep 17 00:00:00 2001 From: "daniel_fillol@hotmail.com" <55287657+DanielFillol@users.noreply.github.com> Date: Mon, 3 Jun 2024 19:23:18 -0300 Subject: [PATCH 1/4] feat: reorganization for better reading and debug --- goSpider.go | 78 ++++++++++++++++++++++++----------------------------- 1 file changed, 35 insertions(+), 43 deletions(-) diff --git a/goSpider.go b/goSpider.go index a75eb0c..66711b0 100644 --- a/goSpider.go +++ b/goSpider.go @@ -21,19 +21,6 @@ type Navigator struct { Logger *log.Logger } -// Requests structure to hold user data -type Requests struct { - ProcessNumber string -} - -// ResponseBody structure to hold response data -type ResponseBody struct { - Cover map[string]string - Movements []map[int]map[string]interface{} - People []map[int]map[string]interface{} - Error error -} - // NewNavigator creates a new Navigator instance. // Example: // @@ -554,13 +541,26 @@ func (nav *Navigator) SelectDropdown(selector, value string) error { return nil } +// Requests structure to hold user data +type Requests struct { + ProcessNumber string +} + +// ResponseBody structure to hold response data +type ResponseBody struct { + Cover map[string]string + Movements []map[int]map[string]interface{} + People []map[int]map[string]interface{} + Error error +} + // ParallelRequests performs web scraping tasks concurrently with a specified number of workers and a delay between requests. // The crawlerFunc parameter allows for flexibility in defining the web scraping logic. // // Parameters: // - requests: A slice of Requests structures containing the data needed for each request. // - numberOfWorkers: The number of concurrent workers to process the requests. -// - duration: The delay duration between each request to avoid overwhelming the target server. +// - delay: The delay duration between each request to avoid overwhelming the target server. // - crawlerFunc: A user-defined function that takes a process number as input and returns cover data, movements, people, and an error. // // Returns: @@ -569,39 +569,33 @@ func (nav *Navigator) SelectDropdown(selector, value string) error { // // Example Usage: // -// results, err := asyncRequest(requests, numberOfWorkers, duration, crawlerFunc) -func ParallelRequests(requests []Requests, numberOfWorkers int, duration time.Duration, crawlerFunc func(string) (map[string]string, []map[int]map[string]interface{}, []map[int]map[string]interface{}, error)) ([]ResponseBody, error) { +// results, err := ParallelRequests(requests, numberOfWorkers, delay, crawlerFunc) +func ParallelRequests(requests []Requests, numberOfWorkers int, delay time.Duration, crawlerFunc func(string) (map[string]string, []map[int]map[string]interface{}, []map[int]map[string]interface{}, error)) ([]ResponseBody, error) { done := make(chan struct{}) defer close(done) inputCh := streamInputs(done, requests) - var wg sync.WaitGroup resultCh := make(chan ResponseBody, len(requests)) // Buffered channel to hold all results - k := 0 + var wg sync.WaitGroup + + // Start workers for i := 0; i < numberOfWorkers; i++ { wg.Add(1) - go func() { + go func(workerID int) { defer wg.Done() - for input := range inputCh { - k++ - time.Sleep(duration) - cover, movements, people, err := crawlerFunc(input.ProcessNumber) + for req := range inputCh { + log.Printf("Worker %d processing request: %s", workerID, req.ProcessNumber) + time.Sleep(delay) + cover, movements, people, err := crawlerFunc(req.ProcessNumber) resultCh <- ResponseBody{ Cover: cover, Movements: movements, People: people, Error: err, } - if err != nil { - log.Println(err) - continue - } - if k == len(requests)-1 { - break - } } - }() + }(i) } // Close the result channel once all workers are done @@ -610,10 +604,10 @@ func ParallelRequests(requests []Requests, numberOfWorkers int, duration time.Du close(resultCh) }() + // Collect results from the result channel var results []ResponseBody var errorOnApiRequests error - // Collect results from the result channel for result := range resultCh { if result.Error != nil { errorOnApiRequests = result.Error @@ -621,12 +615,6 @@ func ParallelRequests(requests []Requests, numberOfWorkers int, duration time.Du results = append(results, result) } - if k == len(requests)-1 { - l := log.New(os.Stdout, "goSpider: ", log.LstdFlags) - l.Printf("Finished processing %d requests\n", len(requests)) - return results, errorOnApiRequests - } - return results, errorOnApiRequests } @@ -634,21 +622,21 @@ func ParallelRequests(requests []Requests, numberOfWorkers int, duration time.Du // // Parameters: // - done: A channel to signal when to stop processing inputs. -// - inputs: A slice of Requests structures containing the data needed for each request. +// - requests: A slice of Requests structures containing the data needed for each request. // // Returns: // - A channel that streams the input requests. // // Example Usage: // -// inputCh := streamInputs(done, inputs) -func streamInputs(done <-chan struct{}, inputs []Requests) <-chan Requests { +// inputCh := streamInputs(done, requests) +func streamInputs(done <-chan struct{}, requests []Requests) <-chan Requests { inputCh := make(chan Requests) go func() { defer close(inputCh) - for _, input := range inputs { + for _, req := range requests { select { - case inputCh <- input: + case inputCh <- req: case <-done: return } @@ -656,3 +644,7 @@ func streamInputs(done <-chan struct{}, inputs []Requests) <-chan Requests { }() return inputCh } + +func main() { + // Example usage +} From 71529e4cd2900ac1514bdb234c7cae5e9016bce5 Mon Sep 17 00:00:00 2001 From: "daniel_fillol@hotmail.com" <55287657+DanielFillol@users.noreply.github.com> Date: Mon, 3 Jun 2024 19:25:20 -0300 Subject: [PATCH 2/4] feat: create new tests to ensure that TestParallelRequests works, and it does, but when the crawler got any issue this function get stuck --- goSpider_test.go | 329 +++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 287 insertions(+), 42 deletions(-) diff --git a/goSpider_test.go b/goSpider_test.go index bde42b4..d875c0f 100644 --- a/goSpider_test.go +++ b/goSpider_test.go @@ -155,7 +155,7 @@ func TestGetCurrentURL(t *testing.T) { } } -func TestParallelRequests(t *testing.T) { +func TestParallelRequests1(t *testing.T) { users := []Requests{ {ProcessNumber: "1017927-35.2023.8.26.0008"}, {ProcessNumber: "0002396-75.2013.8.26.0201"}, @@ -166,88 +166,333 @@ func TestParallelRequests(t *testing.T) { {ProcessNumber: "1557599-09.2021.8.26.0090"}, {ProcessNumber: "1045142-72.2021.8.26.0002"}, {ProcessNumber: "0208591-43.2009.8.26.0004"}, - {ProcessNumber: "1017927-35.2023.8.26.0008"}, + {ProcessNumber: "1024511-70.2022.8.26.0003"}, } - numberOfWorkers := 3 - duration := 2 * time.Second + numberOfWorkers := 1 + duration := 0 * time.Millisecond results, err := ParallelRequests(users, numberOfWorkers, duration, Crawler) if err != nil { - log.Printf("GetCurrentURL error: %v", err) + t.Errorf("Expected %d results, but got %d, List results: %v", len(users), 0, len(results)) } log.Println("Finish Parallel Requests!") - var found []string - for _, u := range users { - for _, result := range results { - for _, value := range result.Cover { - if value == u.ProcessNumber { - found = append(found, value) - } - } - } + //var found []string + //for _, u := range users { + // for _, result := range results { + // for _, value := range result.Cover { + // if value == u.ProcessNumber { + // found = append(found, value) + // } + // } + // } + //} + // + //if len(found) != len(users) { + // + //} + +} + +func TestParallelRequests2(t *testing.T) { + users := []Requests{ + {ProcessNumber: "1017927-35.2023.8.26.0008"}, + {ProcessNumber: "0002396-75.2013.8.26.0201"}, + {ProcessNumber: "1551285-50.2021.8.26.0477"}, + {ProcessNumber: "0015386-82.2013.8.26.0562"}, + {ProcessNumber: "0007324-95.2015.8.26.0590"}, + {ProcessNumber: "1545639-85.2023.8.26.0090"}, + {ProcessNumber: "1557599-09.2021.8.26.0090"}, + {ProcessNumber: "1045142-72.2021.8.26.0002"}, + {ProcessNumber: "0208591-43.2009.8.26.0004"}, + {ProcessNumber: "1024511-70.2022.8.26.0003"}, } - if len(found) != len(users) { - t.Errorf("Expected %d results, but got %d, List results: %v", len(users), len(found), found) + numberOfWorkers := 2 + duration := 0 * time.Millisecond + + results, err := ParallelRequests(users, numberOfWorkers, duration, Crawler) + if err != nil { + t.Errorf("Expected %d results, but got %d, List results: %v", len(users), 0, len(results)) } + log.Println("Finish Parallel Requests!") + + //var found []string + //for _, u := range users { + // for _, result := range results { + // for _, value := range result.Cover { + // if value == u.ProcessNumber { + // found = append(found, value) + // } + // } + // } + //} + // + //if len(found) != len(users) { + // + //} + } -func Crawler(d string) (map[string]string, []map[int]map[string]interface{}, []map[int]map[string]interface{}, error) { - url := "https://esaj.tjsp.jus.br/cpopg/open.do" - nav := NewNavigator() - defer nav.Close() +func TestParallelRequests4(t *testing.T) { + users := []Requests{ + {ProcessNumber: "1017927-35.2023.8.26.0008"}, + {ProcessNumber: "0002396-75.2013.8.26.0201"}, + {ProcessNumber: "1551285-50.2021.8.26.0477"}, + {ProcessNumber: "0015386-82.2013.8.26.0562"}, + {ProcessNumber: "0007324-95.2015.8.26.0590"}, + {ProcessNumber: "1545639-85.2023.8.26.0090"}, + {ProcessNumber: "1557599-09.2021.8.26.0090"}, + {ProcessNumber: "1045142-72.2021.8.26.0002"}, + {ProcessNumber: "0208591-43.2009.8.26.0004"}, + {ProcessNumber: "1024511-70.2022.8.26.0003"}, + } - err := nav.OpenURL(url) + numberOfWorkers := 4 + duration := 0 * time.Millisecond + + results, err := ParallelRequests(users, numberOfWorkers, duration, Crawler) if err != nil { - log.Printf("OpenURL error: %v", err) - return nil, nil, nil, err + t.Errorf("Expected %d results, but got %d, List results: %v", len(users), 0, len(results)) } - err = nav.CheckRadioButton("#interna_NUMPROC > div > fieldset > label:nth-child(5)") + log.Println("Finish Parallel Requests!") + + //var found []string + //for _, u := range users { + // for _, result := range results { + // for _, value := range result.Cover { + // if value == u.ProcessNumber { + // found = append(found, value) + // } + // } + // } + //} + // + //if len(found) != len(users) { + // + //} + +} + +func TestParallelRequests5(t *testing.T) { + users := []Requests{ + {ProcessNumber: "1017927-35.2023.8.26.0008"}, + {ProcessNumber: "0002396-75.2013.8.26.0201"}, + {ProcessNumber: "1551285-50.2021.8.26.0477"}, + {ProcessNumber: "0015386-82.2013.8.26.0562"}, + {ProcessNumber: "0007324-95.2015.8.26.0590"}, + {ProcessNumber: "1545639-85.2023.8.26.0090"}, + {ProcessNumber: "1557599-09.2021.8.26.0090"}, + {ProcessNumber: "1045142-72.2021.8.26.0002"}, + {ProcessNumber: "0208591-43.2009.8.26.0004"}, + {ProcessNumber: "1024511-70.2022.8.26.0003"}, + } + + numberOfWorkers := 5 + duration := 0 * time.Millisecond + + results, err := ParallelRequests(users, numberOfWorkers, duration, Crawler) if err != nil { - log.Printf("CheckRadioButton error: %v", err) - return nil, nil, nil, err + t.Errorf("Expected %d results, but got %d, List results: %v", len(users), 0, len(results)) } - err = nav.FillField("#nuProcessoAntigoFormatado", d) + log.Println("Finish Parallel Requests!") + + //var found []string + //for _, u := range users { + // for _, result := range results { + // for _, value := range result.Cover { + // if value == u.ProcessNumber { + // found = append(found, value) + // } + // } + // } + //} + // + //if len(found) != len(users) { + // + //} + +} + +func TestParallelRequests8(t *testing.T) { + users := []Requests{ + {ProcessNumber: "1017927-35.2023.8.26.0008"}, + {ProcessNumber: "0002396-75.2013.8.26.0201"}, + {ProcessNumber: "1551285-50.2021.8.26.0477"}, + {ProcessNumber: "0015386-82.2013.8.26.0562"}, + {ProcessNumber: "0007324-95.2015.8.26.0590"}, + {ProcessNumber: "1545639-85.2023.8.26.0090"}, + {ProcessNumber: "1557599-09.2021.8.26.0090"}, + {ProcessNumber: "1045142-72.2021.8.26.0002"}, + {ProcessNumber: "0208591-43.2009.8.26.0004"}, + {ProcessNumber: "1024511-70.2022.8.26.0003"}, + } + + numberOfWorkers := 8 + duration := 0 * time.Millisecond + + results, err := ParallelRequests(users, numberOfWorkers, duration, Crawler) if err != nil { - log.Printf("filling field error: %v", err) - return nil, nil, nil, err + t.Errorf("Expected %d results, but got %d, List results: %v", len(users), 0, len(results)) } - err = nav.ClickButton("#botaoConsultarProcessos") + log.Println("Finish Parallel Requests!") + + //var found []string + //for _, u := range users { + // for _, result := range results { + // for _, value := range result.Cover { + // if value == u.ProcessNumber { + // found = append(found, value) + // } + // } + // } + //} + // + //if len(found) != len(users) { + // + //} + +} + +func TestParallelRequests10(t *testing.T) { + users := []Requests{ + {ProcessNumber: "1017927-35.2023.8.26.0008"}, + {ProcessNumber: "0002396-75.2013.8.26.0201"}, + {ProcessNumber: "1551285-50.2021.8.26.0477"}, + {ProcessNumber: "0015386-82.2013.8.26.0562"}, + {ProcessNumber: "0007324-95.2015.8.26.0590"}, + {ProcessNumber: "1545639-85.2023.8.26.0090"}, + {ProcessNumber: "1557599-09.2021.8.26.0090"}, + {ProcessNumber: "1045142-72.2021.8.26.0002"}, + {ProcessNumber: "0208591-43.2009.8.26.0004"}, + {ProcessNumber: "1024511-70.2022.8.26.0003"}, + } + + numberOfWorkers := 10 + duration := 0 * time.Millisecond + + results, err := ParallelRequests(users, numberOfWorkers, duration, Crawler) if err != nil { - log.Printf("ClickButton error: %v", err) - return nil, nil, nil, err + t.Errorf("Expected %d results, but got %d, List results: %v", len(users), 0, len(results)) } - err = nav.ClickElement("#linkmovimentacoes") + log.Println("Finish Parallel Requests!") + + //var found []string + //for _, u := range users { + // for _, result := range results { + // for _, value := range result.Cover { + // if value == u.ProcessNumber { + // found = append(found, value) + // } + // } + // } + //} + // + //if len(found) != len(users) { + // + //} + +} + +func TestParallelRequests11(t *testing.T) { + users := []Requests{ + {ProcessNumber: "1017927-35.2023.8.26.0008"}, + {ProcessNumber: "0002396-75.2013.8.26.0201"}, + {ProcessNumber: "1551285-50.2021.8.26.0477"}, + {ProcessNumber: "0015386-82.2013.8.26.0562"}, + {ProcessNumber: "0007324-95.2015.8.26.0590"}, + {ProcessNumber: "1545639-85.2023.8.26.0090"}, + {ProcessNumber: "1557599-09.2021.8.26.0090"}, + {ProcessNumber: "1045142-72.2021.8.26.0002"}, + {ProcessNumber: "0208591-43.2009.8.26.0004"}, + {ProcessNumber: "1024511-70.2022.8.26.0003"}, + } + + numberOfWorkers := 11 + duration := 0 * time.Millisecond + + results, err := ParallelRequests(users, numberOfWorkers, duration, Crawler) if err != nil { - log.Printf("ClickElement error: %v", err) - return nil, nil, nil, err + t.Errorf("Expected %d results, but got %d, List results: %v", len(users), 0, len(results)) } - people, err := nav.ExtractTableData("#tablePartesPrincipais") + log.Println("Finish Parallel Requests!") + + //var found []string + //for _, u := range users { + // for _, result := range results { + // for _, value := range result.Cover { + // if value == u.ProcessNumber { + // found = append(found, value) + // } + // } + // } + //} + // + //if len(found) != len(users) { + // + //} + +} + +func Crawler(d string) (map[string]string, []map[int]map[string]interface{}, []map[int]map[string]interface{}, error) { + url := "https://esaj.tjsp.jus.br/cpopg/open.do" + nav := NewNavigator() + + err := nav.OpenURL(url) if err != nil { - log.Printf("ExtractTableData error: %v", err) + log.Printf("OpenURL error: %v", err) return nil, nil, nil, err } - movements, err := nav.ExtractTableData("#tabelaTodasMovimentacoes") + err = nav.CheckRadioButton("#interna_NUMPROC > div > fieldset > label:nth-child(5)") if err != nil { - log.Printf("ExtractTableData error: %v", err) + log.Printf("CheckRadioButton error: %v", err) return nil, nil, nil, err } - cover, err := nav.ExtractDivText("#containerDadosPrincipaisProcesso", "#maisDetalhes") + err = nav.FillField("#nuProcessoAntigoFormatado", d) if err != nil { - log.Printf("ExtractDivText error: %v", err) + log.Printf("filling field error: %v", err) return nil, nil, nil, err } - return cover, movements, people, nil + //err = nav.ClickButton("#botaoConsultarProcessos") + //if err != nil { + // log.Printf("ClickButton error: %v", err) + // return nil, nil, nil, err + //} + // + //err = nav.ClickElement("#linkmovimentacoes") + //if err != nil { + // log.Printf("ClickElement error: %v", err) + // return nil, nil, nil, err + //} + + //cover, err := nav.ExtractDivText("#containerDadosPrincipaisProcesso", "#maisDetalhes") + //if err != nil { + // log.Printf("ExtractDivText error: %v", err) + // return nil, nil, nil, err + //} + // + //people, err := nav.ExtractTableData("#tablePartesPrincipais") + //if err != nil { + // log.Printf("ExtractTableData error: %v", err) + // return nil, nil, nil, err + //} + // + //movements, err := nav.ExtractTableData("#tabelaTodasMovimentacoes") + //if err != nil { + // log.Printf("ExtractTableData error: %v", err) + // return nil, nil, nil, err + //} + + return nil, nil, nil, nil } From 2575f3cca4525d32b9049977167f2f517f4b6f66 Mon Sep 17 00:00:00 2001 From: "daniel_fillol@hotmail.com" <55287657+DanielFillol@users.noreply.github.com> Date: Tue, 4 Jun 2024 17:05:18 -0300 Subject: [PATCH 3/4] feat: add htmlquery library to improve on html extraction feat: add GetPageSource to extract html source feat: remove ExtractTableData and ExtractDivText at this stage they are less useful than they appear feat: now the crawlerFunc must be a function that returns the html and error feat: add FindNodes, ExtractText and ExtractTable --- go.mod | 12 ++- go.sum | 37 +++++++ goSpider.go | 293 ++++++++++++++++++++-------------------------------- 3 files changed, 158 insertions(+), 184 deletions(-) diff --git a/go.mod b/go.mod index 371ee94..67d04ed 100644 --- a/go.mod +++ b/go.mod @@ -3,13 +3,21 @@ module github.com/DanielFillol/goSpider go 1.20 require ( - github.com/chromedp/cdproto v0.0.0-20240524221637-55927c2a4565 // indirect - github.com/chromedp/chromedp v0.9.5 // indirect + github.com/antchfx/htmlquery v1.3.1 + github.com/chromedp/cdproto v0.0.0-20240524221637-55927c2a4565 + github.com/chromedp/chromedp v0.9.5 + golang.org/x/net v0.7.0 +) + +require ( + github.com/antchfx/xpath v1.3.0 // indirect github.com/chromedp/sysutil v1.0.0 // indirect github.com/gobwas/httphead v0.1.0 // indirect github.com/gobwas/pool v0.2.1 // indirect github.com/gobwas/ws v1.4.0 // indirect + github.com/golang/groupcache v0.0.0-20210331224755-41bb18bfe9da // indirect github.com/josharian/intern v1.0.0 // indirect github.com/mailru/easyjson v0.7.7 // indirect golang.org/x/sys v0.20.0 // indirect + golang.org/x/text v0.7.0 // indirect ) diff --git a/go.sum b/go.sum index 07f8b84..ff03886 100644 --- a/go.sum +++ b/go.sum @@ -1,3 +1,7 @@ +github.com/antchfx/htmlquery v1.3.1 h1:wm0LxjLMsZhRHfQKKZscDf2COyH4vDYA3wyH+qZ+Ylc= +github.com/antchfx/htmlquery v1.3.1/go.mod h1:PTj+f1V2zksPlwNt7uVvZPsxpKNa7mlVliCRxLX6Nx8= +github.com/antchfx/xpath v1.3.0 h1:nTMlzGAK3IJ0bPpME2urTuFL76o4A96iYvoKFHRXJgc= +github.com/antchfx/xpath v1.3.0/go.mod h1:i54GszH55fYfBmoZXapTHN8T8tkcHfRgLyVwwqzXNcs= github.com/chromedp/cdproto v0.0.0-20240202021202-6d0b6a386732/go.mod h1:GKljq0VrfU4D5yc+2qA6OVr8pmO/MBbPEWqWQ/oqGEs= github.com/chromedp/cdproto v0.0.0-20240524221637-55927c2a4565 h1:sa5vT8UuQvHPysVV0o3sGJBIYp3sBZCFOowqGDE8Qwo= github.com/chromedp/cdproto v0.0.0-20240524221637-55927c2a4565/go.mod h1:GKljq0VrfU4D5yc+2qA6OVr8pmO/MBbPEWqWQ/oqGEs= @@ -12,13 +16,46 @@ github.com/gobwas/pool v0.2.1/go.mod h1:q8bcK0KcYlCgd9e7WYLm9LpyS+YeLd8JVDW6Wezm github.com/gobwas/ws v1.3.2/go.mod h1:hRKAFb8wOxFROYNsT1bqfWnhX+b5MFeJM9r2ZSwg/KY= github.com/gobwas/ws v1.4.0 h1:CTaoG1tojrh4ucGPcoJFiAQUAsEWekEWvLy7GsVNqGs= github.com/gobwas/ws v1.4.0/go.mod h1:G3gNqMNtPppf5XUz7O4shetPpcZ1VJ7zt18dlUeakrc= +github.com/golang/groupcache v0.0.0-20210331224755-41bb18bfe9da h1:oI5xCqsCo564l8iNU+DwB5epxmsaqB+rhGL0m5jtYqE= +github.com/golang/groupcache v0.0.0-20210331224755-41bb18bfe9da/go.mod h1:cIg4eruTrX1D+g88fzRXU5OdNfaM+9IcxsU14FzY7Hc= github.com/josharian/intern v1.0.0 h1:vlS4z54oSdjm0bgjRigI+G1HpF+tI+9rE5LLzOg8HmY= github.com/josharian/intern v1.0.0/go.mod h1:5DoeVV0s6jJacbCEi61lwdGj/aVlrQvzHFFd8Hwg//Y= +github.com/ledongthuc/pdf v0.0.0-20220302134840-0c2507a12d80 h1:6Yzfa6GP0rIo/kULo2bwGEkFvCePZ3qHDDTC3/J9Swo= github.com/ledongthuc/pdf v0.0.0-20220302134840-0c2507a12d80/go.mod h1:imJHygn/1yfhB7XSJJKlFZKl/J+dCPAknuiaGOshXAs= github.com/mailru/easyjson v0.7.7 h1:UGYAvKxe3sBsEDzO8ZeWOSlIQfWFlxbzLZe7hwFURr0= github.com/mailru/easyjson v0.7.7/go.mod h1:xzfreul335JAWq5oZzymOObrkdz5UnU4kGfJJLY9Nlc= +github.com/orisano/pixelmatch v0.0.0-20220722002657-fb0b55479cde h1:x0TT0RDC7UhAVbbWWBzr41ElhJx5tXPWkIHA2HWPRuw= github.com/orisano/pixelmatch v0.0.0-20220722002657-fb0b55479cde/go.mod h1:nZgzbfBr3hhjoZnS66nKrHmduYNpc34ny7RK4z5/HM0= +github.com/yuin/goldmark v1.4.13/go.mod h1:6yULJ656Px+3vBD8DxQVa3kxgyrAnzto9xy5taEt/CY= +golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w= +golang.org/x/crypto v0.0.0-20210921155107-089bfa567519/go.mod h1:GvvjBRRGRdwPK5ydBHafDWAxML/pGHZbMvKqRZ5+Abc= +golang.org/x/mod v0.6.0-dev.0.20220419223038-86c51ed26bb4/go.mod h1:jJ57K6gSWd91VN4djpZkiMVwK6gcyfeH4XE8wZrZaV4= +golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= +golang.org/x/net v0.0.0-20210226172049-e18ecbb05110/go.mod h1:m0MpNAwzfU5UDzcl9v0D8zg8gWTRqZa9RBIspLL5mdg= +golang.org/x/net v0.0.0-20220722155237-a158d28d115b/go.mod h1:XRhObCWvk6IyKnWLug+ECip1KBveYUHfp+8e9klMJ9c= +golang.org/x/net v0.7.0 h1:rJrUqqhjsgNp7KqAIc25s9pZnjU7TUcSY7HcVZjdn1g= +golang.org/x/net v0.7.0/go.mod h1:2Tu9+aMcznHK/AK1HMvgo6xiTLG5rD5rZLDS+rp2Bjs= +golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= +golang.org/x/sync v0.0.0-20220722155255-886fb9371eb4/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= +golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= +golang.org/x/sys v0.0.0-20201119102817-f84b799fce68/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20210615035016-665e8c7367d1/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.0.0-20220520151302-bc2c85ada10a/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.0.0-20220722155257-8c9f86f7a55f/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.5.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.6.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.16.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= golang.org/x/sys v0.20.0 h1:Od9JTbYCk261bKm4M/mw7AklTlFYIa0bIp9BgSm1S8Y= golang.org/x/sys v0.20.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= +golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo= +golang.org/x/term v0.0.0-20210927222741-03fcf44c2211/go.mod h1:jbD1KX2456YbFQfuXm/mYQcufACuNUgVhRMnK/tPxf8= +golang.org/x/term v0.5.0/go.mod h1:jMB1sMXY+tzblOD4FWmEbocvup2/aLOaQEp7JmGp78k= +golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= +golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= +golang.org/x/text v0.3.7/go.mod h1:u+2+/6zg+i71rQMx5EYifcz6MCKuco9NR6JIITiCfzQ= +golang.org/x/text v0.7.0 h1:4BRB4x83lYWy72KwLD/qYDuTu7q9PjSagHvijDw7cLo= +golang.org/x/text v0.7.0/go.mod h1:mrYo+phRRbMaCq/xk9113O4dZlRixOauAjOtrjsXDZ8= +golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= +golang.org/x/tools v0.0.0-20191119224855-298f0cb1881e/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo= +golang.org/x/tools v0.1.12/go.mod h1:hNGJHUnrk76NpqgfD5Aqm5Crs+Hm0VOH/i9J2+nxYbc= +golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= diff --git a/goSpider.go b/goSpider.go index 66711b0..23b6f37 100644 --- a/goSpider.go +++ b/goSpider.go @@ -2,10 +2,12 @@ package goSpider import ( "context" + "errors" "fmt" - "github.com/chromedp/cdproto/cdp" + "github.com/antchfx/htmlquery" "github.com/chromedp/cdproto/page" "github.com/chromedp/chromedp" + "golang.org/x/net/html" "io/ioutil" "log" "os" @@ -146,30 +148,29 @@ func (nav *Navigator) CaptureScreenshot() error { return nil } -// GetElement retrieves the text content of an element specified by the selector. +// GetPageSource capture all page HTML from current page +// Return the page HTML as a string and an error if any // Example: // -// text, err := nav.GetElement("#elementID") -func (nav *Navigator) GetElement(selector string) (string, error) { - var content string - - err := nav.WaitForElement(selector, 3*time.Second) +// pageSource,err := nav.GetPageSource() +func (nav *Navigator) GetPageSource() (*html.Node, error) { + nav.Logger.Println("Getting the HTML content of the page") + var pageHTML string + err := chromedp.Run(nav.Ctx, + chromedp.OuterHTML("html", &pageHTML), + ) if err != nil { - nav.Logger.Printf("Failed waiting for element: %v\n", err) - return "", fmt.Errorf("failed waiting for element: %v", err) + nav.Logger.Printf("Failed to get page HTML: %v\n", err) + return nil, fmt.Errorf("failed to get page HTML: %v", err) } - err = chromedp.Run(nav.Ctx, - chromedp.Text(selector, &content, chromedp.ByQuery, chromedp.NodeVisible), - ) - if err != nil && err.Error() != "could not find node" { - nav.Logger.Printf("Failed to get element: %v\n", err) - return "", fmt.Errorf("failed to get element: %v", err) - } - if content == "" { - return "", nil // Element not found or empty + htmlPgSrc, err := htmlquery.Parse(strings.NewReader(pageHTML)) + if err != nil { + return nil, fmt.Errorf("failed to convert page HTML: %v", err) } - return content, nil + + //nav.Logger.Println("Page HTML retrieved successfully") + return htmlPgSrc, nil } // WaitForElement waits for an element specified by the selector to be visible within the given timeout. @@ -302,149 +303,6 @@ func (nav *Navigator) FillField(selector string, value string) error { return nil } -// ExtractTableData extracts data from a table specified by the selector. -// Example: -// -// tableData, err := nav.ExtractTableData("#tableID") -func (nav *Navigator) ExtractTableData(selector string) ([]map[int]map[string]interface{}, error) { - nav.Logger.Printf("Extracting table data with selector: %s\n", selector) - var rows []*cdp.Node - err := chromedp.Run(nav.Ctx, - chromedp.Nodes(selector+" tr", &rows, chromedp.ByQueryAll), - ) - if err != nil { - nav.Logger.Printf("Failed to extract table rows: %v\n", err) - return nil, fmt.Errorf("failed to extract table rows: %v", err) - } - - var tableData []map[int]map[string]interface{} - for _, row := range rows { - // nav.Logger.Printf("Processing row %d", rowIndex) - var cells []*cdp.Node - err = chromedp.Run(nav.Ctx, - chromedp.Nodes("td, th", &cells, chromedp.ByQueryAll, chromedp.FromNode(row)), - ) - if err != nil { - nav.Logger.Printf("Failed to extract table cells: %v\n", err) - return nil, fmt.Errorf("failed to extract table cells: %v", err) - } - - rowData := make(map[int]map[string]interface{}) - for cellIndex, cell := range cells { - // nav.Logger.Printf("Processing cell %d in row %d", cellIndex, rowIndex) - cellData := make(map[string]interface{}) - - var cellText string - err = chromedp.Run(nav.Ctx, - chromedp.Text(cell.FullXPath(), &cellText, chromedp.NodeVisible), - ) - if err != nil { - nav.Logger.Printf("Failed to get cell text: %v\n", err) - return nil, fmt.Errorf("failed to get cell text: %v", err) - } - cellData["text"] = cellText - - // Check for any nested spans within the cell - var nestedSpans []*cdp.Node - nestedSpansErr := chromedp.Run(nav.Ctx, - chromedp.Nodes(cell.FullXPath()+"//span", &nestedSpans, chromedp.ByQueryAll), - ) - if nestedSpansErr != nil { - // nav.Logger.Printf("No nested spans found in cell %d of row %d: %v\n", cellIndex, rowIndex, nestedSpansErr) - // No nested spans found, continue processing - nestedSpans = []*cdp.Node{} - } - - spanData := make(map[int]string) - for spanIndex, span := range nestedSpans { - // nav.Logger.Printf("Processing span %d in cell %d of row %d", spanIndex, cellIndex, rowIndex) - var spanText string - err = chromedp.Run(nav.Ctx, - chromedp.Text(span.FullXPath(), &spanText, chromedp.NodeVisible), - ) - if err != nil { - nav.Logger.Printf("Failed to get span text: %v\n", err) - return nil, fmt.Errorf("failed to get span text: %v", err) - } - spanData[spanIndex] = spanText - } - - if len(spanData) > 0 { - cellData["spans"] = spanData - } - - rowData[cellIndex] = cellData - } - tableData = append(tableData, rowData) - } - // nav.Logger.Println("Table data extracted successfully") - return tableData, nil -} - -// ExtractDivText extracts text content from divs specified by the parent selectors. -// Example: -// -// textData, err := nav.ExtractDivText("#parent1", "#parent2") -func (nav *Navigator) ExtractDivText(parentSelectors ...string) (map[string]string, error) { - nav.Logger.Println("Extracting text from divs") - data := make(map[string]string) - for _, parentSelector := range parentSelectors { - var nodes []*cdp.Node - err := chromedp.Run(nav.Ctx, - chromedp.Nodes(parentSelector+" span, "+parentSelector+" div", &nodes, chromedp.ByQueryAll), - ) - if err != nil { - nav.Logger.Printf("Failed to extract nodes from %s: %v\n", parentSelector, err) - return nil, fmt.Errorf("failed to extract nodes from %s: %v", parentSelector, err) - } - for _, node := range nodes { - if node.NodeType == cdp.NodeTypeText { - continue - } - var text string - err = chromedp.Run(nav.Ctx, - chromedp.TextContent(node.FullXPath(), &text), - ) - if err != nil { - nav.Logger.Printf("Failed to extract text content from %s: %v\n", node.FullXPath(), err) - return nil, fmt.Errorf("failed to extract text content from %s: %v", node.FullXPath(), err) - } - data[node.AttributeValue("id")] = strings.TrimSpace(text) - } - } - // nav.Logger.Println("Text extracted successfully from divs") - return data, nil -} - -// Close closes the Navigator instance and releases resources. -// Example: -// -// nav.Close() -func (nav *Navigator) Close() { - // nav.Logger.Println("Closing the Navigator instance") - nav.Cancel() - nav.Logger.Println("Navigator instance closed successfully") -} - -// FetchHTML fetches the HTML content of the specified URL. -// Example: -// -// htmlContent, err := nav.FetchHTML("https://www.example.com") -func (nav *Navigator) FetchHTML(url string) (string, error) { - nav.Logger.Printf("Fetching HTML content from URL: %s\n", url) - var htmlContent string - err := chromedp.Run(nav.Ctx, - chromedp.Navigate(url), - chromedp.OuterHTML("html", &htmlContent), - ) - if err != nil { - nav.Logger.Printf("Failed to fetch URL: %v\n", err) - return "", fmt.Errorf("failed to fetch URL: %v", err) - } - nav.Logger.Println("HTML content fetched successfully") - return htmlContent, nil -} - // ExtractLinks extracts all links from the current page. // Example: // @@ -541,17 +399,51 @@ func (nav *Navigator) SelectDropdown(selector, value string) error { return nil } +// Close closes the Navigator instance and releases resources. +// Example: +// +// nav.Close() +func (nav *Navigator) Close() { + // nav.Logger.Println("Closing the Navigator instance") + nav.Cancel() + nav.Logger.Println("Navigator instance closed successfully") +} + +// GetElement retrieves the text content of an element specified by the selector. +// Example: +// +// text, err := nav.GetElement("#elementID") +func (nav *Navigator) GetElement(selector string) (string, error) { + var content string + + err := nav.WaitForElement(selector, 3*time.Second) + if err != nil { + nav.Logger.Printf("Failed waiting for element: %v\n", err) + return "", fmt.Errorf("failed waiting for element: %v", err) + } + + err = chromedp.Run(nav.Ctx, + chromedp.Text(selector, &content, chromedp.ByQuery, chromedp.NodeVisible), + ) + if err != nil && err.Error() != "could not find node" { + nav.Logger.Printf("Failed to get element: %v\n", err) + return "", fmt.Errorf("failed to get element: %v", err) + } + if content == "" { + return "", nil // Element not found or empty + } + return content, nil +} + // Requests structure to hold user data type Requests struct { - ProcessNumber string + SearchString string } -// ResponseBody structure to hold response data -type ResponseBody struct { - Cover map[string]string - Movements []map[int]map[string]interface{} - People []map[int]map[string]interface{} - Error error +// PageSource structure to hold the HTML data +type PageSource struct { + Page *html.Node + Error error } // ParallelRequests performs web scraping tasks concurrently with a specified number of workers and a delay between requests. @@ -561,7 +453,7 @@ type ResponseBody struct { // - requests: A slice of Requests structures containing the data needed for each request. // - numberOfWorkers: The number of concurrent workers to process the requests. // - delay: The delay duration between each request to avoid overwhelming the target server. -// - crawlerFunc: A user-defined function that takes a process number as input and returns cover data, movements, people, and an error. +// - crawlerFunc: A user-defined function that takes a process number as input and returns the html as *html.Node, and an error. // // Returns: // - A slice of ResponseBody structures containing the results of the web scraping tasks. @@ -570,12 +462,12 @@ type ResponseBody struct { // Example Usage: // // results, err := ParallelRequests(requests, numberOfWorkers, delay, crawlerFunc) -func ParallelRequests(requests []Requests, numberOfWorkers int, delay time.Duration, crawlerFunc func(string) (map[string]string, []map[int]map[string]interface{}, []map[int]map[string]interface{}, error)) ([]ResponseBody, error) { +func ParallelRequests(requests []Requests, numberOfWorkers int, delay time.Duration, crawlerFunc func(string) (*html.Node, error)) ([]PageSource, error) { done := make(chan struct{}) defer close(done) inputCh := streamInputs(done, requests) - resultCh := make(chan ResponseBody, len(requests)) // Buffered channel to hold all results + resultCh := make(chan PageSource, len(requests)) // Buffered channel to hold all results var wg sync.WaitGroup @@ -585,14 +477,12 @@ func ParallelRequests(requests []Requests, numberOfWorkers int, delay time.Durat go func(workerID int) { defer wg.Done() for req := range inputCh { - log.Printf("Worker %d processing request: %s", workerID, req.ProcessNumber) + log.Printf("Worker %d processing request: %s", workerID, req.SearchString) time.Sleep(delay) - cover, movements, people, err := crawlerFunc(req.ProcessNumber) - resultCh <- ResponseBody{ - Cover: cover, - Movements: movements, - People: people, - Error: err, + pageSource, err := crawlerFunc(req.SearchString) + resultCh <- PageSource{ + Page: pageSource, + Error: err, } } }(i) @@ -605,7 +495,7 @@ func ParallelRequests(requests []Requests, numberOfWorkers int, delay time.Durat }() // Collect results from the result channel - var results []ResponseBody + var results []PageSource var errorOnApiRequests error for result := range resultCh { @@ -645,6 +535,45 @@ func streamInputs(done <-chan struct{}, requests []Requests) <-chan Requests { return inputCh } -func main() { - // Example usage +// ExtractTable extracts data from a table specified by the selector. +// Example: +// +// tableData, err := goSpider.ExtractTableData("#tableID") +func ExtractTable(pageSource *html.Node, tableRowsExpression string) ([]*html.Node, error) { + log.Printf("Extracting table data with selector: %s\n", tableRowsExpression) + rows := htmlquery.Find(pageSource, tableRowsExpression) + if len(rows) > 0 { + return rows, nil + } + // log.Printf("Table data extracted successfully") + return nil, errors.New("could not find any table rows") +} + +// ExtractText extracts text content from nodes specified by the parent selectors. +// Example: +// +// textData, err := goSpider.ExtractText(pageSource,"#parent1", "\n") +func ExtractText(node *html.Node, nodeExpression string, Dirt string) (string, error) { + //log.Print("Extracting text from node") + var text string + tt := htmlquery.Find(node, nodeExpression) + if len(tt) > 0 { + text = strings.TrimSpace(strings.Replace(htmlquery.InnerText(htmlquery.FindOne(node, nodeExpression)), Dirt, "", -1)) + return text, nil + } + + //log.Printf("Text %v extracted successfully from node", nodeExpression) + return "", errors.New("could not find specified text") +} + +// FindNodes extracts nodes content from nodes specified by the parent selectors. +// Example: +// +// textData, err := goSpider.FindNode(pageSource,"#parent1") +func FindNodes(node *html.Node, nodeExpression string) ([]*html.Node, error) { + n := htmlquery.Find(node, nodeExpression) + if len(n) > 0 { + return n, nil + } + return nil, errors.New("could not find specified node") } From 5424196139ed78e3ab2605d60e0d4c1b6e0f3437 Mon Sep 17 00:00:00 2001 From: "daniel_fillol@hotmail.com" <55287657+DanielFillol@users.noreply.github.com> Date: Tue, 4 Jun 2024 17:06:03 -0300 Subject: [PATCH 4/4] feat: add GetPageSource to extract html source feat: add FindNodes, ExtractText and ExtractTable feat: add a full crawler to be tested --- goSpider_test.go | 571 ++++++++++++++++++++++++----------------------- 1 file changed, 289 insertions(+), 282 deletions(-) diff --git a/goSpider_test.go b/goSpider_test.go index d875c0f..77295ed 100644 --- a/goSpider_test.go +++ b/goSpider_test.go @@ -1,9 +1,14 @@ package goSpider import ( + "errors" + "fmt" + "golang.org/x/net/html" "log" "net/http" "os" + "strconv" + "strings" "testing" "time" ) @@ -37,11 +42,12 @@ func TestMain(m *testing.M) { // TestFetchHTML tests fetching the HTML content from a URL func TestFetchHTML(t *testing.T) { - htmlContent, err := nav.FetchHTML("http://localhost:8080") + nav.OpenURL("https://www.google.com") + htmlContent, err := nav.GetPageSource() if err != nil { t.Errorf("FetchHTML error: %v", err) } - if htmlContent == "" { + if htmlContent == nil { t.Error("FetchHTML returned empty content") } } @@ -116,12 +122,9 @@ func TestWaitForElement(t *testing.T) { // TestGetCurrentURL tests extracting the current URL from the browser func TestGetCurrentURL(t *testing.T) { // Navigate to the main page - htmlContent, err := nav.FetchHTML("http://localhost:8080") + err := nav.OpenURL("http://localhost:8080") if err != nil { - t.Errorf("FetchHTML error: %v", err) - } - if htmlContent == "" { - t.Error("FetchHTML returned empty content") + t.Errorf("OpenURL error: %v", err) } // Extract and verify the current URL @@ -155,62 +158,51 @@ func TestGetCurrentURL(t *testing.T) { } } -func TestParallelRequests1(t *testing.T) { +func TestParallelRequests(t *testing.T) { users := []Requests{ - {ProcessNumber: "1017927-35.2023.8.26.0008"}, - {ProcessNumber: "0002396-75.2013.8.26.0201"}, - {ProcessNumber: "1551285-50.2021.8.26.0477"}, - {ProcessNumber: "0015386-82.2013.8.26.0562"}, - {ProcessNumber: "0007324-95.2015.8.26.0590"}, - {ProcessNumber: "1545639-85.2023.8.26.0090"}, - {ProcessNumber: "1557599-09.2021.8.26.0090"}, - {ProcessNumber: "1045142-72.2021.8.26.0002"}, - {ProcessNumber: "0208591-43.2009.8.26.0004"}, - {ProcessNumber: "1024511-70.2022.8.26.0003"}, + {SearchString: "1017927-35.2023.8.26.0008"}, + {SearchString: "0002396-75.2013.8.26.0201"}, + {SearchString: "1551285-50.2021.8.26.0477"}, + {SearchString: "0015386-82.2013.8.26.0562"}, + {SearchString: "0007324-95.2015.8.26.0590"}, + {SearchString: "1545639-85.2023.8.26.0090"}, + {SearchString: "1557599-09.2021.8.26.0090"}, + {SearchString: "1045142-72.2021.8.26.0002"}, + {SearchString: "0208591-43.2009.8.26.0004"}, + {SearchString: "1024511-70.2022.8.26.0003"}, } - numberOfWorkers := 1 + numberOfWorkers := 10 duration := 0 * time.Millisecond results, err := ParallelRequests(users, numberOfWorkers, duration, Crawler) if err != nil { - t.Errorf("Expected %d results, but got %d, List results: %v", len(users), 0, len(results)) + log.Printf("ParallelRequests error: %v", err) } - log.Println("Finish Parallel Requests!") + if len(results) != len(users) { + t.Errorf("Expected %d results, but got %d, List results: %v, error: %v", len(users), 0, len(results), err) + } - //var found []string - //for _, u := range users { - // for _, result := range results { - // for _, value := range result.Cover { - // if value == u.ProcessNumber { - // found = append(found, value) - // } - // } - // } - //} - // - //if len(found) != len(users) { - // - //} + log.Println("Finish Parallel Requests!") } -func TestParallelRequests2(t *testing.T) { +func TestRequestsDataStruct(t *testing.T) { users := []Requests{ - {ProcessNumber: "1017927-35.2023.8.26.0008"}, - {ProcessNumber: "0002396-75.2013.8.26.0201"}, - {ProcessNumber: "1551285-50.2021.8.26.0477"}, - {ProcessNumber: "0015386-82.2013.8.26.0562"}, - {ProcessNumber: "0007324-95.2015.8.26.0590"}, - {ProcessNumber: "1545639-85.2023.8.26.0090"}, - {ProcessNumber: "1557599-09.2021.8.26.0090"}, - {ProcessNumber: "1045142-72.2021.8.26.0002"}, - {ProcessNumber: "0208591-43.2009.8.26.0004"}, - {ProcessNumber: "1024511-70.2022.8.26.0003"}, - } - - numberOfWorkers := 2 + {SearchString: "1017927-35.2023.8.26.0008"}, + {SearchString: "0002396-75.2013.8.26.0201"}, + {SearchString: "1551285-50.2021.8.26.0477"}, + {SearchString: "0015386-82.2013.8.26.0562"}, + {SearchString: "0007324-95.2015.8.26.0590"}, + {SearchString: "1545639-85.2023.8.26.0090"}, + {SearchString: "1557599-09.2021.8.26.0090"}, + {SearchString: "1045142-72.2021.8.26.0002"}, + {SearchString: "0208591-43.2009.8.26.0004"}, + {SearchString: "1024511-70.2022.8.26.0003"}, + } + + numberOfWorkers := 1 duration := 0 * time.Millisecond results, err := ParallelRequests(users, numberOfWorkers, duration, Crawler) @@ -220,279 +212,294 @@ func TestParallelRequests2(t *testing.T) { log.Println("Finish Parallel Requests!") - //var found []string - //for _, u := range users { - // for _, result := range results { - // for _, value := range result.Cover { - // if value == u.ProcessNumber { - // found = append(found, value) - // } - // } - // } - //} - // - //if len(found) != len(users) { - // - //} + type Lawsuit struct { + Cover Cover + Persons []Person + Movements []Movement + } + var lawsuits []Lawsuit + for _, result := range results { + // Cover + c, err := extractDataCover(result.Page, "//*[@id=\"numeroProcesso\"]", "//*[@id=\"labelSituacaoProcesso\"]", "//*[@id=\"classeProcesso\"]", "//*[@id=\"assuntoProcesso\"]", "//*[@id=\"foroProcesso\"]", "//*[@id=\"varaProcesso\"]", "//*[@id=\"juizProcesso\"]", "//*[@id=\"dataHoraDistribuicaoProcesso\"]", "//*[@id=\"numeroControleProcesso\"]", "//*[@id=\"areaProcesso\"]/span", "//*[@id=\"valorAcaoProcesso\"]") + if err != nil { + t.Errorf("ExtractDataCover error: %v", err) + } + // Persons + p, err := extractDataPerson(result.Page, "//*[@id=\"tableTodasPartes\"]/tbody/tr", "td[1]/span", "td[2]/text()", "\n") + if err != nil { + p, err = extractDataPerson(result.Page, "//*[@id=\"tablePartesPrincipais\"]/tbody/tr", "td[1]/text()", "td[2]/text()", "\n") + if err != nil { + t.Errorf("Expected some person but got none: %v", err.Error()) + } + } + // Movements + m, err := extractDataMovement(result.Page, "//*[@id=\"tabelaTodasMovimentacoes\"]/tr", "\n") + if err != nil { + t.Errorf("Expected some movement but got none: %v", err.Error()) + } + + lawsuits = append(lawsuits, Lawsuit{ + Cover: c, + Persons: p, + Movements: m, + }) + } + + if len(lawsuits) != len(users) { + t.Errorf("Expected %d lawsuits, but got %d", len(users), len(lawsuits)) + } + + fmt.Println(lawsuits) } -func TestParallelRequests4(t *testing.T) { - users := []Requests{ - {ProcessNumber: "1017927-35.2023.8.26.0008"}, - {ProcessNumber: "0002396-75.2013.8.26.0201"}, - {ProcessNumber: "1551285-50.2021.8.26.0477"}, - {ProcessNumber: "0015386-82.2013.8.26.0562"}, - {ProcessNumber: "0007324-95.2015.8.26.0590"}, - {ProcessNumber: "1545639-85.2023.8.26.0090"}, - {ProcessNumber: "1557599-09.2021.8.26.0090"}, - {ProcessNumber: "1045142-72.2021.8.26.0002"}, - {ProcessNumber: "0208591-43.2009.8.26.0004"}, - {ProcessNumber: "1024511-70.2022.8.26.0003"}, - } - - numberOfWorkers := 4 - duration := 0 * time.Millisecond +func Crawler(d string) (*html.Node, error) { + url := "https://esaj.tjsp.jus.br/cpopg/open.do" + nav := NewNavigator() - results, err := ParallelRequests(users, numberOfWorkers, duration, Crawler) + err := nav.OpenURL(url) if err != nil { - t.Errorf("Expected %d results, but got %d, List results: %v", len(users), 0, len(results)) + log.Printf("OpenURL error: %v", err) + return nil, err } - log.Println("Finish Parallel Requests!") - - //var found []string - //for _, u := range users { - // for _, result := range results { - // for _, value := range result.Cover { - // if value == u.ProcessNumber { - // found = append(found, value) - // } - // } - // } - //} - // - //if len(found) != len(users) { - // - //} - -} + err = nav.CheckRadioButton("#interna_NUMPROC > div > fieldset > label:nth-child(5)") + if err != nil { + log.Printf("CheckRadioButton error: %v", err) + return nil, err + } -func TestParallelRequests5(t *testing.T) { - users := []Requests{ - {ProcessNumber: "1017927-35.2023.8.26.0008"}, - {ProcessNumber: "0002396-75.2013.8.26.0201"}, - {ProcessNumber: "1551285-50.2021.8.26.0477"}, - {ProcessNumber: "0015386-82.2013.8.26.0562"}, - {ProcessNumber: "0007324-95.2015.8.26.0590"}, - {ProcessNumber: "1545639-85.2023.8.26.0090"}, - {ProcessNumber: "1557599-09.2021.8.26.0090"}, - {ProcessNumber: "1045142-72.2021.8.26.0002"}, - {ProcessNumber: "0208591-43.2009.8.26.0004"}, - {ProcessNumber: "1024511-70.2022.8.26.0003"}, - } - - numberOfWorkers := 5 - duration := 0 * time.Millisecond + err = nav.FillField("#nuProcessoAntigoFormatado", d) + if err != nil { + log.Printf("filling field error: %v", err) + return nil, err + } - results, err := ParallelRequests(users, numberOfWorkers, duration, Crawler) + err = nav.ClickButton("#botaoConsultarProcessos") if err != nil { - t.Errorf("Expected %d results, but got %d, List results: %v", len(users), 0, len(results)) + log.Printf("ClickButton error: %v", err) + return nil, err } - log.Println("Finish Parallel Requests!") + err = nav.WaitForElement("#tabelaUltimasMovimentacoes > tr:nth-child(1) > td.dataMovimentacao", 15*time.Second) + if err != nil { + log.Printf("WaitForElement error: %v", err) + return nil, err + } - //var found []string - //for _, u := range users { - // for _, result := range results { - // for _, value := range result.Cover { - // if value == u.ProcessNumber { - // found = append(found, value) - // } - // } - // } - //} - // - //if len(found) != len(users) { - // - //} + pageSource, err := nav.GetPageSource() + if err != nil { + log.Printf("GetPageSource error: %v", err) + return nil, err + } + return pageSource, nil } -func TestParallelRequests8(t *testing.T) { - users := []Requests{ - {ProcessNumber: "1017927-35.2023.8.26.0008"}, - {ProcessNumber: "0002396-75.2013.8.26.0201"}, - {ProcessNumber: "1551285-50.2021.8.26.0477"}, - {ProcessNumber: "0015386-82.2013.8.26.0562"}, - {ProcessNumber: "0007324-95.2015.8.26.0590"}, - {ProcessNumber: "1545639-85.2023.8.26.0090"}, - {ProcessNumber: "1557599-09.2021.8.26.0090"}, - {ProcessNumber: "1045142-72.2021.8.26.0002"}, - {ProcessNumber: "0208591-43.2009.8.26.0004"}, - {ProcessNumber: "1024511-70.2022.8.26.0003"}, - } - - numberOfWorkers := 8 - duration := 0 * time.Millisecond +type Cover struct { + Title string + Tag string + Class string + Subject string + Location string + Unit string + Judge string + InitialDate string + Control string + Field string + Value string + Error string +} - results, err := ParallelRequests(users, numberOfWorkers, duration, Crawler) +func extractDataCover(pageSource *html.Node, xpathTitle string, xpathTag string, xpathClass string, xpathSubject string, xpathLocation string, xpathUnit string, xpathJudge string, xpathInitDate string, xpathControl string, xpathField string, xpathValue string) (Cover, error) { + var i int //count errors + title, err := ExtractText(pageSource, xpathTitle, " ") if err != nil { - t.Errorf("Expected %d results, but got %d, List results: %v", len(users), 0, len(results)) + log.Println("error extracting title") } - log.Println("Finish Parallel Requests!") - - //var found []string - //for _, u := range users { - // for _, result := range results { - // for _, value := range result.Cover { - // if value == u.ProcessNumber { - // found = append(found, value) - // } - // } - // } - //} - // - //if len(found) != len(users) { - // - //} - -} - -func TestParallelRequests10(t *testing.T) { - users := []Requests{ - {ProcessNumber: "1017927-35.2023.8.26.0008"}, - {ProcessNumber: "0002396-75.2013.8.26.0201"}, - {ProcessNumber: "1551285-50.2021.8.26.0477"}, - {ProcessNumber: "0015386-82.2013.8.26.0562"}, - {ProcessNumber: "0007324-95.2015.8.26.0590"}, - {ProcessNumber: "1545639-85.2023.8.26.0090"}, - {ProcessNumber: "1557599-09.2021.8.26.0090"}, - {ProcessNumber: "1045142-72.2021.8.26.0002"}, - {ProcessNumber: "0208591-43.2009.8.26.0004"}, - {ProcessNumber: "1024511-70.2022.8.26.0003"}, + tag, err := ExtractText(pageSource, xpathTag, "") + if err != nil { + i++ + log.Println("error extracting tag") } - numberOfWorkers := 10 - duration := 0 * time.Millisecond + class, err := ExtractText(pageSource, xpathClass, "") + if err != nil { + i++ + log.Println("error extracting class") + } - results, err := ParallelRequests(users, numberOfWorkers, duration, Crawler) + subject, err := ExtractText(pageSource, xpathSubject, "") if err != nil { - t.Errorf("Expected %d results, but got %d, List results: %v", len(users), 0, len(results)) + i++ + log.Println("error extracting subject") } - log.Println("Finish Parallel Requests!") + location, err := ExtractText(pageSource, xpathLocation, "") + if err != nil { + i++ + log.Println("error extracting location") + } - //var found []string - //for _, u := range users { - // for _, result := range results { - // for _, value := range result.Cover { - // if value == u.ProcessNumber { - // found = append(found, value) - // } - // } - // } - //} - // - //if len(found) != len(users) { - // - //} + unit, err := ExtractText(pageSource, xpathUnit, "") + if err != nil { + i++ + log.Println("error extracting unit") + } -} + judge, err := ExtractText(pageSource, xpathJudge, "") + if err != nil { + i++ + log.Println("error extracting existJudge") + } -func TestParallelRequests11(t *testing.T) { - users := []Requests{ - {ProcessNumber: "1017927-35.2023.8.26.0008"}, - {ProcessNumber: "0002396-75.2013.8.26.0201"}, - {ProcessNumber: "1551285-50.2021.8.26.0477"}, - {ProcessNumber: "0015386-82.2013.8.26.0562"}, - {ProcessNumber: "0007324-95.2015.8.26.0590"}, - {ProcessNumber: "1545639-85.2023.8.26.0090"}, - {ProcessNumber: "1557599-09.2021.8.26.0090"}, - {ProcessNumber: "1045142-72.2021.8.26.0002"}, - {ProcessNumber: "0208591-43.2009.8.26.0004"}, - {ProcessNumber: "1024511-70.2022.8.26.0003"}, - } - - numberOfWorkers := 11 - duration := 0 * time.Millisecond + initDate, err := ExtractText(pageSource, xpathInitDate, "") + if err != nil { + i++ + log.Println("error extracting initDate") + } - results, err := ParallelRequests(users, numberOfWorkers, duration, Crawler) + control, err := ExtractText(pageSource, xpathControl, "") if err != nil { - t.Errorf("Expected %d results, but got %d, List results: %v", len(users), 0, len(results)) + i++ + log.Println("error extracting control") } - log.Println("Finish Parallel Requests!") + field, err := ExtractText(pageSource, xpathField, "") + if err != nil { + log.Println("error extracting field") + } - //var found []string - //for _, u := range users { - // for _, result := range results { - // for _, value := range result.Cover { - // if value == u.ProcessNumber { - // found = append(found, value) - // } - // } - // } - //} - // - //if len(found) != len(users) { - // - //} + value, err := ExtractText(pageSource, xpathValue, "R$ ") + if err != nil { + i++ + log.Println("error extracting field value") + } + var e string + if err != nil { + e = err.Error() + } + + if i >= 5 { + return Cover{}, fmt.Errorf("too many errors: %d", i) + } + + return Cover{ + Title: title, + Tag: tag, + Class: class, + Subject: subject, + Location: location, + Unit: unit, + Judge: judge, + InitialDate: initDate, + Control: control, + Field: field, + Value: value, + Error: e, + }, nil } -func Crawler(d string) (map[string]string, []map[int]map[string]interface{}, []map[int]map[string]interface{}, error) { - url := "https://esaj.tjsp.jus.br/cpopg/open.do" - nav := NewNavigator() +type Person struct { + Pole string + Name string + Lawyers []string +} - err := nav.OpenURL(url) +func extractDataPerson(pageSource *html.Node, xpathPeople string, xpathPole string, xpathLawyer string, dirt string) ([]Person, error) { + Pole, err := FindNodes(pageSource, xpathPeople) if err != nil { - log.Printf("OpenURL error: %v", err) - return nil, nil, nil, err - } + return nil, err + } + + var personas []Person + for i, person := range Pole { + pole, err := ExtractText(person, xpathPole, dirt) + if err != nil { + return nil, errors.New("error extract data person, pole not found: " + err.Error()) + } + + var name string + _, err = FindNodes(person, xpathPeople+"["+strconv.Itoa(i)+"]/td[2]") + if err != nil { + name, err = ExtractText(person, "td[2]/text()", dirt) + if err != nil { + return nil, errors.New("error extract data person, name not found: " + err.Error()) + } + } else { + name, err = ExtractText(person, "td[2]/text()["+strconv.Itoa(1)+"]", dirt) + if err != nil { + return nil, errors.New("error extract data person, name not found: " + err.Error()) + } + } + + var lawyers []string + ll, err := FindNodes(person, xpathLawyer) + if err != nil { + lawyers = append(lawyers, "no lawyer found") + } + for j, _ := range ll { + n, err := ExtractText(person, "td[2]/text()["+strconv.Itoa(j+1)+"]", dirt) + if err != nil { + return nil, errors.New("error extract data person, lawyer not found: " + err.Error()) + } + lawyers = append(lawyers, n) + } + + p := Person{ + Pole: pole, + Name: name, + Lawyers: lawyers, + } + + personas = append(personas, p) + } + + return personas, nil +} - err = nav.CheckRadioButton("#interna_NUMPROC > div > fieldset > label:nth-child(5)") - if err != nil { - log.Printf("CheckRadioButton error: %v", err) - return nil, nil, nil, err - } +type Movement struct { + Date string + Title string + Text string +} - err = nav.FillField("#nuProcessoAntigoFormatado", d) +func extractDataMovement(pageSource *html.Node, node string, dirt string) ([]Movement, error) { + xpathTable := node + + tableRows, err := ExtractTable(pageSource, xpathTable) if err != nil { - log.Printf("filling field error: %v", err) - return nil, nil, nil, err - } - - //err = nav.ClickButton("#botaoConsultarProcessos") - //if err != nil { - // log.Printf("ClickButton error: %v", err) - // return nil, nil, nil, err - //} - // - //err = nav.ClickElement("#linkmovimentacoes") - //if err != nil { - // log.Printf("ClickElement error: %v", err) - // return nil, nil, nil, err - //} - - //cover, err := nav.ExtractDivText("#containerDadosPrincipaisProcesso", "#maisDetalhes") - //if err != nil { - // log.Printf("ExtractDivText error: %v", err) - // return nil, nil, nil, err - //} - // - //people, err := nav.ExtractTableData("#tablePartesPrincipais") - //if err != nil { - // log.Printf("ExtractTableData error: %v", err) - // return nil, nil, nil, err - //} - // - //movements, err := nav.ExtractTableData("#tabelaTodasMovimentacoes") - //if err != nil { - // log.Printf("ExtractTableData error: %v", err) - // return nil, nil, nil, err - //} - - return nil, nil, nil, nil + return nil, err + } + + if len(tableRows) > 0 { + var allMovements []Movement + for _, row := range tableRows { + date, err := ExtractText(row, "td[1]", dirt) + if err != nil { + return nil, errors.New("error extracting table date: " + err.Error()) + } + title, err := ExtractText(row, "td[3]", dirt) + if err != nil { + return nil, errors.New("error extracting table title: " + err.Error()) + } + text, err := ExtractText(row, "td[3]/span", dirt) + if err != nil { + return nil, errors.New("error extracting table text: " + err.Error()) + } + + mv := Movement{ + Date: strings.ReplaceAll(date, "\t", ""), + Title: strings.ReplaceAll(strings.ReplaceAll(title, text, ""), dirt, ""), + Text: strings.TrimSpace(strings.ReplaceAll(text, "\t", "")), + } + + allMovements = append(allMovements, mv) + } + return allMovements, nil + } + + return nil, errors.New("error table: could not find any movements") }