diff --git a/go.mod b/go.mod index 371ee94..67d04ed 100644 --- a/go.mod +++ b/go.mod @@ -3,13 +3,21 @@ module github.com/DanielFillol/goSpider go 1.20 require ( - github.com/chromedp/cdproto v0.0.0-20240524221637-55927c2a4565 // indirect - github.com/chromedp/chromedp v0.9.5 // indirect + github.com/antchfx/htmlquery v1.3.1 + github.com/chromedp/cdproto v0.0.0-20240524221637-55927c2a4565 + github.com/chromedp/chromedp v0.9.5 + golang.org/x/net v0.7.0 +) + +require ( + github.com/antchfx/xpath v1.3.0 // indirect github.com/chromedp/sysutil v1.0.0 // indirect github.com/gobwas/httphead v0.1.0 // indirect github.com/gobwas/pool v0.2.1 // indirect github.com/gobwas/ws v1.4.0 // indirect + github.com/golang/groupcache v0.0.0-20210331224755-41bb18bfe9da // indirect github.com/josharian/intern v1.0.0 // indirect github.com/mailru/easyjson v0.7.7 // indirect golang.org/x/sys v0.20.0 // indirect + golang.org/x/text v0.7.0 // indirect ) diff --git a/go.sum b/go.sum index 07f8b84..ff03886 100644 --- a/go.sum +++ b/go.sum @@ -1,3 +1,7 @@ +github.com/antchfx/htmlquery v1.3.1 h1:wm0LxjLMsZhRHfQKKZscDf2COyH4vDYA3wyH+qZ+Ylc= +github.com/antchfx/htmlquery v1.3.1/go.mod h1:PTj+f1V2zksPlwNt7uVvZPsxpKNa7mlVliCRxLX6Nx8= +github.com/antchfx/xpath v1.3.0 h1:nTMlzGAK3IJ0bPpME2urTuFL76o4A96iYvoKFHRXJgc= +github.com/antchfx/xpath v1.3.0/go.mod h1:i54GszH55fYfBmoZXapTHN8T8tkcHfRgLyVwwqzXNcs= github.com/chromedp/cdproto v0.0.0-20240202021202-6d0b6a386732/go.mod h1:GKljq0VrfU4D5yc+2qA6OVr8pmO/MBbPEWqWQ/oqGEs= github.com/chromedp/cdproto v0.0.0-20240524221637-55927c2a4565 h1:sa5vT8UuQvHPysVV0o3sGJBIYp3sBZCFOowqGDE8Qwo= github.com/chromedp/cdproto v0.0.0-20240524221637-55927c2a4565/go.mod h1:GKljq0VrfU4D5yc+2qA6OVr8pmO/MBbPEWqWQ/oqGEs= @@ -12,13 +16,46 @@ github.com/gobwas/pool v0.2.1/go.mod h1:q8bcK0KcYlCgd9e7WYLm9LpyS+YeLd8JVDW6Wezm github.com/gobwas/ws v1.3.2/go.mod h1:hRKAFb8wOxFROYNsT1bqfWnhX+b5MFeJM9r2ZSwg/KY= github.com/gobwas/ws v1.4.0 h1:CTaoG1tojrh4ucGPcoJFiAQUAsEWekEWvLy7GsVNqGs= github.com/gobwas/ws v1.4.0/go.mod h1:G3gNqMNtPppf5XUz7O4shetPpcZ1VJ7zt18dlUeakrc= +github.com/golang/groupcache v0.0.0-20210331224755-41bb18bfe9da h1:oI5xCqsCo564l8iNU+DwB5epxmsaqB+rhGL0m5jtYqE= +github.com/golang/groupcache v0.0.0-20210331224755-41bb18bfe9da/go.mod h1:cIg4eruTrX1D+g88fzRXU5OdNfaM+9IcxsU14FzY7Hc= github.com/josharian/intern v1.0.0 h1:vlS4z54oSdjm0bgjRigI+G1HpF+tI+9rE5LLzOg8HmY= github.com/josharian/intern v1.0.0/go.mod h1:5DoeVV0s6jJacbCEi61lwdGj/aVlrQvzHFFd8Hwg//Y= +github.com/ledongthuc/pdf v0.0.0-20220302134840-0c2507a12d80 h1:6Yzfa6GP0rIo/kULo2bwGEkFvCePZ3qHDDTC3/J9Swo= github.com/ledongthuc/pdf v0.0.0-20220302134840-0c2507a12d80/go.mod h1:imJHygn/1yfhB7XSJJKlFZKl/J+dCPAknuiaGOshXAs= github.com/mailru/easyjson v0.7.7 h1:UGYAvKxe3sBsEDzO8ZeWOSlIQfWFlxbzLZe7hwFURr0= github.com/mailru/easyjson v0.7.7/go.mod h1:xzfreul335JAWq5oZzymOObrkdz5UnU4kGfJJLY9Nlc= +github.com/orisano/pixelmatch v0.0.0-20220722002657-fb0b55479cde h1:x0TT0RDC7UhAVbbWWBzr41ElhJx5tXPWkIHA2HWPRuw= github.com/orisano/pixelmatch v0.0.0-20220722002657-fb0b55479cde/go.mod h1:nZgzbfBr3hhjoZnS66nKrHmduYNpc34ny7RK4z5/HM0= +github.com/yuin/goldmark v1.4.13/go.mod h1:6yULJ656Px+3vBD8DxQVa3kxgyrAnzto9xy5taEt/CY= +golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w= +golang.org/x/crypto v0.0.0-20210921155107-089bfa567519/go.mod h1:GvvjBRRGRdwPK5ydBHafDWAxML/pGHZbMvKqRZ5+Abc= +golang.org/x/mod v0.6.0-dev.0.20220419223038-86c51ed26bb4/go.mod h1:jJ57K6gSWd91VN4djpZkiMVwK6gcyfeH4XE8wZrZaV4= +golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= +golang.org/x/net v0.0.0-20210226172049-e18ecbb05110/go.mod h1:m0MpNAwzfU5UDzcl9v0D8zg8gWTRqZa9RBIspLL5mdg= +golang.org/x/net v0.0.0-20220722155237-a158d28d115b/go.mod h1:XRhObCWvk6IyKnWLug+ECip1KBveYUHfp+8e9klMJ9c= +golang.org/x/net v0.7.0 h1:rJrUqqhjsgNp7KqAIc25s9pZnjU7TUcSY7HcVZjdn1g= +golang.org/x/net v0.7.0/go.mod h1:2Tu9+aMcznHK/AK1HMvgo6xiTLG5rD5rZLDS+rp2Bjs= +golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= +golang.org/x/sync v0.0.0-20220722155255-886fb9371eb4/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= +golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= +golang.org/x/sys v0.0.0-20201119102817-f84b799fce68/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20210615035016-665e8c7367d1/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.0.0-20220520151302-bc2c85ada10a/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.0.0-20220722155257-8c9f86f7a55f/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.5.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.6.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.16.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= golang.org/x/sys v0.20.0 h1:Od9JTbYCk261bKm4M/mw7AklTlFYIa0bIp9BgSm1S8Y= golang.org/x/sys v0.20.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= +golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo= +golang.org/x/term v0.0.0-20210927222741-03fcf44c2211/go.mod h1:jbD1KX2456YbFQfuXm/mYQcufACuNUgVhRMnK/tPxf8= +golang.org/x/term v0.5.0/go.mod h1:jMB1sMXY+tzblOD4FWmEbocvup2/aLOaQEp7JmGp78k= +golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= +golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= +golang.org/x/text v0.3.7/go.mod h1:u+2+/6zg+i71rQMx5EYifcz6MCKuco9NR6JIITiCfzQ= +golang.org/x/text v0.7.0 h1:4BRB4x83lYWy72KwLD/qYDuTu7q9PjSagHvijDw7cLo= +golang.org/x/text v0.7.0/go.mod h1:mrYo+phRRbMaCq/xk9113O4dZlRixOauAjOtrjsXDZ8= +golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= +golang.org/x/tools v0.0.0-20191119224855-298f0cb1881e/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo= +golang.org/x/tools v0.1.12/go.mod h1:hNGJHUnrk76NpqgfD5Aqm5Crs+Hm0VOH/i9J2+nxYbc= +golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= diff --git a/goSpider.go b/goSpider.go index a75eb0c..23b6f37 100644 --- a/goSpider.go +++ b/goSpider.go @@ -2,10 +2,12 @@ package goSpider import ( "context" + "errors" "fmt" - "github.com/chromedp/cdproto/cdp" + "github.com/antchfx/htmlquery" "github.com/chromedp/cdproto/page" "github.com/chromedp/chromedp" + "golang.org/x/net/html" "io/ioutil" "log" "os" @@ -21,19 +23,6 @@ type Navigator struct { Logger *log.Logger } -// Requests structure to hold user data -type Requests struct { - ProcessNumber string -} - -// ResponseBody structure to hold response data -type ResponseBody struct { - Cover map[string]string - Movements []map[int]map[string]interface{} - People []map[int]map[string]interface{} - Error error -} - // NewNavigator creates a new Navigator instance. // Example: // @@ -159,30 +148,29 @@ func (nav *Navigator) CaptureScreenshot() error { return nil } -// GetElement retrieves the text content of an element specified by the selector. +// GetPageSource capture all page HTML from current page +// Return the page HTML as a string and an error if any // Example: // -// text, err := nav.GetElement("#elementID") -func (nav *Navigator) GetElement(selector string) (string, error) { - var content string - - err := nav.WaitForElement(selector, 3*time.Second) +// pageSource,err := nav.GetPageSource() +func (nav *Navigator) GetPageSource() (*html.Node, error) { + nav.Logger.Println("Getting the HTML content of the page") + var pageHTML string + err := chromedp.Run(nav.Ctx, + chromedp.OuterHTML("html", &pageHTML), + ) if err != nil { - nav.Logger.Printf("Failed waiting for element: %v\n", err) - return "", fmt.Errorf("failed waiting for element: %v", err) + nav.Logger.Printf("Failed to get page HTML: %v\n", err) + return nil, fmt.Errorf("failed to get page HTML: %v", err) } - err = chromedp.Run(nav.Ctx, - chromedp.Text(selector, &content, chromedp.ByQuery, chromedp.NodeVisible), - ) - if err != nil && err.Error() != "could not find node" { - nav.Logger.Printf("Failed to get element: %v\n", err) - return "", fmt.Errorf("failed to get element: %v", err) - } - if content == "" { - return "", nil // Element not found or empty + htmlPgSrc, err := htmlquery.Parse(strings.NewReader(pageHTML)) + if err != nil { + return nil, fmt.Errorf("failed to convert page HTML: %v", err) } - return content, nil + + //nav.Logger.Println("Page HTML retrieved successfully") + return htmlPgSrc, nil } // WaitForElement waits for an element specified by the selector to be visible within the given timeout. @@ -315,149 +303,6 @@ func (nav *Navigator) FillField(selector string, value string) error { return nil } -// ExtractTableData extracts data from a table specified by the selector. -// Example: -// -// tableData, err := nav.ExtractTableData("#tableID") -func (nav *Navigator) ExtractTableData(selector string) ([]map[int]map[string]interface{}, error) { - nav.Logger.Printf("Extracting table data with selector: %s\n", selector) - var rows []*cdp.Node - err := chromedp.Run(nav.Ctx, - chromedp.Nodes(selector+" tr", &rows, chromedp.ByQueryAll), - ) - if err != nil { - nav.Logger.Printf("Failed to extract table rows: %v\n", err) - return nil, fmt.Errorf("failed to extract table rows: %v", err) - } - - var tableData []map[int]map[string]interface{} - for _, row := range rows { - // nav.Logger.Printf("Processing row %d", rowIndex) - var cells []*cdp.Node - err = chromedp.Run(nav.Ctx, - chromedp.Nodes("td, th", &cells, chromedp.ByQueryAll, chromedp.FromNode(row)), - ) - if err != nil { - nav.Logger.Printf("Failed to extract table cells: %v\n", err) - return nil, fmt.Errorf("failed to extract table cells: %v", err) - } - - rowData := make(map[int]map[string]interface{}) - for cellIndex, cell := range cells { - // nav.Logger.Printf("Processing cell %d in row %d", cellIndex, rowIndex) - cellData := make(map[string]interface{}) - - var cellText string - err = chromedp.Run(nav.Ctx, - chromedp.Text(cell.FullXPath(), &cellText, chromedp.NodeVisible), - ) - if err != nil { - nav.Logger.Printf("Failed to get cell text: %v\n", err) - return nil, fmt.Errorf("failed to get cell text: %v", err) - } - cellData["text"] = cellText - - // Check for any nested spans within the cell - var nestedSpans []*cdp.Node - nestedSpansErr := chromedp.Run(nav.Ctx, - chromedp.Nodes(cell.FullXPath()+"//span", &nestedSpans, chromedp.ByQueryAll), - ) - if nestedSpansErr != nil { - // nav.Logger.Printf("No nested spans found in cell %d of row %d: %v\n", cellIndex, rowIndex, nestedSpansErr) - // No nested spans found, continue processing - nestedSpans = []*cdp.Node{} - } - - spanData := make(map[int]string) - for spanIndex, span := range nestedSpans { - // nav.Logger.Printf("Processing span %d in cell %d of row %d", spanIndex, cellIndex, rowIndex) - var spanText string - err = chromedp.Run(nav.Ctx, - chromedp.Text(span.FullXPath(), &spanText, chromedp.NodeVisible), - ) - if err != nil { - nav.Logger.Printf("Failed to get span text: %v\n", err) - return nil, fmt.Errorf("failed to get span text: %v", err) - } - spanData[spanIndex] = spanText - } - - if len(spanData) > 0 { - cellData["spans"] = spanData - } - - rowData[cellIndex] = cellData - } - tableData = append(tableData, rowData) - } - // nav.Logger.Println("Table data extracted successfully") - return tableData, nil -} - -// ExtractDivText extracts text content from divs specified by the parent selectors. -// Example: -// -// textData, err := nav.ExtractDivText("#parent1", "#parent2") -func (nav *Navigator) ExtractDivText(parentSelectors ...string) (map[string]string, error) { - nav.Logger.Println("Extracting text from divs") - data := make(map[string]string) - for _, parentSelector := range parentSelectors { - var nodes []*cdp.Node - err := chromedp.Run(nav.Ctx, - chromedp.Nodes(parentSelector+" span, "+parentSelector+" div", &nodes, chromedp.ByQueryAll), - ) - if err != nil { - nav.Logger.Printf("Failed to extract nodes from %s: %v\n", parentSelector, err) - return nil, fmt.Errorf("failed to extract nodes from %s: %v", parentSelector, err) - } - for _, node := range nodes { - if node.NodeType == cdp.NodeTypeText { - continue - } - var text string - err = chromedp.Run(nav.Ctx, - chromedp.TextContent(node.FullXPath(), &text), - ) - if err != nil { - nav.Logger.Printf("Failed to extract text content from %s: %v\n", node.FullXPath(), err) - return nil, fmt.Errorf("failed to extract text content from %s: %v", node.FullXPath(), err) - } - data[node.AttributeValue("id")] = strings.TrimSpace(text) - } - } - // nav.Logger.Println("Text extracted successfully from divs") - return data, nil -} - -// Close closes the Navigator instance and releases resources. -// Example: -// -// nav.Close() -func (nav *Navigator) Close() { - // nav.Logger.Println("Closing the Navigator instance") - nav.Cancel() - nav.Logger.Println("Navigator instance closed successfully") -} - -// FetchHTML fetches the HTML content of the specified URL. -// Example: -// -// htmlContent, err := nav.FetchHTML("https://www.example.com") -func (nav *Navigator) FetchHTML(url string) (string, error) { - nav.Logger.Printf("Fetching HTML content from URL: %s\n", url) - var htmlContent string - err := chromedp.Run(nav.Ctx, - chromedp.Navigate(url), - chromedp.OuterHTML("html", &htmlContent), - ) - if err != nil { - nav.Logger.Printf("Failed to fetch URL: %v\n", err) - return "", fmt.Errorf("failed to fetch URL: %v", err) - } - nav.Logger.Println("HTML content fetched successfully") - return htmlContent, nil -} - // ExtractLinks extracts all links from the current page. // Example: // @@ -554,14 +399,61 @@ func (nav *Navigator) SelectDropdown(selector, value string) error { return nil } +// Close closes the Navigator instance and releases resources. +// Example: +// +// nav.Close() +func (nav *Navigator) Close() { + // nav.Logger.Println("Closing the Navigator instance") + nav.Cancel() + nav.Logger.Println("Navigator instance closed successfully") +} + +// GetElement retrieves the text content of an element specified by the selector. +// Example: +// +// text, err := nav.GetElement("#elementID") +func (nav *Navigator) GetElement(selector string) (string, error) { + var content string + + err := nav.WaitForElement(selector, 3*time.Second) + if err != nil { + nav.Logger.Printf("Failed waiting for element: %v\n", err) + return "", fmt.Errorf("failed waiting for element: %v", err) + } + + err = chromedp.Run(nav.Ctx, + chromedp.Text(selector, &content, chromedp.ByQuery, chromedp.NodeVisible), + ) + if err != nil && err.Error() != "could not find node" { + nav.Logger.Printf("Failed to get element: %v\n", err) + return "", fmt.Errorf("failed to get element: %v", err) + } + if content == "" { + return "", nil // Element not found or empty + } + return content, nil +} + +// Requests structure to hold user data +type Requests struct { + SearchString string +} + +// PageSource structure to hold the HTML data +type PageSource struct { + Page *html.Node + Error error +} + // ParallelRequests performs web scraping tasks concurrently with a specified number of workers and a delay between requests. // The crawlerFunc parameter allows for flexibility in defining the web scraping logic. // // Parameters: // - requests: A slice of Requests structures containing the data needed for each request. // - numberOfWorkers: The number of concurrent workers to process the requests. -// - duration: The delay duration between each request to avoid overwhelming the target server. -// - crawlerFunc: A user-defined function that takes a process number as input and returns cover data, movements, people, and an error. +// - delay: The delay duration between each request to avoid overwhelming the target server. +// - crawlerFunc: A user-defined function that takes a process number as input and returns the html as *html.Node, and an error. // // Returns: // - A slice of ResponseBody structures containing the results of the web scraping tasks. @@ -569,39 +461,31 @@ func (nav *Navigator) SelectDropdown(selector, value string) error { // // Example Usage: // -// results, err := asyncRequest(requests, numberOfWorkers, duration, crawlerFunc) -func ParallelRequests(requests []Requests, numberOfWorkers int, duration time.Duration, crawlerFunc func(string) (map[string]string, []map[int]map[string]interface{}, []map[int]map[string]interface{}, error)) ([]ResponseBody, error) { +// results, err := ParallelRequests(requests, numberOfWorkers, delay, crawlerFunc) +func ParallelRequests(requests []Requests, numberOfWorkers int, delay time.Duration, crawlerFunc func(string) (*html.Node, error)) ([]PageSource, error) { done := make(chan struct{}) defer close(done) inputCh := streamInputs(done, requests) + resultCh := make(chan PageSource, len(requests)) // Buffered channel to hold all results + var wg sync.WaitGroup - resultCh := make(chan ResponseBody, len(requests)) // Buffered channel to hold all results - k := 0 + // Start workers for i := 0; i < numberOfWorkers; i++ { wg.Add(1) - go func() { + go func(workerID int) { defer wg.Done() - for input := range inputCh { - k++ - time.Sleep(duration) - cover, movements, people, err := crawlerFunc(input.ProcessNumber) - resultCh <- ResponseBody{ - Cover: cover, - Movements: movements, - People: people, - Error: err, - } - if err != nil { - log.Println(err) - continue - } - if k == len(requests)-1 { - break + for req := range inputCh { + log.Printf("Worker %d processing request: %s", workerID, req.SearchString) + time.Sleep(delay) + pageSource, err := crawlerFunc(req.SearchString) + resultCh <- PageSource{ + Page: pageSource, + Error: err, } } - }() + }(i) } // Close the result channel once all workers are done @@ -610,10 +494,10 @@ func ParallelRequests(requests []Requests, numberOfWorkers int, duration time.Du close(resultCh) }() - var results []ResponseBody + // Collect results from the result channel + var results []PageSource var errorOnApiRequests error - // Collect results from the result channel for result := range resultCh { if result.Error != nil { errorOnApiRequests = result.Error @@ -621,12 +505,6 @@ func ParallelRequests(requests []Requests, numberOfWorkers int, duration time.Du results = append(results, result) } - if k == len(requests)-1 { - l := log.New(os.Stdout, "goSpider: ", log.LstdFlags) - l.Printf("Finished processing %d requests\n", len(requests)) - return results, errorOnApiRequests - } - return results, errorOnApiRequests } @@ -634,21 +512,21 @@ func ParallelRequests(requests []Requests, numberOfWorkers int, duration time.Du // // Parameters: // - done: A channel to signal when to stop processing inputs. -// - inputs: A slice of Requests structures containing the data needed for each request. +// - requests: A slice of Requests structures containing the data needed for each request. // // Returns: // - A channel that streams the input requests. // // Example Usage: // -// inputCh := streamInputs(done, inputs) -func streamInputs(done <-chan struct{}, inputs []Requests) <-chan Requests { +// inputCh := streamInputs(done, requests) +func streamInputs(done <-chan struct{}, requests []Requests) <-chan Requests { inputCh := make(chan Requests) go func() { defer close(inputCh) - for _, input := range inputs { + for _, req := range requests { select { - case inputCh <- input: + case inputCh <- req: case <-done: return } @@ -656,3 +534,46 @@ func streamInputs(done <-chan struct{}, inputs []Requests) <-chan Requests { }() return inputCh } + +// ExtractTable extracts data from a table specified by the selector. +// Example: +// +// tableData, err := goSpider.ExtractTableData("#tableID") +func ExtractTable(pageSource *html.Node, tableRowsExpression string) ([]*html.Node, error) { + log.Printf("Extracting table data with selector: %s\n", tableRowsExpression) + rows := htmlquery.Find(pageSource, tableRowsExpression) + if len(rows) > 0 { + return rows, nil + } + // log.Printf("Table data extracted successfully") + return nil, errors.New("could not find any table rows") +} + +// ExtractText extracts text content from nodes specified by the parent selectors. +// Example: +// +// textData, err := goSpider.ExtractText(pageSource,"#parent1", "\n") +func ExtractText(node *html.Node, nodeExpression string, Dirt string) (string, error) { + //log.Print("Extracting text from node") + var text string + tt := htmlquery.Find(node, nodeExpression) + if len(tt) > 0 { + text = strings.TrimSpace(strings.Replace(htmlquery.InnerText(htmlquery.FindOne(node, nodeExpression)), Dirt, "", -1)) + return text, nil + } + + //log.Printf("Text %v extracted successfully from node", nodeExpression) + return "", errors.New("could not find specified text") +} + +// FindNodes extracts nodes content from nodes specified by the parent selectors. +// Example: +// +// textData, err := goSpider.FindNode(pageSource,"#parent1") +func FindNodes(node *html.Node, nodeExpression string) ([]*html.Node, error) { + n := htmlquery.Find(node, nodeExpression) + if len(n) > 0 { + return n, nil + } + return nil, errors.New("could not find specified node") +} diff --git a/goSpider_test.go b/goSpider_test.go index bde42b4..77295ed 100644 --- a/goSpider_test.go +++ b/goSpider_test.go @@ -1,9 +1,14 @@ package goSpider import ( + "errors" + "fmt" + "golang.org/x/net/html" "log" "net/http" "os" + "strconv" + "strings" "testing" "time" ) @@ -37,11 +42,12 @@ func TestMain(m *testing.M) { // TestFetchHTML tests fetching the HTML content from a URL func TestFetchHTML(t *testing.T) { - htmlContent, err := nav.FetchHTML("http://localhost:8080") + nav.OpenURL("https://www.google.com") + htmlContent, err := nav.GetPageSource() if err != nil { t.Errorf("FetchHTML error: %v", err) } - if htmlContent == "" { + if htmlContent == nil { t.Error("FetchHTML returned empty content") } } @@ -116,12 +122,9 @@ func TestWaitForElement(t *testing.T) { // TestGetCurrentURL tests extracting the current URL from the browser func TestGetCurrentURL(t *testing.T) { // Navigate to the main page - htmlContent, err := nav.FetchHTML("http://localhost:8080") + err := nav.OpenURL("http://localhost:8080") if err != nil { - t.Errorf("FetchHTML error: %v", err) - } - if htmlContent == "" { - t.Error("FetchHTML returned empty content") + t.Errorf("OpenURL error: %v", err) } // Extract and verify the current URL @@ -157,97 +160,346 @@ func TestGetCurrentURL(t *testing.T) { func TestParallelRequests(t *testing.T) { users := []Requests{ - {ProcessNumber: "1017927-35.2023.8.26.0008"}, - {ProcessNumber: "0002396-75.2013.8.26.0201"}, - {ProcessNumber: "1551285-50.2021.8.26.0477"}, - {ProcessNumber: "0015386-82.2013.8.26.0562"}, - {ProcessNumber: "0007324-95.2015.8.26.0590"}, - {ProcessNumber: "1545639-85.2023.8.26.0090"}, - {ProcessNumber: "1557599-09.2021.8.26.0090"}, - {ProcessNumber: "1045142-72.2021.8.26.0002"}, - {ProcessNumber: "0208591-43.2009.8.26.0004"}, - {ProcessNumber: "1017927-35.2023.8.26.0008"}, + {SearchString: "1017927-35.2023.8.26.0008"}, + {SearchString: "0002396-75.2013.8.26.0201"}, + {SearchString: "1551285-50.2021.8.26.0477"}, + {SearchString: "0015386-82.2013.8.26.0562"}, + {SearchString: "0007324-95.2015.8.26.0590"}, + {SearchString: "1545639-85.2023.8.26.0090"}, + {SearchString: "1557599-09.2021.8.26.0090"}, + {SearchString: "1045142-72.2021.8.26.0002"}, + {SearchString: "0208591-43.2009.8.26.0004"}, + {SearchString: "1024511-70.2022.8.26.0003"}, + } + + numberOfWorkers := 10 + duration := 0 * time.Millisecond + + results, err := ParallelRequests(users, numberOfWorkers, duration, Crawler) + if err != nil { + log.Printf("ParallelRequests error: %v", err) + } + + if len(results) != len(users) { + t.Errorf("Expected %d results, but got %d, List results: %v, error: %v", len(users), 0, len(results), err) + } + + log.Println("Finish Parallel Requests!") + +} + +func TestRequestsDataStruct(t *testing.T) { + users := []Requests{ + {SearchString: "1017927-35.2023.8.26.0008"}, + {SearchString: "0002396-75.2013.8.26.0201"}, + {SearchString: "1551285-50.2021.8.26.0477"}, + {SearchString: "0015386-82.2013.8.26.0562"}, + {SearchString: "0007324-95.2015.8.26.0590"}, + {SearchString: "1545639-85.2023.8.26.0090"}, + {SearchString: "1557599-09.2021.8.26.0090"}, + {SearchString: "1045142-72.2021.8.26.0002"}, + {SearchString: "0208591-43.2009.8.26.0004"}, + {SearchString: "1024511-70.2022.8.26.0003"}, } - numberOfWorkers := 3 - duration := 2 * time.Second + numberOfWorkers := 1 + duration := 0 * time.Millisecond results, err := ParallelRequests(users, numberOfWorkers, duration, Crawler) if err != nil { - log.Printf("GetCurrentURL error: %v", err) + t.Errorf("Expected %d results, but got %d, List results: %v", len(users), 0, len(results)) } log.Println("Finish Parallel Requests!") - var found []string - for _, u := range users { - for _, result := range results { - for _, value := range result.Cover { - if value == u.ProcessNumber { - found = append(found, value) - } + type Lawsuit struct { + Cover Cover + Persons []Person + Movements []Movement + } + var lawsuits []Lawsuit + for _, result := range results { + // Cover + c, err := extractDataCover(result.Page, "//*[@id=\"numeroProcesso\"]", "//*[@id=\"labelSituacaoProcesso\"]", "//*[@id=\"classeProcesso\"]", "//*[@id=\"assuntoProcesso\"]", "//*[@id=\"foroProcesso\"]", "//*[@id=\"varaProcesso\"]", "//*[@id=\"juizProcesso\"]", "//*[@id=\"dataHoraDistribuicaoProcesso\"]", "//*[@id=\"numeroControleProcesso\"]", "//*[@id=\"areaProcesso\"]/span", "//*[@id=\"valorAcaoProcesso\"]") + if err != nil { + t.Errorf("ExtractDataCover error: %v", err) + } + // Persons + p, err := extractDataPerson(result.Page, "//*[@id=\"tableTodasPartes\"]/tbody/tr", "td[1]/span", "td[2]/text()", "\n") + if err != nil { + p, err = extractDataPerson(result.Page, "//*[@id=\"tablePartesPrincipais\"]/tbody/tr", "td[1]/text()", "td[2]/text()", "\n") + if err != nil { + t.Errorf("Expected some person but got none: %v", err.Error()) } } + // Movements + m, err := extractDataMovement(result.Page, "//*[@id=\"tabelaTodasMovimentacoes\"]/tr", "\n") + if err != nil { + t.Errorf("Expected some movement but got none: %v", err.Error()) + } + + lawsuits = append(lawsuits, Lawsuit{ + Cover: c, + Persons: p, + Movements: m, + }) } - if len(found) != len(users) { - t.Errorf("Expected %d results, but got %d, List results: %v", len(users), len(found), found) + if len(lawsuits) != len(users) { + t.Errorf("Expected %d lawsuits, but got %d", len(users), len(lawsuits)) } + fmt.Println(lawsuits) + } -func Crawler(d string) (map[string]string, []map[int]map[string]interface{}, []map[int]map[string]interface{}, error) { +func Crawler(d string) (*html.Node, error) { url := "https://esaj.tjsp.jus.br/cpopg/open.do" nav := NewNavigator() - defer nav.Close() err := nav.OpenURL(url) if err != nil { log.Printf("OpenURL error: %v", err) - return nil, nil, nil, err + return nil, err } err = nav.CheckRadioButton("#interna_NUMPROC > div > fieldset > label:nth-child(5)") if err != nil { log.Printf("CheckRadioButton error: %v", err) - return nil, nil, nil, err + return nil, err } err = nav.FillField("#nuProcessoAntigoFormatado", d) if err != nil { log.Printf("filling field error: %v", err) - return nil, nil, nil, err + return nil, err } err = nav.ClickButton("#botaoConsultarProcessos") if err != nil { log.Printf("ClickButton error: %v", err) - return nil, nil, nil, err + return nil, err + } + + err = nav.WaitForElement("#tabelaUltimasMovimentacoes > tr:nth-child(1) > td.dataMovimentacao", 15*time.Second) + if err != nil { + log.Printf("WaitForElement error: %v", err) + return nil, err + } + + pageSource, err := nav.GetPageSource() + if err != nil { + log.Printf("GetPageSource error: %v", err) + return nil, err + } + + return pageSource, nil +} + +type Cover struct { + Title string + Tag string + Class string + Subject string + Location string + Unit string + Judge string + InitialDate string + Control string + Field string + Value string + Error string +} + +func extractDataCover(pageSource *html.Node, xpathTitle string, xpathTag string, xpathClass string, xpathSubject string, xpathLocation string, xpathUnit string, xpathJudge string, xpathInitDate string, xpathControl string, xpathField string, xpathValue string) (Cover, error) { + var i int //count errors + title, err := ExtractText(pageSource, xpathTitle, " ") + if err != nil { + log.Println("error extracting title") + } + + tag, err := ExtractText(pageSource, xpathTag, "") + if err != nil { + i++ + log.Println("error extracting tag") + } + + class, err := ExtractText(pageSource, xpathClass, "") + if err != nil { + i++ + log.Println("error extracting class") + } + + subject, err := ExtractText(pageSource, xpathSubject, "") + if err != nil { + i++ + log.Println("error extracting subject") + } + + location, err := ExtractText(pageSource, xpathLocation, "") + if err != nil { + i++ + log.Println("error extracting location") + } + + unit, err := ExtractText(pageSource, xpathUnit, "") + if err != nil { + i++ + log.Println("error extracting unit") + } + + judge, err := ExtractText(pageSource, xpathJudge, "") + if err != nil { + i++ + log.Println("error extracting existJudge") + } + + initDate, err := ExtractText(pageSource, xpathInitDate, "") + if err != nil { + i++ + log.Println("error extracting initDate") + } + + control, err := ExtractText(pageSource, xpathControl, "") + if err != nil { + i++ + log.Println("error extracting control") + } + + field, err := ExtractText(pageSource, xpathField, "") + if err != nil { + log.Println("error extracting field") } - err = nav.ClickElement("#linkmovimentacoes") + value, err := ExtractText(pageSource, xpathValue, "R$ ") if err != nil { - log.Printf("ClickElement error: %v", err) - return nil, nil, nil, err + i++ + log.Println("error extracting field value") } - people, err := nav.ExtractTableData("#tablePartesPrincipais") + var e string if err != nil { - log.Printf("ExtractTableData error: %v", err) - return nil, nil, nil, err + e = err.Error() + } + + if i >= 5 { + return Cover{}, fmt.Errorf("too many errors: %d", i) } - movements, err := nav.ExtractTableData("#tabelaTodasMovimentacoes") + return Cover{ + Title: title, + Tag: tag, + Class: class, + Subject: subject, + Location: location, + Unit: unit, + Judge: judge, + InitialDate: initDate, + Control: control, + Field: field, + Value: value, + Error: e, + }, nil +} + +type Person struct { + Pole string + Name string + Lawyers []string +} + +func extractDataPerson(pageSource *html.Node, xpathPeople string, xpathPole string, xpathLawyer string, dirt string) ([]Person, error) { + Pole, err := FindNodes(pageSource, xpathPeople) if err != nil { - log.Printf("ExtractTableData error: %v", err) - return nil, nil, nil, err + return nil, err + } + + var personas []Person + for i, person := range Pole { + pole, err := ExtractText(person, xpathPole, dirt) + if err != nil { + return nil, errors.New("error extract data person, pole not found: " + err.Error()) + } + + var name string + _, err = FindNodes(person, xpathPeople+"["+strconv.Itoa(i)+"]/td[2]") + if err != nil { + name, err = ExtractText(person, "td[2]/text()", dirt) + if err != nil { + return nil, errors.New("error extract data person, name not found: " + err.Error()) + } + } else { + name, err = ExtractText(person, "td[2]/text()["+strconv.Itoa(1)+"]", dirt) + if err != nil { + return nil, errors.New("error extract data person, name not found: " + err.Error()) + } + } + + var lawyers []string + ll, err := FindNodes(person, xpathLawyer) + if err != nil { + lawyers = append(lawyers, "no lawyer found") + } + for j, _ := range ll { + n, err := ExtractText(person, "td[2]/text()["+strconv.Itoa(j+1)+"]", dirt) + if err != nil { + return nil, errors.New("error extract data person, lawyer not found: " + err.Error()) + } + lawyers = append(lawyers, n) + } + + p := Person{ + Pole: pole, + Name: name, + Lawyers: lawyers, + } + + personas = append(personas, p) } - cover, err := nav.ExtractDivText("#containerDadosPrincipaisProcesso", "#maisDetalhes") + return personas, nil +} + +type Movement struct { + Date string + Title string + Text string +} + +func extractDataMovement(pageSource *html.Node, node string, dirt string) ([]Movement, error) { + xpathTable := node + + tableRows, err := ExtractTable(pageSource, xpathTable) if err != nil { - log.Printf("ExtractDivText error: %v", err) - return nil, nil, nil, err + return nil, err + } + + if len(tableRows) > 0 { + var allMovements []Movement + for _, row := range tableRows { + date, err := ExtractText(row, "td[1]", dirt) + if err != nil { + return nil, errors.New("error extracting table date: " + err.Error()) + } + title, err := ExtractText(row, "td[3]", dirt) + if err != nil { + return nil, errors.New("error extracting table title: " + err.Error()) + } + text, err := ExtractText(row, "td[3]/span", dirt) + if err != nil { + return nil, errors.New("error extracting table text: " + err.Error()) + } + + mv := Movement{ + Date: strings.ReplaceAll(date, "\t", ""), + Title: strings.ReplaceAll(strings.ReplaceAll(title, text, ""), dirt, ""), + Text: strings.TrimSpace(strings.ReplaceAll(text, "\t", "")), + } + + allMovements = append(allMovements, mv) + } + return allMovements, nil } - return cover, movements, people, nil + return nil, errors.New("error table: could not find any movements") }