Skip to content

Commit

Permalink
from-to date filters
Browse files Browse the repository at this point in the history
  • Loading branch information
karust committed Jun 4, 2023
1 parent f4f95f0 commit 46f330c
Show file tree
Hide file tree
Showing 3 changed files with 51 additions and 18 deletions.
10 changes: 7 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -36,18 +36,22 @@ gogetcrawl -h

#### Get URLs

* You can get multiple-domain archive data, flags will be applied to each. By default, you will get all results displayed in your terminal (use `--collapse` to get unique results):
* You can get multiple-domain archive data, flags will be applied to each. By default, you will get all results displayed in your terminal (use `--collapse` to get **unique** results):
```
gogetcrawl url *.example.com *.tutorialspoint.com/* --collapse
```

* To limit the number of results, enable output to a file and select only Wayback as a source you can:
* To **limit** the number of results, enable output to a file and select only Wayback as a **source** you can:
```
gogetcrawl url *.tutorialspoint.com/* --limit 10 --sources wb -o ./urls.txt
```

* Set **date range**:
```
gogetcrawl url *.tutorialspoint.com/* --limit 10 --from 20140131 --to 20231231
```
#### Download files
* Download 5 `PDF` files to `./test` directory with 3 workers:
* Download 5 `PDF` files to `./test` directory with 3 **workers**:
```
gogetcrawl download *.cia.gov/* --limit 5 -w 3 -d ./test -f "mimetype:application/pdf"
```
Expand Down
48 changes: 34 additions & 14 deletions cmd/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ import (
"mime"
"os"
"strings"
"time"

"github.com/karust/gogetcrawl/common"
"github.com/karust/gogetcrawl/commoncrawl"
Expand All @@ -17,17 +18,19 @@ import (
const version = "1.1.2"

var (
filters []string
isCollapse bool
isSuccessful bool
isLogging bool
isVerbose bool
maxTimeout int
maxRetries int
maxResults uint
maxWorkers uint
extensions []string
sourceNames []string
filters []string
fromDateFilter string
toDateFilter string
isCollapse bool
isSuccessful bool
isLogging bool
isVerbose bool
maxTimeout int
maxRetries int
maxResults uint
maxWorkers uint
extensions []string
sourceNames []string
)

var rootCmd = &cobra.Command{
Expand Down Expand Up @@ -90,11 +93,25 @@ func getRequestConfigs(args []string) chan common.RequestConfig {
filters = append(filters, "statuscode:200")
}

if fromDateFilter != "" {
if _, err := time.Parse("20060102", fromDateFilter); err != nil {
log.Fatalln(fmt.Sprintf("Please check `--from` filter date: '%v', %v", fromDateFilter, err))
}
}

if toDateFilter != "" {
if _, err := time.Parse("20060102", toDateFilter); err != nil {
log.Fatalln(fmt.Sprintf("Please check `--to` filter date: '%v', %v", toDateFilter, err))
}
}

for _, domain := range args {
config := common.RequestConfig{
URL: domain,
Filters: filters,
Limit: maxResults,
URL: domain,
Filters: filters,
Limit: maxResults,
FromDate: fromDateFilter,
ToDate: toDateFilter,
}

if isCollapse {
Expand Down Expand Up @@ -145,4 +162,7 @@ func init() {
rootCmd.PersistentFlags().StringSliceVarP(&sourceNames, "sources", "s", []string{"wb", "cc"}, `Web archive sources to use. Example: --sources "wb" to use only the Wayback`)
rootCmd.PersistentFlags().BoolVarP(&isVerbose, "verbose", "v", false, `Use verbose output.`)
rootCmd.PersistentFlags().BoolVarP(&isLogging, "log", "", false, `Print logs to ./logs.txt.`)
rootCmd.PersistentFlags().StringVarP(&fromDateFilter, "from", "", "", "Filter from date, example: --from 20200131 (filter from 31 Jan 2020)")
rootCmd.PersistentFlags().StringVarP(&toDateFilter, "to", "", "", "Filter to date, example: --to 20230401 (filter to 1 Apr 2023)")
//TODOrootCmd.PersistentFlags().BoolVarP(&isDisablePagination, "disable-pagination", "", "", "")
}
11 changes: 10 additions & 1 deletion common/common.go
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,8 @@ type RequestConfig struct {
Limit uint // Max number of results per page
CollapseColumn string // Which column to use to collapse results
SinglePage bool // Get results only from 1st page (mostly used for tests)
FromDate string // Filter results from Date
ToDate string // Filter results to Date
}

// GetUrlFromConfig ... Compose URL with CDX server request parameters
Expand All @@ -72,10 +74,17 @@ func (config RequestConfig) GetUrl(serverURL string, page int) string {
}
}

if config.FromDate != "" {
reqURL = fmt.Sprintf("%v&from=%v", reqURL, config.FromDate)
}

if config.ToDate != "" {
reqURL = fmt.Sprintf("%v&to=%v", reqURL, config.ToDate)
}

if !config.SinglePage {
reqURL = fmt.Sprintf("%v&page=%v", reqURL, page)
}

return reqURL
}

Expand Down

0 comments on commit 46f330c

Please sign in to comment.