From f35f4e38423c1cbefca3a7184453c38d3d8fdd78 Mon Sep 17 00:00:00 2001 From: Tim Bray Date: Mon, 3 Jun 2024 16:18:23 -0700 Subject: [PATCH] kaizen: prepare for 2.0 release addresses: #31 Signed-off-by: Tim Bray --- CONTRIBUTING.md | 2 +- INSTALLING.md | 2 +- Makefile | 18 +++++++++--------- README.md | 38 ++++++++++++++++++++------------------ doc/tf.1 | 16 ++++++++-------- internal/config.go | 12 ++++++------ internal/segmenter_test.go | 4 ++-- main.go | 2 +- 8 files changed, 48 insertions(+), 46 deletions(-) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 5437904..d4b95fa 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -4,7 +4,7 @@ Topfew is hosted in this GitHub repository at `github.com/timbray/topfew` and welcomes contributions. -This is release 1.0 of Topfew, which is probably more +This is release 2.0 of Topfew, which is probably more or less complete. It is well-tested. Its performance at processing streams can keep up with most streams and it is dramatically faster when processing files, diff --git a/INSTALLING.md b/INSTALLING.md index 66d6b79..3d1ff1f 100644 --- a/INSTALLING.md +++ b/INSTALLING.md @@ -3,7 +3,7 @@ Each Topfew [release](https://github.com/timbray/topfew/releases) comes with binaries built for both the x86 and ARM flavors of Linux, MacOS, and Windows. -Topfew comes with a Makefile which is uncomplicated. Typing `make` will create an executable named `tf`, +Topfew comes with a Makefile which is uncomplicated. Typing `make` will create an executable named `topfew`, created by `go build` with no options, in the `./bin` directory. ## Arch Linux diff --git a/Makefile b/Makefile index 595358f..5b51ac6 100644 --- a/Makefile +++ b/Makefile @@ -1,18 +1,18 @@ .PHONY: test -all: test tf +all: test topfew test: main.go internal/*.go go test ./... && go vet ./... # local version you can run -tf: - go build -o bin/tf +topfew: + go build -o bin/topfew release: test - GOOS=darwin GOARCH=arm64 go build -o tf && gzip < tf > tf-macos-arm.gz - GOOS=darwin GOARCH=amd64 go build -o tf && gzip < tf > tf-macos-x86.gz - GOOS=linux GOARCH=amd64 go build -o tf && gzip < tf > tf-linux-x86.gz - GOOS=linux GOARCH=arm64 go build -o tf && gzip < tf > tf-linux-arm.gz - GOOS=windows GOARCH=amd64 go build -o tf && zip -mq tf-windows-x86.exe.zip tf - GOOS=windows GOARCH=arm64 go build -o tf && zip -mq tf-windows-arm.exe.zip tf + GOOS=darwin GOARCH=arm64 go build -o topfew && gzip < topfew>topfew-macos-arm.gz + GOOS=darwin GOARCH=amd64 go build -o topfew && gzip < topfew>topfew-macos-x86.gz + GOOS=linux GOARCH=amd64 go build -o topfew && gzip < topfew>topfew-linux-x86.gz + GOOS=linux GOARCH=arm64 go build -o topfew && gzip < topfew>topfew-linux-arm.gz + GOOS=windows GOARCH=amd64 go build -o topfew && zip -mq topfew-windows-x86.exe.zip topfew + GOOS=windows GOARCH=arm64 go build -o topfew && zip -mq topfew-windows-arm.exe.zip topfew diff --git a/README.md b/README.md index b4dcfbe..b2dad1d 100644 --- a/README.md +++ b/README.md @@ -8,38 +8,40 @@ A program that finds and prints out the top few records in which a certain field or combination of fields occurs most frequently. -This is release 1.0 of Topfew. +This is release 2.0 of Topfew. ## Examples To find the IP address that most commonly hits your web site, given an Apache logfile named `access_log`. -`tf --fields 1 access_log` +`topfew --fields 1 access_log` The same effect could be achieved with `awk '{print $1}' access_log | sort | uniq -c | sort -rn | head` -But **tf** is usually much faster. +But **topfew** is usually much faster. Do the same, but exclude high-traffic bots (omitting the filename). -`tf --fields 1 --vgrep googlebot --vgrep bingbot` +`topfew --fields 1 --vgrep googlebot --vgrep bingbot` Most popular IP addresses from May 2020. -`tf --fields 1 -grep '\[../May/2020'` +`topfew --fields 1 -grep '\[../May/2020'` Most popular hour/minute of the day for retrievals. -`tf --fields 4 --sed "\\[" "" --sed '^[^:]*:' '' --sed ':..$' ''` +`topfew --fields 4 --sed "\\[" "" --sed '^[^:]*:' '' --sed ':..$' ''` ## Usage ```shell -tf +topfew -n, --number (output line count) [default is 10] -f, --fields (field list) [default is the whole record] + -q, --quotedfields [respect "-delimited space-separated fields] + -p, --fieldseparator (regexp) [use provided regexp to separate fields] -g, --grep (regexp) [may repeat, default is accept all] -v, --vgrep (regexp) [may repeat, default is reject none] -s, --sed (regexp) (replacement) [may repeat, default is no changes] @@ -48,7 +50,7 @@ tf -h, -help, --help filename [default is stdin] -All the arguments are optional; if none are provided, tf will read records +All the arguments are optional; if none are provided, topfew will read records from the standard input and list the 10 which occur most often. ``` ## Options @@ -63,7 +65,7 @@ Specifies which fields should be extracted from incoming records and used in com The fieldlist must be a comma‐separated list of integers identifying field numbers, which start at one, for example 3 and 2,5,6. The fields must be provided in order, so 3,1,7 is an error. -If no fieldlist is provided, **tf** treats the whole input record as a single field. +If no fieldlist is provided, **topfew** treats the whole input record as a single field. `-p separator, --fieldseparator separator` @@ -74,13 +76,13 @@ This is likely to incur a significant performance cost. Some files, for example Apache httpd logs, use space-separation but also allow spaces within fields which are delimited by `"`. The -q/--quotedfields -argument allows **tf** to process these correctly. It is an error to specify both +argument allows **topfew** to process these correctly. It is an error to specify both -p and -q. `-g regexp`, `--grep regexp` The initial **g** suggests `grep`. -This option applies the provided regular expression to each record as it is read and if the regexp does not match the record, **tf** bypasses it. +This option applies the provided regular expression to each record as it is read and if the regexp does not match the record, **topfew** bypasses it. This option can be provided multiple times; the provided regular expressions will be applied in the order they appear on the command line. @@ -101,19 +103,19 @@ This option can be provided many times, and the replacement operations are perf `--sample` It can be tricky to get the regular expressions in the `−g`, `−v`, and `−s` options right. -Specifying `-−sample` causes **tf** to print lines to the standard output that display the filtering and field‐editing logic. +Specifying `-−sample` causes **topfew** to print lines to the standard output that display the filtering and field‐editing logic. It can only be used when processing standard input, not a file. `-w integer`, `--width integer` -If a file name is specified then **tf**, rather than reading it from end to end, will divide it into segments and process it in multiple parallel threads. +If a file name is specified then **topfew**, rather than reading it from end to end, will divide it into segments and process it in multiple parallel threads. The optimal number of threads depends in a complicated way on how many cores your CPU has what kind of cores they are, and the storage architecture. The default is the result of the Go `runtime.NumCPU()` calls and often produces good results. `-h`, `-help`, `--help` -Describes the function and options of **tf**. +Describes the function and options of **topfew**. ## Records and fields @@ -142,10 +144,10 @@ summarizing the request and its result, is delimited by quote characters `"`. The fetch of `picInfo.xml` signals that this is an actual browser request, likely signifying that a human was involved; the URL following the `o=` is the resource the human looked at. Here is a -**tf** invocation that yields a list of the top 5 URLs that were fetched by a human: +**topfew** invocation that yields a list of the top 5 URLs that were fetched by a human: ```shell -tf -g picInfo.xml -f 6 -q -s '\?utm.*' '' -s " HTTP/..." "" -s "GET .*\/ongoing" "" +topfew -g picInfo.xml -f 6 -q -s '\?utm.*' '' -s " HTTP/..." "" -s "GET .*\/ongoing" "" ``` Note the `-g` to select only lines with `picInfo.xml`, the `-q` to request correct processing @@ -160,8 +162,8 @@ Therefore, the observed effects of combinations of options can vary dramatically For example, if I want to list the top records containing the string `example` from a file named `big-file` I could do either of the following: ```shell -tf -g example big-file -grep example big-file | tf +topfew -g example big-file +grep example big-file |topfew ``` When I benchmark topfew on a modern Apple-Silicon Mac and an elderly spinning-rust Linux VPS, I observe that the first option is faster on Mac, the second on Linux. diff --git a/doc/tf.1 b/doc/tf.1 index 6ff1b1d..f810ff7 100644 --- a/doc/tf.1 +++ b/doc/tf.1 @@ -5,7 +5,7 @@ A program that finds and prints out the top few records in which a certain field .PP To find the IP address that most commonly hits your web site, given an Apache logfile named \fB\fCaccess_log\fR\&. .PP -\fB\fCtf \-\-fields 1 access_log\fR +\fB\fCtopfew\-\-fields 1 access_log\fR .PP The same effect could be achieved with .PP @@ -15,20 +15,20 @@ But \fBtf\fP is usually much faster. .PP Do the same, but exclude high\-traffic bots (omitting the filename). .PP -\fB\fCtf \-\-fields 1 \-\-vgrep googlebot \-\-vgrep bingbot\fR +\fB\fCtopfew\-\-fields 1 \-\-vgrep googlebot \-\-vgrep bingbot\fR .PP Most popular IP addresses from May 2020. .PP -\fB\fCtf \-\-fields 1 \-grep '\\[../May/2020'\fR +\fB\fCtopfew\-\-fields 1 \-grep '\\[../May/2020'\fR .PP Most popular hour/minute of the day for retrievals. .PP -\fB\fCtf \-\-fields 4 \-\-sed "\\\\[" "" \-\-sed '^[^:]*:' '' \-\-sed ':..$' ''\fR +\fB\fCtopfew\-\-fields 4 \-\-sed "\\\\[" "" \-\-sed '^[^:]*:' '' \-\-sed ':..$' ''\fR .SH Usage .PP .RS .nf -tf +topfew \-n, \-\-number (output line count) [default is 10] \-f, \-\-fields (field list) [default is the whole record] \-g, \-\-grep (regexp) [may repeat, default is accept all] @@ -39,7 +39,7 @@ tf \-h, \-help, \-\-help filename [default is stdin] -All the arguments are optional; if none are provided, tf will read records +All the arguments are optional; if none are provided, topfewwill read records from the standard input and list the 10 which occur most often. .fi .RE @@ -102,8 +102,8 @@ For example, if I want to list the top records containing the string \fB\fCexamp .PP .RS .nf -tf \-g example big\-file -grep example big\-file | tf +topfew\-g example big\-file +grep example big\-file |topfew .fi .RE .PP diff --git a/internal/config.go b/internal/config.go index c29452f..54fa07a 100644 --- a/internal/config.go +++ b/internal/config.go @@ -131,14 +131,14 @@ func parseFields(spec string) ([]uint, error) { } const instructions = ` -tf (short for "topfew") finds the most common values in a line-structured input +topfew finds the most common values in a line-structured input and prints the top few of them out, with their occurrence counts, in decreasing order of occurrences. -Usage: tf +Usage:topfew -n, --number (output line count) [default is 10] -f, --fields (field list) [default is the whole record] - -p, --fieldseparator (field separator regex) [default is white space] + -p, --fieldseparator (field separator regex) [default is white space] -q, --quotedfields [default is false] -g, --grep (regexp) [may repeat, default is accept all] -v, --vgrep (regexp) [may repeat, default is reject none] @@ -148,7 +148,7 @@ Usage: tf -h, -help, --help filename [default is stdin] -All the arguments are optional; if none are provided, tf will read records +All the arguments are optional; if none are provided, topfew will read records from the standard input and list the 10 which occur most often. Field list is comma-separated integers, e.g. -f 3 or --fields 1,3,7. The fields @@ -160,7 +160,7 @@ performance. Some files, for example Apache httpd logs, use space-separation but also allow spaces within fields which are quoted with ("). The -q/--quotedfields -allows tf to process these correctly. It is an error to specify both +allows topfew to process these correctly. It is an error to specify both -p and -q. The regexp-valued fields work as follows: @@ -171,7 +171,7 @@ The regexp-valued fields work as follows: The regexp-valued fields can be supplied multiple times; the filtering and substitution will be performed in the order supplied. -If the input is a named file, tf will process it in multiple parallel +If the input is a named file, topfew will process it in multiple parallel threads, which can dramatically improve performance. The --width argument allows you to specify the number of threads. The default value is not always optimal; experience with particular data on a particular computer may lead diff --git a/internal/segmenter_test.go b/internal/segmenter_test.go index 510ee21..366f145 100644 --- a/internal/segmenter_test.go +++ b/internal/segmenter_test.go @@ -68,7 +68,7 @@ func TestReadSegmentFiltering(t *testing.T) { t.Error("config!") } - tmpName := fmt.Sprintf("/tmp/tf-%d", os.Getpid()) + tmpName := fmt.Sprintf("/tmp/topfew-%d", os.Getpid()) tmpfile, err := os.Create(tmpName) if err != nil { t.Fatal("can't make tmpfile: " + err.Error()) @@ -90,7 +90,7 @@ func TestReadSegmentFiltering(t *testing.T) { // ErrBufferFull condition, had to create lines 80k long to execute that, so rather than clutter // up the filesystem with this junk, we create them synthetically func TestVeryLongLines(t *testing.T) { - tmpName := fmt.Sprintf("/tmp/tf-%d", os.Getpid()) + tmpName := fmt.Sprintf("/tmp/topfew-%d", os.Getpid()) tmpfile, err := os.Create(tmpName) if err != nil { t.Fatal("can't make tmpfile: " + err.Error()) diff --git a/main.go b/main.go index 440e98d..ea61ab4 100644 --- a/main.go +++ b/main.go @@ -11,7 +11,7 @@ func main() { config, err := topfew.Configure(os.Args[1:]) // skip whatever go puts in os.Args[0] if err != nil { - fmt.Println("Problem (tf -h for help): " + err.Error()) + fmt.Println("Problem (topfew -h for help): " + err.Error()) os.Exit(1) }