diff --git a/.gitignore b/.gitignore index cfb093e..d16e131 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,5 @@ project-resources data/ statements +oss/ +experiments/ diff --git a/cmd/process_statements/main.go b/cmd/process_statements/main.go index 883e49a..fbe3215 100644 --- a/cmd/process_statements/main.go +++ b/cmd/process_statements/main.go @@ -1,12 +1,7 @@ package main -import ( - "fmt" - - "github.com/dslipak/pdf" -) - // TODOs +// 0. Port PyMuPDF, PyPdf, PdfPlumber to Go? // 1. Figure out which file actually contains the statement (-0 vs -1) // Approach: // 2. Figure out if the year matches the statement title @@ -17,35 +12,38 @@ import ( // 4. Build intelligence through analysis of past data & news reports & any other source func main() { - // pdf.DebugOn = true - content, err := readPdf("./statements/ASIANPAINT/2022-2023.pdf") // Read local pdf file - if err != nil { - panic(err) - } - fmt.Println(content) - return + // unipdf. } -func readPdf(path string) (string, error) { - r, err := pdf.Open(path) - if err != nil { - return "", err - } - totalPage := r.NumPage() +// // pdf.DebugOn = true +// content, err := readPdf("./statements/ASIANPAINT/2022-2023.pdf") // Read local pdf file +// if err != nil { +// panic(err) +// } +// fmt.Println(content) +// return +// } - for pageIndex := 1; pageIndex <= totalPage; pageIndex++ { - p := r.Page(pageIndex) - if p.V.IsNull() { - continue - } +// func readPdf(path string) (string, error) { +// r, err := pdf.Open(path) +// if err != nil { +// return "", err +// } +// totalPage := r.NumPage() - rows, _ := p.GetTextByRow() - for _, row := range rows { - println(">>>> row: ", row.Position) - for _, word := range row.Content { - fmt.Println(word.S) - } - } - } - return "", nil -} +// for pageIndex := 1; pageIndex <= totalPage; pageIndex++ { +// p := r.Page(pageIndex) +// if p.V.IsNull() { +// continue +// } + +// rows, _ := p.GetTextByRow() +// for _, row := range rows { +// println(">>>> row: ", row.Position) +// for _, word := range row.Content { +// fmt.Println(word.S) +// } +// } +// } +// return "", nil +// } diff --git a/go.mod b/go.mod index bf883d7..a7ee8fc 100644 --- a/go.mod +++ b/go.mod @@ -5,6 +5,7 @@ go 1.21.6 require github.com/sirupsen/logrus v1.9.3 require ( - github.com/dslipak/pdf v0.0.2 // indirect - golang.org/x/sys v0.0.0-20220715151400-c0bba94af5f8 // indirect + github.com/stretchr/testify v1.7.1 // indirect + golang.org/x/sys v0.15.0 // indirect + gopkg.in/yaml.v3 v3.0.0 // indirect ) diff --git a/go.sum b/go.sum index 63627de..9885dae 100644 --- a/go.sum +++ b/go.sum @@ -1,17 +1,18 @@ github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c= github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= -github.com/dslipak/pdf v0.0.2 h1:djAvcM5neg9Ush+zR6QXB+VMJzR6TdnX766HPIg1JmI= -github.com/dslipak/pdf v0.0.2/go.mod h1:2L3SnkI9cQwnAS9gfPz2iUoLC0rUZwbucpbKi5R1mUo= github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= github.com/sirupsen/logrus v1.9.3 h1:dueUQJ1C2q9oE3F7wvmSGAaVtTmUizReu6fjN8uqzbQ= github.com/sirupsen/logrus v1.9.3/go.mod h1:naHLuLoDiP4jHNo9R0sCBMtWGeIprob74mVsIT4qYEQ= github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= -github.com/stretchr/testify v1.7.0 h1:nwc3DEeHmmLAfoZucVR881uASk0Mfjw8xYJ99tb5CcY= github.com/stretchr/testify v1.7.0/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg= -golang.org/x/sys v0.0.0-20220715151400-c0bba94af5f8 h1:0A+M6Uqn+Eje4kHMK80dtF3JCXC4ykBgQG4Fe06QRhQ= +github.com/stretchr/testify v1.7.1 h1:5TQK59W5E3v0r2duFAb7P95B6hEeOyEnHRa8MjYSMTY= +github.com/stretchr/testify v1.7.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg= golang.org/x/sys v0.0.0-20220715151400-c0bba94af5f8/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.15.0 h1:h48lPFYpsTvQJZF4EKyI4aLHaev3CxivZmv7yZig9pc= +golang.org/x/sys v0.15.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= -gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c h1:dUUwHk2QECo/6vqA44rthZ8ie2QXMNeKRTHCNY2nXvo= gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= +gopkg.in/yaml.v3 v3.0.0 h1:hjy8E9ON/egN1tAYqKb61G10WtihqetD4sz2H+8nIeA= +gopkg.in/yaml.v3 v3.0.0/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= diff --git a/notes/240206.md b/notes/240206.md new file mode 100644 index 0000000..781b901 --- /dev/null +++ b/notes/240206.md @@ -0,0 +1,9 @@ +# Notes (6th Feb 2024) + +- Extracting data from Annual Statements + +- Extreme information dense + - Highly visual data with legends and notes + - Some images need to be annotated, automagicaly! + +- diff --git a/pkg/mathext/digits.go b/pkg/mathext/digits.go index d6bcc08..8c8bb04 100644 --- a/pkg/mathext/digits.go +++ b/pkg/mathext/digits.go @@ -1,8 +1,10 @@ package mathext func DigitCount(n int) int { - if n/10 == 0 { - return 1 + count := 1 + for n/10 != 0 { + count++ + n = n / 10 } - return 1 + DigitCount(n/10) + return count } diff --git a/pkg/mathext/digits_test.go b/pkg/mathext/digits_test.go new file mode 100644 index 0000000..3381314 --- /dev/null +++ b/pkg/mathext/digits_test.go @@ -0,0 +1,30 @@ +package mathext_test + +import ( + "testing" + + "codermana.com/go/pkg/value_analysis/pkg/mathext" +) + +func TestDigitCount(t *testing.T) { + testCases := []struct { + input int + expected int + }{ + {1123512, 7}, + {12, 2}, + {0, 1}, + {1, 1}, + {100, 3}, + } + + for _, testCase := range testCases { + actual := mathext.DigitCount(testCase.input) + + if testCase.expected != actual { + t.Log("Expected:", testCase.expected) + t.Log("Actual:", actual) + t.Fail() + } + } +} diff --git a/samples/2012-2013-0.pdf b/samples/2012-2013-0.pdf new file mode 100644 index 0000000..b9bbee9 Binary files /dev/null and b/samples/2012-2013-0.pdf differ diff --git a/samples/2012-2013-1.pdf b/samples/2012-2013-1.pdf new file mode 100644 index 0000000..e4a9a64 Binary files /dev/null and b/samples/2012-2013-1.pdf differ diff --git a/samples/2022-2023.pdf b/samples/2022-2023.pdf new file mode 100644 index 0000000..4d6178b Binary files /dev/null and b/samples/2022-2023.pdf differ