Skip to content

Commit

Permalink
feat: add pkg extractor (#5)
Browse files Browse the repository at this point in the history
* add extractor

* file refactor

* added walker log file discovery

* add whitespace

* WIP full refactor in progress

* fixed decoding

* add tests, updated README

* removed unused errors, removed args

* test fixes

* fix typo

* fix newlines

* rename function

* remove variable, expand err msg

* edit error message

* replace conditionals with require

* replace conditional with require#2

* add cleanup helper, replace defer with t.cleanup

* reorg args

* add t.Helper()

* replace t.fatal with require

* fix arg order

* modify random string func in testing

* replace t.Fatalf with require

* move global variables to smaller scopes

* rename error

Co-authored-by: Antonio Navarro Perez <[email protected]>

* remove unused anon vars

Co-authored-by: Antonio Navarro Perez <[email protected]>

* edit comment

Co-authored-by: Antonio Navarro Perez <[email protected]>

* rename error

Co-authored-by: Antonio Navarro Perez <[email protected]>

* update map storage for paths

Co-authored-by: Antonio Navarro Perez <[email protected]>

* update map storage for paths #2

Co-authored-by: Antonio Navarro Perez <[email protected]>

* add conversion check

Co-authored-by: Antonio Navarro Perez <[email protected]>

* add error check

Co-authored-by: Antonio Navarro Perez <[email protected]>

* format & handle errors

* move cleanup above err check

* Revert "move cleanup above err check"

This reverts commit 872fd12.

* swap arg order

* wip tests

* added more tests

* modify source path

* parallel test failing, save

* fix broken tests

---------

Co-authored-by: Antonio Navarro Perez <[email protected]>
  • Loading branch information
leohhhn and ajnavarro authored Dec 29, 2023
1 parent 0007bf3 commit ca7cf22
Show file tree
Hide file tree
Showing 8 changed files with 1,123 additions and 0 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,2 +1,3 @@
txexport.log
txexport.log.gz
.idea
6 changes: 6 additions & 0 deletions extractor/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
.idea
logs
extracted
.env
testdir
test
23 changes: 23 additions & 0 deletions extractor/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
# Gno Source Code Extractor

This tool is a simple parser to extract source code (packages & realms) from logs created by the [tx-archive](https://github.com/gnolang/tx-archive) tool for Gno chains.

## Running the extractor

The extractor takes in three arguments:
- the filetype of the archive files,
- output directory for the extracted packages,
- the root directory where the archive files are located.

```
USAGE
[flags]
The Gno source code extractor service
FLAGS
-file-type .jsonl the file type for analysis, with a preceding period (ie .log)
-output-dir ./extracted the output directory for the extracted Gno source code
-source-dir . the root folder containing transaction data
```

47 changes: 47 additions & 0 deletions extractor/go.mod
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
module extractor

go 1.20

require (
github.com/gnolang/gno v0.0.0-20231006162410-fa8eb7753dc5
github.com/go-test/deep v1.1.0
github.com/peterbourgon/ff/v3 v3.4.0
github.com/stretchr/testify v1.8.4
golang.org/x/sync v0.4.0
)

require (
github.com/btcsuite/btcd v0.22.0-beta.0.20220111032746-97732e52810c // indirect
github.com/btcsuite/btcd/btcutil v1.0.0 // indirect
github.com/cespare/xxhash v1.1.0 // indirect
github.com/cespare/xxhash/v2 v2.1.1 // indirect
github.com/cockroachdb/apd v1.1.0 // indirect
github.com/davecgh/go-spew v1.1.1 // indirect
github.com/dgraph-io/badger/v3 v3.2103.4 // indirect
github.com/dgraph-io/ristretto v0.1.1 // indirect
github.com/dustin/go-humanize v1.0.0 // indirect
github.com/gnolang/goleveldb v0.0.9 // indirect
github.com/gnolang/overflow v0.0.0-20170615021017-4d914c927216 // indirect
github.com/gogo/protobuf v1.3.2 // indirect
github.com/golang/glog v0.0.0-20160126235308-23def4e6c14b // indirect
github.com/golang/groupcache v0.0.0-20190702054246-869f871628b6 // indirect
github.com/golang/protobuf v1.5.3 // indirect
github.com/golang/snappy v0.0.4 // indirect
github.com/google/flatbuffers v1.12.1 // indirect
github.com/jmhodges/levigo v1.0.0 // indirect
github.com/klauspost/compress v1.12.3 // indirect
github.com/linxGnu/grocksdb v1.8.4 // indirect
github.com/pkg/errors v0.9.1 // indirect
github.com/pmezard/go-difflib v1.0.0 // indirect
github.com/tecbot/gorocksdb v0.0.0-20191217155057-f0fad39f321c // indirect
go.etcd.io/bbolt v1.3.7 // indirect
go.opencensus.io v0.22.5 // indirect
go.uber.org/atomic v1.7.0 // indirect
go.uber.org/multierr v1.9.0 // indirect
golang.org/x/crypto v0.13.0 // indirect
golang.org/x/net v0.15.0 // indirect
golang.org/x/sys v0.12.0 // indirect
golang.org/x/tools v0.6.0 // indirect
google.golang.org/protobuf v1.31.0 // indirect
gopkg.in/yaml.v3 v3.0.1 // indirect
)
226 changes: 226 additions & 0 deletions extractor/go.sum

Large diffs are not rendered by default.

311 changes: 311 additions & 0 deletions extractor/main.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,311 @@
package main

import (
"bufio"
"context"
"encoding/json"
"errors"
"flag"
"fmt"
"github.com/gnolang/gno/gno.land/pkg/sdk/vm"
"github.com/gnolang/gno/tm2/pkg/amino"
"github.com/gnolang/gno/tm2/pkg/std"
"github.com/peterbourgon/ff/v3/ffcli"
"golang.org/x/sync/errgroup"
"io"
"os"
"path/filepath"
"strings"
)

// Define constants
const (
packageMetadataFile = "pkg_metadata.json"
)

var (
errInvalidFileType = errors.New("no file type specified")
errInvalidSourceDir = errors.New("invalid source directory")
errInvalidOutputDir = errors.New("invalid output directory")
errNoSourceFilesFound = errors.New("no source files found, exiting")
)

// Define extractor config
type extractorCfg struct {
fileType string
sourceDir string
outputDir string
}

func main() {
var (
cfg = &extractorCfg{}
fs = flag.NewFlagSet("root", flag.ExitOnError)
)

// Register the flags
cfg.registerFlags(fs)

// Create the command
cmd := &ffcli.Command{
ShortUsage: "[flags]",
LongHelp: "The Gno / TM2 source code extractor service",
FlagSet: fs,
Exec: func(ctx context.Context, _ []string) error {
return execExtract(ctx, cfg)
},
}

// Run the command
if err := cmd.ParseAndRun(context.Background(), os.Args[1:]); err != nil {
fmt.Fprintf(os.Stderr, "%+v", err)

os.Exit(1)
}
}

// registerFlags registers the extractor service flag set
func (c *extractorCfg) registerFlags(fs *flag.FlagSet) {
fs.StringVar(
&c.fileType,
"file-type",
".jsonl",
"the file type for analysis, with a preceding period (ie .jsonl)",
)

fs.StringVar(
&c.sourceDir,
"source-dir",
".",
"the root folder containing transaction data",
)

fs.StringVar(
&c.outputDir,
"output-dir",
"./extracted",
"the output directory for the extracted Gno source code",
)
}

// execExtract runs the extract service for Gno source code
func execExtract(ctx context.Context, cfg *extractorCfg) error {
// Check the file type is valid
if cfg.fileType == "" {
return errInvalidFileType
}

// Check the source dir is valid
if cfg.sourceDir == "" {
return errInvalidSourceDir
}

// Check the output dir is valid
if cfg.outputDir == "" {
return errInvalidOutputDir
}

// Find the files that need to be analyzed
sourceFiles, findErr := findFilePaths(cfg.sourceDir, cfg.fileType)
if findErr != nil {
return fmt.Errorf("unable to find file paths, %w", findErr)
}

if len(sourceFiles) == 0 {
return errNoSourceFilesFound
}

// Concurrently process the source files
g, ctx := errgroup.WithContext(ctx)

for _, sourceFile := range sourceFiles {
sourceFile := sourceFile

g.Go(func() error {
// Extract messages
msgs, processErr := extractAddMessages(sourceFile)
if processErr != nil {
return processErr
}

// Process messages
for _, msg := range msgs {
outputDir := filepath.Join(cfg.outputDir, strings.TrimLeft(msg.Package.Path, "gno.land/"))

// Write dir before writing files
if dirWriteErr := os.MkdirAll(outputDir, os.ModePerm); dirWriteErr != nil {
return fmt.Errorf("unable to write dir, %w", dirWriteErr)
}

// Write the package source code
if writeErr := writePackageFiles(msg, outputDir); writeErr != nil {
return writeErr
}

// Write the package metadata
if writeErr := writePackageMetadata(metadataFromMsg(msg), outputDir); writeErr != nil {
return writeErr
}
}

return nil
})
}

return g.Wait()
}

// writePackageFiles writes all files from a single package to the output directory
func writePackageFiles(msg vm.MsgAddPackage, outputDir string) error {
for _, file := range msg.Package.Files {
// Get the output path
writePath := filepath.Join(outputDir, file.Name)

if writeErr := os.WriteFile(writePath, []byte(file.Body), 0644); writeErr != nil {
return fmt.Errorf("unable to write file %s, %w", file.Name, writeErr)
}
}

return nil
}

// writePackageMetadata writes the package metadata to the output directory
func writePackageMetadata(metadata Metadata, outputDir string) error {
// Get the output path
writePath := filepath.Join(outputDir, packageMetadataFile)

// Get the JSON metadata
metadataRaw, marshalErr := json.Marshal(metadata)
if marshalErr != nil {
return fmt.Errorf("unable to JSON marshal metadata, %w", marshalErr)
}

if writeErr := os.WriteFile(writePath, metadataRaw, 0644); writeErr != nil {
return fmt.Errorf("unable to write package metadata, %w", writeErr)
}

return nil
}

func extractAddMessages(filePath string) ([]vm.MsgAddPackage, error) {
file, err := os.Open(filePath)
if err != nil {
return nil, fmt.Errorf("unable to open file, %w", err)
}

cleanup := func() error {
if closeErr := file.Close(); closeErr != nil {
return fmt.Errorf("unable to gracefully close file, %w", closeErr)
}
return nil
}

reader := bufio.NewReader(file)

// Used to track what was parsed in the past
touchMap := make(map[string]bool)

// Msg array to be returned for further processing
msgArr := make([]vm.MsgAddPackage, 0)

// Buffer to handle lines longer than 64kb
tempBuf := make([]byte, 0)

for {
var tx std.Tx
line, isPrefix, err := reader.ReadLine()

// Exit if no more lines in file
if errors.Is(err, io.EOF) {
break
}
if err != nil {
return nil, fmt.Errorf("error reading lines; %w", err)
}

// If line is too long, save it in a temporary buffer and continue reading line
if isPrefix {
tempBuf = append(tempBuf, line...)
continue
}

// Handle long lines
if len(tempBuf) != 0 {
// Append last part of line to temporary buffer
tempBuf = append(tempBuf, line...)

// Use line variable to pass it on to amino
line = tempBuf
}

if err := amino.UnmarshalJSON(line, &tx); err != nil {
fmt.Errorf("Error while parsing amino JSON at line: %w\nLine:%s\n", err, line)
continue
}

// Reset tempBuf in case it was used for a long line
if tempBuf != nil {
tempBuf = nil
}

for _, msg := range tx.Msgs {
// Only MsgAddPkg should be parsed
if msg.Type() != "add_package" {
continue
}

msgAddPkg, ok := msg.(vm.MsgAddPackage)
if !ok {
return nil, errors.New("could not cast into MsgAddPackage")
}

if msgAddPkg.Package == nil {
return nil, errors.New("MsgAddPackage is nil")
}
path := msgAddPkg.Package.Path

if _, parsed := touchMap[path]; parsed {
// Package already parsed
continue
}

touchMap[path] = true
msgArr = append(msgArr, msgAddPkg)
}
}

return msgArr, cleanup()
}

// findFilePaths gathers the file paths for specific file types
func findFilePaths(startPath string, fileType string) ([]string, error) {
filePaths := make([]string, 0)

walkFn := func(path string, info os.FileInfo, err error) error {
if err != nil {
return fmt.Errorf("error accessing file: %w", err)
}

// Check if the file is a dir
if info.IsDir() {
return nil
}

// Check if the file type matches
if !strings.HasSuffix(info.Name(), fileType) {
return nil
}

// File is not a directory, and is of the type
filePaths = append(filePaths, path)

return nil
}

// Walk the directory root recursively
if walkErr := filepath.Walk(startPath, walkFn); walkErr != nil {
return nil, fmt.Errorf("unable to walk directory, %w", walkErr)
}

return filePaths, nil
}
Loading

0 comments on commit ca7cf22

Please sign in to comment.