-
Notifications
You must be signed in to change notification settings - Fork 431
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #733 from google/magika-go
go: add a Go library for Magika
- Loading branch information
Showing
20 changed files
with
833 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,17 @@ | ||
# Go library | ||
|
||
This directory contains the Go library for Magika. | ||
|
||
The inference relies on the [ONNX Runtime](https://onnxruntime.ai/), and it | ||
requires [cgo](https://go.dev/blog/cgo) for interfacing with the ONNX Runtime | ||
[C API](https://onnxruntime.ai/docs/api/c/). | ||
|
||
- [`docker`](./docker) contains a sample docker file that builds a | ||
container image that ties together a Magika CLI, an ONNX Runtime, | ||
and a [model](../assets/models/standard_v2_1). | ||
- [`cli`](./cli) contains a basic CLI that illustrates how to | ||
the Magika go library may be called from within an application. | ||
- [`magika`](./magika) contains the library, that extracts | ||
features from a sequence of bytes. | ||
- [`onnx`](./onnx) wraps the C API of the ONNX Runtime to | ||
provide an inference engine. |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,50 @@ | ||
package main | ||
|
||
import ( | ||
"bytes" | ||
"fmt" | ||
"io" | ||
"os" | ||
|
||
"github.com/google/magika/magika" | ||
) | ||
|
||
const ( | ||
assetsDirEnv = "MAGIKA_ASSETS_DIR" | ||
modelNameEnv = "MAGIKA_MODEL" | ||
) | ||
|
||
// cli is a basic CLI that infers the content type of the files listed on | ||
// the command line. The assets dir and the model name are given via the | ||
// environment variable MAGIKA_ASSETS_DIR and MAGIKA_MODEL respectively. | ||
func cli(w io.Writer, args ...string) error { | ||
assetsDir := os.Getenv(assetsDirEnv) | ||
if assetsDir == "" { | ||
return fmt.Errorf("%s environment variable not set or empty", assetsDirEnv) | ||
} | ||
modelName := os.Getenv(modelNameEnv) | ||
if modelName == "" { | ||
return fmt.Errorf("%s environment variable not set or empty", modelNameEnv) | ||
} | ||
s, err := magika.NewScanner(assetsDir, modelName) | ||
if err != nil { | ||
return fmt.Errorf("create scanner: %w", err) | ||
} | ||
|
||
// For each filename given as argument, read the file and scan its content. | ||
for _, a := range args { | ||
fmt.Fprintf(w, "%s: ", a) | ||
b, err := os.ReadFile(a) | ||
if err != nil { | ||
fmt.Fprintf(w, "%v\n", err) | ||
continue | ||
} | ||
ct, err := s.Scan(bytes.NewReader(b), len(b)) | ||
if err != nil { | ||
fmt.Fprintf(w, "scan: %v\n", err) | ||
continue | ||
} | ||
fmt.Fprintf(w, "%s\n", ct.Label) | ||
} | ||
return nil | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,31 @@ | ||
//go:build cgo && onnxruntime | ||
|
||
package main | ||
|
||
import ( | ||
"path" | ||
"strings" | ||
"testing" | ||
|
||
"github.com/google/go-cmp/cmp" | ||
) | ||
|
||
func TestCLI(t *testing.T) { | ||
const basicDir = "../../tests_data/basic" | ||
var ( | ||
files = []string{ | ||
path.Join(basicDir, "python/code.py"), | ||
path.Join(basicDir, "zip/magika_test.zip"), | ||
} | ||
b strings.Builder | ||
) | ||
if err := cli(&b, files...); err != nil { | ||
t.Fatal(err) | ||
} | ||
if d := cmp.Diff(strings.TrimSpace(b.String()), strings.Join([]string{ | ||
"../../tests_data/basic/python/code.py: python", | ||
"../../tests_data/basic/zip/magika_test.zip: zip", | ||
}, "\n")); d != "" { | ||
t.Errorf("mismatch (-want +got):\n%s", d) | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,26 @@ | ||
/* | ||
CLI is a simple command line interface for magika. | ||
It takes a list of files as argument, and infers their types in sequence. | ||
For example: | ||
$ magika test.go readme.md | ||
test.go: go | ||
readme.md: markdown | ||
The primary intent is to illustrate how the magika go library can be used | ||
and compiled, using cgo and the ONNX Runtime library. | ||
*/ | ||
package main | ||
|
||
import ( | ||
"fmt" | ||
"os" | ||
) | ||
|
||
func main() { | ||
if err := cli(os.Stdout, os.Args[1:]...); err != nil { | ||
fmt.Printf("Error: %v\n", err) | ||
os.Exit(1) | ||
} | ||
} |
Binary file not shown.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,3 @@ | ||
This is a test for Magika! | ||
|
||
Very cool if this can be detected correctly! |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,69 @@ | ||
# Sample Dockerfile to build an image that ties together an ONNX Runtime, | ||
# a Magika model, and a Magika CLI. | ||
# | ||
# It expects the root of the repository as build context: | ||
# $ docker build -f go/docker/Dockerfile -t magika-go:latest . | ||
# | ||
# Then, to list the content type of the files in the current directory: | ||
# docker run --rm --name magika-go -v $PWD:$PWD:ro -w $PWD magika-go:latest * | ||
|
||
# Build stage for ONNX Runtime and magika. | ||
FROM golang:latest AS build | ||
|
||
# Work in a clean temp directory. | ||
WORKDIR /tmp | ||
|
||
# Download, check, and install ONNX Runtime (https://onnxruntime.ai/) in | ||
# /opt/onnxruntime. | ||
# Releases are located at https://github.com/microsoft/onnxruntime/releases. | ||
# We need the SDK (/include) for compiling, and the library (/lib) for inference. | ||
ARG ONNX_NAME=onnxruntime | ||
ARG ONNX_ARCH=linux-x64 | ||
ARG ONNX_VERSION=1.19.2 | ||
ARG ONNX_FULLNAME=${ONNX_NAME}-${ONNX_ARCH}-${ONNX_VERSION} | ||
ARG ONNX_TARBALL=${ONNX_FULLNAME}.tgz | ||
ARG ONNX_DIGEST=eb00c64e0041f719913c4080e0fed7d9963dc3aa9b54664df6036d8308dbcd33 | ||
|
||
RUN curl -sL -O https://github.com/microsoft/${ONNX_NAME}/releases/download/v${ONNX_VERSION}/${ONNX_TARBALL} \ | ||
&& echo "${ONNX_DIGEST} ${ONNX_TARBALL}" > checksum.txt \ | ||
&& sha256sum -c checksum.txt \ | ||
&& tar -xzf ${ONNX_TARBALL} -C /opt \ | ||
&& ln -s /opt/${ONNX_FULLNAME} /opt/onnxruntime | ||
|
||
# Retrieve the magika go code from the build context, test, and build the cli. | ||
COPY go go/ | ||
COPY tests_data tests_data/ | ||
COPY assets/content_types_kb.min.json assets/content_types_kb.min.json | ||
COPY assets/models/standard_v2_1 assets/models/standard_v2_1/ | ||
|
||
ARG CGO_ENABLED=1 | ||
ARG CGO_CFLAGS=-I/opt/onnxruntime/include | ||
ARG LD_LIBRARY_PATH=/opt/onnxruntime/lib | ||
|
||
# Run the tests. | ||
WORKDIR go | ||
RUN MAGIKA_ASSETS_DIR=../../assets \ | ||
MAGIKA_MODEL=standard_v2_1 \ | ||
go test -tags onnxruntime -ldflags="-linkmode=external -extldflags=-L/opt/onnxruntime/lib" ./... | ||
|
||
# Build the CLI. | ||
WORKDIR cli | ||
RUN go build -tags onnxruntime -ldflags="-linkmode=external -extldflags=-L/opt/onnxruntime/lib" . | ||
|
||
|
||
# Final stage: copy resources from the build and set environment variables. | ||
FROM debian:latest | ||
|
||
# Add the ONNX Runtime. | ||
ENV LD_LIBRARY_PATH=/opt/onnxruntime/lib | ||
COPY --from=build /opt/onnxruntime/lib ${LD_LIBRARY_PATH} | ||
|
||
# Magika model. | ||
ENV MAGIKA_ASSETS_DIR=/opt/magika/assets | ||
ENV MAGIKA_MODEL=standard_v2_1 | ||
COPY assets/models/${MAGIKA_MODEL} ${MAGIKA_ASSETS_DIR}/models/${MAGIKA_MODEL}/ | ||
COPY assets/content_types_kb.min.json ${MAGIKA_ASSETS_DIR}/content_types_kb.min.json | ||
|
||
# Magika CLI. | ||
COPY --from=build /tmp/go/cli/cli /usr/local/bin/magika | ||
ENTRYPOINT ["magika"] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,5 @@ | ||
module github.com/google/magika | ||
|
||
go 1.22.3 | ||
|
||
require github.com/google/go-cmp v0.6.0 // indirect |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,2 @@ | ||
github.com/google/go-cmp v0.6.0 h1:ofyhxvXcZhMsU5ulbFiLKl/XBFqE1GSq7atu8tAmTRI= | ||
github.com/google/go-cmp v0.6.0/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY= |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,62 @@ | ||
package magika | ||
|
||
import ( | ||
"encoding/json" | ||
"fmt" | ||
"os" | ||
"path" | ||
) | ||
|
||
const ( | ||
configFile = "config.min.json" | ||
contentTypesKBFile = "content_types_kb.min.json" | ||
modelFile = "model.onnx" | ||
modelsDir = "models" | ||
) | ||
|
||
// Config holds the portion of Magika's model configuration that is relevant | ||
// for inference. | ||
type Config struct { | ||
BegSize int `json:"beg_size"` | ||
MidSize int `json:"mid_size"` | ||
EndSize int `json:"end_size"` | ||
UseInputsAtOffsets bool `json:"use_inputs_at_offsets"` | ||
MediumConfidenceThreshold float32 `json:"medium_confidence_threshold"` | ||
MinFileSizeForDl int64 `json:"min_file_size_for_dl"` | ||
PaddingToken int `json:"padding_token"` | ||
BlockSize int `json:"block_size"` | ||
TargetLabelsSpace []string `json:"target_labels_space"` | ||
Thresholds map[string]float32 `json:"thresholds"` | ||
} | ||
|
||
// ReadConfig is a helper that reads and unmarshal a Config, given an assets | ||
// dir and a model name. | ||
func ReadConfig(assetsDir, name string) (Config, error) { | ||
var cfg Config | ||
p := configPath(assetsDir, name) | ||
b, err := os.ReadFile(p) | ||
if err != nil { | ||
return Config{}, fmt.Errorf("read %q: %w", p, err) | ||
} | ||
if err := json.Unmarshal(b, &cfg); err != nil { | ||
return Config{}, fmt.Errorf("unmarshal: %w", err) | ||
} | ||
return cfg, nil | ||
} | ||
|
||
// contentTypesKBPath returns the content types KB path for the given | ||
// asset folder. | ||
func contentTypesKBPath(assetDir string) string { | ||
return path.Join(assetDir, contentTypesKBFile) | ||
} | ||
|
||
// configPath returns the model config for the given asset folder and model | ||
// name. | ||
func configPath(assetDir, name string) string { | ||
return path.Join(assetDir, modelsDir, name, configFile) | ||
} | ||
|
||
// modelPath returns the Onnx model for the given asset folder and model name. | ||
func modelPath(assetDir, name string) string { | ||
return path.Join(assetDir, modelsDir, name, modelFile) | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,44 @@ | ||
package magika | ||
|
||
import ( | ||
"encoding/json" | ||
"fmt" | ||
"os" | ||
) | ||
|
||
const ( | ||
contentTypeLabelEmpty = "empty" | ||
contentTypeLabelTxt = "txt" | ||
contentTypeLabelUnknown = "unknown" | ||
) | ||
|
||
// ContentType holds the definition of a content type. | ||
type ContentType struct { | ||
Label string // As keyed in the content types KB. | ||
MimeType string `json:"mime_type"` | ||
Group string `json:"group"` | ||
Description string `json:"description"` | ||
Extensions []string `json:"extensions"` | ||
IsText bool `json:"is_text"` | ||
} | ||
|
||
// readContentTypesKB is a helper that reads and unmarshal a content types KB, | ||
// given the assets dir. | ||
// It returns a dictionary that maps a label as defined in the model config | ||
// target label space to a content type. | ||
func readContentTypesKB(assetsDir string) (map[string]ContentType, error) { | ||
var ckb map[string]ContentType | ||
p := contentTypesKBPath(assetsDir) | ||
b, err := os.ReadFile(p) | ||
if err != nil { | ||
return nil, fmt.Errorf("read %q: %w", p, err) | ||
} | ||
if err := json.Unmarshal(b, &ckb); err != nil { | ||
return nil, fmt.Errorf("unmarshal: %w", err) | ||
} | ||
for label, ct := range ckb { | ||
ct.Label = label | ||
ckb[label] = ct | ||
} | ||
return ckb, nil | ||
} |
Oops, something went wrong.