Skip to content

Commit

Permalink
Merge pull request #733 from google/magika-go
Browse files Browse the repository at this point in the history
go: add a Go library for Magika
  • Loading branch information
reyammer authored Dec 17, 2024
2 parents 372aa1e + b783f42 commit 43871b1
Show file tree
Hide file tree
Showing 20 changed files with 833 additions and 0 deletions.
17 changes: 17 additions & 0 deletions go/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
# Go library

This directory contains the Go library for Magika.

The inference relies on the [ONNX Runtime](https://onnxruntime.ai/), and it
requires [cgo](https://go.dev/blog/cgo) for interfacing with the ONNX Runtime
[C API](https://onnxruntime.ai/docs/api/c/).

- [`docker`](./docker) contains a sample docker file that builds a
container image that ties together a Magika CLI, an ONNX Runtime,
and a [model](../assets/models/standard_v2_1).
- [`cli`](./cli) contains a basic CLI that illustrates how to
the Magika go library may be called from within an application.
- [`magika`](./magika) contains the library, that extracts
features from a sequence of bytes.
- [`onnx`](./onnx) wraps the C API of the ONNX Runtime to
provide an inference engine.
50 changes: 50 additions & 0 deletions go/cli/cli.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
package main

import (
"bytes"
"fmt"
"io"
"os"

"github.com/google/magika/magika"
)

const (
assetsDirEnv = "MAGIKA_ASSETS_DIR"
modelNameEnv = "MAGIKA_MODEL"
)

// cli is a basic CLI that infers the content type of the files listed on
// the command line. The assets dir and the model name are given via the
// environment variable MAGIKA_ASSETS_DIR and MAGIKA_MODEL respectively.
func cli(w io.Writer, args ...string) error {
assetsDir := os.Getenv(assetsDirEnv)
if assetsDir == "" {
return fmt.Errorf("%s environment variable not set or empty", assetsDirEnv)
}
modelName := os.Getenv(modelNameEnv)
if modelName == "" {
return fmt.Errorf("%s environment variable not set or empty", modelNameEnv)
}
s, err := magika.NewScanner(assetsDir, modelName)
if err != nil {
return fmt.Errorf("create scanner: %w", err)
}

// For each filename given as argument, read the file and scan its content.
for _, a := range args {
fmt.Fprintf(w, "%s: ", a)
b, err := os.ReadFile(a)
if err != nil {
fmt.Fprintf(w, "%v\n", err)
continue
}
ct, err := s.Scan(bytes.NewReader(b), len(b))
if err != nil {
fmt.Fprintf(w, "scan: %v\n", err)
continue
}
fmt.Fprintf(w, "%s\n", ct.Label)
}
return nil
}
31 changes: 31 additions & 0 deletions go/cli/cli_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
//go:build cgo && onnxruntime

package main

import (
"path"
"strings"
"testing"

"github.com/google/go-cmp/cmp"
)

func TestCLI(t *testing.T) {
const basicDir = "../../tests_data/basic"
var (
files = []string{
path.Join(basicDir, "python/code.py"),
path.Join(basicDir, "zip/magika_test.zip"),
}
b strings.Builder
)
if err := cli(&b, files...); err != nil {
t.Fatal(err)
}
if d := cmp.Diff(strings.TrimSpace(b.String()), strings.Join([]string{
"../../tests_data/basic/python/code.py: python",
"../../tests_data/basic/zip/magika_test.zip: zip",
}, "\n")); d != "" {
t.Errorf("mismatch (-want +got):\n%s", d)
}
}
26 changes: 26 additions & 0 deletions go/cli/main.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
/*
CLI is a simple command line interface for magika.
It takes a list of files as argument, and infers their types in sequence.
For example:
$ magika test.go readme.md
test.go: go
readme.md: markdown
The primary intent is to illustrate how the magika go library can be used
and compiled, using cgo and the ONNX Runtime library.
*/
package main

import (
"fmt"
"os"
)

func main() {
if err := cli(os.Stdout, os.Args[1:]...); err != nil {
fmt.Printf("Error: %v\n", err)
os.Exit(1)
}
}
Binary file added go/cli/tests_data/magika_test.zip
Binary file not shown.
3 changes: 3 additions & 0 deletions go/cli/tests_data/magika_test_pptx.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
This is a test for Magika!

Very cool if this can be detected correctly!
69 changes: 69 additions & 0 deletions go/docker/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
# Sample Dockerfile to build an image that ties together an ONNX Runtime,
# a Magika model, and a Magika CLI.
#
# It expects the root of the repository as build context:
# $ docker build -f go/docker/Dockerfile -t magika-go:latest .
#
# Then, to list the content type of the files in the current directory:
# docker run --rm --name magika-go -v $PWD:$PWD:ro -w $PWD magika-go:latest *

# Build stage for ONNX Runtime and magika.
FROM golang:latest AS build

# Work in a clean temp directory.
WORKDIR /tmp

# Download, check, and install ONNX Runtime (https://onnxruntime.ai/) in
# /opt/onnxruntime.
# Releases are located at https://github.com/microsoft/onnxruntime/releases.
# We need the SDK (/include) for compiling, and the library (/lib) for inference.
ARG ONNX_NAME=onnxruntime
ARG ONNX_ARCH=linux-x64
ARG ONNX_VERSION=1.19.2
ARG ONNX_FULLNAME=${ONNX_NAME}-${ONNX_ARCH}-${ONNX_VERSION}
ARG ONNX_TARBALL=${ONNX_FULLNAME}.tgz
ARG ONNX_DIGEST=eb00c64e0041f719913c4080e0fed7d9963dc3aa9b54664df6036d8308dbcd33

RUN curl -sL -O https://github.com/microsoft/${ONNX_NAME}/releases/download/v${ONNX_VERSION}/${ONNX_TARBALL} \
&& echo "${ONNX_DIGEST} ${ONNX_TARBALL}" > checksum.txt \
&& sha256sum -c checksum.txt \
&& tar -xzf ${ONNX_TARBALL} -C /opt \
&& ln -s /opt/${ONNX_FULLNAME} /opt/onnxruntime

# Retrieve the magika go code from the build context, test, and build the cli.
COPY go go/
COPY tests_data tests_data/
COPY assets/content_types_kb.min.json assets/content_types_kb.min.json
COPY assets/models/standard_v2_1 assets/models/standard_v2_1/

ARG CGO_ENABLED=1
ARG CGO_CFLAGS=-I/opt/onnxruntime/include
ARG LD_LIBRARY_PATH=/opt/onnxruntime/lib

# Run the tests.
WORKDIR go
RUN MAGIKA_ASSETS_DIR=../../assets \
MAGIKA_MODEL=standard_v2_1 \
go test -tags onnxruntime -ldflags="-linkmode=external -extldflags=-L/opt/onnxruntime/lib" ./...

# Build the CLI.
WORKDIR cli
RUN go build -tags onnxruntime -ldflags="-linkmode=external -extldflags=-L/opt/onnxruntime/lib" .


# Final stage: copy resources from the build and set environment variables.
FROM debian:latest

# Add the ONNX Runtime.
ENV LD_LIBRARY_PATH=/opt/onnxruntime/lib
COPY --from=build /opt/onnxruntime/lib ${LD_LIBRARY_PATH}

# Magika model.
ENV MAGIKA_ASSETS_DIR=/opt/magika/assets
ENV MAGIKA_MODEL=standard_v2_1
COPY assets/models/${MAGIKA_MODEL} ${MAGIKA_ASSETS_DIR}/models/${MAGIKA_MODEL}/
COPY assets/content_types_kb.min.json ${MAGIKA_ASSETS_DIR}/content_types_kb.min.json

# Magika CLI.
COPY --from=build /tmp/go/cli/cli /usr/local/bin/magika
ENTRYPOINT ["magika"]
5 changes: 5 additions & 0 deletions go/go.mod
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
module github.com/google/magika

go 1.22.3

require github.com/google/go-cmp v0.6.0 // indirect
2 changes: 2 additions & 0 deletions go/go.sum
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
github.com/google/go-cmp v0.6.0 h1:ofyhxvXcZhMsU5ulbFiLKl/XBFqE1GSq7atu8tAmTRI=
github.com/google/go-cmp v0.6.0/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY=
62 changes: 62 additions & 0 deletions go/magika/config.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
package magika

import (
"encoding/json"
"fmt"
"os"
"path"
)

const (
configFile = "config.min.json"
contentTypesKBFile = "content_types_kb.min.json"
modelFile = "model.onnx"
modelsDir = "models"
)

// Config holds the portion of Magika's model configuration that is relevant
// for inference.
type Config struct {
BegSize int `json:"beg_size"`
MidSize int `json:"mid_size"`
EndSize int `json:"end_size"`
UseInputsAtOffsets bool `json:"use_inputs_at_offsets"`
MediumConfidenceThreshold float32 `json:"medium_confidence_threshold"`
MinFileSizeForDl int64 `json:"min_file_size_for_dl"`
PaddingToken int `json:"padding_token"`
BlockSize int `json:"block_size"`
TargetLabelsSpace []string `json:"target_labels_space"`
Thresholds map[string]float32 `json:"thresholds"`
}

// ReadConfig is a helper that reads and unmarshal a Config, given an assets
// dir and a model name.
func ReadConfig(assetsDir, name string) (Config, error) {
var cfg Config
p := configPath(assetsDir, name)
b, err := os.ReadFile(p)
if err != nil {
return Config{}, fmt.Errorf("read %q: %w", p, err)
}
if err := json.Unmarshal(b, &cfg); err != nil {
return Config{}, fmt.Errorf("unmarshal: %w", err)
}
return cfg, nil
}

// contentTypesKBPath returns the content types KB path for the given
// asset folder.
func contentTypesKBPath(assetDir string) string {
return path.Join(assetDir, contentTypesKBFile)
}

// configPath returns the model config for the given asset folder and model
// name.
func configPath(assetDir, name string) string {
return path.Join(assetDir, modelsDir, name, configFile)
}

// modelPath returns the Onnx model for the given asset folder and model name.
func modelPath(assetDir, name string) string {
return path.Join(assetDir, modelsDir, name, modelFile)
}
44 changes: 44 additions & 0 deletions go/magika/content.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
package magika

import (
"encoding/json"
"fmt"
"os"
)

const (
contentTypeLabelEmpty = "empty"
contentTypeLabelTxt = "txt"
contentTypeLabelUnknown = "unknown"
)

// ContentType holds the definition of a content type.
type ContentType struct {
Label string // As keyed in the content types KB.
MimeType string `json:"mime_type"`
Group string `json:"group"`
Description string `json:"description"`
Extensions []string `json:"extensions"`
IsText bool `json:"is_text"`
}

// readContentTypesKB is a helper that reads and unmarshal a content types KB,
// given the assets dir.
// It returns a dictionary that maps a label as defined in the model config
// target label space to a content type.
func readContentTypesKB(assetsDir string) (map[string]ContentType, error) {
var ckb map[string]ContentType
p := contentTypesKBPath(assetsDir)
b, err := os.ReadFile(p)
if err != nil {
return nil, fmt.Errorf("read %q: %w", p, err)
}
if err := json.Unmarshal(b, &ckb); err != nil {
return nil, fmt.Errorf("unmarshal: %w", err)
}
for label, ct := range ckb {
ct.Label = label
ckb[label] = ct
}
return ckb, nil
}
Loading

0 comments on commit 43871b1

Please sign in to comment.