Skip to content

Commit

Permalink
Make mso detection work similar to what file/file does
Browse files Browse the repository at this point in the history
https://github.com/file/file/blob/7c62d696b06e53fc5be015c41a57513278ac6c54/magic/Magdir/msooxml
The algorithms is not 100% percent reliable. For example, a
zero compression zip containing a docx will still sometimes be detected
as docx instead of zip (it depends on how many files and the order of
files in the zip)

Second thing in this PR is removing some test data fixtures.
From now, I'll try as much as possible to write regular unit tests
without relying on test file fixtures. #575 (comment)
related #550 #575
closes #400
  • Loading branch information
gabriel-vasile committed Oct 10, 2024
1 parent c78cb11 commit c6c5e4f
Show file tree
Hide file tree
Showing 8 changed files with 204 additions and 79 deletions.
10 changes: 10 additions & 0 deletions internal/magic/magic.go
Original file line number Diff line number Diff line change
Expand Up @@ -239,3 +239,13 @@ func min(a, b int) int {
}
return b
}

type readBuf []byte

func (b *readBuf) advance(n int) bool {
if n < 0 || len(*b) < n {
return false
}
*b = (*b)[n:]
return true
}
45 changes: 3 additions & 42 deletions internal/magic/ms_office.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,58 +5,19 @@ import (
"encoding/binary"
)

var (
xlsxSigFiles = [][]byte{
[]byte("xl/worksheets/"),
[]byte("xl/drawings/"),
[]byte("xl/theme/"),
[]byte("xl/_rels/"),
[]byte("xl/styles.xml"),
[]byte("xl/workbook.xml"),
[]byte("xl/sharedStrings.xml"),
}
docxSigFiles = [][]byte{
[]byte("word/media/"),
[]byte("word/_rels/document.xml.rels"),
[]byte("word/document.xml"),
[]byte("word/styles.xml"),
[]byte("word/fontTable.xml"),
[]byte("word/settings.xml"),
[]byte("word/numbering.xml"),
[]byte("word/header"),
[]byte("word/footer"),
}
pptxSigFiles = [][]byte{
[]byte("ppt/slides/"),
[]byte("ppt/media/"),
[]byte("ppt/slideLayouts/"),
[]byte("ppt/theme/"),
[]byte("ppt/slideMasters/"),
[]byte("ppt/tags/"),
[]byte("ppt/notesMasters/"),
[]byte("ppt/_rels/"),
[]byte("ppt/handoutMasters/"),
[]byte("ppt/notesSlides/"),
[]byte("ppt/presentation.xml"),
[]byte("ppt/tableStyles.xml"),
[]byte("ppt/presProps.xml"),
[]byte("ppt/viewProps.xml"),
}
)

// Xlsx matches a Microsoft Excel 2007 file.
func Xlsx(raw []byte, limit uint32) bool {
return zipContains(raw, xlsxSigFiles...)
return zipContains(raw, []byte("xl/"), true)
}

// Docx matches a Microsoft Word 2007 file.
func Docx(raw []byte, limit uint32) bool {
return zipContains(raw, docxSigFiles...)
return zipContains(raw, []byte("word/"), true)
}

// Pptx matches a Microsoft PowerPoint 2007 file.
func Pptx(raw []byte, limit uint32) bool {
return zipContains(raw, pptxSigFiles...)
return zipContains(raw, []byte("ppt/"), true)
}

// Ole matches an Open Linking and Embedding file.
Expand Down
89 changes: 55 additions & 34 deletions internal/magic/zip.go
Original file line number Diff line number Diff line change
Expand Up @@ -42,50 +42,71 @@ func Zip(raw []byte, limit uint32) bool {

// Jar matches a Java archive file.
func Jar(raw []byte, limit uint32) bool {
return zipContains(raw, []byte("META-INF/MANIFEST.MF"))
return zipContains(raw, []byte("META-INF/MANIFEST.MF"), false)
}

// zipTokenizer holds the source zip file and scanned index.
type zipTokenizer struct {
in []byte
i int // current index
}
func zipContains(raw, sig []byte, msoCheck bool) bool {
b := readBuf(raw)
pk := []byte("PK\003\004")
if len(b) < 0x1E {
return false
}

// next returns the next file name from the zip headers.
// https://web.archive.org/web/20191129114319/https://users.cs.jmu.edu/buchhofp/forensics/formats/pkzip.html
func (t *zipTokenizer) next() (fileName []byte) {
if t.i > len(t.in) {
return
if !b.advance(0x1E) {
return false
}
in := t.in[t.i:]
// pkSig is the signature of the zip local file header.
pkSig := []byte("PK\003\004")
pkIndex := bytes.Index(in, pkSig)
// 30 is the offset of the file name in the header.
fNameOffset := pkIndex + 30
// end if signature not found or file name offset outside of file.
if pkIndex == -1 || fNameOffset > len(in) {
return
if bytes.HasPrefix(b, sig) {
return true
}

fNameLen := int(binary.LittleEndian.Uint16(in[pkIndex+26 : pkIndex+28]))
if fNameLen <= 0 || fNameOffset+fNameLen > len(in) {
return
}
t.i += fNameOffset + fNameLen
return in[fNameOffset : fNameOffset+fNameLen]
}
if msoCheck {
skipFiles := [][]byte{
[]byte("[Content_Types].xml"),
[]byte("_rels/.rels"),
[]byte("docProps"),
[]byte("customXml"),
[]byte("[trash]"),
}

// zipContains returns true if the zip file headers from in contain any of the paths.
func zipContains(in []byte, paths ...[]byte) bool {
t := zipTokenizer{in: in}
for tok := t.next(); len(tok) != 0; tok = t.next() {
for p := range paths {
if bytes.HasPrefix(tok, paths[p]) {
return true
hasSkipFile := false
for _, sf := range skipFiles {
if bytes.HasPrefix(b, sf) {
hasSkipFile = true
break
}
}
if !hasSkipFile {
return false
}
}

searchOffset := binary.LittleEndian.Uint32(raw[18:]) + 49
if !b.advance(int(searchOffset)) {
return false
}

nextHeader := bytes.Index(raw[searchOffset:], pk)
if !b.advance(nextHeader) {
return false
}
if bytes.HasPrefix(b, sig) {
return true
}

for i := 0; i < 4; i++ {
if !b.advance(0x1A) {
return false
}
nextHeader = bytes.Index(b, pk)
if nextHeader == -1 {
return false
}
if !b.advance(nextHeader + 0x1E) {
return false
}
if bytes.HasPrefix(b, sig) {
return true
}
}
return false
}
136 changes: 136 additions & 0 deletions internal/magic/zip_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,136 @@
package magic

import (
"archive/zip"
"bytes"
"fmt"
"io"
"testing"
)

func createZip(files []string) (*bytes.Buffer, error) {
buf := bytes.NewBuffer(nil)
w := zip.NewWriter(buf)

for _, f := range files {
_, err := w.Create(f)
if err != nil {
return nil, err
}
}

return buf, w.Close()
}

func createZipUncompressed(content *bytes.Buffer) (*bytes.Buffer, error) {
buf := bytes.NewBuffer(nil)
w := zip.NewWriter(buf)

for i := 0; i < 5; i++ {
file, err := w.CreateHeader(&zip.FileHeader{
Name: fmt.Sprintf("file%d", i),
Method: zip.Store, // Store means 0 compression.
})
if err != nil {
return nil, err
}
if _, err := io.Copy(file, content); err != nil {
return nil, err
}
}

return buf, w.Close()
}

func TestZeroZip(t *testing.T) {
tcases := []struct {
name string
files []string
xlsx bool
docx bool
pptx bool
jar bool
}{{
name: "empty zip",
files: nil,
}, {
name: "no customXml",
files: []string{"foo", "word/"},
}, {
name: "customXml, but no word/",
files: []string{"customXml"},
}, {
name: "customXml, and other files, but no word/",
files: []string{"customXml", "1", "2", "3"},
}, {
name: "customXml, and other files, but word/ is the 7th file", // we only check until 6th file
files: []string{"customXml", "1", "2", "3", "4", "5", "word/"},
}, {
name: "customXml, word/ xl/ pptx/ after 5 files",
files: []string{"1", "2", "3", "4", "5", "customXml", "word/", "xl/", "ppt/"},
}, {
name: "customXml, word/",
files: []string{"customXml", "word/"},
docx: true,
}, {
name: "customXml, word/with_suffix",
files: []string{"customXml", "word/with_suffix"},
docx: true,
}, {
name: "customXml, word/",
files: []string{"customXml", "word/media"},
docx: true,
}, {
name: "customXml, xl/",
files: []string{"customXml", "xl/media"},
xlsx: true,
}, {
name: "customXml, ppt/",
files: []string{"customXml", "ppt/media"},
pptx: true,
}, {
name: "META-INF",
files: []string{"META-INF/MANIFEST.MF"},
jar: true,
}, {
name: "1 2 3 4 5 6 META-INF", // we only check first 6 files
files: []string{"1", "2", "3", "4", "5", "6", "META-INF/MANIFEST.MF"},
jar: false,
}}

for _, tc := range tcases {
t.Run(tc.name, func(t *testing.T) {
buf, err := createZip(tc.files)
if err != nil {
t.Fatal(err)
}

docx := Docx(buf.Bytes(), 0)
xlsx := Xlsx(buf.Bytes(), 0)
pptx := Pptx(buf.Bytes(), 0)
jar := Jar(buf.Bytes(), 0)

if tc.docx != docx || tc.xlsx != xlsx || tc.pptx != pptx || tc.jar != jar {
t.Errorf(`expected %t %t %t %t;
got %t %t %t %t`, tc.docx, tc.xlsx, tc.pptx, tc.jar, docx, xlsx, pptx, jar)
}

// #400 - xlsx, docx, pptx put as is (compression lvl 0) inside a zip
// It should continue to get detected as regular zip, not xlsx or docx or pptx.
uncompressedZip, err := createZipUncompressed(buf)
if err != nil {
t.Fatal(err)
}

docx = Docx(uncompressedZip.Bytes(), 0)
xlsx = Xlsx(uncompressedZip.Bytes(), 0)
pptx = Pptx(uncompressedZip.Bytes(), 0)
jar = Jar(uncompressedZip.Bytes(), 0)

if docx || xlsx || pptx || jar {
t.Errorf(`uncompressedZip: expected false, false, false;
got %t %t %t %t`, docx, xlsx, pptx, jar)
}
})
}
}
3 changes: 0 additions & 3 deletions mimetype_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,6 @@ var files = map[string]string{
"deb.deb": "application/vnd.debian.binary-package",
"djvu.djvu": "image/vnd.djvu",
"doc.doc": "application/msword",
"docx.1.docx": "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
"docx.docx": "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
"drpm.rpm": "application/x-rpm",
"dwg.1.dwg": "image/vnd.dwg",
Expand Down Expand Up @@ -222,8 +221,6 @@ var files = map[string]string{
"xfdf.xfdf": "application/vnd.adobe.xfdf",
"xlf.xlf": "application/x-xliff+xml",
"xls.xls": "application/vnd.ms-excel",
"xlsx.1.xlsx": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"xlsx.2.xlsx": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"xlsx.xlsx": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"xml.xml": "text/xml; charset=utf-8",
"xml.withbr.xml": "text/xml; charset=utf-8",
Expand Down
Binary file removed testdata/docx.1.docx
Binary file not shown.
Binary file removed testdata/xlsx.1.xlsx
Binary file not shown.
Binary file removed testdata/xlsx.2.xlsx
Binary file not shown.

0 comments on commit c6c5e4f

Please sign in to comment.