-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathfilehandler.go
149 lines (130 loc) · 3.91 KB
/
filehandler.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
package main
import (
"bytes"
"errors"
"fmt"
"net/http"
"os"
"regexp"
"strings"
"time"
"github.com/c2h5oh/datasize"
"github.com/ledongthuc/pdf"
log "github.com/sirupsen/logrus"
)
// File for the worker
type File struct {
Name string
Path string
Size datasize.ByteSize
ModTime time.Time
Locked bool
ContentType string
ContentPDF string
Tags []string
}
func (f *File) GetContentType() error {
// try to open the file
openFile, err := os.Open(f.Path)
if err != nil {
log.Errorln("could not open file: ", err)
return err
}
// remember to close file
defer openFile.Close()
// Only the first 512 bytes are used to sniff the content type.
buffer := make([]byte, 512)
_, err = openFile.Read(buffer)
if err != nil {
log.Errorln("could not read file: ", err)
return err
}
// Use the net/http package's handy DectectContentType function. Always returns a valid
// content-type by returning "application/octet-stream" if no others seemed to match.
f.ContentType = http.DetectContentType(buffer)
return nil
}
// checkType returns true if the file matches the requested type.
// Types to check are : pdf
func (f *File) CheckType(expectedType string) (matched bool, reason string) {
// force type it lowercase
expectedType = strings.ToLower(expectedType)
// just make sure ContentType is set
if f.ContentType == "" {
if err := f.GetContentType(); err != nil {
log.Debugln("somthing went wrong calling GetContentType within checkType")
return false, "ContentType cloud not be optained"
}
}
switch expectedType {
case "pdf":
regexmatch1, err := regexp.MatchString(Conf.FileHandling.FileTypePDF.ContentTypeFilter, f.ContentType)
if err != nil {
log.Debugln("contenttype does not macht the type pdf")
return false, "contenttype does not macht"
}
regexmatch2, err := regexp.MatchString(Conf.FileHandling.FileTypePDF.FileNameFilter, f.Name)
if err != nil {
log.Debugln("filename does not macht the type pdf")
return false, "filename does not macht"
}
// make sure type and filename matches
return regexmatch1 && regexmatch2, ""
default:
log.Warningln("func checkType: no typ maches!, bad scenario")
return false, ""
}
}
// readPDF extracts the content of a pdf to f.contentPDF
func (f *File) ReadPdf() error {
// This is a PDF feature check for type pdf
if result, _ := f.CheckType("pdf"); !result {
log.Errorln("This should only be called for pdf files")
return errors.New("this should only be called for pdf files")
}
// make sure the file is not to big
if f.Size.MBytes() > 10.0 {
return errors.New("to big to read as PFD")
}
openFile, r, err := pdf.Open(f.Path)
if err != nil {
log.Errorln(err)
}
// remember to close file
defer openFile.Close()
if err != nil {
return err
}
var buf bytes.Buffer
b, err := r.GetPlainText()
if err != nil {
return err
}
buf.ReadFrom(b)
f.ContentPDF = buf.String()
return nil
}
func (f *File) DetectTags() error {
if result, _ := f.CheckType("pdf"); result {
// iterate over all potenial tags and try to match them
for _, potentialTag := range Conf.Tagging {
regexmatchd, err := regexp.MatchString(potentialTag.SearchExpression, f.ContentPDF)
fmt.Println(potentialTag.Tag)
if regexmatchd && err == nil {
log.Debug("the potential tag '" + potentialTag.Tag + "' matches the file '" + f.Name + "'")
f.Tags = append(f.Tags, potentialTag.Tag)
f.Tags = append(f.Tags, potentialTag.AdditionalTags...)
log.Debugf(" adding the following tags : %s, %s", potentialTag.Tag, potentialTag.AdditionalTags)
return nil
} else if err != nil {
// there must be an errors
return errors.New("error at file: " + f.Name + " and potential Tag : " + potentialTag.Tag + "error is :" + err.Error())
}
}
} else {
// just in case
log.Errorln("can not detect tags on a unknown type:", f.Name)
return errors.New("can not detect tags on a unknown type: " + f.Name)
}
return nil
}