-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmaybank-pdf-to-csv.groovy
144 lines (115 loc) · 3.95 KB
/
maybank-pdf-to-csv.groovy
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
/**
* <h2>Usage examples</h2>
* <code>
* C:\tmp> groovy path/to/maybank-pdf-to-csv.groovy --input.folder path/to/pdf/folder --output.file path/to/result.csv
*
* C:\path\to\pdf\folder> groovy path/to/maybank-pdf-to-csv.groovy --output.file path/to/result.csv
* </code>
*/
@Grab('org.apache.pdfbox:pdfbox:2.0.8')
@Grab('com.opencsv:opencsv:4.1')
import org.apache.pdfbox.pdmodel.PDDocument
import org.apache.pdfbox.text.PDFTextStripper
import java.util.regex.Pattern
import com.opencsv.CSVWriter
def cli = new CliBuilder(usage: "${getClass().simpleName} [options]")
cli.h(longOpt: "help", "Show usage")
cli.i(longOpt: "input.folder", args: 1, argName: "Folder path containing PDF files", "e.g. C:\\tmp\\my-savings (defaults to current folder)")
cli.o(longOpt: "output.file", required: true, args: 1, argName: "File path", "e.g. savings.csv")
def options = cli.parse(args)
if (options == null) {
return
}
if (options.h) {
println cli.usage()
return
}
File sourceFolder = options.i ? new File(options.i) : new File(".")
File outputFile = new File(options.o)
def pdfFiles = sourceFolder.listFiles().findAll { it.isFile() && getExtension(it).equalsIgnoreCase("pdf") }
if (pdfFiles.empty) {
System.err.println("[ERROR] No PDF file found in ${sourceFolder.canonicalPath}")
return
}
def txStartPattern = Pattern.compile("^\\d\\d/\\d\\d/\\d\\d.*")
def records = []
pdfFiles.each { pdfFile ->
def text = PDDocument.load(pdfFile.bytes).withCloseable { new PDFTextStripper().getText(it) }
def lines = text.split("\r\n") as List
def currentRecord = null
for (line in lines) {
print "Records found: ${records.size()}\r"
if (line.matches(txStartPattern)) {
currentRecord = new TxRecord()
currentRecord.data = line
records << currentRecord
continue
}
if (line.startsWith(" ") && currentRecord != null) {
currentRecord.addInfo(line)
continue
}
}
}
def csvFile = outputFile
new CSVWriter(new FileWriter(csvFile)).withCloseable { writer ->
TxRecord.writeTo(writer, records)
}
println "Successfully converted to CSV at ${csvFile.canonicalPath}"
class TxRecord {
String date
String action
String signedAmount
String balance
List<String> infos = []
String info1
String info2
String info3
void setData(line) {
try {
def entries = line.trim().split(" ")
this.date = entries[0]
this.balance = entries[-1]
entries[-2].with {
def sign = it.substring(it.length() - 1)
def amount = it.substring(0, it.length() - 1)
this.signedAmount = sign + amount
}
this.action = entries[1..-3].join(" ")
} catch (ex) {
throw new IllegalArgumentException("Failed to parse ${line}", ex)
}
}
void addInfo(String line) {
infos << line.trim()
}
String getInfo(int index) {
return infos.size() <= index ? null : infos.get(index)
}
void writeTo(CSVWriter writer) {
writer.writeNext([] as String[])
}
@Override
String toString() {
def sb = new StringBuilder("${date} (${action}): ${signedAmount} => ${balance}\r\n")
infos.each {
sb.append(" - ${it}\r\n")
}
return sb.toString()
}
static void writeTo(CSVWriter writer, Collection<TxRecord> records) {
writer.writeNext(
["Date", "Action", "Amount", "Balance", "Info 1", "Info 2", "Info 3"] as String[],
false)
records.each {
writer.writeNext(
[it.date, it.action, it.signedAmount, it.balance, it.getInfo(0), it.getInfo(1), it.getInfo(2)] as String[],
false)
}
}
}
def getExtension(File file) {
def paths = file.name.split("/")
def fileNameWithExt = paths[-1]
return fileNameWithExt.substring(fileNameWithExt.indexOf(".") + 1)
}