-
Notifications
You must be signed in to change notification settings - Fork 0
/
cleanRawSongs.js
62 lines (47 loc) · 1.56 KB
/
cleanRawSongs.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
import { readFileSync } from 'fs'
import { COLUMN_JOIN_STRING, ROW_JOIN_STRING } from './joinStrings.js'
import { getRawLines, processRawLines } from './utils/cleaning.js'
import { createTable } from './db'
function readCSV (path) {
const rawString = readFileSync(path).toString()
return rawString
.split(ROW_JOIN_STRING)
.map(strRow => strRow.split(COLUMN_JOIN_STRING))
}
function cleanPage (htmlStr) {
const rawLines = getRawLines(htmlStr)
const processedLines = processRawLines(rawLines)
return processedLines.join('\n')
}
const yearRegex = /\d{4}$/
function cleanDate (locationDateStr) {
const split = locationDateStr.split('---')
const split0Match = split[0].match(yearRegex)
if (split0Match) return split0Match[0]
const split1Match = split[1] ? split[1].match(yearRegex) : undefined
if (split1Match !== undefined) return split1Match[0]
}
async function main () {
const db = await createTable('songs', ['artist', 'url', 'lyrics', 'date'])
const songsRaw = readCSV('./output/songsRaw.csv')
let currentPercentage = '0%'
for (let i = 0; i < songsRaw.length; i++) {
const [artist, url, rawText, rawLocationDate] = songsRaw[i]
const lyrics = cleanPage(rawText)
const date = cleanDate(rawLocationDate)
if (date) {
await db.insert({
artist,
url,
lyrics,
date
})
}
const percentage = `${Math.round(((i + 1) / (songsRaw.length + 1)) * 100)}%`
if (currentPercentage !== percentage) {
currentPercentage = percentage
console.log(currentPercentage)
}
}
}
main()